71 lines
1.5 KiB
Go
71 lines
1.5 KiB
Go
|
package typeset
|
||
|
|
||
|
import "unicode"
|
||
|
|
||
|
// TODO perhaps follow https://unicode.org/reports/tr14/
|
||
|
|
||
|
func parseString (text string) ([]runeLayout, []token) {
|
||
|
// TODO find an optimal size for both of these to minimize allocs. will
|
||
|
// require some testing.
|
||
|
runes := make([]runeLayout, 0, len(text) * 2 / 3)
|
||
|
tokens := make([]token, 0, len(text) / 4)
|
||
|
|
||
|
var index int
|
||
|
var startingIndex int
|
||
|
var run rune
|
||
|
var lastRune rune
|
||
|
|
||
|
var tok token
|
||
|
tokenBoundary := func () {
|
||
|
if startingIndex != index {
|
||
|
tok.runes = runes[startingIndex:index]
|
||
|
startingIndex = index
|
||
|
tokens = append(tokens, tok)
|
||
|
}
|
||
|
tok = token { }
|
||
|
}
|
||
|
mustBeInToken := func (kind tokenKind) {
|
||
|
if tok.kind != kind {
|
||
|
tokenBoundary()
|
||
|
tok.kind = kind
|
||
|
}
|
||
|
}
|
||
|
|
||
|
for index, run = range text {
|
||
|
runes = append(runes, runeLayout {
|
||
|
run: run,
|
||
|
})
|
||
|
|
||
|
switch {
|
||
|
case run == '\r':
|
||
|
tokenBoundary()
|
||
|
// we don't know the token type yet. if next rune is a
|
||
|
// \n then this is a CRLF line break. if not, this is
|
||
|
// just a word.
|
||
|
|
||
|
case run == '\n':
|
||
|
if lastRune == '\r' {
|
||
|
// continue the \r to make a CRLF line break
|
||
|
tok.kind = tokenKindLineBreak
|
||
|
} else {
|
||
|
tokenBoundary()
|
||
|
tok.kind = tokenKindLineBreak
|
||
|
}
|
||
|
|
||
|
case run == '\t':
|
||
|
mustBeInToken(tokenKindTab)
|
||
|
|
||
|
case unicode.IsSpace(run):
|
||
|
mustBeInToken(tokenKindSpace)
|
||
|
|
||
|
default:
|
||
|
mustBeInToken(tokenKindWord)
|
||
|
}
|
||
|
lastRune = run
|
||
|
}
|
||
|
index ++ // make index equal to len([]rune(text))
|
||
|
|
||
|
tokenBoundary()
|
||
|
return runes, tokens
|
||
|
}
|