typeset/parse.go

79 lines
1.7 KiB
Go

package typeset
import "unicode"
// TODO perhaps follow https://unicode.org/reports/tr14/
func parseString (text string) ([]runeLayout, []token) {
// alloc initial rune slice
runes := make([]runeLayout, 0, len(text) * 2 / 3)
// build the rune slice
// we need to do this before parsing into tokens, because otherwise
// a realloc will occur in the middle of it and the tokens at the start
// will be referencing old memory
for _, run := range text {
runes = append(runes, runeLayout {
run: run,
})
}
// alloc initial token slice
tokens := make([]token, 0, len(runes) / 5)
var index int
var startingIndex int
var runl runeLayout
var lastRune rune
var tok token
tokenBoundary := func () {
if startingIndex != index {
tok.runes = runes[startingIndex:index]
startingIndex = index
tokens = append(tokens, tok)
}
tok = token { }
}
mustBeInToken := func (kind tokenKind) {
if tok.kind != kind {
tokenBoundary()
tok.kind = kind
}
}
// parse tokens
for index, runl = range runes {
switch {
case runl.run == '\r':
tokenBoundary()
// we don't know the token type yet. if next rune is a
// \n then this is a CRLF line break. if not, this is
// just a word.
case runl.run == '\n':
if lastRune == '\r' {
// continue the \r to make a CRLF line break
tok.kind = tokenKindLineBreak
} else {
tokenBoundary()
tok.kind = tokenKindLineBreak
}
case runl.run == '\t':
mustBeInToken(tokenKindTab)
case unicode.IsSpace(runl.run):
mustBeInToken(tokenKindSpace)
default:
mustBeInToken(tokenKindWord)
}
lastRune = runl.run
}
index ++ // make index equal to len([]rune(text))
tokenBoundary()
return runes, tokens
}