78 lines
1.7 KiB
Go
78 lines
1.7 KiB
Go
package typeset
|
|
|
|
import "unicode"
|
|
|
|
// TODO perhaps follow https://unicode.org/reports/tr14/
|
|
|
|
func parseString (text string) ([]runeLayout, []token) {
|
|
// TODO find an optimal size for both of these to minimize allocs. will
|
|
// require some testing.
|
|
runes := make([]runeLayout, 0, len(text) * 2 / 3)
|
|
tokens := make([]token, 0, len(text) / 4)
|
|
|
|
var index int
|
|
var startingIndex int
|
|
var runl runeLayout
|
|
var lastRune rune
|
|
|
|
var tok token
|
|
tokenBoundary := func () {
|
|
if startingIndex != index {
|
|
tok.runes = runes[startingIndex:index]
|
|
startingIndex = index
|
|
tokens = append(tokens, tok)
|
|
}
|
|
tok = token { }
|
|
}
|
|
mustBeInToken := func (kind tokenKind) {
|
|
if tok.kind != kind {
|
|
tokenBoundary()
|
|
tok.kind = kind
|
|
}
|
|
}
|
|
|
|
// build the rune slice
|
|
// we need to do this before parsing into tokens, because otherwise
|
|
// a realloc will occur in the middle of it and the tokens at the start
|
|
// will be referencing old memory
|
|
for _, run := range text {
|
|
runes = append(runes, runeLayout {
|
|
run: run,
|
|
})
|
|
}
|
|
|
|
// parse tokens
|
|
for index, runl = range runes {
|
|
switch {
|
|
case runl.run == '\r':
|
|
tokenBoundary()
|
|
// we don't know the token type yet. if next rune is a
|
|
// \n then this is a CRLF line break. if not, this is
|
|
// just a word.
|
|
|
|
case runl.run == '\n':
|
|
if lastRune == '\r' {
|
|
// continue the \r to make a CRLF line break
|
|
tok.kind = tokenKindLineBreak
|
|
} else {
|
|
tokenBoundary()
|
|
tok.kind = tokenKindLineBreak
|
|
}
|
|
|
|
case runl.run == '\t':
|
|
mustBeInToken(tokenKindTab)
|
|
|
|
case unicode.IsSpace(runl.run):
|
|
mustBeInToken(tokenKindSpace)
|
|
|
|
default:
|
|
mustBeInToken(tokenKindWord)
|
|
}
|
|
lastRune = runl.run
|
|
}
|
|
index ++ // make index equal to len([]rune(text))
|
|
|
|
tokenBoundary()
|
|
return runes, tokens
|
|
}
|