package typeset import "unicode" // TODO perhaps follow https://unicode.org/reports/tr14/ func parseString (text string) ([]runeLayout, []token) { // TODO find an optimal size for both of these to minimize allocs. will // require some testing. runes := make([]runeLayout, 0, len(text) * 2 / 3) tokens := make([]token, 0, len(text) / 4) var index int var startingIndex int var run rune var lastRune rune var tok token tokenBoundary := func () { if startingIndex != index { tok.runes = runes[startingIndex:index] startingIndex = index tokens = append(tokens, tok) } tok = token { } } mustBeInToken := func (kind tokenKind) { if tok.kind != kind { tokenBoundary() tok.kind = kind } } for index, run = range text { runes = append(runes, runeLayout { run: run, }) switch { case run == '\r': tokenBoundary() // we don't know the token type yet. if next rune is a // \n then this is a CRLF line break. if not, this is // just a word. case run == '\n': if lastRune == '\r' { // continue the \r to make a CRLF line break tok.kind = tokenKindLineBreak } else { tokenBoundary() tok.kind = tokenKindLineBreak } case run == '\t': mustBeInToken(tokenKindTab) case unicode.IsSpace(run): mustBeInToken(tokenKindSpace) default: mustBeInToken(tokenKindWord) } lastRune = run } index ++ // make index equal to len([]rune(text)) tokenBoundary() return runes, tokens }