package typeset import "unicode" // TODO perhaps follow https://unicode.org/reports/tr14/ func parseString (text string) ([]runeLayout, []token) { // alloc initial rune slice runes := make([]runeLayout, 0, len(text) * 2 / 3) // build the rune slice // we need to do this before parsing into tokens, because otherwise // a realloc will occur in the middle of it and the tokens at the start // will be referencing old memory for _, run := range text { runes = append(runes, runeLayout { run: run, }) } // alloc initial token slice tokens := make([]token, 0, len(runes) / 3) var index int var startingIndex int var runl runeLayout var lastRune rune var tok token tokenBoundary := func () { if startingIndex != index { tok.runes = runes[startingIndex:index] startingIndex = index tokens = append(tokens, tok) } tok = token { } } mustBeInToken := func (kind tokenKind) { if tok.kind != kind { tokenBoundary() tok.kind = kind } } // parse tokens for index, runl = range runes { switch { case runl.run == '\r': tokenBoundary() // we don't know the token type yet. if next rune is a // \n then this is a CRLF line break. if not, this is // just a word. case runl.run == '\n': if lastRune == '\r' { // continue the \r to make a CRLF line break tok.kind = tokenKindLineBreak } else { tokenBoundary() tok.kind = tokenKindLineBreak } case runl.run == '\t': mustBeInToken(tokenKindTab) case unicode.IsSpace(runl.run): mustBeInToken(tokenKindSpace) default: mustBeInToken(tokenKindWord) } lastRune = runl.run } index ++ // make index equal to len([]rune(text)) tokenBoundary() return runes, tokens }