typeset/parse.go

package typeset

import "unicode"

// TODO perhaps follow https://unicode.org/reports/tr14/

func parseString (text string) ([]runeLayout, []token) {
	// alloc initial rune slice
	runes := make([]runeLayout, 0, len(text) * 2 / 3)

	// build the rune slice
	// we need to do this before parsing into tokens, because otherwise
	// a realloc will occur in the middle of it and the tokens at the start
	// will be referencing old memory
	for _, run := range text {
		runes = append(runes, runeLayout {
			run: run,
		})
	}

	// alloc initial token slice
	tokens := make([]token, 0, len(runes) / 5)

	var index         int
	var startingIndex int
	var runl          runeLayout
	var lastRune      rune

	var tok token
	tokenBoundary := func () {
		if startingIndex != index {
			tok.runes = runes[startingIndex:index]
			startingIndex = index
			tokens = append(tokens, tok)
		}
		tok = token { }
	}
	mustBeInToken := func (kind tokenKind) {
		if tok.kind != kind {
			tokenBoundary()
			tok.kind = kind
		}
	}

	// parse tokens
	for index, runl = range runes {
		switch {
		case runl.run == '\r':
			tokenBoundary()
			// we don't know the token type yet. if next rune is a
			// \n then this is a CRLF line break. if not, this is
			// just a word.

		case runl.run == '\n':
			if lastRune == '\r' {
				// continue the \r to make a CRLF line break
				tok.kind = tokenKindLineBreak
			} else {
				tokenBoundary()
				tok.kind = tokenKindLineBreak
			}

		case runl.run == '\t':
			mustBeInToken(tokenKindTab)

		case unicode.IsSpace(runl.run):
			mustBeInToken(tokenKindSpace)

		default:
			mustBeInToken(tokenKindWord)
		}
		lastRune = runl.run
	}
	index ++ // make index equal to len([]rune(text))

	tokenBoundary()
	return runes, tokens
}