package typeset

import "unicode"

// TODO perhaps follow https://unicode.org/reports/tr14/

func parseString (text string) ([]runeLayout, []token) {
	// TODO find an optimal size for both of these to minimize allocs. will
	// require some testing.
	runes  := make([]runeLayout, 0, len(text) * 2 / 3)
	tokens := make([]token,      0, len(text) / 4)

	var index         int
	var startingIndex int
	var run           rune
	var lastRune      rune

	var tok token
	tokenBoundary := func () {
		if startingIndex != index {
			tok.runes = runes[startingIndex:index]
			startingIndex = index
			tokens = append(tokens, tok)
		}
		tok = token { }
	}
	mustBeInToken := func (kind tokenKind) {
		if tok.kind != kind {
			tokenBoundary()
			tok.kind = kind
		}
	}

	for index, run = range text {
		runes = append(runes, runeLayout {
			run: run,
		})

		switch {
		case run == '\r':
			tokenBoundary()
			// we don't know the token type yet. if next rune is a
			// \n then this is a CRLF line break. if not, this is
			// just a word.
			
		case run == '\n':
			if lastRune == '\r' {
				// continue the \r to make a CRLF line break
				tok.kind = tokenKindLineBreak
			} else {
				tokenBoundary()
				tok.kind = tokenKindLineBreak
			}

		case run == '\t':
			mustBeInToken(tokenKindTab)

		case unicode.IsSpace(run):
			mustBeInToken(tokenKindSpace)

		default:
			mustBeInToken(tokenKindWord)
		}
		lastRune = run
	}
	index ++ // make index equal to len([]rune(text))
	
	tokenBoundary()
	return runes, tokens
}