typeset/parse.go

package typeset

import "unicode"

// TODO perhaps follow https://unicode.org/reports/tr14/

func parseString (text string) ([]runeLayout, []token) {
	// TODO find an optimal size for both of these to minimize allocs. will
	// require some testing.
	runes  := make([]runeLayout, 0, len(text) * 2 / 3)
	tokens := make([]token,      0, len(text) / 4)

	var index         int
	var startingIndex int
	var run           rune
	var lastRune      rune

	var tok token
	tokenBoundary := func () {
		if startingIndex != index {
			tok.runes = runes[startingIndex:index]
			startingIndex = index
			tokens = append(tokens, tok)
		}
		tok = token { }
	}
	mustBeInToken := func (kind tokenKind) {
		if tok.kind != kind {
			tokenBoundary()
			tok.kind = kind
		}
	}

	for index, run = range text {
		runes = append(runes, runeLayout {
			run: run,
		})

		switch {
		case run == '\r':
			tokenBoundary()
			// we don't know the token type yet. if next rune is a
			// \n then this is a CRLF line break. if not, this is
			// just a word.
			
		case run == '\n':
			if lastRune == '\r' {
				// continue the \r to make a CRLF line break
				tok.kind = tokenKindLineBreak
			} else {
				tokenBoundary()
				tok.kind = tokenKindLineBreak
			}

		case run == '\t':
			mustBeInToken(tokenKindTab)

		case unicode.IsSpace(run):
			mustBeInToken(tokenKindSpace)

		default:
			mustBeInToken(tokenKindWord)
		}
		lastRune = run
	}
	index ++ // make index equal to len([]rune(text))
	
	tokenBoundary()
	return runes, tokens
}
Add parsing stage 2024-09-10 09:19:57 -06:00			`package typeset`

			`import "unicode"`

			`// TODO perhaps follow https://unicode.org/reports/tr14/`

			`func parseString (text string) ([]runeLayout, []token) {`
			`// TODO find an optimal size for both of these to minimize allocs. will`
			`// require some testing.`
			`runes := make([]runeLayout, 0, len(text) * 2 / 3)`
			`tokens := make([]token, 0, len(text) / 4)`

			`var index int`
			`var startingIndex int`
			`var run rune`
			`var lastRune rune`

			`var tok token`
			`tokenBoundary := func () {`
			`if startingIndex != index {`
			`tok.runes = runes[startingIndex:index]`
			`startingIndex = index`
			`tokens = append(tokens, tok)`
			`}`
			`tok = token { }`
			`}`
			`mustBeInToken := func (kind tokenKind) {`
			`if tok.kind != kind {`
			`tokenBoundary()`
			`tok.kind = kind`
			`}`
			`}`

			`for index, run = range text {`
			`runes = append(runes, runeLayout {`
			`run: run,`
			`})`

			`switch {`
			`case run == '\r':`
			`tokenBoundary()`
			`// we don't know the token type yet. if next rune is a`
			`// \n then this is a CRLF line break. if not, this is`
			`// just a word.`

			`case run == '\n':`
			`if lastRune == '\r' {`
			`// continue the \r to make a CRLF line break`
			`tok.kind = tokenKindLineBreak`
			`} else {`
			`tokenBoundary()`
			`tok.kind = tokenKindLineBreak`
			`}`

			`case run == '\t':`
			`mustBeInToken(tokenKindTab)`

			`case unicode.IsSpace(run):`
			`mustBeInToken(tokenKindSpace)`

			`default:`
			`mustBeInToken(tokenKindWord)`
			`}`
			`lastRune = run`
			`}`
			`index ++ // make index equal to len([]rune(text))`

			`tokenBoundary()`
			`return runes, tokens`
			`}`