diff --git a/parse.go b/parse.go new file mode 100644 index 0000000..e7d816e --- /dev/null +++ b/parse.go @@ -0,0 +1,70 @@ +package typeset + +import "unicode" + +// TODO perhaps follow https://unicode.org/reports/tr14/ + +func parseString (text string) ([]runeLayout, []token) { + // TODO find an optimal size for both of these to minimize allocs. will + // require some testing. + runes := make([]runeLayout, 0, len(text) * 2 / 3) + tokens := make([]token, 0, len(text) / 4) + + var index int + var startingIndex int + var run rune + var lastRune rune + + var tok token + tokenBoundary := func () { + if startingIndex != index { + tok.runes = runes[startingIndex:index] + startingIndex = index + tokens = append(tokens, tok) + } + tok = token { } + } + mustBeInToken := func (kind tokenKind) { + if tok.kind != kind { + tokenBoundary() + tok.kind = kind + } + } + + for index, run = range text { + runes = append(runes, runeLayout { + run: run, + }) + + switch { + case run == '\r': + tokenBoundary() + // we don't know the token type yet. if next rune is a + // \n then this is a CRLF line break. if not, this is + // just a word. + + case run == '\n': + if lastRune == '\r' { + // continue the \r to make a CRLF line break + tok.kind = tokenKindLineBreak + } else { + tokenBoundary() + tok.kind = tokenKindLineBreak + } + + case run == '\t': + mustBeInToken(tokenKindTab) + + case unicode.IsSpace(run): + mustBeInToken(tokenKindSpace) + + default: + mustBeInToken(tokenKindWord) + } + lastRune = run + } + index ++ // make index equal to len([]rune(text)) + + tokenBoundary() + return runes, tokens +} diff --git a/parse_test.go b/parse_test.go new file mode 100644 index 0000000..5c9ce4c --- /dev/null +++ b/parse_test.go @@ -0,0 +1,126 @@ +package typeset + +import "slices" +import "testing" + +func rl (run rune) runeLayout { return runeLayout { run: run } } +func tk (kind tokenKind, value string) token { + tok := token { + kind: kind, + } + runeValue := []rune(value) + tok.runes = make([]runeLayout, len(runeValue)) + for index, run := range runeValue { + tok.runes[index] = rl(run) + } + return tok +} +func compareTokens (got, correct []token) bool { + for index, tok := range got { + correctTok := correct[index] + isCorrect := + correctTok.kind == tok.kind && + correctTok.width == tok.width && + slices.Equal(correctTok.runes, tok.runes) + if !isCorrect { return false } + } + return true +} +func logTokens (test *testing.T, tokens []token) { + for _, token := range tokens { + test.Logf("- %-40v | %v", token, token.runes) + } +} + +func TestParseString (test *testing.T) { + // ---- processing ---- + runes, tokens := parseString("hello \rworld!\nfoo\n\r\nbar\tbaz\t\tsomething") + + // ---- correct data ---- + correctRunes := []runeLayout { + rl('h'), + rl('e'), + rl('l'), + rl('l'), + rl('o'), + rl(' '), + rl('\r'), + rl('w'), + rl('o'), + rl('r'), + rl('l'), + rl('d'), + rl('!'), + rl('\n'), + rl('f'), + rl('o'), + rl('o'), + rl('\n'), + rl('\r'), + rl('\n'), + rl('b'), + rl('a'), + rl('r'), + rl('\t'), + rl('b'), + rl('a'), + rl('z'), + rl('\t'), + rl('\t'), + rl('s'), + rl('o'), + rl('m'), + rl('e'), + rl('t'), + rl('h'), + rl('i'), + rl('n'), + rl('g'), + } + correctTokens := []token { + tk(tokenKindWord, "hello"), + tk(tokenKindSpace, " "), + tk(tokenKindWord, "\rworld!"), + tk(tokenKindLineBreak, "\n"), + tk(tokenKindWord, "foo"), + tk(tokenKindLineBreak, "\n"), + tk(tokenKindLineBreak, "\r\n"), + tk(tokenKindWord, "bar"), + tk(tokenKindTab, "\t"), + tk(tokenKindWord, "baz"), + tk(tokenKindTab, "\t\t"), + tk(tokenKindWord, "something"), + } + + // ---- testing ---- + if len(runes) != len(correctRunes) { + test.Logf("len(runes) != len(correctRunes): %d, %d", len(runes), len(correctRunes)) + test.Log(runes) + test.Log(correctRunes) + test.FailNow() + } + if !slices.Equal(runes, correctRunes) { + test.Log("runes != correctRunes:") + test.Log(runes) + test.Log(correctRunes) + test.FailNow() + } + if len(tokens) != len(correctTokens) { + test.Logf("len(tokens) != len(correctTokens): %d, %d", len(tokens), len(correctTokens)) + test.Log("GOT") + logTokens(test, tokens) + test.Log("CORRECT") + logTokens(test, correctTokens) + test.FailNow() + } + if !compareTokens(tokens, correctTokens) { + test.Log("tokens != correctTokens:") + test.Log("GOT") + logTokens(test, tokens) + test.Log("CORRECT") + logTokens(test, correctTokens) + test.FailNow() + } + // TODO: ensure runeLayout slices in the tokens reference the same + // memory as the complete runes slice +}