Add parsing stage

This commit is contained in:
Sasha Koshka 2024-09-10 11:19:57 -04:00
parent 0c9d50ebcd
commit 569defdb36
2 changed files with 196 additions and 0 deletions

70
parse.go Normal file
View File

@ -0,0 +1,70 @@
package typeset
import "unicode"
// TODO perhaps follow https://unicode.org/reports/tr14/
func parseString (text string) ([]runeLayout, []token) {
// TODO find an optimal size for both of these to minimize allocs. will
// require some testing.
runes := make([]runeLayout, 0, len(text) * 2 / 3)
tokens := make([]token, 0, len(text) / 4)
var index int
var startingIndex int
var run rune
var lastRune rune
var tok token
tokenBoundary := func () {
if startingIndex != index {
tok.runes = runes[startingIndex:index]
startingIndex = index
tokens = append(tokens, tok)
}
tok = token { }
}
mustBeInToken := func (kind tokenKind) {
if tok.kind != kind {
tokenBoundary()
tok.kind = kind
}
}
for index, run = range text {
runes = append(runes, runeLayout {
run: run,
})
switch {
case run == '\r':
tokenBoundary()
// we don't know the token type yet. if next rune is a
// \n then this is a CRLF line break. if not, this is
// just a word.
case run == '\n':
if lastRune == '\r' {
// continue the \r to make a CRLF line break
tok.kind = tokenKindLineBreak
} else {
tokenBoundary()
tok.kind = tokenKindLineBreak
}
case run == '\t':
mustBeInToken(tokenKindTab)
case unicode.IsSpace(run):
mustBeInToken(tokenKindSpace)
default:
mustBeInToken(tokenKindWord)
}
lastRune = run
}
index ++ // make index equal to len([]rune(text))
tokenBoundary()
return runes, tokens
}

126
parse_test.go Normal file
View File

@ -0,0 +1,126 @@
package typeset
import "slices"
import "testing"
func rl (run rune) runeLayout { return runeLayout { run: run } }
func tk (kind tokenKind, value string) token {
tok := token {
kind: kind,
}
runeValue := []rune(value)
tok.runes = make([]runeLayout, len(runeValue))
for index, run := range runeValue {
tok.runes[index] = rl(run)
}
return tok
}
func compareTokens (got, correct []token) bool {
for index, tok := range got {
correctTok := correct[index]
isCorrect :=
correctTok.kind == tok.kind &&
correctTok.width == tok.width &&
slices.Equal(correctTok.runes, tok.runes)
if !isCorrect { return false }
}
return true
}
func logTokens (test *testing.T, tokens []token) {
for _, token := range tokens {
test.Logf("- %-40v | %v", token, token.runes)
}
}
func TestParseString (test *testing.T) {
// ---- processing ----
runes, tokens := parseString("hello \rworld!\nfoo\n\r\nbar\tbaz\t\tsomething")
// ---- correct data ----
correctRunes := []runeLayout {
rl('h'),
rl('e'),
rl('l'),
rl('l'),
rl('o'),
rl(' '),
rl('\r'),
rl('w'),
rl('o'),
rl('r'),
rl('l'),
rl('d'),
rl('!'),
rl('\n'),
rl('f'),
rl('o'),
rl('o'),
rl('\n'),
rl('\r'),
rl('\n'),
rl('b'),
rl('a'),
rl('r'),
rl('\t'),
rl('b'),
rl('a'),
rl('z'),
rl('\t'),
rl('\t'),
rl('s'),
rl('o'),
rl('m'),
rl('e'),
rl('t'),
rl('h'),
rl('i'),
rl('n'),
rl('g'),
}
correctTokens := []token {
tk(tokenKindWord, "hello"),
tk(tokenKindSpace, " "),
tk(tokenKindWord, "\rworld!"),
tk(tokenKindLineBreak, "\n"),
tk(tokenKindWord, "foo"),
tk(tokenKindLineBreak, "\n"),
tk(tokenKindLineBreak, "\r\n"),
tk(tokenKindWord, "bar"),
tk(tokenKindTab, "\t"),
tk(tokenKindWord, "baz"),
tk(tokenKindTab, "\t\t"),
tk(tokenKindWord, "something"),
}
// ---- testing ----
if len(runes) != len(correctRunes) {
test.Logf("len(runes) != len(correctRunes): %d, %d", len(runes), len(correctRunes))
test.Log(runes)
test.Log(correctRunes)
test.FailNow()
}
if !slices.Equal(runes, correctRunes) {
test.Log("runes != correctRunes:")
test.Log(runes)
test.Log(correctRunes)
test.FailNow()
}
if len(tokens) != len(correctTokens) {
test.Logf("len(tokens) != len(correctTokens): %d, %d", len(tokens), len(correctTokens))
test.Log("GOT")
logTokens(test, tokens)
test.Log("CORRECT")
logTokens(test, correctTokens)
test.FailNow()
}
if !compareTokens(tokens, correctTokens) {
test.Log("tokens != correctTokens:")
test.Log("GOT")
logTokens(test, tokens)
test.Log("CORRECT")
logTokens(test, correctTokens)
test.FailNow()
}
// TODO: ensure runeLayout slices in the tokens reference the same
// memory as the complete runes slice
}