Add parsing stage
This commit is contained in:
parent
0c9d50ebcd
commit
569defdb36
70
parse.go
Normal file
70
parse.go
Normal file
@ -0,0 +1,70 @@
|
||||
package typeset
|
||||
|
||||
import "unicode"
|
||||
|
||||
// TODO perhaps follow https://unicode.org/reports/tr14/
|
||||
|
||||
func parseString (text string) ([]runeLayout, []token) {
|
||||
// TODO find an optimal size for both of these to minimize allocs. will
|
||||
// require some testing.
|
||||
runes := make([]runeLayout, 0, len(text) * 2 / 3)
|
||||
tokens := make([]token, 0, len(text) / 4)
|
||||
|
||||
var index int
|
||||
var startingIndex int
|
||||
var run rune
|
||||
var lastRune rune
|
||||
|
||||
var tok token
|
||||
tokenBoundary := func () {
|
||||
if startingIndex != index {
|
||||
tok.runes = runes[startingIndex:index]
|
||||
startingIndex = index
|
||||
tokens = append(tokens, tok)
|
||||
}
|
||||
tok = token { }
|
||||
}
|
||||
mustBeInToken := func (kind tokenKind) {
|
||||
if tok.kind != kind {
|
||||
tokenBoundary()
|
||||
tok.kind = kind
|
||||
}
|
||||
}
|
||||
|
||||
for index, run = range text {
|
||||
runes = append(runes, runeLayout {
|
||||
run: run,
|
||||
})
|
||||
|
||||
switch {
|
||||
case run == '\r':
|
||||
tokenBoundary()
|
||||
// we don't know the token type yet. if next rune is a
|
||||
// \n then this is a CRLF line break. if not, this is
|
||||
// just a word.
|
||||
|
||||
case run == '\n':
|
||||
if lastRune == '\r' {
|
||||
// continue the \r to make a CRLF line break
|
||||
tok.kind = tokenKindLineBreak
|
||||
} else {
|
||||
tokenBoundary()
|
||||
tok.kind = tokenKindLineBreak
|
||||
}
|
||||
|
||||
case run == '\t':
|
||||
mustBeInToken(tokenKindTab)
|
||||
|
||||
case unicode.IsSpace(run):
|
||||
mustBeInToken(tokenKindSpace)
|
||||
|
||||
default:
|
||||
mustBeInToken(tokenKindWord)
|
||||
}
|
||||
lastRune = run
|
||||
}
|
||||
index ++ // make index equal to len([]rune(text))
|
||||
|
||||
tokenBoundary()
|
||||
return runes, tokens
|
||||
}
|
126
parse_test.go
Normal file
126
parse_test.go
Normal file
@ -0,0 +1,126 @@
|
||||
package typeset
|
||||
|
||||
import "slices"
|
||||
import "testing"
|
||||
|
||||
func rl (run rune) runeLayout { return runeLayout { run: run } }
|
||||
func tk (kind tokenKind, value string) token {
|
||||
tok := token {
|
||||
kind: kind,
|
||||
}
|
||||
runeValue := []rune(value)
|
||||
tok.runes = make([]runeLayout, len(runeValue))
|
||||
for index, run := range runeValue {
|
||||
tok.runes[index] = rl(run)
|
||||
}
|
||||
return tok
|
||||
}
|
||||
func compareTokens (got, correct []token) bool {
|
||||
for index, tok := range got {
|
||||
correctTok := correct[index]
|
||||
isCorrect :=
|
||||
correctTok.kind == tok.kind &&
|
||||
correctTok.width == tok.width &&
|
||||
slices.Equal(correctTok.runes, tok.runes)
|
||||
if !isCorrect { return false }
|
||||
}
|
||||
return true
|
||||
}
|
||||
func logTokens (test *testing.T, tokens []token) {
|
||||
for _, token := range tokens {
|
||||
test.Logf("- %-40v | %v", token, token.runes)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseString (test *testing.T) {
|
||||
// ---- processing ----
|
||||
runes, tokens := parseString("hello \rworld!\nfoo\n\r\nbar\tbaz\t\tsomething")
|
||||
|
||||
// ---- correct data ----
|
||||
correctRunes := []runeLayout {
|
||||
rl('h'),
|
||||
rl('e'),
|
||||
rl('l'),
|
||||
rl('l'),
|
||||
rl('o'),
|
||||
rl(' '),
|
||||
rl('\r'),
|
||||
rl('w'),
|
||||
rl('o'),
|
||||
rl('r'),
|
||||
rl('l'),
|
||||
rl('d'),
|
||||
rl('!'),
|
||||
rl('\n'),
|
||||
rl('f'),
|
||||
rl('o'),
|
||||
rl('o'),
|
||||
rl('\n'),
|
||||
rl('\r'),
|
||||
rl('\n'),
|
||||
rl('b'),
|
||||
rl('a'),
|
||||
rl('r'),
|
||||
rl('\t'),
|
||||
rl('b'),
|
||||
rl('a'),
|
||||
rl('z'),
|
||||
rl('\t'),
|
||||
rl('\t'),
|
||||
rl('s'),
|
||||
rl('o'),
|
||||
rl('m'),
|
||||
rl('e'),
|
||||
rl('t'),
|
||||
rl('h'),
|
||||
rl('i'),
|
||||
rl('n'),
|
||||
rl('g'),
|
||||
}
|
||||
correctTokens := []token {
|
||||
tk(tokenKindWord, "hello"),
|
||||
tk(tokenKindSpace, " "),
|
||||
tk(tokenKindWord, "\rworld!"),
|
||||
tk(tokenKindLineBreak, "\n"),
|
||||
tk(tokenKindWord, "foo"),
|
||||
tk(tokenKindLineBreak, "\n"),
|
||||
tk(tokenKindLineBreak, "\r\n"),
|
||||
tk(tokenKindWord, "bar"),
|
||||
tk(tokenKindTab, "\t"),
|
||||
tk(tokenKindWord, "baz"),
|
||||
tk(tokenKindTab, "\t\t"),
|
||||
tk(tokenKindWord, "something"),
|
||||
}
|
||||
|
||||
// ---- testing ----
|
||||
if len(runes) != len(correctRunes) {
|
||||
test.Logf("len(runes) != len(correctRunes): %d, %d", len(runes), len(correctRunes))
|
||||
test.Log(runes)
|
||||
test.Log(correctRunes)
|
||||
test.FailNow()
|
||||
}
|
||||
if !slices.Equal(runes, correctRunes) {
|
||||
test.Log("runes != correctRunes:")
|
||||
test.Log(runes)
|
||||
test.Log(correctRunes)
|
||||
test.FailNow()
|
||||
}
|
||||
if len(tokens) != len(correctTokens) {
|
||||
test.Logf("len(tokens) != len(correctTokens): %d, %d", len(tokens), len(correctTokens))
|
||||
test.Log("GOT")
|
||||
logTokens(test, tokens)
|
||||
test.Log("CORRECT")
|
||||
logTokens(test, correctTokens)
|
||||
test.FailNow()
|
||||
}
|
||||
if !compareTokens(tokens, correctTokens) {
|
||||
test.Log("tokens != correctTokens:")
|
||||
test.Log("GOT")
|
||||
logTokens(test, tokens)
|
||||
test.Log("CORRECT")
|
||||
logTokens(test, correctTokens)
|
||||
test.FailNow()
|
||||
}
|
||||
// TODO: ensure runeLayout slices in the tokens reference the same
|
||||
// memory as the complete runes slice
|
||||
}
|
Loading…
Reference in New Issue
Block a user