From 569defdb36e7af0c2e01e30feaa84b4cfd0cd032 Mon Sep 17 00:00:00 2001
From: Sasha Koshka <sashakoshka@tebibyte.media>
Date: Tue, 10 Sep 2024 11:19:57 -0400
Subject: [PATCH] Add parsing stage

---
 parse.go      |  70 ++++++++++++++++++++++++++++
 parse_test.go | 126 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 196 insertions(+)
 create mode 100644 parse.go
 create mode 100644 parse_test.go

diff --git a/parse.go b/parse.go
new file mode 100644
index 0000000..e7d816e
--- /dev/null
+++ b/parse.go
@@ -0,0 +1,70 @@
+package typeset
+
+import "unicode"
+
+// TODO perhaps follow https://unicode.org/reports/tr14/
+
+func parseString (text string) ([]runeLayout, []token) {
+	// TODO find an optimal size for both of these to minimize allocs. will
+	// require some testing.
+	runes  := make([]runeLayout, 0, len(text) * 2 / 3)
+	tokens := make([]token,      0, len(text) / 4)
+
+	var index         int
+	var startingIndex int
+	var run           rune
+	var lastRune      rune
+
+	var tok token
+	tokenBoundary := func () {
+		if startingIndex != index {
+			tok.runes = runes[startingIndex:index]
+			startingIndex = index
+			tokens = append(tokens, tok)
+		}
+		tok = token { }
+	}
+	mustBeInToken := func (kind tokenKind) {
+		if tok.kind != kind {
+			tokenBoundary()
+			tok.kind = kind
+		}
+	}
+
+	for index, run = range text {
+		runes = append(runes, runeLayout {
+			run: run,
+		})
+
+		switch {
+		case run == '\r':
+			tokenBoundary()
+			// we don't know the token type yet. if next rune is a
+			// \n then this is a CRLF line break. if not, this is
+			// just a word.
+			
+		case run == '\n':
+			if lastRune == '\r' {
+				// continue the \r to make a CRLF line break
+				tok.kind = tokenKindLineBreak
+			} else {
+				tokenBoundary()
+				tok.kind = tokenKindLineBreak
+			}
+
+		case run == '\t':
+			mustBeInToken(tokenKindTab)
+
+		case unicode.IsSpace(run):
+			mustBeInToken(tokenKindSpace)
+
+		default:
+			mustBeInToken(tokenKindWord)
+		}
+		lastRune = run
+	}
+	index ++ // make index equal to len([]rune(text))
+	
+	tokenBoundary()
+	return runes, tokens
+}
diff --git a/parse_test.go b/parse_test.go
new file mode 100644
index 0000000..5c9ce4c
--- /dev/null
+++ b/parse_test.go
@@ -0,0 +1,126 @@
+package typeset
+
+import "slices"
+import "testing"
+
+func rl (run rune) runeLayout { return runeLayout { run: run } }
+func tk (kind tokenKind, value string) token {
+	tok := token {
+		kind: kind,
+	}
+	runeValue := []rune(value)
+	tok.runes = make([]runeLayout, len(runeValue))
+	for index, run := range runeValue {
+		tok.runes[index] = rl(run)
+	}
+	return tok
+}
+func compareTokens (got, correct []token) bool {
+	for index, tok := range got {
+		correctTok := correct[index]
+		isCorrect :=
+			correctTok.kind  == tok.kind &&
+			correctTok.width == tok.width &&
+			slices.Equal(correctTok.runes, tok.runes)
+		if !isCorrect { return false }
+	}
+	return true
+}
+func logTokens (test *testing.T, tokens []token) {
+	for _, token := range tokens {
+		test.Logf("- %-40v | %v", token, token.runes)
+	}
+}
+
+func TestParseString (test *testing.T) {
+	// ---- processing ----
+	runes, tokens := parseString("hello \rworld!\nfoo\n\r\nbar\tbaz\t\tsomething")
+
+	// ---- correct data ----
+	correctRunes := []runeLayout {
+		rl('h'),
+		rl('e'),
+		rl('l'),
+		rl('l'),
+		rl('o'),
+		rl(' '),
+		rl('\r'),
+		rl('w'),
+		rl('o'),
+		rl('r'),
+		rl('l'),
+		rl('d'),
+		rl('!'),
+		rl('\n'),
+		rl('f'),
+		rl('o'),
+		rl('o'),
+		rl('\n'),
+		rl('\r'),
+		rl('\n'),
+		rl('b'),
+		rl('a'),
+		rl('r'),
+		rl('\t'),
+		rl('b'),
+		rl('a'),
+		rl('z'),
+		rl('\t'),
+		rl('\t'),
+		rl('s'),
+		rl('o'),
+		rl('m'),
+		rl('e'),
+		rl('t'),
+		rl('h'),
+		rl('i'),
+		rl('n'),
+		rl('g'),
+	}
+	correctTokens := []token {
+		tk(tokenKindWord, "hello"),
+		tk(tokenKindSpace, " "),
+		tk(tokenKindWord, "\rworld!"),
+		tk(tokenKindLineBreak, "\n"),
+		tk(tokenKindWord, "foo"),
+		tk(tokenKindLineBreak, "\n"),
+		tk(tokenKindLineBreak, "\r\n"),
+		tk(tokenKindWord, "bar"),
+		tk(tokenKindTab, "\t"),
+		tk(tokenKindWord, "baz"),
+		tk(tokenKindTab, "\t\t"),
+		tk(tokenKindWord, "something"),
+	}
+
+	// ---- testing ----
+	if len(runes) != len(correctRunes) {
+		test.Logf("len(runes) != len(correctRunes): %d, %d", len(runes), len(correctRunes))
+		test.Log(runes)
+		test.Log(correctRunes)
+		test.FailNow()
+	}
+	if !slices.Equal(runes, correctRunes) {
+		test.Log("runes != correctRunes:")
+		test.Log(runes)
+		test.Log(correctRunes)
+		test.FailNow()
+	}
+	if len(tokens) != len(correctTokens) {
+		test.Logf("len(tokens) != len(correctTokens): %d, %d", len(tokens), len(correctTokens))
+		test.Log("GOT")
+		logTokens(test, tokens)
+		test.Log("CORRECT")
+		logTokens(test, correctTokens)
+		test.FailNow()
+	}
+	if !compareTokens(tokens, correctTokens) {
+		test.Log("tokens != correctTokens:")
+		test.Log("GOT")
+		logTokens(test, tokens)
+		test.Log("CORRECT")
+		logTokens(test, correctTokens)
+		test.FailNow()
+	}
+	// TODO: ensure runeLayout slices in the tokens reference the same
+	// memory as the complete runes slice
+}