From 127aa23a61939f0e097711d5f780a7de4f66b942 Mon Sep 17 00:00:00 2001 From: Sasha Koshka Date: Thu, 5 Jun 2025 22:06:22 -0400 Subject: [PATCH] generate: Add PDL lexer --- generate/lex.go | 230 +++++++++++++++++++++++++++++++++++++++++++ generate/lex_test.go | 54 ++++++++++ 2 files changed, 284 insertions(+) create mode 100644 generate/lex.go create mode 100644 generate/lex_test.go diff --git a/generate/lex.go b/generate/lex.go new file mode 100644 index 0000000..0d9aaf8 --- /dev/null +++ b/generate/lex.go @@ -0,0 +1,230 @@ +package generate + +import "io" +import "bufio" +import "unicode" +import "unicode/utf8" +import "git.tebibyte.media/sashakoshka/goparse" + +const ( + TokenMethod parse.TokenKind = iota + TokenKey + TokenIdent + TokenComma + TokenLBrace + TokenRBrace + TokenLBracket + TokenRBracket +) + +var tokenNames = map[parse.TokenKind] string { + TokenMethod: "Method", + TokenKey: "Key", + TokenIdent: "Ident", + TokenComma: "Comma", + TokenLBrace: "LBrace", + TokenRBrace: "RBrace", + TokenLBracket: "LBracket", + TokenRBracket: "RBracket", +} + +func Lex(fileName string, reader io.Reader) (parse.Lexer, error) { + lex := &lexer { + fileName: fileName, + lineScanner: bufio.NewScanner(reader), + } + lex.nextRune() + return lex, nil +} + +type lexer struct { + fileName string + lineScanner *bufio.Scanner + rune rune + line string + lineFood string + + offset int + row int + column int + + eof bool + +} + +func (this *lexer) Next() (parse.Token, error) { + token, err := this.nextInternal() + if err == io.EOF { err = this.errUnexpectedEOF() } + return token, err +} + +func (this *lexer) nextInternal() (token parse.Token, err error) { + err = this.skipWhitespace() + token.Position = this.pos() + if this.eof { + token.Kind = parse.EOF + err = nil + return + } + if err != nil { return } + + appendRune := func () { + token.Value += string(this.rune) + err = this.nextRune() + } + + doNumber := func () { + for isDigit(this.rune) { + appendRune() + if this.eof { err = nil; return } + if err != nil { return } + } + } + + defer func () { + newPos := this.pos() + newPos.End -- // TODO figure out why tf we have to do this + token.Position = token.Position.Union(newPos) + } () + + switch { + // Method + case this.rune == 'M': + token.Kind = TokenMethod + err = this.nextRune() + if err != nil { return } + doNumber() + if this.eof { err = nil; return } + // Key + case isDigit(this.rune): + token.Kind = TokenKey + doNumber() + if this.eof { err = nil; return } + // Ident + case unicode.IsUpper(this.rune): + token.Kind = TokenIdent + for unicode.IsLetter(this.rune) || isDigit(this.rune) { + appendRune() + if this.eof { err = nil; return } + if err != nil { return } + } + // Comma + case this.rune == ',': + token.Kind = TokenComma + appendRune() + if this.eof { err = nil; return } + // LBrace + case this.rune == '{': + token.Kind = TokenLBrace + appendRune() + if this.eof { err = nil; return } + // RBrace + case this.rune == '}': + token.Kind = TokenRBrace + appendRune() + if this.eof { err = nil; return } + // LBracket + case this.rune == '[': + token.Kind = TokenLBracket + appendRune() + if this.eof { err = nil; return } + // RBracket + case this.rune == ']': + token.Kind = TokenRBracket + appendRune() + if this.eof { err = nil; return } + case unicode.IsPrint(this.rune): + err = parse.Errorf ( + this.pos(), "unexpected rune '%c'", + this.rune) + default: + err = parse.Errorf ( + this.pos(), "unexpected rune %U", + this.rune) + } + + return +} + +func (this *lexer) nextRune() error { + if this.lineFood == "" { + ok := this.lineScanner.Scan() + if ok { + this.line = this.lineScanner.Text() + this.lineFood = this.line + this.rune = '\n' + this.column = 0 + this.row ++ + } else { + err := this.lineScanner.Err() + if err == nil { + this.eof = true + return io.EOF + } else { + return err + } + } + } else { + var ch rune + var size int + for ch == 0 && this.lineFood != "" { + ch, size = utf8.DecodeRuneInString(this.lineFood) + this.lineFood = this.lineFood[size:] + } + this.rune = ch + this.column ++ + } + + return nil +} + +func (this *lexer) skipWhitespace() error { + err := this.skipComment() + if err != nil { return err } + for isWhitespace(this.rune) { + err := this.nextRune() + if err != nil { return err } + err = this.skipComment() + if err != nil { return err } + } + return nil +} + +func (this *lexer) skipComment() error { + if this.rune == ';' { + for this.rune != '\n' { + err := this.nextRune() + if err != nil { return err } + } + } + return nil +} + +func (this *lexer) pos() parse.Position { + return parse.Position { + File: this.fileName, + Line: this.lineScanner.Text(), + Row: this.row - 1, + Start: this.column - 1, + End: this.column, + } +} + +func (this *lexer) errUnexpectedEOF() error { + return parse.Errorf(this.pos(), "unexpected EOF") +} + +func isWhitespace(char rune) bool { + switch char { + case ' ', '\t', '\r', '\n': return true + default: return false + } +} + +func isDigit(char rune) bool { + return char >= '0' && char <= '9' +} + +func isHexDigit(char rune) bool { + return isDigit(char) || char >= 'a' && char <= 'f' || char >= 'A' && char <= 'F' +} diff --git a/generate/lex_test.go b/generate/lex_test.go new file mode 100644 index 0000000..fc4a967 --- /dev/null +++ b/generate/lex_test.go @@ -0,0 +1,54 @@ +package generate + +import "strings" +import "testing" +import "git.tebibyte.media/sashakoshka/goparse" + +func TestLex(test *testing.T) { + lexer, err := Lex("test.pdl", strings.NewReader(` + M0001 User { + 0000 Name String, + 0001 Users []User, + 0002 Followers U32, + }`)) + if err != nil { test.Fatal(parse.Format(err)) } + + correctTokens := []parse.Token { + tok(TokenMethod, "0001"), + tok(TokenIdent, "User"), + tok(TokenLBrace, "{"), + tok(TokenKey, "0000"), + tok(TokenIdent, "Name"), + tok(TokenIdent, "String"), + tok(TokenComma, ","), + tok(TokenKey, "0001"), + tok(TokenIdent, "Users"), + tok(TokenLBracket, "["), + tok(TokenRBracket, "]"), + tok(TokenIdent, "User"), + tok(TokenComma, ","), + tok(TokenKey, "0002"), + tok(TokenIdent, "Followers"), + tok(TokenIdent, "U32"), + tok(TokenComma, ","), + tok(TokenRBrace, "}"), + tok(parse.EOF, ""), + } + + for index, correct := range correctTokens { + got, err := lexer.Next() + if err != nil { test.Fatal(parse.Format(err)) } + if got.Kind != correct.Kind || got.Value != correct.Value { + test.Logf("token %d mismatch", index) + test.Log("GOT:", tokenNames[got.Kind], got.Value) + test.Fatal("CORRECT:", tokenNames[correct.Kind], correct.Value) + } + } +} + +func tok(kind parse.TokenKind, value string) parse.Token { + return parse.Token { + Kind: kind, + Value: value, + } +}