pass-test-tokenize-all #1

Merged
sashakoshka merged 20 commits from pass-test-tokenize-all into main 2022-08-11 03:07:36 -06:00
7 changed files with 491 additions and 50 deletions

View File

@ -58,7 +58,8 @@ func (err Error) Error () (formattedMessage string) {
// print an arrow with a tail spanning the width of the mistake
columnCountdown := err.Location.column
for columnCountdown > 0 {
for columnCountdown > 1 {
// TODO: for tabs, print out a teb instead.
formattedMessage += " "
columnCountdown --
}
@ -66,9 +67,9 @@ func (err Error) Error () (formattedMessage string) {
// TODO: for tabs, print out 8 of these instead.
formattedMessage += "-"
}
formattedMessage += "-\n"
formattedMessage += "^\n"
}
formattedMessage += err.message + "-\n"
formattedMessage += err.message + "\n"
return
}

View File

@ -1,7 +1,9 @@
package lexer
import "io"
import "fmt"
import "github.com/sashakoshka/arf/file"
import "github.com/sashakoshka/arf/types"
// LexingOperation holds information about an ongoing lexing operataion.
type LexingOperation struct {
@ -31,22 +33,62 @@ func (lexer *LexingOperation) tokenize () (err error) {
if err != nil { return }
for {
fmt.Println(string(lexer.char))
lowercase := lexer.char >= 'a' && lexer.char <= 'z'
uppercase := lexer.char >= 'A' && lexer.char <= 'Z'
number := lexer.char >= '0' && lexer.char <= '9'
if number {
// TODO: tokenize number begin
err = lexer.tokenizeNumberBeginning(false)
if err != nil { return }
} else if lowercase || uppercase {
// TODO: tokenize alpha begin
err = lexer.tokenizeAlphaBeginning()
if err != nil { return }
} else {
err = lexer.tokenizeSymbolBeginning()
if err != nil { return err }
if err != nil { return }
}
// TODO: skip whitespace
err = lexer.skipSpaces()
if err != nil { return }
}
if lexer.tokens[len(lexer.tokens) - 1].kind != TokenKindNewline {
lexer.addToken(Token { kind: TokenKindNewline })
}
return
}
func (lexer *LexingOperation) tokenizeAlphaBeginning () (err error) {
got := ""
for {
lowercase := lexer.char >= 'a' && lexer.char <= 'z'
uppercase := lexer.char >= 'A' && lexer.char <= 'Z'
number := lexer.char >= '0' && lexer.char <= '9'
if !lowercase && !uppercase && !number { break }
got += string(lexer.char)
lexer.nextRune()
}
token := Token { kind: TokenKindName, value: got }
if len(got) == 2 {
firstValid := got[0] == 'n' || got[0] == 'r' || got[0] == 'w'
secondValid := got[1] == 'n' || got[1] == 'r' || got[1] == 'w'
if firstValid && secondValid {
token.kind = TokenKindPermission
token.value = types.PermissionFrom(got)
}
}
lexer.addToken(token)
return
}
@ -55,7 +97,8 @@ func (lexer *LexingOperation) tokenizeSymbolBeginning () (err error) {
case '#':
// comment
for lexer.char != '\n' {
lexer.nextRune()
err = lexer.nextRune()
if err != nil { return }
}
case '\t':
// indent level
@ -75,105 +118,146 @@ func (lexer *LexingOperation) tokenizeSymbolBeginning () (err error) {
lexer.addToken (Token {
kind: TokenKindIndent,
})
lexer.nextRune()
err = lexer.nextRune()
if err != nil { return }
}
case '\n':
// line break
lastLineEmpty := true
tokenIndex := len(lexer.tokens) - 1
for lexer.tokens[tokenIndex].kind != TokenKindNewline {
if lexer.tokens[tokenIndex].kind != TokenKindIndent {
lastLineEmpty = false
break
}
tokenIndex --
}
if lastLineEmpty {
lexer.tokens = lexer.tokens[:tokenIndex]
}
// TODO: if last line was blank, (ony whitespace) discard.
lexer.addToken (Token {
kind: TokenKindNewline,
})
lexer.nextRune()
err = lexer.nextRune()
case '"':
// TODO: tokenize string literal
lexer.nextRune()
err = lexer.tokenizeString(false)
case '\'':
// TODO: tokenize rune literal
lexer.nextRune()
err = lexer.tokenizeString(true)
case ':':
lexer.addToken (Token {
kind: TokenKindColon,
})
lexer.nextRune()
err = lexer.nextRune()
case '.':
lexer.addToken (Token {
kind: TokenKindDot,
})
lexer.nextRune()
err = lexer.nextRune()
case '[':
lexer.addToken (Token {
kind: TokenKindLBracket,
})
lexer.nextRune()
err = lexer.nextRune()
case ']':
lexer.addToken (Token {
kind: TokenKindRBracket,
})
lexer.nextRune()
err = lexer.nextRune()
case '{':
lexer.addToken (Token {
kind: TokenKindLBrace,
})
lexer.nextRune()
err = lexer.nextRune()
case '}':
lexer.addToken (Token {
kind: TokenKindRBrace,
})
lexer.nextRune()
err = lexer.nextRune()
case '+':
lexer.addToken (Token {
kind: TokenKindPlus,
})
lexer.nextRune()
err = lexer.nextRune()
if err != nil { return }
token := Token { kind: TokenKindPlus }
if lexer.char == '+' {
token.kind = TokenKindIncrement
}
lexer.addToken(token)
err = lexer.nextRune()
case '-':
// TODO: tokenize dash begin
lexer.nextRune()
err = lexer.tokenizeDashBeginning()
case '*':
lexer.addToken (Token {
kind: TokenKindAsterisk,
})
lexer.nextRune()
err = lexer.nextRune()
case '/':
lexer.addToken (Token {
kind: TokenKindSlash,
})
lexer.nextRune()
err = lexer.nextRune()
case '@':
lexer.addToken (Token {
kind: TokenKindAt,
})
lexer.nextRune()
err = lexer.nextRune()
case '!':
lexer.addToken (Token {
kind: TokenKindExclamation,
})
lexer.nextRune()
err = lexer.nextRune()
case '%':
lexer.addToken (Token {
kind: TokenKindPercent,
})
lexer.nextRune()
err = lexer.nextRune()
case '~':
lexer.addToken (Token {
kind: TokenKindTilde,
})
lexer.nextRune()
err = lexer.nextRune()
case '<':
// TODO: tokenize less than begin
lexer.nextRune()
err = lexer.nextRune()
if err != nil { return }
token := Token { kind: TokenKindLessThan }
if lexer.char == '<' {
token.kind = TokenKindLShift
}
lexer.addToken(token)
err = lexer.nextRune()
case '>':
// TODO: tokenize greater than begin
lexer.nextRune()
err = lexer.nextRune()
if err != nil { return }
token := Token { kind: TokenKindGreaterThan }
if lexer.char == '>' {
token.kind = TokenKindRShift
}
lexer.addToken(token)
err = lexer.nextRune()
case '|':
// TODO: tokenize bar begin
lexer.nextRune()
err = lexer.nextRune()
if err != nil { return }
token := Token { kind: TokenKindBinaryOr }
if lexer.char == '|' {
token.kind = TokenKindLogicalOr
}
lexer.addToken(token)
err = lexer.nextRune()
case '&':
// TODO: tokenize and begin
lexer.nextRune()
err = lexer.nextRune()
if err != nil { return }
token := Token { kind: TokenKindBinaryAnd }
if lexer.char == '&' {
token.kind = TokenKindLogicalAnd
}
lexer.addToken(token)
err = lexer.nextRune()
default:
err = file.NewError (
lexer.file.Location(), 1,
"unexpected character " +
"unexpected symbol character " +
string(lexer.char),
file.ErrorKindError)
return
@ -182,10 +266,53 @@ func (lexer *LexingOperation) tokenizeSymbolBeginning () (err error) {
return
}
func (lexer *LexingOperation) tokenizeDashBeginning () (err error) {
err = lexer.nextRune()
if err != nil { return }
if lexer.char == '-' {
token := Token { kind: TokenKindDecrement }
err = lexer.nextRune()
if err != nil { return }
if lexer.char == '-' {
token.kind = TokenKindSeparator
lexer.nextRune()
}
lexer.addToken(token)
} else if lexer.char == '>' {
token := Token { kind: TokenKindReturnDirection }
err = lexer.nextRune()
if err != nil { return }
lexer.addToken(token)
} else if lexer.char >= '0' && lexer.char <= '9' {
lexer.tokenizeNumberBeginning(true)
} else {
token := Token { kind: TokenKindMinus }
lexer.addToken(token)
}
return
}
// addToken adds a new token to the lexer's token slice.
func (lexer *LexingOperation) addToken (token Token) {
lexer.tokens = append(lexer.tokens, token)
}
// skipSpaces skips all space characters (not tabs or newlines)
func (lexer *LexingOperation) skipSpaces () (err error) {
for lexer.char == ' ' {
err = lexer.nextRune()
if err != nil { return }
}
return
}
// nextRune advances the lexer to the next rune in the file.
func (lexer *LexingOperation) nextRune () (err error) {
lexer.char, _, err = lexer.file.ReadRune()

View File

@ -13,10 +13,15 @@ func TestTokenizeAll (test *testing.T) {
}
tokens, err := Tokenize(file)
test.Log("resulting error:")
test.Log(err.Error())
if err == nil {
test.Log("Tokenize() should have returned an error")
// print all tokens
for index, token := range tokens {
test.Log(index, "\tgot token:", token.Describe())
}
if err != nil {
test.Log("returned error:")
test.Log(err.Error())
test.Fail()
return
}
@ -28,10 +33,10 @@ func TestTokenizeAll (test *testing.T) {
External: types.ModeWrite,
}},
Token { kind: TokenKindReturnDirection },
Token { kind: TokenKindInt, value: -349820394 },
Token { kind: TokenKindUInt, value: 932748397 },
Token { kind: TokenKindInt, value: int64(-349820394) },
Token { kind: TokenKindUInt, value: uint64(932748397) },
Token { kind: TokenKindFloat, value: 239485.37520 },
Token { kind: TokenKindString, value: "hello world\n" },
Token { kind: TokenKindString, value: "hello world!\n" },
Token { kind: TokenKindRune, value: 'E' },
Token { kind: TokenKindName, value: "helloWorld" },
Token { kind: TokenKindColon },
@ -40,6 +45,7 @@ func TestTokenizeAll (test *testing.T) {
Token { kind: TokenKindRBracket },
Token { kind: TokenKindLBrace },
Token { kind: TokenKindRBrace },
Token { kind: TokenKindNewline },
Token { kind: TokenKindPlus },
Token { kind: TokenKindMinus },
Token { kind: TokenKindIncrement },
@ -58,6 +64,7 @@ func TestTokenizeAll (test *testing.T) {
Token { kind: TokenKindLogicalOr },
Token { kind: TokenKindBinaryAnd },
Token { kind: TokenKindLogicalAnd },
Token { kind: TokenKindNewline },
}
if len(tokens) != len(correct) {
@ -70,6 +77,9 @@ func TestTokenizeAll (test *testing.T) {
for index, token := range tokens {
if !token.Equals(correct[index]) {
test.Log("token", index, "not equal")
test.Log (
"have", token.Describe(),
"want", correct[index].Describe())
test.Fail()
return
}

124
lexer/numbers.go Normal file
View File

@ -0,0 +1,124 @@
package lexer
import "github.com/sashakoshka/arf/file"
// tokenizeSymbolBeginning lexes a token that starts with a number.
func (lexer *LexingOperation) tokenizeNumberBeginning (negative bool) (err error) {
var number uint64
var fragment float64
var isFloat bool
if lexer.char == '0' {
lexer.nextRune()
if lexer.char == 'x' {
lexer.nextRune()
number, fragment, isFloat, err = lexer.tokenizeNumber(16)
} else if lexer.char == 'b' {
lexer.nextRune()
number, fragment, isFloat, err = lexer.tokenizeNumber(2)
} else if lexer.char == '.' {
number, fragment, isFloat, err = lexer.tokenizeNumber(10)
} else if lexer.char >= '0' && lexer.char <= '9' {
number, fragment, isFloat, err = lexer.tokenizeNumber(8)
} else {
return file.NewError (
lexer.file.Location(), 1,
"unexpected character in number literal",
file.ErrorKindError)
}
} else {
number, fragment, isFloat, err = lexer.tokenizeNumber(10)
}
if err != nil { return }
token := Token { }
if isFloat {
floatNumber := float64(number) + fragment
token.kind = TokenKindFloat
if negative {
token.value = floatNumber * -1
} else {
token.value = floatNumber
}
} else {
if negative {
token.kind = TokenKindInt
token.value = int64(number) * -1
} else {
token.kind = TokenKindUInt
token.value = uint64(number)
}
}
lexer.addToken(token)
return
}
// runeToDigit converts a rune from 0-F to a corresponding digit, with a maximum
// radix. If the character is invalid, or the digit is too big, it will return
// false for worked.
func runeToDigit (char rune, radix uint64) (digit uint64, worked bool) {
worked = true
if char >= '0' && char <= '9' {
digit = uint64(char - '0')
} else if char >= 'A' && char <= 'F' {
digit = uint64(char - 'A' + 9)
} else if char >= 'a' && char <= 'f' {
digit = uint64(char - 'a' + 9)
} else {
worked = false
}
if digit >= radix {
worked = false
}
return
}
// tokenizeNumber reads and tokenizes a number with the specified radix.
func (lexer *LexingOperation) tokenizeNumber (
radix uint64,
) (
number uint64,
fragment float64,
isFloat bool,
err error,
) {
for {
digit, worked := runeToDigit(lexer.char, radix)
if !worked { break }
number *= radix
number += digit
err = lexer.nextRune()
if err != nil { return }
}
if lexer.char == '.' {
isFloat = true
err = lexer.nextRune()
if err != nil { return }
coef := 1 / float64(radix)
for {
digit, worked := runeToDigit(lexer.char, radix)
if !worked { break }
fragment += float64(digit) * coef
coef /= float64(radix)
err = lexer.nextRune()
if err != nil { return }
}
}
return
}

77
lexer/text.go Normal file
View File

@ -0,0 +1,77 @@
package lexer
import "github.com/sashakoshka/arf/file"
var escapeSequenceMap = map[rune] rune {
'a': '\x07',
'b': '\x08',
'f': '\x0c',
'n': '\x0a',
'r': '\x0d',
't': '\x09',
'v': '\x0b',
'\'': '\'',
'"': '"',
'\\': '\\',
}
func (lexer *LexingOperation) tokenizeString (isRuneLiteral bool) (err error) {
err = lexer.nextRune()
if err != nil { return }
got := ""
for {
// TODO: add hexadecimal escape codes
if lexer.char == '\\' {
err = lexer.nextRune()
if err != nil { return }
actual, exists := escapeSequenceMap[lexer.char]
if exists {
got += string(actual)
} else {
err = file.NewError (
lexer.file.Location(), 1,
"unknown escape character " +
string(lexer.char), file.ErrorKindError)
return
}
} else {
got += string(lexer.char)
}
err = lexer.nextRune()
if err != nil { return }
if isRuneLiteral {
if lexer.char == '\'' { break }
} else {
if lexer.char == '"' { break }
}
}
err = lexer.nextRune()
if err != nil { return }
token := Token { }
if isRuneLiteral {
if len(got) > 1 {
err = file.NewError (
lexer.file.Location(), len(got) - 1,
"excess data in rune literal",
file.ErrorKindError)
return
}
token.kind = TokenKindRune
token.value = rune([]rune(got)[0])
} else {
token.kind = TokenKindString
token.value = got
}
lexer.addToken(token)
return
}

View File

@ -1,5 +1,6 @@
package lexer
import "fmt"
import "github.com/sashakoshka/arf/file"
// TokenKind is an enum represzenting what role a token has.
@ -84,3 +85,86 @@ func (token Token) Equals (testToken Token) (match bool) {
func (token Token) Location () (location file.Location) {
return token.location
}
// Describe generates a textual description of the token to be used in debug
// logs.
func (token Token) Describe () (description string) {
switch token.kind {
case TokenKindNewline:
description += "Newline"
case TokenKindIndent:
description += "Indent"
case TokenKindSeparator:
description += "Separator"
case TokenKindPermission:
description += "Permission"
case TokenKindReturnDirection:
description += "ReturnDirection"
case TokenKindInt:
description += "Int"
case TokenKindUInt:
description += "UInt"
case TokenKindFloat:
description += "Float"
case TokenKindString:
description += "String"
case TokenKindRune:
description += "Rune"
case TokenKindName:
description += "Name"
case TokenKindColon:
description += "Colon"
case TokenKindDot:
description += "Dot"
case TokenKindLBracket:
description += "LBracket"
case TokenKindRBracket:
description += "RBracket"
case TokenKindLBrace:
description += "LBrace"
case TokenKindRBrace:
description += "RBrace"
case TokenKindPlus:
description += "Plus"
case TokenKindMinus:
description += "Minus"
case TokenKindIncrement:
description += "Increment"
case TokenKindDecrement:
description += "Decrement"
case TokenKindAsterisk:
description += "Asterisk"
case TokenKindSlash:
description += "Slash"
case TokenKindAt:
description += "At"
case TokenKindExclamation:
description += "Exclamation"
case TokenKindPercent:
description += "Percent"
case TokenKindTilde:
description += "Tilde"
case TokenKindLessThan:
description += "LessThan"
case TokenKindLShift:
description += "LShift"
case TokenKindGreaterThan:
description += "GreaterThan"
case TokenKindRShift:
description += "RShift"
case TokenKindBinaryOr:
description += "BinaryOr"
case TokenKindLogicalOr:
description += "LogicalOr"
case TokenKindBinaryAnd:
description += "BinaryAnd"
case TokenKindLogicalAnd:
description += "LogicalAnd"
}
if token.value != nil {
description += fmt.Sprint(": ", token.value)
}
return
}

View File

@ -3,12 +3,30 @@ package types
type Mode int
const (
ModeRead = iota
ModeNone = iota
ModeRead
ModeWrite
ModeNone
)
type Permission struct {
Internal Mode
External Mode
}
func ModeFrom (char rune) (mode Mode) {
switch (char) {
case 'n': mode = ModeNone
case 'r': mode = ModeRead
case 'w': mode = ModeWrite
}
return
}
func PermissionFrom (data string) (permission Permission) {
if len(data) != 2 { return }
permission.Internal = ModeFrom(rune(data[0]))
permission.External = ModeFrom(rune(data[1]))
return
}