diff --git a/file/error.go b/file/error.go index e7f72e4..6a0d502 100644 --- a/file/error.go +++ b/file/error.go @@ -58,7 +58,8 @@ func (err Error) Error () (formattedMessage string) { // print an arrow with a tail spanning the width of the mistake columnCountdown := err.Location.column - for columnCountdown > 0 { + for columnCountdown > 1 { + // TODO: for tabs, print out a teb instead. formattedMessage += " " columnCountdown -- } @@ -66,9 +67,9 @@ func (err Error) Error () (formattedMessage string) { // TODO: for tabs, print out 8 of these instead. formattedMessage += "-" } - formattedMessage += "-\n" + formattedMessage += "^\n" } - formattedMessage += err.message + "-\n" + formattedMessage += err.message + "\n" return } diff --git a/lexer/lexer.go b/lexer/lexer.go index 040b405..742e981 100644 --- a/lexer/lexer.go +++ b/lexer/lexer.go @@ -1,7 +1,9 @@ package lexer import "io" +import "fmt" import "github.com/sashakoshka/arf/file" +import "github.com/sashakoshka/arf/types" // LexingOperation holds information about an ongoing lexing operataion. type LexingOperation struct { @@ -31,22 +33,62 @@ func (lexer *LexingOperation) tokenize () (err error) { if err != nil { return } for { + fmt.Println(string(lexer.char)) + lowercase := lexer.char >= 'a' && lexer.char <= 'z' uppercase := lexer.char >= 'A' && lexer.char <= 'Z' number := lexer.char >= '0' && lexer.char <= '9' if number { - // TODO: tokenize number begin + err = lexer.tokenizeNumberBeginning(false) + if err != nil { return } } else if lowercase || uppercase { - // TODO: tokenize alpha begin + err = lexer.tokenizeAlphaBeginning() + if err != nil { return } } else { err = lexer.tokenizeSymbolBeginning() - if err != nil { return err } + if err != nil { return } } - // TODO: skip whitespace + err = lexer.skipSpaces() + if err != nil { return } } + if lexer.tokens[len(lexer.tokens) - 1].kind != TokenKindNewline { + lexer.addToken(Token { kind: TokenKindNewline }) + } + + return +} + +func (lexer *LexingOperation) tokenizeAlphaBeginning () (err error) { + got := "" + + for { + lowercase := lexer.char >= 'a' && lexer.char <= 'z' + uppercase := lexer.char >= 'A' && lexer.char <= 'Z' + number := lexer.char >= '0' && lexer.char <= '9' + if !lowercase && !uppercase && !number { break } + + got += string(lexer.char) + + lexer.nextRune() + } + + token := Token { kind: TokenKindName, value: got } + + if len(got) == 2 { + firstValid := got[0] == 'n' || got[0] == 'r' || got[0] == 'w' + secondValid := got[1] == 'n' || got[1] == 'r' || got[1] == 'w' + + if firstValid && secondValid { + token.kind = TokenKindPermission + token.value = types.PermissionFrom(got) + } + } + + lexer.addToken(token) + return } @@ -55,7 +97,8 @@ func (lexer *LexingOperation) tokenizeSymbolBeginning () (err error) { case '#': // comment for lexer.char != '\n' { - lexer.nextRune() + err = lexer.nextRune() + if err != nil { return } } case '\t': // indent level @@ -75,105 +118,146 @@ func (lexer *LexingOperation) tokenizeSymbolBeginning () (err error) { lexer.addToken (Token { kind: TokenKindIndent, }) - lexer.nextRune() + err = lexer.nextRune() + if err != nil { return } } case '\n': // line break + lastLineEmpty := true + tokenIndex := len(lexer.tokens) - 1 + for lexer.tokens[tokenIndex].kind != TokenKindNewline { + if lexer.tokens[tokenIndex].kind != TokenKindIndent { + + lastLineEmpty = false + break + } + tokenIndex -- + } + + if lastLineEmpty { + lexer.tokens = lexer.tokens[:tokenIndex] + } + // TODO: if last line was blank, (ony whitespace) discard. lexer.addToken (Token { kind: TokenKindNewline, }) - lexer.nextRune() + err = lexer.nextRune() case '"': - // TODO: tokenize string literal - lexer.nextRune() + err = lexer.tokenizeString(false) case '\'': - // TODO: tokenize rune literal - lexer.nextRune() + err = lexer.tokenizeString(true) case ':': lexer.addToken (Token { kind: TokenKindColon, }) - lexer.nextRune() + err = lexer.nextRune() case '.': lexer.addToken (Token { kind: TokenKindDot, }) - lexer.nextRune() + err = lexer.nextRune() case '[': lexer.addToken (Token { kind: TokenKindLBracket, }) - lexer.nextRune() + err = lexer.nextRune() case ']': lexer.addToken (Token { kind: TokenKindRBracket, }) - lexer.nextRune() + err = lexer.nextRune() case '{': lexer.addToken (Token { kind: TokenKindLBrace, }) - lexer.nextRune() + err = lexer.nextRune() case '}': lexer.addToken (Token { kind: TokenKindRBrace, }) - lexer.nextRune() + err = lexer.nextRune() case '+': - lexer.addToken (Token { - kind: TokenKindPlus, - }) - lexer.nextRune() + err = lexer.nextRune() + if err != nil { return } + token := Token { kind: TokenKindPlus } + if lexer.char == '+' { + token.kind = TokenKindIncrement + } + lexer.addToken(token) + err = lexer.nextRune() case '-': - // TODO: tokenize dash begin - lexer.nextRune() + err = lexer.tokenizeDashBeginning() case '*': lexer.addToken (Token { kind: TokenKindAsterisk, }) - lexer.nextRune() + err = lexer.nextRune() case '/': lexer.addToken (Token { kind: TokenKindSlash, }) - lexer.nextRune() + err = lexer.nextRune() case '@': lexer.addToken (Token { kind: TokenKindAt, }) - lexer.nextRune() + err = lexer.nextRune() case '!': lexer.addToken (Token { kind: TokenKindExclamation, }) - lexer.nextRune() + err = lexer.nextRune() case '%': lexer.addToken (Token { kind: TokenKindPercent, }) - lexer.nextRune() + err = lexer.nextRune() case '~': lexer.addToken (Token { kind: TokenKindTilde, }) - lexer.nextRune() + err = lexer.nextRune() case '<': - // TODO: tokenize less than begin - lexer.nextRune() + err = lexer.nextRune() + if err != nil { return } + token := Token { kind: TokenKindLessThan } + if lexer.char == '<' { + token.kind = TokenKindLShift + } + lexer.addToken(token) + err = lexer.nextRune() case '>': - // TODO: tokenize greater than begin - lexer.nextRune() + err = lexer.nextRune() + if err != nil { return } + token := Token { kind: TokenKindGreaterThan } + if lexer.char == '>' { + token.kind = TokenKindRShift + } + lexer.addToken(token) + err = lexer.nextRune() case '|': - // TODO: tokenize bar begin - lexer.nextRune() + err = lexer.nextRune() + if err != nil { return } + token := Token { kind: TokenKindBinaryOr } + if lexer.char == '|' { + token.kind = TokenKindLogicalOr + } + lexer.addToken(token) + err = lexer.nextRune() case '&': - // TODO: tokenize and begin - lexer.nextRune() + err = lexer.nextRune() + if err != nil { return } + token := Token { kind: TokenKindBinaryAnd } + if lexer.char == '&' { + token.kind = TokenKindLogicalAnd + } + lexer.addToken(token) + err = lexer.nextRune() default: err = file.NewError ( lexer.file.Location(), 1, - "unexpected character " + + "unexpected symbol character " + string(lexer.char), file.ErrorKindError) return @@ -182,10 +266,53 @@ func (lexer *LexingOperation) tokenizeSymbolBeginning () (err error) { return } +func (lexer *LexingOperation) tokenizeDashBeginning () (err error) { + err = lexer.nextRune() + if err != nil { return } + + if lexer.char == '-' { + token := Token { kind: TokenKindDecrement } + + err = lexer.nextRune() + if err != nil { return } + + if lexer.char == '-' { + token.kind = TokenKindSeparator + lexer.nextRune() + } + lexer.addToken(token) + } else if lexer.char == '>' { + token := Token { kind: TokenKindReturnDirection } + + err = lexer.nextRune() + if err != nil { return } + + lexer.addToken(token) + } else if lexer.char >= '0' && lexer.char <= '9' { + lexer.tokenizeNumberBeginning(true) + } else { + token := Token { kind: TokenKindMinus } + lexer.addToken(token) + } + + return +} + +// addToken adds a new token to the lexer's token slice. func (lexer *LexingOperation) addToken (token Token) { lexer.tokens = append(lexer.tokens, token) } +// skipSpaces skips all space characters (not tabs or newlines) +func (lexer *LexingOperation) skipSpaces () (err error) { + for lexer.char == ' ' { + err = lexer.nextRune() + if err != nil { return } + } + + return +} + // nextRune advances the lexer to the next rune in the file. func (lexer *LexingOperation) nextRune () (err error) { lexer.char, _, err = lexer.file.ReadRune() diff --git a/lexer/lexer_test.go b/lexer/lexer_test.go index 62a5791..9143521 100644 --- a/lexer/lexer_test.go +++ b/lexer/lexer_test.go @@ -13,10 +13,15 @@ func TestTokenizeAll (test *testing.T) { } tokens, err := Tokenize(file) - test.Log("resulting error:") - test.Log(err.Error()) - if err == nil { - test.Log("Tokenize() should have returned an error") + + // print all tokens + for index, token := range tokens { + test.Log(index, "\tgot token:", token.Describe()) + } + + if err != nil { + test.Log("returned error:") + test.Log(err.Error()) test.Fail() return } @@ -28,10 +33,10 @@ func TestTokenizeAll (test *testing.T) { External: types.ModeWrite, }}, Token { kind: TokenKindReturnDirection }, - Token { kind: TokenKindInt, value: -349820394 }, - Token { kind: TokenKindUInt, value: 932748397 }, + Token { kind: TokenKindInt, value: int64(-349820394) }, + Token { kind: TokenKindUInt, value: uint64(932748397) }, Token { kind: TokenKindFloat, value: 239485.37520 }, - Token { kind: TokenKindString, value: "hello world\n" }, + Token { kind: TokenKindString, value: "hello world!\n" }, Token { kind: TokenKindRune, value: 'E' }, Token { kind: TokenKindName, value: "helloWorld" }, Token { kind: TokenKindColon }, @@ -40,6 +45,7 @@ func TestTokenizeAll (test *testing.T) { Token { kind: TokenKindRBracket }, Token { kind: TokenKindLBrace }, Token { kind: TokenKindRBrace }, + Token { kind: TokenKindNewline }, Token { kind: TokenKindPlus }, Token { kind: TokenKindMinus }, Token { kind: TokenKindIncrement }, @@ -58,6 +64,7 @@ func TestTokenizeAll (test *testing.T) { Token { kind: TokenKindLogicalOr }, Token { kind: TokenKindBinaryAnd }, Token { kind: TokenKindLogicalAnd }, + Token { kind: TokenKindNewline }, } if len(tokens) != len(correct) { @@ -70,6 +77,9 @@ func TestTokenizeAll (test *testing.T) { for index, token := range tokens { if !token.Equals(correct[index]) { test.Log("token", index, "not equal") + test.Log ( + "have", token.Describe(), + "want", correct[index].Describe()) test.Fail() return } diff --git a/lexer/numbers.go b/lexer/numbers.go new file mode 100644 index 0000000..a08074b --- /dev/null +++ b/lexer/numbers.go @@ -0,0 +1,124 @@ +package lexer + +import "github.com/sashakoshka/arf/file" + +// tokenizeSymbolBeginning lexes a token that starts with a number. +func (lexer *LexingOperation) tokenizeNumberBeginning (negative bool) (err error) { + var number uint64 + var fragment float64 + var isFloat bool + + if lexer.char == '0' { + lexer.nextRune() + + if lexer.char == 'x' { + lexer.nextRune() + number, fragment, isFloat, err = lexer.tokenizeNumber(16) + } else if lexer.char == 'b' { + lexer.nextRune() + number, fragment, isFloat, err = lexer.tokenizeNumber(2) + } else if lexer.char == '.' { + number, fragment, isFloat, err = lexer.tokenizeNumber(10) + } else if lexer.char >= '0' && lexer.char <= '9' { + number, fragment, isFloat, err = lexer.tokenizeNumber(8) + } else { + return file.NewError ( + lexer.file.Location(), 1, + "unexpected character in number literal", + file.ErrorKindError) + } + } else { + number, fragment, isFloat, err = lexer.tokenizeNumber(10) + } + + if err != nil { return } + + token := Token { } + + if isFloat { + floatNumber := float64(number) + fragment + + token.kind = TokenKindFloat + if negative { + token.value = floatNumber * -1 + } else { + token.value = floatNumber + } + } else { + if negative { + token.kind = TokenKindInt + token.value = int64(number) * -1 + } else { + token.kind = TokenKindUInt + token.value = uint64(number) + } + } + + lexer.addToken(token) + return +} + +// runeToDigit converts a rune from 0-F to a corresponding digit, with a maximum +// radix. If the character is invalid, or the digit is too big, it will return +// false for worked. +func runeToDigit (char rune, radix uint64) (digit uint64, worked bool) { + worked = true + + if char >= '0' && char <= '9' { + digit = uint64(char - '0') + } else if char >= 'A' && char <= 'F' { + digit = uint64(char - 'A' + 9) + } else if char >= 'a' && char <= 'f' { + digit = uint64(char - 'a' + 9) + } else { + worked = false + } + + if digit >= radix { + worked = false + } + + return +} + +// tokenizeNumber reads and tokenizes a number with the specified radix. +func (lexer *LexingOperation) tokenizeNumber ( + radix uint64, +) ( + number uint64, + fragment float64, + isFloat bool, + err error, +) { + for { + digit, worked := runeToDigit(lexer.char, radix) + if !worked { break } + + number *= radix + number += digit + + err = lexer.nextRune() + if err != nil { return } + } + + if lexer.char == '.' { + isFloat = true + err = lexer.nextRune() + if err != nil { return } + + coef := 1 / float64(radix) + for { + digit, worked := runeToDigit(lexer.char, radix) + if !worked { break } + + fragment += float64(digit) * coef + + coef /= float64(radix) + + err = lexer.nextRune() + if err != nil { return } + } + } + + return +} diff --git a/lexer/text.go b/lexer/text.go new file mode 100644 index 0000000..e349581 --- /dev/null +++ b/lexer/text.go @@ -0,0 +1,77 @@ +package lexer + +import "github.com/sashakoshka/arf/file" + +var escapeSequenceMap = map[rune] rune { + 'a': '\x07', + 'b': '\x08', + 'f': '\x0c', + 'n': '\x0a', + 'r': '\x0d', + 't': '\x09', + 'v': '\x0b', + '\'': '\'', + '"': '"', + '\\': '\\', +} + +func (lexer *LexingOperation) tokenizeString (isRuneLiteral bool) (err error) { + err = lexer.nextRune() + if err != nil { return } + + got := "" + + for { + // TODO: add hexadecimal escape codes + if lexer.char == '\\' { + err = lexer.nextRune() + if err != nil { return } + + actual, exists := escapeSequenceMap[lexer.char] + if exists { + got += string(actual) + } else { + err = file.NewError ( + lexer.file.Location(), 1, + "unknown escape character " + + string(lexer.char), file.ErrorKindError) + return + } + } else { + got += string(lexer.char) + } + + err = lexer.nextRune() + if err != nil { return } + + if isRuneLiteral { + if lexer.char == '\'' { break } + } else { + if lexer.char == '"' { break } + } + } + + err = lexer.nextRune() + if err != nil { return } + + token := Token { } + + if isRuneLiteral { + if len(got) > 1 { + err = file.NewError ( + lexer.file.Location(), len(got) - 1, + "excess data in rune literal", + file.ErrorKindError) + return + } + + token.kind = TokenKindRune + token.value = rune([]rune(got)[0]) + } else { + token.kind = TokenKindString + token.value = got + } + + lexer.addToken(token) + return +} diff --git a/lexer/token.go b/lexer/token.go index e803606..0ac8298 100644 --- a/lexer/token.go +++ b/lexer/token.go @@ -1,5 +1,6 @@ package lexer +import "fmt" import "github.com/sashakoshka/arf/file" // TokenKind is an enum represzenting what role a token has. @@ -84,3 +85,86 @@ func (token Token) Equals (testToken Token) (match bool) { func (token Token) Location () (location file.Location) { return token.location } + +// Describe generates a textual description of the token to be used in debug +// logs. +func (token Token) Describe () (description string) { + switch token.kind { + case TokenKindNewline: + description += "Newline" + case TokenKindIndent: + description += "Indent" + case TokenKindSeparator: + description += "Separator" + case TokenKindPermission: + description += "Permission" + case TokenKindReturnDirection: + description += "ReturnDirection" + case TokenKindInt: + description += "Int" + case TokenKindUInt: + description += "UInt" + case TokenKindFloat: + description += "Float" + case TokenKindString: + description += "String" + case TokenKindRune: + description += "Rune" + case TokenKindName: + description += "Name" + case TokenKindColon: + description += "Colon" + case TokenKindDot: + description += "Dot" + case TokenKindLBracket: + description += "LBracket" + case TokenKindRBracket: + description += "RBracket" + case TokenKindLBrace: + description += "LBrace" + case TokenKindRBrace: + description += "RBrace" + case TokenKindPlus: + description += "Plus" + case TokenKindMinus: + description += "Minus" + case TokenKindIncrement: + description += "Increment" + case TokenKindDecrement: + description += "Decrement" + case TokenKindAsterisk: + description += "Asterisk" + case TokenKindSlash: + description += "Slash" + case TokenKindAt: + description += "At" + case TokenKindExclamation: + description += "Exclamation" + case TokenKindPercent: + description += "Percent" + case TokenKindTilde: + description += "Tilde" + case TokenKindLessThan: + description += "LessThan" + case TokenKindLShift: + description += "LShift" + case TokenKindGreaterThan: + description += "GreaterThan" + case TokenKindRShift: + description += "RShift" + case TokenKindBinaryOr: + description += "BinaryOr" + case TokenKindLogicalOr: + description += "LogicalOr" + case TokenKindBinaryAnd: + description += "BinaryAnd" + case TokenKindLogicalAnd: + description += "LogicalAnd" + } + + if token.value != nil { + description += fmt.Sprint(": ", token.value) + } + + return +} diff --git a/types/permission.go b/types/permission.go index ec59771..4421058 100644 --- a/types/permission.go +++ b/types/permission.go @@ -3,12 +3,30 @@ package types type Mode int const ( - ModeRead = iota + ModeNone = iota + ModeRead ModeWrite - ModeNone ) type Permission struct { Internal Mode External Mode } + +func ModeFrom (char rune) (mode Mode) { + switch (char) { + case 'n': mode = ModeNone + case 'r': mode = ModeRead + case 'w': mode = ModeWrite + } + + return +} + +func PermissionFrom (data string) (permission Permission) { + if len(data) != 2 { return } + + permission.Internal = ModeFrom(rune(data[0])) + permission.External = ModeFrom(rune(data[1])) + return +}