This repository has been archived on 2024-02-27. You can view files and clone it, but cannot push or open issues or pull requests.
arf/lexer/lexer.go

461 lines
11 KiB
Go
Raw Normal View History

2022-10-11 22:48:55 -06:00
/*
Package lexer implements a tokenizer for the ARF language. It contains a
function called Tokenize which takes in a file from the ARF file package, and
outputs an array of tokens.
*/
2022-08-07 13:18:59 -06:00
package lexer
2022-08-09 18:45:06 -06:00
import "io"
2022-08-29 23:11:10 -06:00
import "git.tebibyte.media/arf/arf/file"
import "git.tebibyte.media/arf/arf/types"
import "git.tebibyte.media/arf/arf/infoerr"
2022-08-07 13:18:59 -06:00
2022-10-11 22:00:34 -06:00
// lexingOperation holds information about an ongoing lexing operataion.
type lexingOperation struct {
file *file.File
char rune
tokens []Token
2022-08-07 13:18:59 -06:00
}
2022-08-09 18:45:06 -06:00
// Tokenize converts a file into a slice of tokens (lexemes).
func Tokenize (file *file.File) (tokens []Token, err error) {
2022-10-11 22:00:34 -06:00
lexer := lexingOperation { file: file }
err = lexer.tokenize()
tokens = lexer.tokens
2022-08-09 18:45:06 -06:00
// if the lexing operation returned io.EOF, nothing went wrong so we
// return nil for err.
if err == io.EOF {
err = nil
}
return
}
// tokenize converts a file into a slice of tokens (lexemes). It will always
// return a non-nil error, but if nothing went wrong it will return io.EOF.
2022-10-11 22:00:34 -06:00
func (lexer *lexingOperation) tokenize () (err error) {
// check to see if the beginning of the file says :arf
var shebangCheck = []rune(":arf\n")
for index := 0; index < 5; index ++ {
err = lexer.nextRune()
if err != nil || shebangCheck[index] != lexer.char {
err = infoerr.NewError (
lexer.file.Location(1),
"not an arf file",
infoerr.ErrorKindError)
return
}
}
2022-08-09 18:45:06 -06:00
err = lexer.nextRune()
if err != nil { return }
for {
lowercase := lexer.char >= 'a' && lexer.char <= 'z'
uppercase := lexer.char >= 'A' && lexer.char <= 'Z'
number := lexer.char >= '0' && lexer.char <= '9'
if number {
2022-08-10 23:14:41 -06:00
err = lexer.tokenizeNumberBeginning(false)
if err != nil { return }
2022-08-09 18:45:06 -06:00
} else if lowercase || uppercase {
err = lexer.tokenizeAlphaBeginning()
if err != nil { return }
2022-08-11 00:00:57 -06:00
} else {
2022-08-09 20:18:12 -06:00
err = lexer.tokenizeSymbolBeginning()
if err != nil { return }
2022-08-09 18:45:06 -06:00
}
err = lexer.skipSpaces()
if err != nil { return }
2022-08-09 18:45:06 -06:00
}
2022-10-11 22:00:34 -06:00
// TODO: figure out why this is here and what its proper place is
// because it is apparently unreachable
2022-08-11 02:47:42 -06:00
if lexer.tokens[len(lexer.tokens) - 1].kind != TokenKindNewline {
token := lexer.newToken()
token.kind = TokenKindNewline
lexer.addToken(token)
2022-08-11 02:47:42 -06:00
}
2022-08-09 18:45:06 -06:00
return
2022-08-07 13:18:59 -06:00
}
2022-10-11 22:00:34 -06:00
func (lexer *lexingOperation) tokenizeAlphaBeginning () (err error) {
token := lexer.newToken()
token.kind = TokenKindName
got := ""
for {
lowercase := lexer.char >= 'a' && lexer.char <= 'z'
uppercase := lexer.char >= 'A' && lexer.char <= 'Z'
number := lexer.char >= '0' && lexer.char <= '9'
if !lowercase && !uppercase && !number { break }
got += string(lexer.char)
lexer.nextRune()
}
token.value = got
2022-08-18 09:35:48 -06:00
token.location.SetWidth(len(got))
if len(got) == 2 {
permission, isPermission := types.PermissionFrom(got)
if isPermission {
token.kind = TokenKindPermission
token.value = permission
}
}
lexer.addToken(token)
return
}
2022-10-11 22:00:34 -06:00
func (lexer *lexingOperation) tokenizeSymbolBeginning () (err error) {
2022-08-09 20:18:12 -06:00
switch lexer.char {
2022-08-09 23:03:59 -06:00
case '#':
// comment
for lexer.char != '\n' {
err = lexer.nextRune()
if err != nil { return }
2022-08-09 23:03:59 -06:00
}
2022-08-09 20:18:12 -06:00
case '\t':
2022-08-09 23:03:59 -06:00
// indent level
previousToken := lexer.tokens[len(lexer.tokens) - 1]
if !previousToken.Is(TokenKindNewline) {
err = lexer.nextRune()
infoerr.NewError (
lexer.file.Location(1),
"tab not used as indent",
infoerr.ErrorKindWarn).Print()
return
}
token := lexer.newToken()
token.kind = TokenKindIndent
// eat up tabs while increasing the indent level
indentLevel := 0
2022-08-09 20:18:12 -06:00
for lexer.char == '\t' {
indentLevel ++
err = lexer.nextRune()
if err != nil { return }
2022-08-09 20:18:12 -06:00
}
token.value = indentLevel
2022-08-18 09:35:48 -06:00
token.location.SetWidth(indentLevel)
lexer.addToken(token)
2022-08-09 23:03:59 -06:00
case '\n':
// line break
// if the last line is empty, discard it
2022-08-11 02:47:42 -06:00
lastLineEmpty := true
tokenIndex := len(lexer.tokens) - 1
for lexer.tokens[tokenIndex].kind != TokenKindNewline {
if lexer.tokens[tokenIndex].kind != TokenKindIndent {
lastLineEmpty = false
break
}
tokenIndex --
}
if lastLineEmpty {
lexer.tokens = lexer.tokens[:tokenIndex]
}
token := lexer.newToken()
token.kind = TokenKindNewline
lexer.addToken(token)
err = lexer.nextRune()
2022-08-09 20:18:12 -06:00
case '\'':
2022-10-04 14:35:00 -06:00
err = lexer.tokenizeString()
2022-08-09 20:18:12 -06:00
case ':':
token := lexer.newToken()
token.kind = TokenKindColon
lexer.addToken(token)
err = lexer.nextRune()
2022-08-09 20:18:12 -06:00
case '.':
token := lexer.newToken()
2022-08-16 18:24:27 -06:00
err = lexer.nextRune()
if err != nil { return }
token.kind = TokenKindDot
2022-08-16 18:24:27 -06:00
if lexer.char == '.' {
token.kind = TokenKindElipsis
err = lexer.nextRune()
2022-08-18 09:35:48 -06:00
token.location.SetWidth(2)
2022-08-16 18:24:27 -06:00
}
lexer.addToken(token)
2022-08-15 12:50:09 -06:00
case ',':
token := lexer.newToken()
token.kind = TokenKindComma
lexer.addToken(token)
err = lexer.nextRune()
2022-09-13 15:04:43 -06:00
case '(':
token := lexer.newToken()
token.kind = TokenKindLParen
lexer.addToken(token)
err = lexer.nextRune()
case ')':
token := lexer.newToken()
token.kind = TokenKindRParen
lexer.addToken(token)
err = lexer.nextRune()
2022-08-09 20:18:12 -06:00
case '[':
token := lexer.newToken()
token.kind = TokenKindLBracket
lexer.addToken(token)
err = lexer.nextRune()
2022-08-09 20:18:12 -06:00
case ']':
token := lexer.newToken()
token.kind = TokenKindRBracket
lexer.addToken(token)
err = lexer.nextRune()
2022-08-09 20:18:12 -06:00
case '{':
token := lexer.newToken()
token.kind = TokenKindLBrace
lexer.addToken(token)
err = lexer.nextRune()
2022-08-09 20:18:12 -06:00
case '}':
token := lexer.newToken()
token.kind = TokenKindRBrace
lexer.addToken(token)
err = lexer.nextRune()
2022-08-09 22:48:18 -06:00
case '+':
token := lexer.newToken()
err = lexer.nextRune()
if err != nil { return }
token.kind = TokenKindPlus
if lexer.char == '+' {
token.kind = TokenKindIncrement
err = lexer.nextRune()
2022-08-18 09:35:48 -06:00
token.location.SetWidth(2)
}
lexer.addToken(token)
2022-08-09 22:48:18 -06:00
case '-':
err = lexer.tokenizeDashBeginning()
2022-08-09 22:48:18 -06:00
case '*':
token := lexer.newToken()
token.kind = TokenKindAsterisk
lexer.addToken(token)
err = lexer.nextRune()
2022-08-09 22:48:18 -06:00
case '/':
token := lexer.newToken()
token.kind = TokenKindSlash
lexer.addToken(token)
err = lexer.nextRune()
2022-08-09 22:48:18 -06:00
case '@':
token := lexer.newToken()
token.kind = TokenKindAt
lexer.addToken(token)
err = lexer.nextRune()
2022-08-09 22:48:18 -06:00
case '!':
token := lexer.newToken()
2022-08-24 22:05:40 -06:00
err = lexer.nextRune()
if err != nil { return }
token.kind = TokenKindExclamation
2022-08-24 22:05:40 -06:00
if lexer.char == '=' {
token.kind = TokenKindNotEqualTo
err = lexer.nextRune()
token.location.SetWidth(2)
}
lexer.addToken(token)
2022-08-09 22:48:18 -06:00
case '%':
token := lexer.newToken()
err = lexer.nextRune()
if err != nil { return }
token.kind = TokenKindPercent
if lexer.char == '=' {
token.kind = TokenKindPercentAssignment
err = lexer.nextRune()
token.location.SetWidth(2)
}
lexer.addToken(token)
2022-08-09 22:48:18 -06:00
case '~':
token := lexer.newToken()
err = lexer.nextRune()
if err != nil { return }
token.kind = TokenKindTilde
if lexer.char == '=' {
token.kind = TokenKindTildeAssignment
err = lexer.nextRune()
token.location.SetWidth(2)
}
lexer.addToken(token)
2022-08-24 22:05:40 -06:00
case '=':
token := lexer.newToken()
err = lexer.nextRune()
if err != nil { return }
token.kind = TokenKindAssignment
if lexer.char == '=' {
token.kind = TokenKindEqualTo
err = lexer.nextRune()
token.location.SetWidth(2)
}
lexer.addToken(token)
2022-08-09 22:48:18 -06:00
case '<':
token := lexer.newToken()
err = lexer.nextRune()
if err != nil { return }
token.kind = TokenKindLessThan
if lexer.char == '<' {
token.kind = TokenKindLShift
err = lexer.nextRune()
2022-08-18 09:35:48 -06:00
token.location.SetWidth(2)
if lexer.char == '=' {
token.kind = TokenKindLShiftAssignment
err = lexer.nextRune()
token.location.SetWidth(3)
}
2022-08-24 22:05:40 -06:00
} else if lexer.char == '=' {
token.kind = TokenKindLessThanEqualTo
err = lexer.nextRune()
token.location.SetWidth(2)
}
lexer.addToken(token)
2022-08-09 22:48:18 -06:00
case '>':
token := lexer.newToken()
err = lexer.nextRune()
if err != nil { return }
token.kind = TokenKindGreaterThan
if lexer.char == '>' {
token.kind = TokenKindRShift
err = lexer.nextRune()
2022-08-18 09:35:48 -06:00
token.location.SetWidth(2)
if lexer.char == '=' {
token.kind = TokenKindRShiftAssignment
err = lexer.nextRune()
token.location.SetWidth(3)
}
2022-08-24 22:05:40 -06:00
} else if lexer.char == '=' {
token.kind = TokenKindGreaterThanEqualTo
err = lexer.nextRune()
token.location.SetWidth(2)
}
lexer.addToken(token)
2022-08-09 22:48:18 -06:00
case '|':
token := lexer.newToken()
err = lexer.nextRune()
if err != nil { return }
token.kind = TokenKindBinaryOr
if lexer.char == '|' {
token.kind = TokenKindLogicalOr
err = lexer.nextRune()
2022-08-18 09:35:48 -06:00
token.location.SetWidth(2)
} else if lexer.char == '=' {
token.kind = TokenKindBinaryOrAssignment
err = lexer.nextRune()
token.location.SetWidth(2)
}
lexer.addToken(token)
2022-08-09 22:48:18 -06:00
case '&':
token := lexer.newToken()
err = lexer.nextRune()
if err != nil { return }
token.kind = TokenKindBinaryAnd
if lexer.char == '&' {
token.kind = TokenKindLogicalAnd
err = lexer.nextRune()
2022-08-18 09:35:48 -06:00
token.location.SetWidth(2)
} else if lexer.char == '=' {
token.kind = TokenKindBinaryAndAssignment
err = lexer.nextRune()
token.location.SetWidth(2)
}
lexer.addToken(token)
2022-08-29 23:33:58 -06:00
case '^':
token := lexer.newToken()
err = lexer.nextRune()
if err != nil { return }
token.kind = TokenKindBinaryXor
if lexer.char == '=' {
token.kind = TokenKindBinaryXorAssignment
err = lexer.nextRune()
token.location.SetWidth(2)
}
lexer.addToken(token)
2022-08-09 20:18:12 -06:00
default:
err = infoerr.NewError (
lexer.file.Location(1),
"unexpected symbol character " +
2022-08-09 20:18:12 -06:00
string(lexer.char),
infoerr.ErrorKindError)
2022-08-09 20:18:12 -06:00
return
}
return
}
2022-10-11 22:00:34 -06:00
func (lexer *lexingOperation) tokenizeDashBeginning () (err error) {
token := lexer.newToken()
2022-08-11 00:10:34 -06:00
err = lexer.nextRune()
if err != nil { return }
if lexer.char == '-' {
token.kind = TokenKindDecrement
2022-08-18 09:35:48 -06:00
token.location.SetWidth(2)
2022-08-11 00:10:34 -06:00
err = lexer.nextRune()
if err != nil { return }
if lexer.char == '-' {
token.kind = TokenKindSeparator
lexer.nextRune()
2022-08-18 09:35:48 -06:00
token.location.SetWidth(3)
2022-08-11 00:10:34 -06:00
}
lexer.addToken(token)
} else if lexer.char == '>' {
token.kind = TokenKindReturnDirection
2022-08-18 09:35:48 -06:00
token.location.SetWidth(2)
err = lexer.nextRune()
2022-08-11 00:10:34 -06:00
if err != nil { return }
2022-08-11 00:10:34 -06:00
lexer.addToken(token)
} else if lexer.char >= '0' && lexer.char <= '9' {
lexer.tokenizeNumberBeginning(true)
} else {
token.kind = TokenKindMinus
2022-08-11 00:10:34 -06:00
lexer.addToken(token)
}
return
}
// newToken creates a new token from the lexer's current position in the file.
2022-10-11 22:00:34 -06:00
func (lexer *lexingOperation) newToken () (token Token) {
return Token { location: lexer.file.Location(1) }
}
// addToken adds a new token to the lexer's token slice.
2022-10-11 22:00:34 -06:00
func (lexer *lexingOperation) addToken (token Token) {
lexer.tokens = append(lexer.tokens, token)
}
// skipSpaces skips all space characters (not tabs or newlines)
2022-10-11 22:00:34 -06:00
func (lexer *lexingOperation) skipSpaces () (err error) {
for lexer.char == ' ' {
err = lexer.nextRune()
if err != nil { return }
}
return
}
2022-08-09 18:45:06 -06:00
// nextRune advances the lexer to the next rune in the file.
2022-10-11 22:00:34 -06:00
func (lexer *lexingOperation) nextRune () (err error) {
2022-08-09 18:45:06 -06:00
lexer.char, _, err = lexer.file.ReadRune()
if err != nil && err != io.EOF {
return infoerr.NewError (
lexer.file.Location(1),
err.Error(), infoerr.ErrorKindError)
2022-08-09 18:45:06 -06:00
}
2022-08-07 13:18:59 -06:00
return
}