2022-08-07 13:18:59 -06:00
|
|
|
package lexer
|
|
|
|
|
2022-08-09 18:45:06 -06:00
|
|
|
import "io"
|
2022-08-10 11:59:09 -06:00
|
|
|
import "fmt"
|
2022-08-08 01:08:50 -06:00
|
|
|
import "github.com/sashakoshka/arf/file"
|
2022-08-07 13:18:59 -06:00
|
|
|
|
|
|
|
// LexingOperation holds information about an ongoing lexing operataion.
|
|
|
|
type LexingOperation struct {
|
2022-08-09 20:12:14 -06:00
|
|
|
file *file.File
|
|
|
|
char rune
|
|
|
|
tokens []Token
|
2022-08-07 13:18:59 -06:00
|
|
|
}
|
|
|
|
|
2022-08-09 18:45:06 -06:00
|
|
|
// Tokenize converts a file into a slice of tokens (lexemes).
|
|
|
|
func Tokenize (file *file.File) (tokens []Token, err error) {
|
|
|
|
lexer := LexingOperation { file: file }
|
2022-08-10 11:03:48 -06:00
|
|
|
err = lexer.tokenize()
|
2022-08-09 20:12:14 -06:00
|
|
|
tokens = lexer.tokens
|
2022-08-09 18:45:06 -06:00
|
|
|
|
|
|
|
// if the lexing operation returned io.EOF, nothing went wrong so we
|
|
|
|
// return nil for err.
|
|
|
|
if err == io.EOF {
|
|
|
|
err = nil
|
|
|
|
}
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
// tokenize converts a file into a slice of tokens (lexemes). It will always
|
|
|
|
// return a non-nil error, but if nothing went wrong it will return io.EOF.
|
2022-08-09 20:12:14 -06:00
|
|
|
func (lexer *LexingOperation) tokenize () (err error) {
|
2022-08-09 18:45:06 -06:00
|
|
|
err = lexer.nextRune()
|
|
|
|
if err != nil { return }
|
|
|
|
|
|
|
|
for {
|
|
|
|
lowercase := lexer.char >= 'a' && lexer.char <= 'z'
|
|
|
|
uppercase := lexer.char >= 'A' && lexer.char <= 'Z'
|
|
|
|
number := lexer.char >= '0' && lexer.char <= '9'
|
|
|
|
|
|
|
|
if number {
|
2022-08-09 22:48:18 -06:00
|
|
|
// TODO: tokenize number begin
|
2022-08-10 11:59:09 -06:00
|
|
|
lexer.nextRune()
|
2022-08-09 18:45:06 -06:00
|
|
|
} else if lowercase || uppercase {
|
2022-08-09 22:48:18 -06:00
|
|
|
// TODO: tokenize alpha begin
|
2022-08-10 11:59:09 -06:00
|
|
|
lexer.nextRune()
|
2022-08-09 18:45:06 -06:00
|
|
|
} else {
|
2022-08-09 20:18:12 -06:00
|
|
|
err = lexer.tokenizeSymbolBeginning()
|
2022-08-10 11:59:09 -06:00
|
|
|
if err != nil { return }
|
2022-08-09 18:45:06 -06:00
|
|
|
}
|
|
|
|
|
2022-08-10 11:59:09 -06:00
|
|
|
err = lexer.skipSpaces()
|
|
|
|
if err != nil { return }
|
2022-08-09 18:45:06 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
return
|
2022-08-07 13:18:59 -06:00
|
|
|
}
|
|
|
|
|
2022-08-09 20:18:12 -06:00
|
|
|
func (lexer *LexingOperation) tokenizeSymbolBeginning () (err error) {
|
2022-08-10 11:59:09 -06:00
|
|
|
fmt.Println(string(lexer.char))
|
2022-08-09 20:18:12 -06:00
|
|
|
switch lexer.char {
|
2022-08-09 23:03:59 -06:00
|
|
|
case '#':
|
|
|
|
// comment
|
|
|
|
for lexer.char != '\n' {
|
|
|
|
lexer.nextRune()
|
|
|
|
}
|
2022-08-09 20:18:12 -06:00
|
|
|
case '\t':
|
2022-08-09 23:03:59 -06:00
|
|
|
// indent level
|
2022-08-09 23:22:53 -06:00
|
|
|
previousToken := lexer.tokens[len(lexer.tokens) - 1]
|
|
|
|
|
|
|
|
if !previousToken.Is(TokenKindNewline) ||
|
|
|
|
!previousToken.Is(TokenKindNewline) {
|
|
|
|
|
|
|
|
file.NewError (
|
|
|
|
lexer.file.Location(), 1,
|
|
|
|
"tab not used as indent",
|
|
|
|
file.ErrorKindWarn)
|
|
|
|
break
|
|
|
|
}
|
|
|
|
|
2022-08-09 20:18:12 -06:00
|
|
|
for lexer.char == '\t' {
|
|
|
|
lexer.addToken (Token {
|
|
|
|
kind: TokenKindIndent,
|
|
|
|
})
|
|
|
|
lexer.nextRune()
|
|
|
|
}
|
2022-08-09 23:03:59 -06:00
|
|
|
case '\n':
|
|
|
|
// line break
|
2022-08-10 09:28:29 -06:00
|
|
|
// TODO: if last line was blank, (ony whitespace) discard.
|
2022-08-09 23:03:59 -06:00
|
|
|
lexer.addToken (Token {
|
|
|
|
kind: TokenKindNewline,
|
|
|
|
})
|
|
|
|
lexer.nextRune()
|
2022-08-09 20:18:12 -06:00
|
|
|
case '"':
|
|
|
|
// TODO: tokenize string literal
|
|
|
|
lexer.nextRune()
|
|
|
|
case '\'':
|
|
|
|
// TODO: tokenize rune literal
|
|
|
|
lexer.nextRune()
|
|
|
|
case ':':
|
|
|
|
lexer.addToken (Token {
|
|
|
|
kind: TokenKindColon,
|
|
|
|
})
|
|
|
|
lexer.nextRune()
|
|
|
|
case '.':
|
|
|
|
lexer.addToken (Token {
|
|
|
|
kind: TokenKindDot,
|
|
|
|
})
|
|
|
|
lexer.nextRune()
|
|
|
|
case '[':
|
|
|
|
lexer.addToken (Token {
|
|
|
|
kind: TokenKindLBracket,
|
|
|
|
})
|
|
|
|
lexer.nextRune()
|
|
|
|
case ']':
|
|
|
|
lexer.addToken (Token {
|
|
|
|
kind: TokenKindRBracket,
|
|
|
|
})
|
|
|
|
lexer.nextRune()
|
|
|
|
case '{':
|
|
|
|
lexer.addToken (Token {
|
|
|
|
kind: TokenKindLBrace,
|
|
|
|
})
|
|
|
|
lexer.nextRune()
|
|
|
|
case '}':
|
|
|
|
lexer.addToken (Token {
|
|
|
|
kind: TokenKindRBrace,
|
|
|
|
})
|
|
|
|
lexer.nextRune()
|
2022-08-09 22:48:18 -06:00
|
|
|
case '+':
|
2022-08-09 23:03:59 -06:00
|
|
|
lexer.addToken (Token {
|
|
|
|
kind: TokenKindPlus,
|
|
|
|
})
|
2022-08-09 22:48:18 -06:00
|
|
|
lexer.nextRune()
|
|
|
|
case '-':
|
|
|
|
// TODO: tokenize dash begin
|
|
|
|
lexer.nextRune()
|
|
|
|
case '*':
|
2022-08-09 23:03:59 -06:00
|
|
|
lexer.addToken (Token {
|
|
|
|
kind: TokenKindAsterisk,
|
|
|
|
})
|
2022-08-09 22:48:18 -06:00
|
|
|
lexer.nextRune()
|
|
|
|
case '/':
|
2022-08-09 23:03:59 -06:00
|
|
|
lexer.addToken (Token {
|
|
|
|
kind: TokenKindSlash,
|
|
|
|
})
|
2022-08-09 22:48:18 -06:00
|
|
|
lexer.nextRune()
|
|
|
|
case '@':
|
2022-08-09 23:03:59 -06:00
|
|
|
lexer.addToken (Token {
|
|
|
|
kind: TokenKindAt,
|
|
|
|
})
|
2022-08-09 22:48:18 -06:00
|
|
|
lexer.nextRune()
|
|
|
|
case '!':
|
2022-08-09 23:03:59 -06:00
|
|
|
lexer.addToken (Token {
|
|
|
|
kind: TokenKindExclamation,
|
|
|
|
})
|
2022-08-09 22:48:18 -06:00
|
|
|
lexer.nextRune()
|
|
|
|
case '%':
|
2022-08-09 23:03:59 -06:00
|
|
|
lexer.addToken (Token {
|
|
|
|
kind: TokenKindPercent,
|
|
|
|
})
|
2022-08-09 22:48:18 -06:00
|
|
|
lexer.nextRune()
|
|
|
|
case '~':
|
2022-08-09 23:03:59 -06:00
|
|
|
lexer.addToken (Token {
|
|
|
|
kind: TokenKindTilde,
|
|
|
|
})
|
2022-08-09 22:48:18 -06:00
|
|
|
lexer.nextRune()
|
|
|
|
case '<':
|
|
|
|
// TODO: tokenize less than begin
|
|
|
|
lexer.nextRune()
|
|
|
|
case '>':
|
|
|
|
// TODO: tokenize greater than begin
|
|
|
|
lexer.nextRune()
|
|
|
|
case '|':
|
|
|
|
// TODO: tokenize bar begin
|
|
|
|
lexer.nextRune()
|
|
|
|
case '&':
|
|
|
|
// TODO: tokenize and begin
|
|
|
|
lexer.nextRune()
|
2022-08-09 20:18:12 -06:00
|
|
|
default:
|
|
|
|
err = file.NewError (
|
|
|
|
lexer.file.Location(), 1,
|
|
|
|
"unexpected character " +
|
|
|
|
string(lexer.char),
|
|
|
|
file.ErrorKindError)
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
2022-08-10 11:59:09 -06:00
|
|
|
// addToken adds a new token to the lexer's token slice.
|
2022-08-09 20:12:14 -06:00
|
|
|
func (lexer *LexingOperation) addToken (token Token) {
|
|
|
|
lexer.tokens = append(lexer.tokens, token)
|
|
|
|
}
|
|
|
|
|
2022-08-10 11:59:09 -06:00
|
|
|
// skipSpaces skips all space characters (not tabs or newlines)
|
|
|
|
func (lexer *LexingOperation) skipSpaces () (err error) {
|
|
|
|
for lexer.char == ' ' {
|
|
|
|
err = lexer.nextRune()
|
|
|
|
if err != nil { return }
|
|
|
|
}
|
|
|
|
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
2022-08-09 18:45:06 -06:00
|
|
|
// nextRune advances the lexer to the next rune in the file.
|
|
|
|
func (lexer *LexingOperation) nextRune () (err error) {
|
|
|
|
lexer.char, _, err = lexer.file.ReadRune()
|
|
|
|
if err != nil && err != io.EOF {
|
|
|
|
return file.NewError (
|
|
|
|
lexer.file.Location(), 1,
|
|
|
|
err.Error(), file.ErrorKindError)
|
|
|
|
}
|
2022-08-07 13:18:59 -06:00
|
|
|
return
|
|
|
|
}
|