diff --git a/lexer/lexer_test.go b/lexer/lexer_test.go index 2dca3f8..4885e1a 100644 --- a/lexer/lexer_test.go +++ b/lexer/lexer_test.go @@ -133,5 +133,7 @@ func TestTokenizeText (test *testing.T) { Token { kind: TokenKindRune, value: '"' }, Token { kind: TokenKindRune, value: '\\' }, Token { kind: TokenKindNewline }, + Token { kind: TokenKindString, value: "hello world \x40\u0040\U00000040!" }, + Token { kind: TokenKindNewline }, }, test) } diff --git a/lexer/text.go b/lexer/text.go index e349581..31f3e3a 100644 --- a/lexer/text.go +++ b/lexer/text.go @@ -1,20 +1,9 @@ package lexer +import "strconv" import "github.com/sashakoshka/arf/file" -var escapeSequenceMap = map[rune] rune { - 'a': '\x07', - 'b': '\x08', - 'f': '\x0c', - 'n': '\x0a', - 'r': '\x0d', - 't': '\x09', - 'v': '\x0b', - '\'': '\'', - '"': '"', - '\\': '\\', -} - +// tokenizeString tokenizes a string or rune literal. func (lexer *LexingOperation) tokenizeString (isRuneLiteral bool) (err error) { err = lexer.nextRune() if err != nil { return } @@ -26,23 +15,18 @@ func (lexer *LexingOperation) tokenizeString (isRuneLiteral bool) (err error) { if lexer.char == '\\' { err = lexer.nextRune() if err != nil { return } - - actual, exists := escapeSequenceMap[lexer.char] - if exists { - got += string(actual) - } else { - err = file.NewError ( - lexer.file.Location(), 1, - "unknown escape character " + - string(lexer.char), file.ErrorKindError) - return - } + + var actual rune + actual, err = lexer.getEscapeSequence() + if err != nil { return } + + got += string(actual) } else { got += string(lexer.char) + + err = lexer.nextRune() + if err != nil { return } } - - err = lexer.nextRune() - if err != nil { return } if isRuneLiteral { if lexer.char == '\'' { break } @@ -75,3 +59,95 @@ func (lexer *LexingOperation) tokenizeString (isRuneLiteral bool) (err error) { lexer.addToken(token) return } + +// escapeSequenceMap contains basic escape sequences and how they map to actual +// runes. +var escapeSequenceMap = map[rune] rune { + 'a': '\x07', + 'b': '\x08', + 'f': '\x0c', + 'n': '\x0a', + 'r': '\x0d', + 't': '\x09', + 'v': '\x0b', + '\'': '\'', + '"': '"', + '\\': '\\', +} + +// getEscapeSequence reads an escape sequence in a string or rune literal. +func (lexer *LexingOperation) getEscapeSequence () (result rune, err error) { + result, exists := escapeSequenceMap[lexer.char] + if exists { + err = lexer.nextRune() + return + } else if lexer.char >= '0' && lexer.char <= '7' { + // octal escape sequence + number := string(lexer.char) + + err = lexer.nextRune() + if err != nil { return } + + for len(number) < 3 { + if lexer.char < '0' || lexer.char > '7' { break } + + number += string(lexer.char) + + err = lexer.nextRune() + if err != nil { return } + } + + if len(number) < 3 { + err = file.NewError ( + lexer.file.Location(), 1, + "octal escape sequence too short", + file.ErrorKindError) + return + } + + parsedNumber, _ := strconv.ParseInt(number, 8, 8) + result = rune(parsedNumber) + + } else if lexer.char == 'x' || lexer.char == 'u' || lexer.char == 'U' { + // hexidecimal escape sequence + want := 2 + if lexer.char == 'u' { want = 4 } + if lexer.char == 'U' { want = 8 } + + number := "" + + err = lexer.nextRune() + if err != nil { return } + + for len(number) < want { + notLower := lexer.char < 'a' || lexer.char > 'f' + notUpper := lexer.char < 'A' || lexer.char > 'F' + notNum := lexer.char < '0' || lexer.char > '9' + if notLower && notUpper && notNum { break } + + number += string(lexer.char) + + err = lexer.nextRune() + if err != nil { return } + } + + if len(number) < want { + err = file.NewError ( + lexer.file.Location(), 1, + "hex escape sequence too short ", + file.ErrorKindError) + return + } + + parsedNumber, _ := strconv.ParseInt(number, 16, want * 4) + result = rune(parsedNumber) + } else { + err = file.NewError ( + lexer.file.Location(), 1, + "unknown escape character " + + string(lexer.char), file.ErrorKindError) + return + } + + return +} diff --git a/tests/lexer/text b/tests/lexer/text index 3515bfc..be5b8c6 100644 --- a/tests/lexer/text +++ b/tests/lexer/text @@ -1,2 +1,3 @@ "hello world!\a\b\f\n\r\t\v\'\"\\" '\a' '\b' '\f' '\n' '\r' '\t' '\v' '\'' '\"' '\\' +"hello world \x40\u0040\U00000040!"