arf/parser/parser.go

/*
Package parser implements a parser for the ARF language. It contains an abstract
syntax tree (SyntaxTree), various tree nodes, and a function called Fetch that
returns a SyntaxTree for the module located at the given path. Internally, the
parser caches parsing results so Fetch may be called frequently.

Trees returned by this package can be expected to be internally consistent and
syntactically corred, but not semantically correct. Ensuring the semantic
integrity of ARF code is the job of the analyzer package.

This package automatically invokes lexer before parsing module files.
*/
package parser

import "io"
import "os"
import "path/filepath"
import "git.tebibyte.media/arf/arf/file"
import "git.tebibyte.media/arf/arf/lexer"
import "git.tebibyte.media/arf/arf/infoerr"

// parsingOperation holds information about an ongoing parsing operation.
type parsingOperation struct {
	modulePath string
	token      lexer.Token
	tokens     []lexer.Token
	tokenIndex int
	skimming   bool

	tree SyntaxTree
}

// Fetch returns the parsed module located at the specified path as a
// SyntaxTree. If the module has not yet been parsed, it parses it first. If it
// has, it grabs it out of a cache. This function can be called frequently.
func Fetch (modulePath string, skim bool) (tree SyntaxTree, err error) {
	if modulePath[0] != '/' {
		panic("module path did not begin at filesystem root")
	}

	// try to hit cache
	cached, exists := cache[modulePath]
	if exists && !(!skim && cached.skimmed){
		tree = cached.tree
		return
	}

	// miss, so parse the module.
	parser := parsingOperation {
		modulePath: modulePath,
		skimming:   skim,
		tree: SyntaxTree {
			requires: make(map[string] string),
			sections: make(map[string] Section),			
		},
	}

	if parser.modulePath[len(parser.modulePath) - 1] != '/' {
		parser.modulePath += "/"
	}

	var moduleFiles []os.DirEntry
	moduleFiles, err = os.ReadDir(parser.modulePath)
	if err != nil { return }

	for _, entry := range moduleFiles {
		if filepath.Ext(entry.Name()) != ".arf" || entry.IsDir() {
			continue
		}

		var sourceFile *file.File
		sourceFile, err = file.Open(parser.modulePath + entry.Name())
		if err != nil { return }

 		// parse the tokens into the module
		err  = parser.parse(sourceFile)
	}
	
	tree = parser.tree

	// cache tree
	cache[modulePath] = cacheItem {
		tree:    tree,
		skimmed: false,
	}
	
	return
}

// parse parses a file and adds it to the syntax tree.
func (parser *parsingOperation) parse (sourceFile *file.File) (err error) {
	var tokens []lexer.Token
	tokens, err = lexer.Tokenize(sourceFile)
	if err != nil { return }

	// reset the parser
	if len(tokens) == 0 { return }
	parser.tokens = tokens
	parser.token  = tokens[0]
	parser.tokenIndex = 0

	err = parser.parseMeta()
	if err != nil { return }

	err = parser.parseBody()
	if err != nil { return }

	return
}

// expect takes in a list of allowed token kinds, and returns an error if the
// current token isn't one of them. If the length of allowed is zero, this
// function will not return an error.
func (parser *parsingOperation) expect (allowed ...lexer.TokenKind) (err error) {
	if len(allowed) == 0 { return }

	for _, kind := range allowed {
		if parser.token.Is(kind) { return }
	}

	message :=
		"unexpected " + parser.token.Kind().Describe() +
		" token, expected "

	for index, allowedItem := range allowed {
		if index > 0 {
			if index == len(allowed) - 1 {
				message += " or "
			} else {
				message += ", " 
			}
		}
	
		message += allowedItem.Describe()
	}

	err = infoerr.NewError (
		parser.token.Location(),
		message, infoerr.ErrorKindError)
	return
}

// nextToken is the same as expect, but it advances to the next token first.
func (parser *parsingOperation) nextToken (allowed ...lexer.TokenKind) (err error) {
	parser.tokenIndex ++
	if parser.tokenIndex >= len(parser.tokens) { return io.EOF }
	parser.token = parser.tokens[parser.tokenIndex]
	
	err = parser.expect(allowed...)
	return
}

// previousToken goes back one token. If the parser is already at the beginning,
// this does nothing.
func (parser *parsingOperation) previousToken () {
	parser.tokenIndex --
	if parser.tokenIndex < 0 { parser.tokenIndex = 0 }
	parser.token = parser.tokens[parser.tokenIndex]
	return
}

// skipIndentLevel advances the parser, ignoring every line with an indentation
// equal to or greater than the specified indent.
func (parser *parsingOperation) skipIndentLevel (indent int) (err error) {
	braceLevel   := 0
	parenLevel   := 0
	bracketLevel := 0

	for {
		if parser.token.Is(lexer.TokenKindNewline) {
			err = parser.nextToken()
			if err != nil { return }

			shouldBreak :=
				!parser.token.Is(lexer.TokenKindIndent) ||
				parser.token.Value().(int) < indent
			
			shouldBreak =
				shouldBreak      &&
				braceLevel   < 1 &&
				parenLevel   < 1 &&
				bracketLevel < 1

			if shouldBreak { return }
		}

		switch parser.token.Kind() {
		case lexer.TokenKindLBrace:   braceLevel ++
		case lexer.TokenKindRBrace:   braceLevel --
		case lexer.TokenKindLParen:   parenLevel ++
		case lexer.TokenKindRParen:   parenLevel --
		case lexer.TokenKindLBracket: bracketLevel ++
		case lexer.TokenKindRBracket: bracketLevel --
		}

		err = parser.nextToken()
		if err != nil { return }
	}
}

// skipWhitespace skips over newlines and indent tokens.
func (parser *parsingOperation) skipWhitespace () (err error) {
	for {
		isWhitespace :=
			parser.token.Is(lexer.TokenKindIndent) ||
			parser.token.Is(lexer.TokenKindNewline)

		if !isWhitespace {
			break
		}

		err = parser.nextToken()
		if err != nil { return }
	}

	return
}
Made documentation a bit better 2022-10-11 22:48:55 -06:00			`/*`
			`Package parser implements a parser for the ARF language. It contains an abstract`
			`syntax tree (SyntaxTree), various tree nodes, and a function called Fetch that`
			`returns a SyntaxTree for the module located at the given path. Internally, the`
			`parser caches parsing results so Fetch may be called frequently.`

			`Trees returned by this package can be expected to be internally consistent and`
			`syntactically corred, but not semantically correct. Ensuring the semantic`
			`integrity of ARF code is the job of the analyzer package.`

			`This package automatically invokes lexer before parsing module files.`
			`*/`
Created base for parser The parser now handles file opening and invokes the lexer. 2022-08-12 09:11:43 -06:00			`package parser`

Added expect and nextToken methods to parser 2022-08-12 12:33:21 -06:00			`import "io"`
Created base for parser The parser now handles file opening and invokes the lexer. 2022-08-12 09:11:43 -06:00			`import "os"`
			`import "path/filepath"`
Fixed import paths 2022-08-29 23:11:10 -06:00			`import "git.tebibyte.media/arf/arf/file"`
			`import "git.tebibyte.media/arf/arf/lexer"`
			`import "git.tebibyte.media/arf/arf/infoerr"`
Created base for parser The parser now handles file opening and invokes the lexer. 2022-08-12 09:11:43 -06:00
Privated parsingOperation 2022-10-11 21:57:27 -06:00			`// parsingOperation holds information about an ongoing parsing operation.`
			`type parsingOperation struct {`
Created base for parser The parser now handles file opening and invokes the lexer. 2022-08-12 09:11:43 -06:00			`modulePath string`
			`token lexer.Token`
			`tokens []lexer.Token`
			`tokenIndex int`
Data section initialization values are now skimmed over 2022-09-05 13:04:39 -06:00			`skimming bool`
Created basic test for parser 2022-08-12 10:55:17 -06:00
The syntax tree is no longer passed by reference 2022-09-04 20:30:14 -06:00			`tree SyntaxTree`
Created base for parser The parser now handles file opening and invokes the lexer. 2022-08-12 09:11:43 -06:00			`}`

Made documentation a bit better 2022-10-11 22:48:55 -06:00			`// Fetch returns the parsed module located at the specified path as a`
			`// SyntaxTree. If the module has not yet been parsed, it parses it first. If it`
			`// has, it grabs it out of a cache. This function can be called frequently.`
Added skim boolean that does nothing 2022-09-05 11:46:10 -06:00			`func Fetch (modulePath string, skim bool) (tree SyntaxTree, err error) {`
Parse -> Fetch, now tries to hit cache 2022-09-05 11:31:38 -06:00			`if modulePath[0] != '/' {`
			`panic("module path did not begin at filesystem root")`
			`}`

			`// try to hit cache`
			`cached, exists := cache[modulePath]`
Added skim boolean that does nothing 2022-09-05 11:46:10 -06:00			`if exists && !(!skim && cached.skimmed){`
Parse -> Fetch, now tries to hit cache 2022-09-05 11:31:38 -06:00			`tree = cached.tree`
			`return`
			`}`

			`// miss, so parse the module.`
Privated parsingOperation 2022-10-11 21:57:27 -06:00			`parser := parsingOperation {`
The section kind specific maps are gone I've REPLACED IT with the unified sections map. Interfaces, baby! 2022-09-04 17:30:59 -06:00			`modulePath: modulePath,`
Data section initialization values are now skimmed over 2022-09-05 13:04:39 -06:00			`skimming: skim,`
The syntax tree is no longer passed by reference 2022-09-04 20:30:14 -06:00			`tree: SyntaxTree {`
Syntax tree now stores map of require names -> full paths 2022-09-07 15:12:46 -06:00			`requires: make(map[string] string),`
The section kind specific maps are gone I've REPLACED IT with the unified sections map. Interfaces, baby! 2022-09-04 17:30:59 -06:00			`sections: make(map[string] Section),`
			`},`
			`}`
Created basic test for parser 2022-08-12 10:55:17 -06:00
Created base for parser The parser now handles file opening and invokes the lexer. 2022-08-12 09:11:43 -06:00			`if parser.modulePath[len(parser.modulePath) - 1] != '/' {`
			`parser.modulePath += "/"`
			`}`

			`var moduleFiles []os.DirEntry`
			`moduleFiles, err = os.ReadDir(parser.modulePath)`
			`if err != nil { return }`

			`for _, entry := range moduleFiles {`
			`if filepath.Ext(entry.Name()) != ".arf" \|\| entry.IsDir() {`
			`continue`
			`}`

			`var sourceFile *file.File`
			`sourceFile, err = file.Open(parser.modulePath + entry.Name())`
			`if err != nil { return }`

Parser now understands the separation between files This needs to be done because each file has a metadata section at the top. 2022-08-12 11:02:20 -06:00			`// parse the tokens into the module`
			`err = parser.parse(sourceFile)`
Created base for parser The parser now handles file opening and invokes the lexer. 2022-08-12 09:11:43 -06:00			`}`
Parser now understands the separation between files This needs to be done because each file has a metadata section at the top. 2022-08-12 11:02:20 -06:00
			`tree = parser.tree`
Parse -> Fetch, now tries to hit cache 2022-09-05 11:31:38 -06:00
			`// cache tree`
			`cache[modulePath] = cacheItem {`
			`tree: tree,`
			`skimmed: false,`
			`}`

Parser now understands the separation between files This needs to be done because each file has a metadata section at the top. 2022-08-12 11:02:20 -06:00			`return`
			`}`

			`// parse parses a file and adds it to the syntax tree.`
Privated parsingOperation 2022-10-11 21:57:27 -06:00			`func (parser parsingOperation) parse (sourceFile file.File) (err error) {`
Parser now understands the separation between files This needs to be done because each file has a metadata section at the top. 2022-08-12 11:02:20 -06:00			`var tokens []lexer.Token`
			`tokens, err = lexer.Tokenize(sourceFile)`
			`if err != nil { return }`

Added expect and nextToken methods to parser 2022-08-12 12:33:21 -06:00			`// reset the parser`
Parser now understands the separation between files This needs to be done because each file has a metadata section at the top. 2022-08-12 11:02:20 -06:00			`if len(tokens) == 0 { return }`
			`parser.tokens = tokens`
			`parser.token = tokens[0]`
			`parser.tokenIndex = 0`

Added expect and nextToken methods to parser 2022-08-12 12:33:21 -06:00			`err = parser.parseMeta()`
			`if err != nil { return }`

Added base parse body function 2022-08-14 20:38:57 -06:00			`err = parser.parseBody()`
			`if err != nil { return }`

Added expect and nextToken methods to parser 2022-08-12 12:33:21 -06:00			`return`
			`}`

			`// expect takes in a list of allowed token kinds, and returns an error if the`
			`// current token isn't one of them. If the length of allowed is zero, this`
			`// function will not return an error.`
Privated parsingOperation 2022-10-11 21:57:27 -06:00			`func (parser *parsingOperation) expect (allowed ...lexer.TokenKind) (err error) {`
Added expect and nextToken methods to parser 2022-08-12 12:33:21 -06:00			`if len(allowed) == 0 { return }`

			`for _, kind := range allowed {`
			`if parser.token.Is(kind) { return }`
			`}`

Parser can now print out a list of expected token kinds 2022-08-12 16:09:37 -06:00			`message :=`
			`"unexpected " + parser.token.Kind().Describe() +`
			`" token, expected "`

			`for index, allowedItem := range allowed {`
			`if index > 0 {`
			`if index == len(allowed) - 1 {`
			`message += " or "`
			`} else {`
			`message += ", "`
			`}`
			`}`

			`message += allowedItem.Describe()`
			`}`

Replaced references to file.Error with infoerr.Error 2022-08-17 22:58:40 -06:00			`err = infoerr.NewError (`
Added metadata parser 2022-08-12 15:22:51 -06:00			`parser.token.Location(),`
Replaced references to file.Error with infoerr.Error 2022-08-17 22:58:40 -06:00			`message, infoerr.ErrorKindError)`
Added expect and nextToken methods to parser 2022-08-12 12:33:21 -06:00			`return`
			`}`

			`// nextToken is the same as expect, but it advances to the next token first.`
Privated parsingOperation 2022-10-11 21:57:27 -06:00			`func (parser *parsingOperation) nextToken (allowed ...lexer.TokenKind) (err error) {`
Added expect and nextToken methods to parser 2022-08-12 12:33:21 -06:00			`parser.tokenIndex ++`
			`if parser.tokenIndex >= len(parser.tokens) { return io.EOF }`
			`parser.token = parser.tokens[parser.tokenIndex]`
Added metadata parser 2022-08-12 15:22:51 -06:00
			`err = parser.expect(allowed...)`
Created base for parser The parser now handles file opening and invokes the lexer. 2022-08-12 09:11:43 -06:00			`return`
			`}`
Added previousToken method to parser 2022-08-17 10:39:26 -06:00
			`// previousToken goes back one token. If the parser is already at the beginning,`
			`// this does nothing.`
Privated parsingOperation 2022-10-11 21:57:27 -06:00			`func (parser *parsingOperation) previousToken () {`
Added previousToken method to parser 2022-08-17 10:39:26 -06:00			`parser.tokenIndex --`
			`if parser.tokenIndex < 0 { parser.tokenIndex = 0 }`
			`parser.token = parser.tokens[parser.tokenIndex]`
			`return`
			`}`
Add skipIndentLevel function 2022-09-05 12:56:35 -06:00
			`// skipIndentLevel advances the parser, ignoring every line with an indentation`
			`// equal to or greater than the specified indent.`
Privated parsingOperation 2022-10-11 21:57:27 -06:00			`func (parser *parsingOperation) skipIndentLevel (indent int) (err error) {`
Pass skim test 2022-09-29 09:15:58 -06:00			`braceLevel := 0`
			`parenLevel := 0`
			`bracketLevel := 0`

Add skipIndentLevel function 2022-09-05 12:56:35 -06:00			`for {`
Data section initialization values are now skimmed over 2022-09-05 13:04:39 -06:00			`if parser.token.Is(lexer.TokenKindNewline) {`
			`err = parser.nextToken()`
			`if err != nil { return }`
Add skipIndentLevel function 2022-09-05 12:56:35 -06:00
Pass skim test 2022-09-29 09:15:58 -06:00			`shouldBreak :=`
			`!parser.token.Is(lexer.TokenKindIndent) \|\|`
			`parser.token.Value().(int) < indent`

			`shouldBreak =`
			`shouldBreak &&`
			`braceLevel < 1 &&`
			`parenLevel < 1 &&`
			`bracketLevel < 1`

			`if shouldBreak { return }`
			`}`
Add skipIndentLevel function 2022-09-05 12:56:35 -06:00
Pass skim test 2022-09-29 09:15:58 -06:00			`switch parser.token.Kind() {`
			`case lexer.TokenKindLBrace: braceLevel ++`
			`case lexer.TokenKindRBrace: braceLevel --`
			`case lexer.TokenKindLParen: parenLevel ++`
			`case lexer.TokenKindRParen: parenLevel --`
			`case lexer.TokenKindLBracket: bracketLevel ++`
			`case lexer.TokenKindRBracket: bracketLevel --`
Add skipIndentLevel function 2022-09-05 12:56:35 -06:00			`}`
Data section initialization values are now skimmed over 2022-09-05 13:04:39 -06:00
			`err = parser.nextToken()`
			`if err != nil { return }`
Add skipIndentLevel function 2022-09-05 12:56:35 -06:00			`}`
			`}`
Fixed the hanging thing 2022-09-16 10:27:13 -06:00
			`// skipWhitespace skips over newlines and indent tokens.`
Privated parsingOperation 2022-10-11 21:57:27 -06:00			`func (parser *parsingOperation) skipWhitespace () (err error) {`
Fixed the hanging thing 2022-09-16 10:27:13 -06:00			`for {`
			`isWhitespace :=`
			`parser.token.Is(lexer.TokenKindIndent) \|\|`
			`parser.token.Is(lexer.TokenKindNewline)`

			`if !isWhitespace {`
			`break`
			`}`

			`err = parser.nextToken()`
			`if err != nil { return }`
			`}`

			`return`
			`}`