arf/parser/parser.go

/*
Package parser implements a parser for the ARF language. It contains an abstract
syntax tree (SyntaxTree), various tree nodes, and a function called Fetch that
returns a SyntaxTree for the module located at the given path. Internally, the
parser caches parsing results so Fetch may be called frequently.

Trees returned by this package can be expected to be internally consistent and
syntactically corred, but not semantically correct. Ensuring the semantic
integrity of ARF code is the job of the analyzer package.

This package automatically invokes lexer before parsing module files.
*/
package parser

import "io"
import "os"
import "path/filepath"
import "git.tebibyte.media/arf/arf/file"
import "git.tebibyte.media/arf/arf/lexer"
import "git.tebibyte.media/arf/arf/infoerr"

// parsingOperation holds information about an ongoing parsing operation.
type parsingOperation struct {
	modulePath string
	token      lexer.Token
	tokens     []lexer.Token
	tokenIndex int
	skimming   bool

	tree SyntaxTree
}

// Fetch returns the parsed module located at the specified path as a
// SyntaxTree. If the module has not yet been parsed, it parses it first. If it
// has, it grabs it out of a cache. This function can be called frequently.
func Fetch (modulePath string, skim bool) (tree SyntaxTree, err error) {
	if modulePath[0] != '/' {
		panic("module path did not begin at filesystem root")
	}

	// try to hit cache
	cached, exists := cache[modulePath]
	if exists && !(!skim && cached.skimmed){
		tree = cached.tree
		return
	}

	// miss, so parse the module.
	parser := parsingOperation {
		modulePath: modulePath,
		skimming:   skim,
		tree: SyntaxTree {
			requires: make(map[string] string),
			sections: make(map[string] Section),
		},
	}

	if parser.modulePath[len(parser.modulePath) - 1] != '/' {
		parser.modulePath += "/"
	}

	var moduleFiles []os.DirEntry
	moduleFiles, err = os.ReadDir(parser.modulePath)
	if err != nil { return }

	for _, entry := range moduleFiles {
		if filepath.Ext(entry.Name()) != ".arf" || entry.IsDir() {
			continue
		}

		var sourceFile *file.File
		sourceFile, err = file.Open(parser.modulePath + entry.Name())
		if err != nil { return }

 		// parse the tokens into the module
		err  = parser.parse(sourceFile)
		if err == io.EOF { err = nil}
		if err != nil { return }
	}

	tree = parser.tree

	// cache tree
	cache[modulePath] = cacheItem {
		tree:    tree,
		skimmed: false,
	}

	return
}

// parse parses a file and adds it to the syntax tree.
func (parser *parsingOperation) parse (sourceFile *file.File) (err error) {
	var tokens []lexer.Token
	tokens, err = lexer.Tokenize(sourceFile)
	if err != nil { return }

	// reset the parser
	if len(tokens) == 0 { return }
	parser.tokens = tokens
	parser.token  = tokens[0]
	parser.tokenIndex = 0

	err = parser.parseMeta()
	if err != nil { return }

	err = parser.parseBody()
	if err != nil { return }

	return
}

// expect takes in a list of allowed token kinds, and returns an error if the
// current token isn't one of them. If the length of allowed is zero, this
// function will not return an error.
func (parser *parsingOperation) expect (allowed ...lexer.TokenKind) (err error) {
	if len(allowed) == 0 { return }

	for _, kind := range allowed {
		if parser.token.Is(kind) { return }
	}

	message :=
		"unexpected " + parser.token.Kind().Describe() +
		" token, expected "

	for index, allowedItem := range allowed {
		if index > 0 {
			if index == len(allowed) - 1 {
				message += " or "
			} else {
				message += ", "
			}
		}

		message += allowedItem.Describe()
	}

	err = infoerr.NewError (
		parser.token.Location(),
		message, infoerr.ErrorKindError)
	return
}

// nextToken is the same as expect, but it advances to the next token first.
func (parser *parsingOperation) nextToken (allowed ...lexer.TokenKind) (err error) {
	parser.tokenIndex ++
	if parser.tokenIndex >= len(parser.tokens) { return io.EOF }
	parser.token = parser.tokens[parser.tokenIndex]

	err = parser.expect(allowed...)
	return
}

// previousToken goes back one token. If the parser is already at the beginning,
// this does nothing.
func (parser *parsingOperation) previousToken () {
	parser.tokenIndex --
	if parser.tokenIndex < 0 { parser.tokenIndex = 0 }
	parser.token = parser.tokens[parser.tokenIndex]
	return
}

// skipIndentLevel advances the parser, ignoring every line with an indentation
// equal to or greater than the specified indent.
func (parser *parsingOperation) skipIndentLevel (indent int) (err error) {
	braceLevel   := 0
	parenLevel   := 0
	bracketLevel := 0

	for {
		if parser.token.Is(lexer.TokenKindNewline) {
			err = parser.nextToken()
			if err != nil { return }

			shouldBreak :=
				!parser.token.Is(lexer.TokenKindIndent) ||
				parser.token.Value().(int) < indent

			shouldBreak =
				shouldBreak      &&
				braceLevel   < 1 &&
				parenLevel   < 1 &&
				bracketLevel < 1

			if shouldBreak { return }
		}

		switch parser.token.Kind() {
		case lexer.TokenKindLBrace:   braceLevel ++
		case lexer.TokenKindRBrace:   braceLevel --
		case lexer.TokenKindLParen:   parenLevel ++
		case lexer.TokenKindRParen:   parenLevel --
		case lexer.TokenKindLBracket: bracketLevel ++
		case lexer.TokenKindRBracket: bracketLevel --
		}

		err = parser.nextToken()
		if err != nil { return }
	}
}

// skipWhitespace skips over newlines and indent tokens.
func (parser *parsingOperation) skipWhitespace () (err error) {
	for {
		isWhitespace :=
			parser.token.Is(lexer.TokenKindIndent) ||
			parser.token.Is(lexer.TokenKindNewline)

		if !isWhitespace {
			break
		}

		err = parser.nextToken()
		if err != nil { return }
	}

	return
}