rgbds/src/link/lexer.cpp

// SPDX-License-Identifier: MIT

#include "link/lexer.hpp"

#include <array>
#include <ctype.h>
#include <errno.h>
#include <fstream>
#include <inttypes.h>
#include <stdio.h>
#include <string_view>
#include <vector>

#include "helpers.hpp"
#include "itertools.hpp"
#include "util.hpp"

#include "link/warning.hpp"
// Include this last so it gets all type & constant definitions
#include "script.hpp" // For token definitions, generated from script.y

struct LexerStackEntry {
	std::filebuf file;
	std::string path;
	uint32_t lineNo;

	explicit LexerStackEntry(std::string &&path_) : file(), path(path_), lineNo(1) {}
};

static std::vector<LexerStackEntry> lexerStack;

void lexer_Error(char const *fmt, ...) {
	LexerStackEntry &context = lexerStack.back();
	va_list args;
	va_start(args, fmt);
	scriptError(context.path.c_str(), context.lineNo, fmt, args);
	va_end(args);
}

void lexer_IncludeFile(std::string &&path) {
	// `.emplace_back` can invalidate references to the stack's elements!
	// This is why `newContext` must be gotten before `prevContext`.
	LexerStackEntry &newContext = lexerStack.emplace_back(std::move(path));
	LexerStackEntry &prevContext = lexerStack[lexerStack.size() - 2];

	if (!newContext.file.open(newContext.path, std::ios_base::in)) {
		// `.pop_back()` will invalidate `newContext`, which is why `path` must be moved first.
		std::string badPath = std::move(newContext.path);
		lexerStack.pop_back();
		// This error will occur in `prevContext`, *before* incrementing the line number!
		lexer_Error(
		    "Failed to open included linker script \"%s\": %s", badPath.c_str(), strerror(errno)
		);
	}

	// `.pop_back()` cannot invalidate an unpopped reference, so `prevContext`
	// is still valid even if `.open()` failed.
	++prevContext.lineNo;
}

void lexer_IncLineNo() {
	++lexerStack.back().lineNo;
}

static bool isWhiteSpace(int c) {
	return c == ' ' || c == '\t';
}

static bool isNewline(int c) {
	return c == '\r' || c == '\n';
}

yy::parser::symbol_type yylex(); // Forward declaration for `yywrap`

static yy::parser::symbol_type yywrap() {
	static bool atEof = false;
	if (lexerStack.size() != 1) {
		if (!atEof) {
			// Inject a newline at EOF to simplify parsing.
			atEof = true;
			return yy::parser::make_newline();
		}
		lexerStack.pop_back();
		return yylex();
	}
	if (!atEof) {
		// Inject a newline at EOF to simplify parsing.
		atEof = true;
		return yy::parser::make_newline();
	}
	return yy::parser::make_YYEOF();
}

static bool isIdentChar(int c) {
	return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9');
}

static std::string readIdent(int c) {
	LexerStackEntry &context = lexerStack.back();
	std::string ident;
	ident.push_back(c);
	for (c = context.file.sgetc(); isIdentChar(c); c = context.file.snextc()) {
		ident.push_back(c);
	}
	return ident;
}

static bool isDecDigit(int c) {
	return c >= '0' && c <= '9';
}

static yy::parser::symbol_type parseDecNumber(int c) {
	LexerStackEntry &context = lexerStack.back();
	uint32_t number = c - '0';
	for (c = context.file.sgetc(); isDecDigit(c) || c == '_'; c = context.file.sgetc()) {
		if (c != '_') {
			number = number * 10 + (c - '0');
		}
		context.file.sbumpc();
	}
	return yy::parser::make_number(number);
}

static bool isBinDigit(int c) {
	return c >= '0' && c <= '1';
}

static yy::parser::symbol_type parseBinNumber(char const *prefix) {
	LexerStackEntry &context = lexerStack.back();
	int c = context.file.sgetc();
	if (!isBinDigit(c)) {
		lexer_Error("No binary digits found after '%s'", prefix);
		return yy::parser::make_number(0);
	}

	uint32_t number = c - '0';
	context.file.sbumpc();
	for (c = context.file.sgetc(); isBinDigit(c) || c == '_'; c = context.file.sgetc()) {
		if (c != '_') {
			number = number * 2 + (c - '0');
		}
		context.file.sbumpc();
	}
	return yy::parser::make_number(number);
}

static bool isOctDigit(int c) {
	return c >= '0' && c <= '7';
}

static yy::parser::symbol_type parseOctNumber(char const *prefix) {
	LexerStackEntry &context = lexerStack.back();
	int c = context.file.sgetc();
	if (!isOctDigit(c)) {
		lexer_Error("No octal digits found after '%s'", prefix);
		return yy::parser::make_number(0);
	}

	uint32_t number = c - '0';
	context.file.sbumpc();
	for (c = context.file.sgetc(); isOctDigit(c) || c == '_'; c = context.file.sgetc()) {
		if (c != '_') {
			number = number * 8 + (c - '0');
		}
		context.file.sbumpc();
	}
	return yy::parser::make_number(number);
}

static bool isHexDigit(int c) {
	return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f');
}

static uint8_t parseHexDigit(int c) {
	if (c >= '0' && c <= '9') {
		return c - '0';
	} else if (c >= 'A' && c <= 'F') {
		return c - 'A' + 10;
	} else if (c >= 'a' && c <= 'f') {
		return c - 'a' + 10;
	} else {
		unreachable_(); // LCOV_EXCL_LINE
	}
}

static yy::parser::symbol_type parseHexNumber(char const *prefix) {
	LexerStackEntry &context = lexerStack.back();
	int c = context.file.sgetc();
	if (!isHexDigit(c)) {
		lexer_Error("No hexadecimal digits found after '%s'", prefix);
		return yy::parser::make_number(0);
	}

	uint32_t number = parseHexDigit(c);
	context.file.sbumpc();
	for (c = context.file.sgetc(); isHexDigit(c) || c == '_'; c = context.file.sgetc()) {
		if (c != '_') {
			number = number * 16 + parseHexDigit(c);
		}
		context.file.sbumpc();
	}
	return yy::parser::make_number(number);
}

static yy::parser::symbol_type parseNumber(int c) {
	LexerStackEntry &context = lexerStack.back();
	if (c == '0') {
		switch (context.file.sgetc()) {
		case 'x':
			context.file.sbumpc();
			return parseHexNumber("0x");
		case 'X':
			context.file.sbumpc();
			return parseHexNumber("0X");
		case 'o':
			context.file.sbumpc();
			return parseOctNumber("0o");
		case 'O':
			context.file.sbumpc();
			return parseOctNumber("0O");
		case 'b':
			context.file.sbumpc();
			return parseBinNumber("0b");
		case 'B':
			context.file.sbumpc();
			return parseBinNumber("0B");
		}
	}
	return parseDecNumber(c);
}

static yy::parser::symbol_type parseString() {
	LexerStackEntry &context = lexerStack.back();
	int c = context.file.sgetc();
	std::string str;
	for (; c != '"'; c = context.file.sgetc()) {
		if (c == EOF || isNewline(c)) {
			lexer_Error("Unterminated string");
			break;
		}
		context.file.sbumpc();
		if (c == '\\') {
			c = context.file.sgetc();
			if (c == EOF || isNewline(c)) {
				lexer_Error("Unterminated string");
				break;
			} else if (c == 'n') {
				c = '\n';
			} else if (c == 'r') {
				c = '\r';
			} else if (c == 't') {
				c = '\t';
			} else if (c == '0') {
				c = '\0';
			} else if (c != '\\' && c != '"' && c != '\'') {
				lexer_Error("Cannot escape character %s", printChar(c));
			}
			context.file.sbumpc();
		}
		str.push_back(c);
	}
	if (c == '"') {
		context.file.sbumpc();
	}
	return yy::parser::make_string(std::move(str));
}

struct Keyword {
	std::string_view name;
	yy::parser::symbol_type (*tokenGen)();
};

using namespace std::literals;

static std::array keywords{
    Keyword{"ORG"sv,      yy::parser::make_ORG     },
    Keyword{"FLOATING"sv, yy::parser::make_FLOATING},
    Keyword{"INCLUDE"sv,  yy::parser::make_INCLUDE },
    Keyword{"ALIGN"sv,    yy::parser::make_ALIGN   },
    Keyword{"DS"sv,       yy::parser::make_DS      },
    Keyword{"OPTIONAL"sv, yy::parser::make_OPTIONAL},
};

yy::parser::symbol_type yylex() {
	LexerStackEntry &context = lexerStack.back();
	int c = context.file.sbumpc();

	// First, skip leading whitespace.
	while (isWhiteSpace(c)) {
		c = context.file.sbumpc();
	}
	// Then, skip a comment if applicable.
	if (c == ';') {
		while (c != EOF && !isNewline(c)) {
			c = context.file.sbumpc();
		}
	}

	// Alright, what token should we return?
	if (c == EOF) {
		return yywrap();
	} else if (c == ',') {
		return yy::parser::make_COMMA();
	} else if (isNewline(c)) {
		// Handle CRLF.
		if (c == '\r' && context.file.sgetc() == '\n') {
			context.file.sbumpc();
		}
		return yy::parser::make_newline();
	} else if (c == '"') {
		return parseString();
	} else if (c == '$') {
		return parseHexNumber("$");
	} else if (c == '%') {
		return parseBinNumber("%");
	} else if (c == '&') {
		return parseOctNumber("&");
	} else if (isDecDigit(c)) {
		return parseNumber(c);
	} else if (isIdentChar(c)) { // Note that we match these *after* digit characters!
		std::string ident = readIdent(c);

		auto strUpperCmp = [](char cmp, char ref) { return toupper(cmp) == ref; };

		for (SectionType type : EnumSeq(SECTTYPE_INVALID)) {
			if (std::equal(RANGE(ident), RANGE(sectionTypeInfo[type].name), strUpperCmp)) {
				return yy::parser::make_sect_type(type);
			}
		}

		for (Keyword const &keyword : keywords) {
			if (std::equal(RANGE(ident), RANGE(keyword.name), strUpperCmp)) {
				return keyword.tokenGen();
			}
		}

		lexer_Error("Unknown keyword \"%s\"", ident.c_str());
		return yylex();
	} else {
		lexer_Error("Unexpected character %s", printChar(c));
		// Keep reading characters until the EOL, to avoid reporting too many errors.
		for (c = context.file.sgetc(); !isNewline(c); c = context.file.sgetc()) {
			if (c == EOF) {
				break;
			}
			context.file.sbumpc();
		}
		return yylex();
	}
	// Not marking as unreachable; this will generate a warning if any codepath forgets to return.
}

bool lexer_Init(char const *linkerScriptName) {
	if (LexerStackEntry &newContext = lexerStack.emplace_back(std::string(linkerScriptName));
	    !newContext.file.open(newContext.path, std::ios_base::in)) {
		error("Failed to open linker script \"%s\"", linkerScriptName);
		lexerStack.clear();
		return false;
	}
	return true;
}