Support more syntax in linkerscripts (#1752)

* No need to use `locale`s * Implement octal numbers, `_` digit separators, and `0x/0b/0o` prefixes in linkerscripts * Refactor some functions out of `yylex` * Support `\0` in linkerscripts
2026-03-03 11:33:04 +00:00 · 2025-07-16 15:00:02 -04:00
parent cf6e5fec63
commit 7f24d46d44
7 changed files with 209 additions and 109 deletions
--- a/man/rgblink.5
+++ b/man/rgblink.5
@@ -24,18 +24,20 @@ They are simply ignored.
 .Pp
 Keywords are composed of letters and digits (but they can't start with a digit); they are all case-insensitive.
 .Pp
-Numbers can be written in decimal format, or in binary using the
+Numbers can be written in a number of formats.
-.Ql %
+.Bl -column -offset indent "Hexadecimal" "Possible prefixes"
-prefix, or in hexadecimal using the
+.It Sy Format type Ta Sy Possible prefixes Ta Sy Accepted characters
-.Ql $
+.It Decimal Ta none Ta 0123456789
-prefix (hexadecimal digits are case-insensitive).
+.It Hexadecimal Ta Li $ , 0x , 0X Ta 0123456789ABCDEF
-Note that unlike
+.It Octal Ta Li & , 0o , 0O Ta 01234567
-.Xr rgbasm 5 ,
+.It Binary Ta Li % , 0b , 0B Ta 01
-an octal
+.El
-.Ql &
+.Pp
-prefix is not supported, nor are
+Underscores are also accepted in numbers, except at the beginning of one.
-.Ql _
+This can be useful for grouping digits, like
-digit separators.
+.Ql 1_234
 or
 .Ql $ff_80 .
 .Pp
 Strings begin with a double quote, and end at the next (non-escaped) double quote.
 Strings must not contain literal newline characters.
@@ -46,8 +48,9 @@ are supported, specifically
 .Ql \e" ,
 .Ql \en ,
 .Ql \er ,
 .Ql \et ,
 and
-.Ql \et .
+.Ql \e0 .
 Other backslash escape sequences in
 .Xr rgbasm 5
 are only relevant to assembly code and do not apply in linker scripts.
--- a/src/link/script.y
+++ b/src/link/script.y
@@ -17,9 +17,9 @@
 	#include <algorithm>
 	#include <array>
 	#include <bit>
 	#include <ctype.h>
 	#include <fstream>
 	#include <inttypes.h>
 	#include <locale>
 	#include <stdio.h>
 	#include <string_view>
 	#include <vector>
@@ -209,18 +209,100 @@ static bool isNewline(int c) {
 	return c == '\r' || c == '\n';
 }
 static yy::parser::symbol_type yywrap() {
 	if (lexerStack.size() != 1) {
 		if (!atEof) {
 			// Inject a newline at EOF to simplify parsing.
 			atEof = true;
 			return yy::parser::make_newline();
 		}
 		lexerStack.pop_back();
 		return yylex();
 	}
 	if (!atEof) {
 		// Inject a newline at EOF to simplify parsing.
 		atEof = true;
 		return yy::parser::make_newline();
 	}
 	return yy::parser::make_YYEOF();
 }
 static bool isIdentChar(int c) {
 	return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9');
 }
 static std::string readIdent(int c) {
 	auto &context = lexerStack.back();
 	std::string ident;
 	ident.push_back(c);
 	for (c = context.file.sgetc(); isIdentChar(c); c = context.file.snextc()) {
 		ident.push_back(c);
 	}
 	return ident;
 }
 static bool isDecDigit(int c) {
 	return c >= '0' && c <= '9';
 }
 static yy::parser::symbol_type parseDecNumber(int c) {
 	auto &context = lexerStack.back();
 	uint32_t number = c - '0';
 	for (c = context.file.sgetc(); isDecDigit(c) || c == '_'; c = context.file.sgetc()) {
 		if (c != '_') {
 			number = number * 10 + (c - '0');
 		}
 		context.file.sbumpc();
 	}
 	return yy::parser::make_number(number);
 }
 static bool isBinDigit(int c) {
 	return c >= '0' && c <= '1';
 }
 static yy::parser::symbol_type parseBinNumber(char const *prefix) {
 	auto &context = lexerStack.back();
 	auto c = context.file.sgetc();
 	if (!isBinDigit(c)) {
 		scriptError(context, "No binary digits found after '%s'", prefix);
 		return yy::parser::make_number(0);
 	}
 	uint32_t number = c - '0';
 	context.file.sbumpc();
 	for (c = context.file.sgetc(); isBinDigit(c) || c == '_'; c = context.file.sgetc()) {
 		if (c != '_') {
 			number = number * 2 + (c - '0');
 		}
 		context.file.sbumpc();
 	}
 	return yy::parser::make_number(number);
 }
 static bool isOctDigit(int c) {
 	return c >= '0' && c <= '7';
 }
 static yy::parser::symbol_type parseOctNumber(char const *prefix) {
 	auto &context = lexerStack.back();
 	auto c = context.file.sgetc();
 	if (!isOctDigit(c)) {
 		scriptError(context, "No octal digits found after '%s'", prefix);
 		return yy::parser::make_number(0);
 	}
 	uint32_t number = c - '0';
 	context.file.sbumpc();
 	for (c = context.file.sgetc(); isOctDigit(c) || c == '_'; c = context.file.sgetc()) {
 		if (c != '_') {
 			number = number * 8 + (c - '0');
 		}
 		context.file.sbumpc();
 	}
 	return yy::parser::make_number(number);
 }
 static bool isHexDigit(int c) {
 	return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f');
 }
@@ -237,6 +319,88 @@ static uint8_t parseHexDigit(int c) {
 	}
 }
 static yy::parser::symbol_type parseHexNumber(char const *prefix) {
 	auto &context = lexerStack.back();
 	auto c = context.file.sgetc();
 	if (!isHexDigit(c)) {
 		scriptError(context, "No hexadecimal digits found after '%s'", prefix);
 		return yy::parser::make_number(0);
 	}
 	uint32_t number = parseHexDigit(c);
 	context.file.sbumpc();
 	for (c = context.file.sgetc(); isHexDigit(c) || c == '_'; c = context.file.sgetc()) {
 		if (c != '_') {
 			number = number * 16 + parseHexDigit(c);
 		}
 		context.file.sbumpc();
 	}
 	return yy::parser::make_number(number);
 }
 static yy::parser::symbol_type parseNumber(int c) {
 	auto &context = lexerStack.back();
 	if (c == '0') {
 		switch (context.file.sgetc()) {
 		case 'x':
 			context.file.sbumpc();
 			return parseHexNumber("0x");
 		case 'X':
 			context.file.sbumpc();
 			return parseHexNumber("0X");
 		case 'o':
 			context.file.sbumpc();
 			return parseOctNumber("0o");
 		case 'O':
 			context.file.sbumpc();
 			return parseOctNumber("0O");
 		case 'b':
 			context.file.sbumpc();
 			return parseBinNumber("0b");
 		case 'B':
 			context.file.sbumpc();
 			return parseBinNumber("0B");
 		}
 	}
 	return parseDecNumber(c);
 }
 static yy::parser::symbol_type parseString() {
 	auto &context = lexerStack.back();
 	auto c = context.file.sgetc();
 	std::string str;
 	for (; c != '"'; c = context.file.sgetc()) {
 		if (c == EOF || isNewline(c)) {
 			scriptError(context, "Unterminated string");
 			break;
 		}
 		context.file.sbumpc();
 		if (c == '\\') {
 			c = context.file.sgetc();
 			if (c == EOF || isNewline(c)) {
 				scriptError(context, "Unterminated string");
 				break;
 			} else if (c == 'n') {
 				c = '\n';
 			} else if (c == 'r') {
 				c = '\r';
 			} else if (c == 't') {
 				c = '\t';
 			} else if (c == '0') {
 				c = '\0';
 			} else if (c != '\\' && c != '"' && c != '\'') {
 				scriptError(context, "Cannot escape character %s", printChar(c));
 			}
 			context.file.sbumpc();
 		}
 		str.push_back(c);
 	}
 	if (c == '"') {
 		context.file.sbumpc();
 	}
 	return yy::parser::make_string(std::move(str));
 }
 yy::parser::symbol_type yylex() {
 	auto &context = lexerStack.back();
 	auto c = context.file.sbumpc();
@@ -254,23 +418,7 @@ yy::parser::symbol_type yylex() {
 	// Alright, what token should we return?
 	if (c == EOF) {
-		// Basically yywrap().
+		return yywrap();
 		if (lexerStack.size() != 1) {
 			if (!atEof) {
 				// Inject a newline at EOF to simplify parsing.
 				atEof = true;
 				return yy::parser::make_newline();
 			} else {
 				lexerStack.pop_back();
 				return yylex();
 			}
 		} else if (!atEof) {
 			// Inject a newline at EOF to simplify parsing.
 			atEof = true;
 			return yy::parser::make_newline();
 		} else {
 			return yy::parser::make_YYEOF();
 		}
 	} else if (c == ',') {
 		return yy::parser::make_COMMA();
 	} else if (isNewline(c)) {
@@ -280,85 +428,21 @@ yy::parser::symbol_type yylex() {
 		}
 		return yy::parser::make_newline();
 	} else if (c == '"') {
-		std::string str;
+		return parseString();
 		for (c = context.file.sgetc(); c != '"'; c = context.file.sgetc()) {
 			if (c == EOF || isNewline(c)) {
 				scriptError(context, "Unterminated string");
 				break;
 			}
 			context.file.sbumpc();
 			if (c == '\\') {
 				c = context.file.sgetc();
 				if (c == EOF || isNewline(c)) {
 					scriptError(context, "Unterminated string");
 					break;
 				} else if (c == 'n') {
 					c = '\n';
 				} else if (c == 'r') {
 					c = '\r';
 				} else if (c == 't') {
 					c = '\t';
 				} else if (c != '\\' && c != '"' && c != '\'') {
 					scriptError(context, "Cannot escape character %s", printChar(c));
 				}
 				context.file.sbumpc();
 			}
 			str.push_back(c);
 		}
 		if (c == '"') {
 			context.file.sbumpc();
 		}
 		return yy::parser::make_string(std::move(str));
 	} else if (c == '$') {
-		c = context.file.sgetc();
+		return parseHexNumber("$");
 		if (!isHexDigit(c)) {
 			scriptError(context, "No hexadecimal digits found after '$'");
 			return yy::parser::make_number(0);
 		}
 		uint32_t number = parseHexDigit(c);
 		context.file.sbumpc();
 		for (c = context.file.sgetc(); isHexDigit(c); c = context.file.sgetc()) {
 			number = number * 16 + parseHexDigit(c);
 			context.file.sbumpc();
 		}
 		return yy::parser::make_number(number);
 	} else if (c == '%') {
-		c = context.file.sgetc();
+		return parseBinNumber("%");
-		if (!isBinDigit(c)) {
+	} else if (c == '&') {
-			scriptError(context, "No binary digits found after '%%'");
+		return parseOctNumber("&");
 			return yy::parser::make_number(0);
 		}
 		uint32_t number = c - '0';
 		context.file.sbumpc();
 		for (c = context.file.sgetc(); isBinDigit(c); c = context.file.sgetc()) {
 			number = number * 2 + (c - '0');
 			context.file.sbumpc();
 		}
 		return yy::parser::make_number(number);
 	} else if (isDecDigit(c)) {
-		uint32_t number = c - '0';
+		return parseNumber(c);
 		for (c = context.file.sgetc(); isDecDigit(c); c = context.file.sgetc()) {
 			number = number * 10 + (c - '0');
 			context.file.sbumpc();
 		}
 		return yy::parser::make_number(number);
 	} else if (isIdentChar(c)) { // Note that we match these *after* digit characters!
-		std::string ident;
+		std::string ident = readIdent(c);
 		auto strUpperCmp = [](char cmp, char ref) {
 			// `locale::classic()` yields the "C" locale.
 			assume(!std::use_facet<std::ctype<char>>(std::locale::classic())
 			            .is(std::ctype_base::lower, ref));
 			return std::use_facet<std::ctype<char>>(std::locale::classic()).toupper(cmp) == ref;
 		};
-		ident.push_back(c);
+		auto strUpperCmp = [](char cmp, char ref) {
-		for (c = context.file.sgetc(); isIdentChar(c); c = context.file.snextc()) {
+			return toupper(cmp) == ref;
-			ident.push_back(c);
+		};
 		}
 		for (SectionType type : EnumSeq(SECTTYPE_INVALID)) {
 			if (std::equal(RANGE(ident), RANGE(sectionTypeInfo[type].name), strUpperCmp)) {
--- a/test/link/script-num-fmt.link
+++ b/test/link/script-num-fmt.link
@@ -1,5 +1,9 @@
 ROM0
-	org 42
+	org 4_2
-	org %101010
+	org %10_10_10
-	org $2A
+	org &52_
 	org $2A_
 	org 0b101_010
 	org 0o5_2
 	org 0x2_A
 	org 41 ; Error!
--- a/test/link/script-num-fmt.out
+++ b/test/link/script-num-fmt.out
@@ -1,2 +1,2 @@
-error: script-num-fmt.link(5): Cannot decrease the current address (from $002a to $0029)
+error: script-num-fmt.link(9): Cannot decrease the current address (from $002a to $0029)
 Linking failed with 1 error
--- a/test/link/script-okay.link
+++ b/test/link/script-okay.link
@@ -0,0 +1,8 @@
 ROM0
 	"ROM0"
 	"\\\"\'\n\r\t\0"
 ROMX 1
 	"ROM1"
 ROMX 2
 	"ROM2 1K"
 	"ROM2 1"
--- a/test/link/script-okay.out
+++ b/test/link/script-okay.out
--- a/test/link/script.asm
+++ b/test/link/script.asm
@@ -7,3 +7,4 @@ SECTION "ROM2 1K", ROMX,BANK[2]
 	ds $1000
 SECTION "ROM2 1", ROMX,BANK[2]
 	ds 1
 SECTION "\\\"\'\n\r\t\0", ROM0
`@@ -1,2 +1,2 @@`
	`error: script-num-fmt.link(5): Cannot decrease the current address (from $002a to $0029)`	`error: script-num-fmt.link(9): Cannot decrease the current address (from $002a to $0029)`
	`Linking failed with 1 error`	`Linking failed with 1 error`