Implement a '#' prefix for raw identifiers that may alias keywords (#1480)

* Implement a '#' prefix for raw identifiers that may alias keywords * Review comments * Disallow hashless raw identifiers in interpolations * Run clang-format
2025-11-20 10:12:06 +00:00 · 2024-08-21 13:31:44 -04:00
parent 82e81ab1da
commit b438c83bda
11 changed files with 114 additions and 13 deletions
--- a/src/asm/lexer.cpp
+++ b/src/asm/lexer.cpp
@@ -596,7 +596,16 @@ static uint32_t readBracketedMacroArgNum() {

 	if (c >= '0' && c <= '9') {
 		num = readNumber(10, 0);
-	} else if (startsIdentifier(c)) {
+	} else if (startsIdentifier(c) || c == '#') {
+		if (c == '#') {
+			shiftChar();
+			c = peek();
+			if (!startsIdentifier(c)) {
+				error("Empty raw symbol in bracketed macro argument\n");
+				return 0;
+			}
+		}
+
 		std::string symName;

 		for (; continuesIdentifier(c); c = peek()) {
@@ -1138,8 +1147,7 @@ static bool continuesIdentifier(int c) {
 	return startsIdentifier(c) || (c <= '9' && c >= '0') || c == '#' || c == '@';
 }

-static Token readIdentifier(char firstChar) {
-	// Lex while checking for a keyword
+static Token readIdentifier(char firstChar, bool raw) {
 	std::string identifier(1, firstChar);
 	int tokenType = firstChar == '.' ? T_(LOCAL_ID) : T_(ID);

@@ -1155,9 +1163,13 @@ static Token readIdentifier(char firstChar) {
 			tokenType = T_(LOCAL_ID);
 	}

-	// Attempt to check for a keyword
-	auto search = keywordDict.find(identifier.c_str());
-	return search != keywordDict.end() ? Token(search->second) : Token(tokenType, identifier);
+	// Attempt to check for a keyword if the identifier is not raw
+	if (!raw) {
+		if (auto search = keywordDict.find(identifier.c_str()); search != keywordDict.end())
+			return Token(search->second);
+	}
+
+	return Token(tokenType, identifier);
 }

 // Functions to read strings
@@ -1207,6 +1219,19 @@ static std::shared_ptr<std::string> readInterpolation(size_t depth) {
 	// Don't return before `lexerState->disableInterpolation` is reset!
 	lexerState->disableInterpolation = disableInterpolation;

+	if (fmtBuf.starts_with('#')) {
+		// Skip a '#' raw identifier prefix, but after expanding any nested interpolations.
+		fmtBuf.erase(0, 1);
+	} else if (keywordDict.find(fmtBuf.c_str()) != keywordDict.end()) {
+		// Don't allow symbols that alias keywords without a '#' prefix.
+		error(
+		    "Interpolated symbol \"%s\" is a reserved keyword; add a '#' prefix to use it as a raw "
+		    "symbol\n",
+		    fmtBuf.c_str()
+		);
+		return nullptr;
+	}
+
 	Symbol const *sym = sym_FindScopedValidSymbol(fmtBuf);

 	if (!sym || !sym->isDefined()) {
@@ -1781,8 +1806,13 @@ static Token yylex_NORMAL() {
 			// Handle identifiers... or report garbage characters

 		default:
+			bool raw = c == '#';
+			if (raw && startsIdentifier(peek())) {
+				c = nextChar();
+			}
+
 			if (startsIdentifier(c)) {
-				Token token = readIdentifier(c);
+				Token token = readIdentifier(c, raw);

 				// An ELIF after a taken IF needs to not evaluate its condition
 				if (token.type == T_(POP_ELIF) && lexerState->lastToken == T_(NEWLINE)
@@ -2017,7 +2047,7 @@ static Token skipIfBlock(bool toEndc) {

 			if (startsIdentifier(c)) {
 				shiftChar();
-				switch (Token token = readIdentifier(c); token.type) {
+				switch (Token token = readIdentifier(c, false); token.type) {
 				case T_(POP_IF):
 					lexer_IncIFDepth();
 					break;
@@ -2103,7 +2133,7 @@ static Token yylex_SKIP_TO_ENDR() {

 			if (startsIdentifier(c)) {
 				shiftChar();
-				switch (readIdentifier(c).type) {
+				switch (readIdentifier(c, false).type) {
 				case T_(POP_FOR):
 				case T_(POP_REPT):
 					depth++;
@@ -2250,7 +2280,7 @@ Capture lexer_CaptureRept() {
 		} while (isWhitespace(c));
 		// Now, try to match `REPT`, `FOR` or `ENDR` as a **whole** identifier
 		if (startsIdentifier(c)) {
-			switch (readIdentifier(c).type) {
+			switch (readIdentifier(c, false).type) {
 			case T_(POP_REPT):
 			case T_(POP_FOR):
 				depth++;
@@ -2303,7 +2333,7 @@ Capture lexer_CaptureMacro() {
 		} while (isWhitespace(c));
 		// Now, try to match `ENDM` as a **whole** identifier
 		if (startsIdentifier(c)) {
-			switch (readIdentifier(c).type) {
+			switch (readIdentifier(c, false).type) {
 			case T_(POP_ENDM):
 				endCapture(capture);
 				// The ENDM has been captured, but we don't want it!