Refactor some redundant lexer code

2026-05-15 06:11:41 +00:00 · 2025-08-04 19:06:57 -04:00
parent 23ce888d65
commit 0c1b422c36
1 changed files with 101 additions and 123 deletions
@@ -257,6 +257,10 @@ static bool isWhitespace(int c) {
 	return c == ' ' || c == '\t';
 }
 static bool isNewline(int c) {
 	return c == '\r' || c == '\n';
 }
 static LexerState *lexerState = nullptr;
 static LexerState *lexerStateEOL = nullptr;
@@ -288,6 +292,13 @@ static void nextLine() {
 	++lexerState->lineNo;
 }
 static void nextLineOutsideExpansion() {
 	// Newlines read within an expansion should not increase the line count
 	if (lexerState->expansions.empty()) {
 		nextLine();
 	}
 }
 uint32_t lexer_GetIFDepth() {
 	return lexerState->ifStack.size();
 }
@@ -757,7 +768,9 @@ static int peek() {
 static void shiftChar() {
 	if (lexerState->capturing) {
 		if (lexerState->captureBuf) {
-			lexerState->captureBuf->push_back(peek());
+			int c = peek();
 			assume(c != EOF); // Avoid calling `shiftChar()` when it could be EOF while capturing
 			lexerState->captureBuf->push_back(c);
 		}
 		++lexerState->captureSize;
 	}
@@ -801,9 +814,7 @@ static bool consumeChar(int c) {
 static int bumpChar() {
 	int c = peek();
-	if (c != EOF) {
+	shiftChar();
 		shiftChar();
 	}
 	return c;
 }
@@ -872,9 +883,7 @@ static void discardBlockComment() {
 			handleCRLF(c);
 			[[fallthrough]];
 		case '\n':
-			if (lexerState->expansions.empty()) {
+			nextLineOutsideExpansion();
 				nextLine();
 			}
 			continue;
 		case '/':
 			if (peek() == '*') {
@@ -895,23 +904,17 @@ static void discardBlockComment() {
 static void discardComment() {
 	Defer reenableExpansions = scopedDisableExpansions();
-	for (;; shiftChar()) {
+	skipChars([](int c) { return c != EOF && !isNewline(c); });
 		if (int c = peek(); c == EOF || c == '\r' || c == '\n') {
 			break;
 		}
 	}
 }
 static void discardLineContinuation() {
 	for (;;) {
 		if (int c = peek(); isWhitespace(c)) {
 			shiftChar();
-		} else if (c == '\r' || c == '\n') {
+		} else if (isNewline(c)) {
 			shiftChar();
 			handleCRLF(c);
-			if (lexerState->expansions.empty()) {
+			nextLineOutsideExpansion();
 				nextLine();
 			}
 			break;
 		} else if (c == ';') {
 			discardComment();
@@ -1244,7 +1247,7 @@ static std::pair<Symbol const *, std::shared_ptr<std::string>> readInterpolation
 				beginExpansion(interp.second, interp.first->name);
 			}
 			continue; // Restart, reading from the new buffer
-		} else if (int c = peek(); c == EOF || c == '\r' || c == '\n' || c == '"') {
+		} else if (int c = peek(); c == EOF || isNewline(c) || c == '"') {
 			error("Missing }");
 			break;
 		} else if (c == '}') {
@@ -1464,7 +1467,7 @@ static void readString(std::string &str, bool rawString) {
 		int c = peek();
 		// '\r', '\n' or EOF ends a single-line string early
-		if (c == EOF || (!multiline && (c == '\r' || c == '\n'))) {
+		if (c == EOF || (!multiline && isNewline(c))) {
 			error("Unterminated string");
 			return;
 		}
@@ -1473,7 +1476,7 @@ static void readString(std::string &str, bool rawString) {
 		shiftChar();
 		// Handle '\r' or '\n' (in multiline strings only, already handled above otherwise)
-		if (c == '\r' || c == '\n') {
+		if (isNewline(c)) {
 			handleCRLF(c);
 			nextLine();
 			str += '\n';
@@ -1948,7 +1951,7 @@ static Token yylex_RAW() {
 		} else if (c == '\\') {
 			c = nextChar();
 			// If not a line continuation, handle as a normal char
-			if (!isWhitespace(c) && c != '\n' && c != '\r') {
+			if (!isWhitespace(c) && !isNewline(c)) {
 				goto backslash;
 			}
 			// Line continuations count as "whitespace"
@@ -2096,9 +2099,10 @@ finish: // Can't `break` out of a nested `for`-`switch`
 	if (!str.empty()) {
 		return Token(T_(STRING), str);
 	}
 	lexer_SetMode(LEXER_NORMAL);
-	if (c == '\r' || c == '\n') {
+	if (isNewline(c)) {
 		shiftChar();
 		handleCRLF(c);
 		return Token(T_(NEWLINE));
@@ -2107,54 +2111,57 @@ finish: // Can't `break` out of a nested `for`-`switch`
 	return Token(T_(YYEOF));
 }
-// This function uses the fact that `if`, etc. constructs are only valid when
+static int skipPastEOL() {
-// there's nothing before them on their lines. This enables filtering
+	if (lexerState->atLineStart) {
-// "meaningful" (= at line start) vs. "meaningless" (everything else) tokens.
+		lexerState->atLineStart = false;
-// It's especially important due to macro args not being handled in this
+		return skipChars(isWhitespace);
-// state, and lexing them in "normal" mode potentially producing such tokens.
+	}
 static Token skipIfBlock(bool toEndc) {
 	lexer_SetMode(LEXER_NORMAL);
 	uint32_t startingDepth = lexer_GetIFDepth();
-	bool atLineStart = lexerState->atLineStart;
+	for (;;) {
-	Defer notAtLineStart{[&] { lexerState->atLineStart = false; }};
+		if (int c = bumpChar(); c == EOF) {
-
+			return EOF;
-	Defer reenableExpansions = scopedDisableExpansions();
+		} else if (isNewline(c)) {
-
+			handleCRLF(c);
-	for (int c;; atLineStart = false) {
+			nextLine();
-		// Read chars until EOL
+			return skipChars(isWhitespace);
-		while (!atLineStart) {
+		} else if (c == '\\') {
 			// Unconditionally skip the next char, including line continuations
 			c = bumpChar();
-
+			if (isNewline(c)) {
 			if (c == EOF) {
 				return Token(T_(YYEOF));
 			} else if (c == '\\') {
 				// Unconditionally skip the next char, including line continuations
 				c = bumpChar();
 			} else if (c == '\r' || c == '\n') {
 				atLineStart = true;
 			}
 			if (c == '\r' || c == '\n') {
 				handleCRLF(c);
 				// Do this both on line continuations and plain EOLs
 				nextLine();
 			}
 		}
 	}
 }
-		// Skip leading whitespace
+// This function uses the fact that `IF` and `REPT` constructs are only valid
-		for (;; shiftChar()) {
+// when there's nothing before them on their lines. This enables filtering
-			c = peek();
+// "meaningful" tokens (at line start) vs. "meaningless" (everything else) ones.
-			if (!isWhitespace(c)) {
+// It's especially important due to macro args not being handled in this
-				break;
+// state, and lexing them in "normal" mode potentially producing such tokens.
-			}
+static Token skipToLeadingIdentifier() {
 	for (;;) {
 		if (int c = skipPastEOL(); c == EOF) {
 			return Token(T_(YYEOF));
 		} else if (startsIdentifier(c)) {
 			shiftChar();
 			return readIdentifier(c, false);
 		}
 	}
 }
 static Token skipIfBlock(bool toEndc) {
 	uint32_t startingDepth = lexer_GetIFDepth();
 	lexer_SetMode(LEXER_NORMAL);
 	Defer reenableExpansions = scopedDisableExpansions();
 	for (;;) {
 		switch (Token token = skipToLeadingIdentifier(); token.type) {
 		case T_(YYEOF):
 			return token;
 		if (!startsIdentifier(c)) {
 			continue;
 		}
 		shiftChar();
 		switch (Token token = readIdentifier(c, false); token.type) {
 		case T_(POP_IF):
 			lexer_IncIFDepth();
 			break;
@@ -2185,9 +2192,6 @@ static Token skipIfBlock(bool toEndc) {
 			}
 			lexer_DecIFDepth();
 			break;
 		default:
 			break;
 		}
 	}
 }
@@ -2203,39 +2207,15 @@ static Token yylex_SKIP_TO_ENDC() {
 static Token yylex_SKIP_TO_ENDR() {
 	lexer_SetMode(LEXER_NORMAL);
-	bool atLineStart = lexerState->atLineStart;
+	// This does not have to look for an `ENDR` token because the entire `REPT` or `FOR` body has
-	Defer notAtLineStart{[&] { lexerState->atLineStart = false; }};
+	// been captured into the current fstack context, so it can just skip to the end of that
-
+	// context, which yields an EOF.
 	Defer reenableExpansions = scopedDisableExpansions();
 	for (;;) {
 		switch (Token token = skipToLeadingIdentifier(); token.type) {
 		case T_(YYEOF):
 			return token;
 	for (int c;; atLineStart = false) {
 		// Read chars until EOL
 		while (!atLineStart) {
 			c = bumpChar();
 			if (c == EOF) {
 				return Token(T_(YYEOF));
 			} else if (c == '\\') {
 				// Unconditionally skip the next char, including line continuations
 				c = bumpChar();
 			} else if (c == '\r' || c == '\n') {
 				atLineStart = true;
 			}
 			if (c == '\r' || c == '\n') {
 				handleCRLF(c);
 				// Do this both on line continuations and plain EOLs
 				nextLine();
 			}
 		}
 		c = skipChars(isWhitespace);
 		if (!startsIdentifier(c)) {
 			continue;
 		}
 		shiftChar();
 		switch (readIdentifier(c, false).type) {
 		case T_(POP_IF):
 			lexer_IncIFDepth();
 			break;
@@ -2243,9 +2223,6 @@ static Token yylex_SKIP_TO_ENDR() {
 		case T_(POP_ENDC):
 			lexer_DecIFDepth();
 			break;
 		default:
 			break;
 		}
 	}
 }
@@ -2258,9 +2235,8 @@ yy::parser::symbol_type yylex() {
 	if (lexerState->lastToken == T_(EOB) && yywrap()) {
 		return yy::parser::make_YYEOF();
 	}
-	// Newlines read within an expansion should not increase the line count
+	if (lexerState->atLineStart) {
-	if (lexerState->atLineStart && lexerState->expansions.empty()) {
+		nextLineOutsideExpansion();
 		nextLine();
 	}
 	static Token (* const lexerModeFuncs[NB_LEXER_MODES])() = {
@@ -2338,23 +2314,20 @@ Capture lexer_CaptureRept() {
 	Capture capture = startCapture();
 	Defer reenableExpansions = scopedDisableExpansions();
-
+	for (size_t depth = 0;;) {
 	size_t depth = 0;
 	for (int c;;) {
 		nextLine();
-		// We're at line start, so attempt to match a `REPT` or `ENDR` token
+		int c = skipChars(isWhitespace);
-		do { // Discard initial whitespace
+		if (c != EOF) {
-			c = bumpChar();
+			shiftChar();
-		} while (isWhitespace(c));
+		}
-		// Now, try to match `REPT`, `FOR` or `ENDR` as a **whole** keyword
+
 		// We're at line start, so attempt to match a `REPT`, `FOR`, or `ENDR` token
 		if (startsIdentifier(c)) {
 			switch (readIdentifier(c, false).type) {
 			case T_(POP_REPT):
 			case T_(POP_FOR):
 				++depth;
 				break; // Ignore the rest of that line
 			case T_(POP_ENDR):
 				if (depth) {
 					--depth;
@@ -2365,23 +2338,24 @@ Capture lexer_CaptureRept() {
 				// We know we have read exactly "ENDR", not e.g. an EQUS
 				capture.span.size -= literal_strlen("ENDR");
 				return capture;
 			default:
 				break;
 			}
 		}
 		// Just consume characters until EOL or EOF
-		for (;; c = bumpChar()) {
+		for (;;) {
 			if (c == EOF) {
 				error("Unterminated REPT/FOR block");
 				endCapture(capture);
 				capture.span.ptr = nullptr; // Indicates that it reached EOF before an ENDR
 				return capture;
-			} else if (c == '\n' || c == '\r') {
+			} else if (isNewline(c)) {
 				handleCRLF(c);
 				break;
 			}
 			c = peek();
 			if (c != EOF) {
 				shiftChar();
 			}
 		}
 	}
 }
@@ -2390,14 +2364,14 @@ Capture lexer_CaptureMacro() {
 	Capture capture = startCapture();
 	Defer reenableExpansions = scopedDisableExpansions();
-
+	for (;;) {
 	for (int c;;) {
 		nextLine();
 		int c = skipChars(isWhitespace);
 		if (c != EOF) {
 			shiftChar();
 		}
 		// We're at line start, so attempt to match an `ENDM` token
 		do { // Discard initial whitespace
 			c = bumpChar();
 		} while (isWhitespace(c));
 		// Now, try to match `ENDM` as a **whole** keyword
 		if (startsIdentifier(c) && readIdentifier(c, false).type == T_(POP_ENDM)) {
 			endCapture(capture);
 			// The ENDM has been captured, but we don't want it!
@@ -2407,16 +2381,20 @@ Capture lexer_CaptureMacro() {
 		}
 		// Just consume characters until EOL or EOF
-		for (;; c = bumpChar()) {
+		for (;;) {
 			if (c == EOF) {
 				error("Unterminated macro definition");
 				endCapture(capture);
 				capture.span.ptr = nullptr; // Indicates that it reached EOF before an ENDM
 				return capture;
-			} else if (c == '\n' || c == '\r') {
+			} else if (isNewline(c)) {
 				handleCRLF(c);
 				break;
 			}
 			c = peek();
 			if (c != EOF) {
 				shiftChar();
 			}
 		}
 	}
 }