Refactor some redundant lexer code

This commit is contained in:
Rangi42
2025-08-04 19:06:57 -04:00
parent 23ce888d65
commit 0c1b422c36

View File

@@ -257,6 +257,10 @@ static bool isWhitespace(int c) {
return c == ' ' || c == '\t'; return c == ' ' || c == '\t';
} }
static bool isNewline(int c) {
return c == '\r' || c == '\n';
}
static LexerState *lexerState = nullptr; static LexerState *lexerState = nullptr;
static LexerState *lexerStateEOL = nullptr; static LexerState *lexerStateEOL = nullptr;
@@ -288,6 +292,13 @@ static void nextLine() {
++lexerState->lineNo; ++lexerState->lineNo;
} }
static void nextLineOutsideExpansion() {
// Newlines read within an expansion should not increase the line count
if (lexerState->expansions.empty()) {
nextLine();
}
}
uint32_t lexer_GetIFDepth() { uint32_t lexer_GetIFDepth() {
return lexerState->ifStack.size(); return lexerState->ifStack.size();
} }
@@ -757,7 +768,9 @@ static int peek() {
static void shiftChar() { static void shiftChar() {
if (lexerState->capturing) { if (lexerState->capturing) {
if (lexerState->captureBuf) { if (lexerState->captureBuf) {
lexerState->captureBuf->push_back(peek()); int c = peek();
assume(c != EOF); // Avoid calling `shiftChar()` when it could be EOF while capturing
lexerState->captureBuf->push_back(c);
} }
++lexerState->captureSize; ++lexerState->captureSize;
} }
@@ -801,9 +814,7 @@ static bool consumeChar(int c) {
static int bumpChar() { static int bumpChar() {
int c = peek(); int c = peek();
if (c != EOF) { shiftChar();
shiftChar();
}
return c; return c;
} }
@@ -872,9 +883,7 @@ static void discardBlockComment() {
handleCRLF(c); handleCRLF(c);
[[fallthrough]]; [[fallthrough]];
case '\n': case '\n':
if (lexerState->expansions.empty()) { nextLineOutsideExpansion();
nextLine();
}
continue; continue;
case '/': case '/':
if (peek() == '*') { if (peek() == '*') {
@@ -895,23 +904,17 @@ static void discardBlockComment() {
static void discardComment() { static void discardComment() {
Defer reenableExpansions = scopedDisableExpansions(); Defer reenableExpansions = scopedDisableExpansions();
for (;; shiftChar()) { skipChars([](int c) { return c != EOF && !isNewline(c); });
if (int c = peek(); c == EOF || c == '\r' || c == '\n') {
break;
}
}
} }
static void discardLineContinuation() { static void discardLineContinuation() {
for (;;) { for (;;) {
if (int c = peek(); isWhitespace(c)) { if (int c = peek(); isWhitespace(c)) {
shiftChar(); shiftChar();
} else if (c == '\r' || c == '\n') { } else if (isNewline(c)) {
shiftChar(); shiftChar();
handleCRLF(c); handleCRLF(c);
if (lexerState->expansions.empty()) { nextLineOutsideExpansion();
nextLine();
}
break; break;
} else if (c == ';') { } else if (c == ';') {
discardComment(); discardComment();
@@ -1244,7 +1247,7 @@ static std::pair<Symbol const *, std::shared_ptr<std::string>> readInterpolation
beginExpansion(interp.second, interp.first->name); beginExpansion(interp.second, interp.first->name);
} }
continue; // Restart, reading from the new buffer continue; // Restart, reading from the new buffer
} else if (int c = peek(); c == EOF || c == '\r' || c == '\n' || c == '"') { } else if (int c = peek(); c == EOF || isNewline(c) || c == '"') {
error("Missing }"); error("Missing }");
break; break;
} else if (c == '}') { } else if (c == '}') {
@@ -1464,7 +1467,7 @@ static void readString(std::string &str, bool rawString) {
int c = peek(); int c = peek();
// '\r', '\n' or EOF ends a single-line string early // '\r', '\n' or EOF ends a single-line string early
if (c == EOF || (!multiline && (c == '\r' || c == '\n'))) { if (c == EOF || (!multiline && isNewline(c))) {
error("Unterminated string"); error("Unterminated string");
return; return;
} }
@@ -1473,7 +1476,7 @@ static void readString(std::string &str, bool rawString) {
shiftChar(); shiftChar();
// Handle '\r' or '\n' (in multiline strings only, already handled above otherwise) // Handle '\r' or '\n' (in multiline strings only, already handled above otherwise)
if (c == '\r' || c == '\n') { if (isNewline(c)) {
handleCRLF(c); handleCRLF(c);
nextLine(); nextLine();
str += '\n'; str += '\n';
@@ -1948,7 +1951,7 @@ static Token yylex_RAW() {
} else if (c == '\\') { } else if (c == '\\') {
c = nextChar(); c = nextChar();
// If not a line continuation, handle as a normal char // If not a line continuation, handle as a normal char
if (!isWhitespace(c) && c != '\n' && c != '\r') { if (!isWhitespace(c) && !isNewline(c)) {
goto backslash; goto backslash;
} }
// Line continuations count as "whitespace" // Line continuations count as "whitespace"
@@ -2096,9 +2099,10 @@ finish: // Can't `break` out of a nested `for`-`switch`
if (!str.empty()) { if (!str.empty()) {
return Token(T_(STRING), str); return Token(T_(STRING), str);
} }
lexer_SetMode(LEXER_NORMAL); lexer_SetMode(LEXER_NORMAL);
if (c == '\r' || c == '\n') { if (isNewline(c)) {
shiftChar(); shiftChar();
handleCRLF(c); handleCRLF(c);
return Token(T_(NEWLINE)); return Token(T_(NEWLINE));
@@ -2107,54 +2111,57 @@ finish: // Can't `break` out of a nested `for`-`switch`
return Token(T_(YYEOF)); return Token(T_(YYEOF));
} }
// This function uses the fact that `if`, etc. constructs are only valid when static int skipPastEOL() {
// there's nothing before them on their lines. This enables filtering if (lexerState->atLineStart) {
// "meaningful" (= at line start) vs. "meaningless" (everything else) tokens. lexerState->atLineStart = false;
// It's especially important due to macro args not being handled in this return skipChars(isWhitespace);
// state, and lexing them in "normal" mode potentially producing such tokens. }
static Token skipIfBlock(bool toEndc) {
lexer_SetMode(LEXER_NORMAL);
uint32_t startingDepth = lexer_GetIFDepth();
bool atLineStart = lexerState->atLineStart; for (;;) {
Defer notAtLineStart{[&] { lexerState->atLineStart = false; }}; if (int c = bumpChar(); c == EOF) {
return EOF;
Defer reenableExpansions = scopedDisableExpansions(); } else if (isNewline(c)) {
handleCRLF(c);
for (int c;; atLineStart = false) { nextLine();
// Read chars until EOL return skipChars(isWhitespace);
while (!atLineStart) { } else if (c == '\\') {
// Unconditionally skip the next char, including line continuations
c = bumpChar(); c = bumpChar();
if (isNewline(c)) {
if (c == EOF) {
return Token(T_(YYEOF));
} else if (c == '\\') {
// Unconditionally skip the next char, including line continuations
c = bumpChar();
} else if (c == '\r' || c == '\n') {
atLineStart = true;
}
if (c == '\r' || c == '\n') {
handleCRLF(c); handleCRLF(c);
// Do this both on line continuations and plain EOLs
nextLine(); nextLine();
} }
} }
}
}
// Skip leading whitespace // This function uses the fact that `IF` and `REPT` constructs are only valid
for (;; shiftChar()) { // when there's nothing before them on their lines. This enables filtering
c = peek(); // "meaningful" tokens (at line start) vs. "meaningless" (everything else) ones.
if (!isWhitespace(c)) { // It's especially important due to macro args not being handled in this
break; // state, and lexing them in "normal" mode potentially producing such tokens.
} static Token skipToLeadingIdentifier() {
for (;;) {
if (int c = skipPastEOL(); c == EOF) {
return Token(T_(YYEOF));
} else if (startsIdentifier(c)) {
shiftChar();
return readIdentifier(c, false);
} }
}
}
static Token skipIfBlock(bool toEndc) {
uint32_t startingDepth = lexer_GetIFDepth();
lexer_SetMode(LEXER_NORMAL);
Defer reenableExpansions = scopedDisableExpansions();
for (;;) {
switch (Token token = skipToLeadingIdentifier(); token.type) {
case T_(YYEOF):
return token;
if (!startsIdentifier(c)) {
continue;
}
shiftChar();
switch (Token token = readIdentifier(c, false); token.type) {
case T_(POP_IF): case T_(POP_IF):
lexer_IncIFDepth(); lexer_IncIFDepth();
break; break;
@@ -2185,9 +2192,6 @@ static Token skipIfBlock(bool toEndc) {
} }
lexer_DecIFDepth(); lexer_DecIFDepth();
break; break;
default:
break;
} }
} }
} }
@@ -2203,39 +2207,15 @@ static Token yylex_SKIP_TO_ENDC() {
static Token yylex_SKIP_TO_ENDR() { static Token yylex_SKIP_TO_ENDR() {
lexer_SetMode(LEXER_NORMAL); lexer_SetMode(LEXER_NORMAL);
bool atLineStart = lexerState->atLineStart; // This does not have to look for an `ENDR` token because the entire `REPT` or `FOR` body has
Defer notAtLineStart{[&] { lexerState->atLineStart = false; }}; // been captured into the current fstack context, so it can just skip to the end of that
// context, which yields an EOF.
Defer reenableExpansions = scopedDisableExpansions(); Defer reenableExpansions = scopedDisableExpansions();
for (;;) {
switch (Token token = skipToLeadingIdentifier(); token.type) {
case T_(YYEOF):
return token;
for (int c;; atLineStart = false) {
// Read chars until EOL
while (!atLineStart) {
c = bumpChar();
if (c == EOF) {
return Token(T_(YYEOF));
} else if (c == '\\') {
// Unconditionally skip the next char, including line continuations
c = bumpChar();
} else if (c == '\r' || c == '\n') {
atLineStart = true;
}
if (c == '\r' || c == '\n') {
handleCRLF(c);
// Do this both on line continuations and plain EOLs
nextLine();
}
}
c = skipChars(isWhitespace);
if (!startsIdentifier(c)) {
continue;
}
shiftChar();
switch (readIdentifier(c, false).type) {
case T_(POP_IF): case T_(POP_IF):
lexer_IncIFDepth(); lexer_IncIFDepth();
break; break;
@@ -2243,9 +2223,6 @@ static Token yylex_SKIP_TO_ENDR() {
case T_(POP_ENDC): case T_(POP_ENDC):
lexer_DecIFDepth(); lexer_DecIFDepth();
break; break;
default:
break;
} }
} }
} }
@@ -2258,9 +2235,8 @@ yy::parser::symbol_type yylex() {
if (lexerState->lastToken == T_(EOB) && yywrap()) { if (lexerState->lastToken == T_(EOB) && yywrap()) {
return yy::parser::make_YYEOF(); return yy::parser::make_YYEOF();
} }
// Newlines read within an expansion should not increase the line count if (lexerState->atLineStart) {
if (lexerState->atLineStart && lexerState->expansions.empty()) { nextLineOutsideExpansion();
nextLine();
} }
static Token (* const lexerModeFuncs[NB_LEXER_MODES])() = { static Token (* const lexerModeFuncs[NB_LEXER_MODES])() = {
@@ -2338,23 +2314,20 @@ Capture lexer_CaptureRept() {
Capture capture = startCapture(); Capture capture = startCapture();
Defer reenableExpansions = scopedDisableExpansions(); Defer reenableExpansions = scopedDisableExpansions();
for (size_t depth = 0;;) {
size_t depth = 0;
for (int c;;) {
nextLine(); nextLine();
// We're at line start, so attempt to match a `REPT` or `ENDR` token int c = skipChars(isWhitespace);
do { // Discard initial whitespace if (c != EOF) {
c = bumpChar(); shiftChar();
} while (isWhitespace(c)); }
// Now, try to match `REPT`, `FOR` or `ENDR` as a **whole** keyword
// We're at line start, so attempt to match a `REPT`, `FOR`, or `ENDR` token
if (startsIdentifier(c)) { if (startsIdentifier(c)) {
switch (readIdentifier(c, false).type) { switch (readIdentifier(c, false).type) {
case T_(POP_REPT): case T_(POP_REPT):
case T_(POP_FOR): case T_(POP_FOR):
++depth; ++depth;
break; // Ignore the rest of that line break; // Ignore the rest of that line
case T_(POP_ENDR): case T_(POP_ENDR):
if (depth) { if (depth) {
--depth; --depth;
@@ -2365,23 +2338,24 @@ Capture lexer_CaptureRept() {
// We know we have read exactly "ENDR", not e.g. an EQUS // We know we have read exactly "ENDR", not e.g. an EQUS
capture.span.size -= literal_strlen("ENDR"); capture.span.size -= literal_strlen("ENDR");
return capture; return capture;
default:
break;
} }
} }
// Just consume characters until EOL or EOF // Just consume characters until EOL or EOF
for (;; c = bumpChar()) { for (;;) {
if (c == EOF) { if (c == EOF) {
error("Unterminated REPT/FOR block"); error("Unterminated REPT/FOR block");
endCapture(capture); endCapture(capture);
capture.span.ptr = nullptr; // Indicates that it reached EOF before an ENDR capture.span.ptr = nullptr; // Indicates that it reached EOF before an ENDR
return capture; return capture;
} else if (c == '\n' || c == '\r') { } else if (isNewline(c)) {
handleCRLF(c); handleCRLF(c);
break; break;
} }
c = peek();
if (c != EOF) {
shiftChar();
}
} }
} }
} }
@@ -2390,14 +2364,14 @@ Capture lexer_CaptureMacro() {
Capture capture = startCapture(); Capture capture = startCapture();
Defer reenableExpansions = scopedDisableExpansions(); Defer reenableExpansions = scopedDisableExpansions();
for (;;) {
for (int c;;) {
nextLine(); nextLine();
int c = skipChars(isWhitespace);
if (c != EOF) {
shiftChar();
}
// We're at line start, so attempt to match an `ENDM` token // We're at line start, so attempt to match an `ENDM` token
do { // Discard initial whitespace
c = bumpChar();
} while (isWhitespace(c));
// Now, try to match `ENDM` as a **whole** keyword
if (startsIdentifier(c) && readIdentifier(c, false).type == T_(POP_ENDM)) { if (startsIdentifier(c) && readIdentifier(c, false).type == T_(POP_ENDM)) {
endCapture(capture); endCapture(capture);
// The ENDM has been captured, but we don't want it! // The ENDM has been captured, but we don't want it!
@@ -2407,16 +2381,20 @@ Capture lexer_CaptureMacro() {
} }
// Just consume characters until EOL or EOF // Just consume characters until EOL or EOF
for (;; c = bumpChar()) { for (;;) {
if (c == EOF) { if (c == EOF) {
error("Unterminated macro definition"); error("Unterminated macro definition");
endCapture(capture); endCapture(capture);
capture.span.ptr = nullptr; // Indicates that it reached EOF before an ENDM capture.span.ptr = nullptr; // Indicates that it reached EOF before an ENDM
return capture; return capture;
} else if (c == '\n' || c == '\r') { } else if (isNewline(c)) {
handleCRLF(c); handleCRLF(c);
break; break;
} }
c = peek();
if (c != EOF) {
shiftChar();
}
} }
} }
} }