Consolidate, refactor, and bugfix the lexer's handling of captures and skips (#1957)

- Do not error about local labels following keywords in skips or captures (fixes #1955)
- Do not incompletely attempt to handle line continuations in skips (fixes #1956)
- Rename `skipToLeadingIdentifier` to `skipToLeadingKeyword`, refactor to merge
  `skipToEOL` into it, and use it for both skips and captures
This commit is contained in:
Rangi
2026-04-20 13:04:20 +02:00
committed by GitHub
parent 12186fdccc
commit cfec017fed
5 changed files with 105 additions and 57 deletions
+44 -57
View File
@@ -2142,42 +2142,39 @@ finish: // Can't `break` out of a nested `for`-`switch`
return Token(T_(YYEOF));
}
static int skipPastEOL() {
if (lexerState->atLineStart) {
lexerState->atLineStart = false;
return skipChars(isBlankSpace);
}
// This function is called when capturing `REPT`/`FOR` loops and `MACRO` bodies,
// and when skipping unexecuted `IF`/`ELIF`/`ELSE` blocks and `REPT`/`FOR` loops.
// It expects that these constructs' `ENDC`/`ENDR`/`ENDM` closing tokens are only
// valid at the start of their lines, which enables ignoring everything except
// the leading keyword in lines that have one (as well as line continuations).
//
// Note that when these constructs are *evaluated*, they can perform expansions
// (for macro args, interpolations, and macro invocations) which may produce
// tokens that would change how these constructs were captured or skipped, if
// they had been produced during the capture/skip non-evaluating phase.
static Token skipToLeadingKeyword() {
for (;;) {
if (lexerState->atLineStart) {
lexerState->atLineStart = false;
if (int c = skipChars(isBlankSpace); c == EOF) {
return Token(T_(YYEOF));
} else if (startsIdentifier(c) && c != '.') {
shiftChar();
std::string keyword(1, c);
for (c = peek(); continuesIdentifier(c) && c != '.'; c = nextChar()) {
keyword += c;
}
if (auto search = keywords.find(keyword); search != keywords.end()) {
return Token(search->second);
}
}
}
if (int c = bumpChar(); c == EOF) {
return EOF;
return Token(T_(YYEOF));
} else if (isNewline(c)) {
handleCRLF(c);
nextLine();
return skipChars(isBlankSpace);
} else if (c == '\\') {
// Unconditionally skip the next char, including line continuations
c = bumpChar();
if (isNewline(c)) {
handleCRLF(c);
nextLine();
}
}
}
}
// This function uses the fact that `IF` and `REPT` constructs are only valid
// when there's nothing before them on their lines. This enables filtering
// "meaningful" tokens (at line start) vs. "meaningless" (everything else) ones.
// It's especially important due to macro args not being handled in this
// state, and lexing them in "normal" mode potentially producing such tokens.
static Token skipToLeadingIdentifier() {
for (;;) {
if (int c = skipPastEOL(); c == EOF) {
return Token(T_(YYEOF));
} else if (startsIdentifier(c)) {
shiftChar();
return readIdentifier(c, false);
lexerState->atLineStart = true;
}
}
}
@@ -2187,7 +2184,7 @@ static Token skipIfBlock(bool toEndc) {
Defer reenableExpansions = scopedDisableExpansions();
for (uint32_t startingDepth = lexer_GetIFDepth();;) {
switch (Token token = skipToLeadingIdentifier(); token.type) {
switch (Token token = skipToLeadingKeyword(); token.type) {
case T_(YYEOF):
return token;
@@ -2241,7 +2238,7 @@ static Token yylex_SKIP_TO_ENDR() {
// context, which yields an EOF.
Defer reenableExpansions = scopedDisableExpansions();
for (;;) {
switch (Token token = skipToLeadingIdentifier(); token.type) {
switch (Token token = skipToLeadingKeyword(); token.type) {
case T_(YYEOF):
return token;
@@ -2323,38 +2320,28 @@ static Capture makeCapture(char const *name, CallbackFnT callback) {
assume(capture.span.ptr == nullptr);
}
nextLine();
Defer reenableExpansions = scopedDisableExpansions();
for (;;) {
nextLine();
if (int c = skipChars(isBlankSpace); startsIdentifier(c)) {
shiftChar();
int tokenType = readIdentifier(c, false).type;
if (size_t endTokenLength = callback(tokenType); endTokenLength > 0) {
if (!capture.span.ptr) {
// Retrieve the capture buffer now that we're done capturing
capture.span.ptr = lexerState->makeSharedCaptureBufPtr();
}
// Subtract the length of the ending token; we know we have read it exactly, not
// e.g. an interpolation or EQUS expansion, since those are disabled.
capture.span.size = lexerState->captureSize - endTokenLength;
break;
}
}
// Just consume characters until EOL or EOF
if (int c = skipChars([](int d) { return d != EOF && !isNewline(d); }); c == EOF) {
if (Token token = skipToLeadingKeyword(); token.type == T_(YYEOF)) {
error("Unterminated %s", name);
capture.span = {.ptr = nullptr, .size = lexerState->captureSize};
break;
} else {
assume(isNewline(c));
shiftChar();
handleCRLF(c);
} else if (size_t endTokenLength = callback(token.type); endTokenLength > 0) {
if (!capture.span.ptr) {
// Retrieve the capture buffer now that we're done capturing
capture.span.ptr = lexerState->makeSharedCaptureBufPtr();
}
// Subtract the length of the ending token; we know we have read it exactly,
// not e.g. an interpolation or EQUS expansion, since those are disabled.
capture.span.size = lexerState->captureSize - endTokenLength;
break;
}
}
lexerState->atLineStart = false; // The ending token or EOF puts us past the start of the line
assume(!lexerState->atLineStart); // `skipToLeadingKeyword` moves past the start of the line
lexerState->capturing = false;
lexerState->captureBuf = nullptr;
return capture;
+34
View File
@@ -0,0 +1,34 @@
MACRO m
ENDM
IF 0
m
ENDC
IF 0
m \
ENDC
IF 1
m
ELSE
m
ENDC
IF 1
m
ELSE
m \
ENDC
IF 1
m
ELIF 0
m
ENDC
IF 1
m
ELIF 0
m \
ENDC
+20
View File
@@ -0,0 +1,20 @@
section "test", rom0
if 0
section.local "oops"
else
println "*sips coffee*"
endc
rept 0
assert.local "lol"
endr
rept 1
println "this is fine"
endr
macro m
db.local 42
endm
db.local 123
+5
View File
@@ -0,0 +1,5 @@
error: Identifier "db.local" begins with a keyword; did you mean to put a space between them?
at local-after-keyword.asm(20)
error: syntax error, unexpected number
at local-after-keyword.asm(20)
Assembly aborted with 2 errors
+2
View File
@@ -0,0 +1,2 @@
*sips coffee*
this is fine