From db6793f444993dd80a17163e0d76e2b2b9882260 Mon Sep 17 00:00:00 2001 From: Rangi <35663410+Rangi42@users.noreply.github.com> Date: Thu, 14 Aug 2025 10:10:59 -0400 Subject: [PATCH] Don't count single quote `'` as garbage (#1801) Also copy the "blank space" (space or tab) vs "whitespace" (space, tab, or newline) convention from `` --- Makefile | 3 ++- include/util.hpp | 3 ++- src/CMakeLists.txt | 1 + src/asm/lexer.cpp | 38 ++++++++++++++++++++++------------- src/asm/main.cpp | 2 +- src/asm/opt.cpp | 6 +++--- src/diagnostics.cpp | 2 +- src/fix/mbc.cpp | 15 +++++++------- src/gfx/main.cpp | 32 ++++++++++++++--------------- src/gfx/pal_spec.cpp | 16 +++++++-------- src/link/lexer.cpp | 4 ++-- src/link/main.cpp | 32 ++++++++++++++++------------- src/util.cpp | 10 ++++++--- test/asm/garbage_sequence.asm | 2 ++ test/asm/garbage_sequence.err | 4 +++- 15 files changed, 98 insertions(+), 72 deletions(-) diff --git a/Makefile b/Makefile index 77f00688..3d64ed20 100644 --- a/Makefile +++ b/Makefile @@ -109,7 +109,8 @@ rgbfix_obj := \ ${common_obj} \ src/fix/main.o \ src/fix/mbc.o \ - src/fix/warning.o + src/fix/warning.o \ + src/util.o rgbgfx_obj := \ ${common_obj} \ diff --git a/include/util.hpp b/include/util.hpp index a26dd393..929a2f77 100644 --- a/include/util.hpp +++ b/include/util.hpp @@ -12,8 +12,9 @@ #include "helpers.hpp" -bool isWhitespace(int c); bool isNewline(int c); +bool isBlankSpace(int c); +bool isWhitespace(int c); bool isPrintable(int c); bool startsIdentifier(int c); diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 57359c22..cfc9b692 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -84,6 +84,7 @@ set(rgbfix_src "fix/main.cpp" "fix/mbc.cpp" "fix/warning.cpp" + "util.cpp" ) set(rgbgfx_src diff --git a/src/asm/lexer.cpp b/src/asm/lexer.cpp index b2593cf0..fa7e6db2 100644 --- a/src/asm/lexer.cpp +++ b/src/asm/lexer.cpp @@ -898,7 +898,7 @@ static void discardComment() { static void discardLineContinuation() { for (;;) { - if (int c = peek(); isWhitespace(c)) { + if (int c = peek(); isBlankSpace(c)) { shiftChar(); } else if (isNewline(c)) { shiftChar(); @@ -1561,8 +1561,18 @@ static Token yylex_SKIP_TO_ENDC(); // Forward declaration for `yylex_NORMAL` // Must stay in sync with the `switch` in `yylex_NORMAL`! static bool isGarbageCharacter(int c) { - return c != EOF && !continuesIdentifier(c) - && (c == '\0' || !strchr("; \t~[](),+-*/|^=!<>:&%`\"\r\n\\", c)); + // Whitespace characters are not garbage, even the non-"printable" ones + if (isWhitespace(c)) { + return false; + } + // Printable characters which are nevertheless garbage: braces should have been interpolated, + // and question mark is unused + if (c == '{' || c == '}' || c == '?') { + return true; + } + // All other printable characters are not garbage (i.e. `yylex_NORMAL` handles them), and + // all other nonprintable characters are garbage (including '\0' and EOF) + return !isPrintable(c); } static void reportGarbageCharacters(int c) { @@ -1594,7 +1604,7 @@ static Token yylex_NORMAL() { int c = bumpChar(); switch (c) { - // Ignore whitespace and comments + // Ignore blank space and comments case ';': discardComment(); @@ -1949,18 +1959,18 @@ static Token yylex_RAW() { size_t parenDepth = 0; int c; - // Trim left whitespace (stops at a block comment) + // Trim left spaces (stops at a block comment) for (;;) { c = peek(); - if (isWhitespace(c)) { + if (isBlankSpace(c)) { shiftChar(); } else if (c == '\\') { c = nextChar(); // If not a line continuation, handle as a normal char - if (!isWhitespace(c) && !isNewline(c)) { + if (!isWhitespace(c)) { goto backslash; } - // Line continuations count as "whitespace" + // Line continuations count as "space" discardLineContinuation(); } else { break; @@ -2083,8 +2093,8 @@ append: } finish: // Can't `break` out of a nested `for`-`switch` - // Trim right whitespace - auto rightPos = std::find_if_not(str.rbegin(), str.rend(), isWhitespace); + // Trim right blank space + auto rightPos = std::find_if_not(str.rbegin(), str.rend(), isBlankSpace); str.resize(rightPos.base() - str.begin()); // Returning COMMAs to the parser would mean that two consecutive commas @@ -2120,7 +2130,7 @@ finish: // Can't `break` out of a nested `for`-`switch` static int skipPastEOL() { if (lexerState->atLineStart) { lexerState->atLineStart = false; - return skipChars(isWhitespace); + return skipChars(isBlankSpace); } for (;;) { @@ -2129,7 +2139,7 @@ static int skipPastEOL() { } else if (isNewline(c)) { handleCRLF(c); nextLine(); - return skipChars(isWhitespace); + return skipChars(isBlankSpace); } else if (c == '\\') { // Unconditionally skip the next char, including line continuations c = bumpChar(); @@ -2323,7 +2333,7 @@ Capture lexer_CaptureRept() { nextLine(); // We're at line start, so attempt to match a `REPT`, `FOR`, or `ENDR` token - if (int c = skipChars(isWhitespace); startsIdentifier(c)) { + if (int c = skipChars(isBlankSpace); startsIdentifier(c)) { shiftChar(); switch (readIdentifier(c, false).type) { case T_(POP_REPT): @@ -2365,7 +2375,7 @@ Capture lexer_CaptureMacro() { nextLine(); // We're at line start, so attempt to match an `ENDM` token - if (int c = skipChars(isWhitespace); startsIdentifier(c)) { + if (int c = skipChars(isBlankSpace); startsIdentifier(c)) { shiftChar(); if (readIdentifier(c, false).type == T_(POP_ENDM)) { endCapture(capture); diff --git a/src/asm/main.cpp b/src/asm/main.cpp index 213ed5a1..a358e68c 100644 --- a/src/asm/main.cpp +++ b/src/asm/main.cpp @@ -250,7 +250,7 @@ static std::vector parseStateFeatures(char *str) { if (next) { *next++ = '\0'; } - // Trim whitespace from the beginning of `feature`... + // Trim blank spaces from the beginning of `feature`... feature += strspn(feature, " \t"); // ...and from the end if (char *end = strpbrk(feature, " \t"); end) { diff --git a/src/asm/opt.cpp b/src/asm/opt.cpp index 40015407..5201be3d 100644 --- a/src/asm/opt.cpp +++ b/src/asm/opt.cpp @@ -1,6 +1,5 @@ // SPDX-License-Identifier: MIT -#include #include #include #include @@ -8,6 +7,7 @@ #include #include "helpers.hpp" // assume +#include "util.hpp" // isBlankSpace #include "asm/fixpoint.hpp" #include "asm/fstack.hpp" @@ -116,8 +116,8 @@ void opt_Parse(char const *s) { case 'r': { ++s; // Skip 'r' - while (isblank(*s)) { - ++s; // Skip leading whitespace + while (isBlankSpace(*s)) { + ++s; // Skip leading blank spaces } if (s[0] == '\0') { diff --git a/src/diagnostics.cpp b/src/diagnostics.cpp index 5b15e8f3..bbdfcefe 100644 --- a/src/diagnostics.cpp +++ b/src/diagnostics.cpp @@ -55,7 +55,7 @@ std::pair> getInitialWarningState(std::str } // Is the rest of the string a decimal number? - // We want to avoid `strtoul`'s whitespace and sign, so we parse manually + // We want to avoid `strtoul`'s whitespace and sign handling, so we parse manually char const *ptr = flag.c_str() + equals + 1; uint32_t param = 0; bool overflowed = false; diff --git a/src/fix/mbc.cpp b/src/fix/mbc.cpp index 10a4ccf8..f1f6ad91 100644 --- a/src/fix/mbc.cpp +++ b/src/fix/mbc.cpp @@ -8,6 +8,7 @@ #include "helpers.hpp" // unreachable_ #include "platform.hpp" // strcasecmp +#include "util.hpp" // isBlankSpace #include "fix/warning.hpp" @@ -93,14 +94,14 @@ bool mbc_HasRAM(MbcType type) { return search != mbcData.end() && search->second.second; } -static void skipWhitespace(char const *&ptr) { - while (*ptr == ' ' || *ptr == '\t') { +static void skipBlankSpace(char const *&ptr) { + while (isBlankSpace(*ptr)) { ++ptr; } } static void skipMBCSpace(char const *&ptr) { - while (*ptr == ' ' || *ptr == '\t' || *ptr == '_') { + while (isBlankSpace(*ptr) || *ptr == '_') { ++ptr; } } @@ -166,7 +167,7 @@ MbcType mbc_ParseName(char const *name, uint8_t &tpp1Major, uint8_t &tpp1Minor) uint16_t mbc; char const *ptr = name; - skipWhitespace(ptr); // Trim off leading whitespace + skipBlankSpace(ptr); // Trim off leading blank space #define tryReadSlice(expected) \ do { \ @@ -318,7 +319,7 @@ MbcType mbc_ParseName(char const *name, uint8_t &tpp1Major, uint8_t &tpp1Minor) // clang-format on for (;;) { - skipWhitespace(ptr); // Trim off trailing whitespace + skipBlankSpace(ptr); // Trim off trailing blank space // If done, start processing "features" if (!*ptr) { @@ -501,9 +502,9 @@ MbcType mbc_ParseName(char const *name, uint8_t &tpp1Major, uint8_t &tpp1Minor) break; } - skipWhitespace(ptr); // Trim off trailing whitespace + skipBlankSpace(ptr); // Trim off trailing blank space - // If there is still something past the whitespace, error out + // If there is still something left, error out if (*ptr) { fatalUnknownMBC(fullName); } diff --git a/src/gfx/main.cpp b/src/gfx/main.cpp index 4795a4b2..42c9406c 100644 --- a/src/gfx/main.cpp +++ b/src/gfx/main.cpp @@ -186,7 +186,7 @@ static uint16_t parseNumber(char *&string, char const *errPrefix, uint16_t errVa return number; } -static void skipWhitespace(char *&arg) { +static void skipBlankSpace(char *&arg) { arg += strspn(arg, " \t"); } @@ -214,8 +214,8 @@ static std::vector readAtFile(std::string const &path, std::vector for (std::vector argvOfs;;) { int c = file->sbumpc(); - // First, discard any leading whitespace - while (isWhitespace(c)) { + // First, discard any leading blank space + while (isBlankSpace(c)) { c = file->sbumpc(); } @@ -239,13 +239,13 @@ static std::vector readAtFile(std::string const &path, std::vector // Read one argument (until the next whitespace char). // We know there is one because we already have its first character in `c`. - for (; c != EOF && !isNewline(c) && !isWhitespace(c); c = file->sbumpc()) { + for (; c != EOF && !isWhitespace(c); c = file->sbumpc()) { argPool.push_back(c); } argPool.push_back('\0'); - // Discard whitespace until the next argument (candidate) - while (isWhitespace(c)) { + // Discard blank space until the next argument (candidate) + while (isBlankSpace(c)) { c = file->sbumpc(); } } while (c != EOF && !isNewline(c)); // End if we reached EOL @@ -285,7 +285,7 @@ static char *parseArgv(int argc, char *argv[]) { options.baseTileIDs[1] = 0; break; } - skipWhitespace(arg); + skipBlankSpace(arg); if (*arg != ',') { error( "Base tile IDs must be one or two comma-separated numbers, not \"%s\"", @@ -294,7 +294,7 @@ static char *parseArgv(int argc, char *argv[]) { break; } ++arg; // Skip comma - skipWhitespace(arg); + skipBlankSpace(arg); number = parseNumber(arg, "Bank 1 base tile ID", 0); if (number >= 256) { error("Bank 1 base tile ID must be below 256"); @@ -356,23 +356,23 @@ static char *parseArgv(int argc, char *argv[]) { error("Input slice left coordinate is out of range!"); break; } - skipWhitespace(arg); + skipBlankSpace(arg); if (*arg != ',') { error("Missing comma after left coordinate in \"%s\"", musl_optarg); break; } ++arg; - skipWhitespace(arg); + skipBlankSpace(arg); options.inputSlice.top = parseNumber(arg, "Input slice upper coordinate"); - skipWhitespace(arg); + skipBlankSpace(arg); if (*arg != ':') { error("Missing colon after upper coordinate in \"%s\"", musl_optarg); break; } ++arg; - skipWhitespace(arg); + skipBlankSpace(arg); options.inputSlice.width = parseNumber(arg, "Input slice width"); - skipWhitespace(arg); + skipBlankSpace(arg); if (options.inputSlice.width == 0) { error("Input slice width may not be 0!"); } @@ -381,7 +381,7 @@ static char *parseArgv(int argc, char *argv[]) { break; } ++arg; - skipWhitespace(arg); + skipBlankSpace(arg); options.inputSlice.height = parseNumber(arg, "Input slice height"); if (options.inputSlice.height == 0) { error("Input slice height may not be 0!"); @@ -416,7 +416,7 @@ static char *parseArgv(int argc, char *argv[]) { options.maxNbTiles[1] = 0; break; } - skipWhitespace(arg); + skipBlankSpace(arg); if (*arg != ',') { error( "Bank capacity must be one or two comma-separated numbers, not \"%s\"", @@ -425,7 +425,7 @@ static char *parseArgv(int argc, char *argv[]) { break; } ++arg; // Skip comma - skipWhitespace(arg); + skipBlankSpace(arg); options.maxNbTiles[1] = parseNumber(arg, "Number of tiles in bank 1", 256); if (options.maxNbTiles[1] > 256) { error("Bank 1 cannot contain more than 256 tiles"); diff --git a/src/gfx/pal_spec.cpp b/src/gfx/pal_spec.cpp index a387293a..98b6963c 100644 --- a/src/gfx/pal_spec.cpp +++ b/src/gfx/pal_spec.cpp @@ -31,7 +31,7 @@ using namespace std::string_view_literals; static char const *hexDigits = "0123456789ABCDEFabcdef"; template // Should be std::string or std::string_view -static void skipWhitespace(Str const &str, size_t &pos) { +static void skipBlankSpace(Str const &str, size_t &pos) { pos = std::min(str.find_first_not_of(" \t"sv, pos), str.length()); } @@ -120,8 +120,8 @@ void parseInlinePalSpec(char const * const rawArg) { n = pos; } - // Skip whitespace, if any - skipWhitespace(arg, n); + // Skip trailing space, if any + skipBlankSpace(arg, n); // Skip comma/semicolon, or end if (n == arg.length()) { @@ -134,7 +134,7 @@ void parseInlinePalSpec(char const * const rawArg) { ++nbColors; // A trailing comma may be followed by a semicolon - skipWhitespace(arg, n); + skipBlankSpace(arg, n); if (n == arg.length()) { break; } else if (arg[n] != ';' && arg[n] != ':') { @@ -149,7 +149,7 @@ void parseInlinePalSpec(char const * const rawArg) { case ':': case ';': ++n; - skipWhitespace(arg, n); + skipBlankSpace(arg, n); nbColors = 0; // Start a new palette // Avoid creating a spurious empty palette @@ -253,7 +253,7 @@ static std::optional parseColor(std::string const &str, size_t &n, uint16_ error("Failed to parse color #%d (\"%s\"): invalid red component", i + 1, str.c_str()); return std::nullopt; } - skipWhitespace(str, n); + skipBlankSpace(str, n); if (n == str.length()) { error("Failed to parse color #%d (\"%s\"): missing green component", i + 1, str.c_str()); return std::nullopt; @@ -263,7 +263,7 @@ static std::optional parseColor(std::string const &str, size_t &n, uint16_ error("Failed to parse color #%d (\"%s\"): invalid green component", i + 1, str.c_str()); return std::nullopt; } - skipWhitespace(str, n); + skipBlankSpace(str, n); if (n == str.length()) { error("Failed to parse color #%d (\"%s\"): missing blue component", i + 1, str.c_str()); return std::nullopt; @@ -356,7 +356,7 @@ static void parseGPLFile(char const *filename, std::filebuf &file) { } size_t n = 0; - skipWhitespace(line, n); + skipBlankSpace(line, n); // Skip empty lines, or lines that contain just a comment. if (line.length() == n || line[n] == '#') { continue; diff --git a/src/link/lexer.cpp b/src/link/lexer.cpp index 68e6c810..22bd2a58 100644 --- a/src/link/lexer.cpp +++ b/src/link/lexer.cpp @@ -260,8 +260,8 @@ yy::parser::symbol_type yylex() { LexerStackEntry &context = lexerStack.back(); int c = context.file.sbumpc(); - // First, skip leading whitespace. - while (isWhitespace(c)) { + // First, skip leading blank space. + while (isBlankSpace(c)) { c = context.file.sbumpc(); } // Then, skip a comment if applicable. diff --git a/src/link/main.cpp b/src/link/main.cpp index d02496e5..89a79b2f 100644 --- a/src/link/main.cpp +++ b/src/link/main.cpp @@ -182,6 +182,10 @@ static void verboseOutputConfig(int argc, char *argv[]) { } // LCOV_EXCL_STOP +static size_t skipBlankSpace(char const *str) { + return strspn(str, " \t"); +} + static void parseScrambleSpec(char *spec) { // clang-format off: vertically align nested initializers static UpperMap> scrambleSpecs{ @@ -191,19 +195,19 @@ static void parseScrambleSpec(char *spec) { }; // clang-format on - // Skip leading whitespace before the regions. - spec += strspn(spec, " \t"); + // Skip leading blank space before the regions. + spec += skipBlankSpace(spec); // The argument to `-S` should be a comma-separated list of regions, allowing a trailing comma. // Each region name is optionally followed by an '=' and a region size. while (*spec) { char *regionName = spec; - // The region name continues (skipping any whitespace) until a ',' (next region), + // The region name continues (skipping any blank space) until a ',' (next region), // '=' (region size), or the end of the string. size_t regionNameLen = strcspn(regionName, "=, \t"); - // Skip trailing whitespace after the region name. - size_t regionNameSkipLen = regionNameLen + strspn(regionName + regionNameLen, " \t"); + // Skip trailing blank space after the region name. + size_t regionNameSkipLen = regionNameLen + skipBlankSpace(regionName + regionNameLen); spec = regionName + regionNameSkipLen; if (*spec != '=' && *spec != ',' && *spec != '\0') { @@ -215,14 +219,14 @@ static void parseScrambleSpec(char *spec) { // The '=' region size limit is optional. if (*spec == '=') { regionSize = spec + 1; // Skip the '=' - // Skip leading whitespace before the region size. - regionSize += strspn(regionSize, " \t"); + // Skip leading blank space before the region size. + regionSize += skipBlankSpace(regionSize); - // The region size continues (skipping any whitespace) until a ',' (next region) + // The region size continues (skipping any blank space) until a ',' (next region) // or the end of the string. regionSizeLen = strcspn(regionSize, ", \t"); - // Skip trailing whitespace after the region size. - size_t regionSizeSkipLen = regionSizeLen + strspn(regionSize + regionSizeLen, " \t"); + // Skip trailing blank space after the region size. + size_t regionSizeSkipLen = regionSizeLen + skipBlankSpace(regionSize + regionSizeLen); spec = regionSize + regionSizeSkipLen; if (*spec != ',' && *spec != '\0') { @@ -234,9 +238,9 @@ static void parseScrambleSpec(char *spec) { if (*spec == ',') { ++spec; } - // Skip trailing whitespace after the region. + // Skip trailing blank space after the region. // `spec` will be the next region name, or the end of the string. - spec += strspn(spec, " \t"); + spec += skipBlankSpace(spec); // Terminate the `regionName` and `regionSize` strings. regionName[regionNameLen] = '\0'; @@ -245,8 +249,8 @@ static void parseScrambleSpec(char *spec) { } // Check for an empty region name or limit. - // Note that by skipping leading whitespace before the loop, and skipping a trailing comma - // and whitespace before the next iteration, we guarantee that the region name will not be + // Note that by skipping leading blank space before the loop, and skipping a trailing comma + // and blank space before the next iteration, we guarantee that the region name will not be // empty if it is present at all. if (*regionName == '\0') { fatal("Empty region name in spec for option '-S'"); diff --git a/src/util.cpp b/src/util.cpp index 3618445d..f6049529 100644 --- a/src/util.cpp +++ b/src/util.cpp @@ -5,12 +5,16 @@ #include #include -bool isWhitespace(int c) { +bool isNewline(int c) { + return c == '\r' || c == '\n'; +} + +bool isBlankSpace(int c) { return c == ' ' || c == '\t'; } -bool isNewline(int c) { - return c == '\r' || c == '\n'; +bool isWhitespace(int c) { + return isBlankSpace(c) || isNewline(c); } bool isPrintable(int c) { diff --git a/test/asm/garbage_sequence.asm b/test/asm/garbage_sequence.asm index c88ecd4a..69cda417 100644 --- a/test/asm/garbage_sequence.asm +++ b/test/asm/garbage_sequence.asm @@ -1,3 +1,5 @@ assert 1 +# 1 == 2 assert 2 ?ÿ* 2 == 4 assert 3 **?ÿ?##?? 3 == 27 +charmap "x", 4 +assert 4 <<#ÿ'x' == 64 diff --git a/test/asm/garbage_sequence.err b/test/asm/garbage_sequence.err index 68deb0ba..b3588648 100644 --- a/test/asm/garbage_sequence.err +++ b/test/asm/garbage_sequence.err @@ -8,4 +8,6 @@ error: Invalid character '#' at garbage_sequence.asm(3) error: Invalid characters '#', '?', '?' at garbage_sequence.asm(3) -Assembly aborted with 5 errors! +error: Invalid characters '#', 0xFF (is the file UTF-8?) + at garbage_sequence.asm(5) +Assembly aborted with 6 errors!