From 850c78aaf47a167e64a01aa1dc07bbe558a7c282 Mon Sep 17 00:00:00 2001 From: Rangi Date: Fri, 9 Apr 2021 14:05:43 -0400 Subject: [PATCH] Report garbage chars as their bytes; don't try decoding them as UTF-8 This decoding required high lookahead, and was not even consistently useful (the `garbage_char` test case was not valid UTF-8 and so did not benefit from `reportGarbageChar`). This limits UTF-8 handling to the `STRLEN` and `STRSUB` built-in functions, and to charmap conversion. --- src/asm/lexer.c | 51 +------------------------------------- test/asm/garbage_char.err | 2 +- test/asm/invalid-utf-8.err | 4 +-- test/asm/null-in-macro.err | 2 +- 4 files changed, 5 insertions(+), 54 deletions(-) diff --git a/src/asm/lexer.c b/src/asm/lexer.c index e2e72388..ca29fd5f 100644 --- a/src/asm/lexer.c +++ b/src/asm/lexer.c @@ -1818,55 +1818,6 @@ finish: return i; } -/* Function to report one character's worth of garbage bytes */ - -static char const *reportGarbageChar(unsigned char firstByte) -{ - static char bytes[6 + 2 + 1]; /* Max size of a UTF-8 encoded code point, plus "''\0" */ - /* First, attempt UTF-8 decoding */ - uint32_t state = 0; /* UTF8_ACCEPT */ - uint32_t codepoint; - uint8_t size = 0; /* Number of additional bytes to shift */ - - bytes[1] = firstByte; /* No need to init the rest of the array */ - decode(&state, &codepoint, firstByte); - while (state != 0 && state != 1 /* UTF8_REJECT */) { - int c = peek(size++); - - if (c == EOF) - break; - bytes[size + 1] = c; - decode(&state, &codepoint, c); - } - - if (state == 0 && (codepoint > UCHAR_MAX || isprint((unsigned char)codepoint))) { - /* Character is valid, printable UTF-8! */ - shiftChars(size); - bytes[0] = '\''; - bytes[size + 2] = '\''; - bytes[size + 3] = '\0'; - return bytes; - } - - /* The character isn't valid UTF-8, so we'll only print that first byte */ - if (isprint(firstByte)) { - /* bytes[1] = firstByte; */ - bytes[0] = '\''; - bytes[2] = '\''; - bytes[3] = '\0'; - return bytes; - } - /* Well then, print its hex value */ - static char const hexChars[16] = "0123456789ABCDEF"; - - bytes[0] = '0'; - bytes[1] = 'x'; - bytes[2] = hexChars[firstByte >> 4]; - bytes[3] = hexChars[firstByte & 0x0f]; - bytes[4] = '\0'; - return bytes; -} - /* Lexer core */ static int yylex_SKIP_TO_ENDC(void); // forward declaration for yylex_NORMAL @@ -2118,7 +2069,7 @@ static int yylex_NORMAL(void) /* Do not report weird characters when capturing, it'll be done later */ if (!lexerState->capturing) { /* TODO: try to group reportings */ - error("Unknown character %s\n", reportGarbageChar(c)); + error("Unknown character '%s'\n", print(c)); } } lexerState->atLineStart = false; diff --git a/test/asm/garbage_char.err b/test/asm/garbage_char.err index d7bb40f3..f75a19fa 100644 --- a/test/asm/garbage_char.err +++ b/test/asm/garbage_char.err @@ -1,3 +1,3 @@ ERROR: garbage_char.asm(1): - Unknown character 0xFF + Unknown character '\xff' error: Assembly aborted (1 error)! diff --git a/test/asm/invalid-utf-8.err b/test/asm/invalid-utf-8.err index 25352e6e..9698cd80 100644 --- a/test/asm/invalid-utf-8.err +++ b/test/asm/invalid-utf-8.err @@ -1,5 +1,5 @@ ERROR: invalid-utf-8.asm(6) -> invalid-utf-8.asm::m(4): - Unknown character 0xCF + Unknown character '\xcf' ERROR: invalid-utf-8.asm(6) -> invalid-utf-8.asm::m(4): - Unknown character 0xD3 + Unknown character '\xd3' error: Assembly aborted (2 errors)! diff --git a/test/asm/null-in-macro.err b/test/asm/null-in-macro.err index c9148b8a..b90a7ac1 100644 --- a/test/asm/null-in-macro.err +++ b/test/asm/null-in-macro.err @@ -1,3 +1,3 @@ ERROR: null-in-macro.asm(4) -> null-in-macro.asm::foo(2): - Unknown character 0x00 + Unknown character '\x00' error: Assembly aborted (1 error)!