From 4d21588eb27c19a2c69145ab343fc1217cd2f21d Mon Sep 17 00:00:00 2001 From: Rangi Date: Tue, 20 Apr 2021 12:24:01 -0400 Subject: [PATCH] Make invalid UTF-8 characters in strings non-fatal STRLEN and STRSUB report the erroneous bytes Fixes #848 --- src/asm/parser.y | 31 +++++++++++++------- test/asm/invalid-utf-8-strings.asm | 23 +++++++++++++++ test/asm/invalid-utf-8-strings.err | 45 ++++++++++++++++++++++++++++++ test/asm/invalid-utf-8-strings.out | 2 ++ test/asm/invalid-utf-8.asm | 2 +- 5 files changed, 92 insertions(+), 11 deletions(-) create mode 100644 test/asm/invalid-utf-8-strings.asm create mode 100644 test/asm/invalid-utf-8-strings.err create mode 100644 test/asm/invalid-utf-8-strings.out diff --git a/src/asm/parser.y b/src/asm/parser.y index b96bce48..81de67ce 100644 --- a/src/asm/parser.y +++ b/src/asm/parser.y @@ -82,16 +82,24 @@ static char *strrstr(char *s1, char *s2) return NULL; } +static void errorInvalidUTF8Byte(uint8_t byte, char const *functionName) +{ + error("%s: Invalid UTF-8 byte 0x%02hhX\n", functionName, byte); +} + static size_t strlenUTF8(char const *s) { size_t len = 0; uint32_t state = 0; for (uint32_t codep = 0; *s; s++) { - switch (decode(&state, &codep, *s)) { + uint8_t byte = *s; + + switch (decode(&state, &codep, byte)) { case 1: - fatalerror("STRLEN: Invalid UTF-8 character\n"); - break; + errorInvalidUTF8Byte(byte, "STRLEN"); + state = 0; + /* fallthrough */ case 0: len++; break; @@ -100,7 +108,7 @@ static size_t strlenUTF8(char const *s) /* Check for partial code point. */ if (state != 0) - fatalerror("STRLEN: Invalid UTF-8 character\n"); + error("STRLEN: Incomplete UTF-8 character\n"); return len; } @@ -116,14 +124,16 @@ static void strsubUTF8(char *dest, size_t destLen, char const *src, uint32_t pos /* Advance to starting position in source string. */ while (src[srcIndex] && curPos < pos) { - switch (decode(&state, &codep, src[srcIndex++])) { + switch (decode(&state, &codep, src[srcIndex])) { case 1: - fatalerror("STRSUB: Invalid UTF-8 character\n"); - break; + errorInvalidUTF8Byte(src[srcIndex], "STRSUB"); + state = 0; + /* fallthrough */ case 0: curPos++; break; } + srcIndex++; } /* @@ -138,8 +148,9 @@ static void strsubUTF8(char *dest, size_t destLen, char const *src, uint32_t pos while (src[srcIndex] && destIndex < destLen - 1 && curLen < len) { switch (decode(&state, &codep, src[srcIndex])) { case 1: - fatalerror("STRSUB: Invalid UTF-8 character\n"); - break; + errorInvalidUTF8Byte(src[srcIndex], "STRSUB"); + state = 0; + /* fallthrough */ case 0: curLen++; break; @@ -152,7 +163,7 @@ static void strsubUTF8(char *dest, size_t destLen, char const *src, uint32_t pos /* Check for partial code point. */ if (state != 0) - fatalerror("STRSUB: Invalid UTF-8 character\n"); + error("STRSUB: Incomplete UTF-8 character\n"); dest[destIndex] = '\0'; } diff --git a/test/asm/invalid-utf-8-strings.asm b/test/asm/invalid-utf-8-strings.asm new file mode 100644 index 00000000..8f6afa52 --- /dev/null +++ b/test/asm/invalid-utf-8-strings.asm @@ -0,0 +1,23 @@ +; characters: +; 1: U+0061 a +; 2: U+00E4 a with diaresis (0xC3 0xA4) +; 3: U+0062 b +; 4: U+6F22 kanji (0xE6 0xBC 0xA2) +; 5: U+002C , +; 6: U+0061 a +; 7: invalid byte 0xA3 +; 8: invalid byte 0xA4 +; 9: U+0062 b +; 10: invalid bytes 0xE6 0xF0 +; 11: invalid byte 0xA2 +; 12: U+0021 ! +invalid EQUS "aäbæ¼¢,a£¤bæð¢!" + +n = STRLEN("{invalid}") +copy EQUS STRSUB("{invalid}", 1) + +println "\"{invalid}\" == \"{copy}\" ({d:n})" + +mid1 EQUS STRSUB("{invalid}", 5, 2) +mid2 EQUS STRSUB("{invalid}", 9, 1) +println "\"{mid2}{mid1}\"" diff --git a/test/asm/invalid-utf-8-strings.err b/test/asm/invalid-utf-8-strings.err new file mode 100644 index 00000000..46f6e8e7 --- /dev/null +++ b/test/asm/invalid-utf-8-strings.err @@ -0,0 +1,45 @@ +ERROR: invalid-utf-8-strings.asm(16): + STRLEN: Invalid UTF-8 byte 0xA3 +ERROR: invalid-utf-8-strings.asm(16): + STRLEN: Invalid UTF-8 byte 0xA4 +ERROR: invalid-utf-8-strings.asm(16): + STRLEN: Invalid UTF-8 byte 0xF0 +ERROR: invalid-utf-8-strings.asm(16): + STRLEN: Invalid UTF-8 byte 0xA2 +ERROR: invalid-utf-8-strings.asm(17): + STRLEN: Invalid UTF-8 byte 0xA3 +ERROR: invalid-utf-8-strings.asm(17): + STRLEN: Invalid UTF-8 byte 0xA4 +ERROR: invalid-utf-8-strings.asm(17): + STRLEN: Invalid UTF-8 byte 0xF0 +ERROR: invalid-utf-8-strings.asm(17): + STRLEN: Invalid UTF-8 byte 0xA2 +ERROR: invalid-utf-8-strings.asm(17): + STRSUB: Invalid UTF-8 byte 0xA3 +ERROR: invalid-utf-8-strings.asm(17): + STRSUB: Invalid UTF-8 byte 0xA4 +ERROR: invalid-utf-8-strings.asm(17): + STRSUB: Invalid UTF-8 byte 0xF0 +ERROR: invalid-utf-8-strings.asm(17): + STRSUB: Invalid UTF-8 byte 0xA2 +ERROR: invalid-utf-8-strings.asm(21): + STRLEN: Invalid UTF-8 byte 0xA3 +ERROR: invalid-utf-8-strings.asm(21): + STRLEN: Invalid UTF-8 byte 0xA4 +ERROR: invalid-utf-8-strings.asm(21): + STRLEN: Invalid UTF-8 byte 0xF0 +ERROR: invalid-utf-8-strings.asm(21): + STRLEN: Invalid UTF-8 byte 0xA2 +ERROR: invalid-utf-8-strings.asm(22): + STRLEN: Invalid UTF-8 byte 0xA3 +ERROR: invalid-utf-8-strings.asm(22): + STRLEN: Invalid UTF-8 byte 0xA4 +ERROR: invalid-utf-8-strings.asm(22): + STRLEN: Invalid UTF-8 byte 0xF0 +ERROR: invalid-utf-8-strings.asm(22): + STRLEN: Invalid UTF-8 byte 0xA2 +ERROR: invalid-utf-8-strings.asm(22): + STRSUB: Invalid UTF-8 byte 0xA3 +ERROR: invalid-utf-8-strings.asm(22): + STRSUB: Invalid UTF-8 byte 0xA4 +error: Assembly aborted (22 errors)! diff --git a/test/asm/invalid-utf-8-strings.out b/test/asm/invalid-utf-8-strings.out new file mode 100644 index 00000000..21af52fb --- /dev/null +++ b/test/asm/invalid-utf-8-strings.out @@ -0,0 +1,2 @@ +"aäbæ¼¢,a£¤bæð¢!" == "aäbæ¼¢,a£¤bæð¢!" (12) +"b,a" diff --git a/test/asm/invalid-utf-8.asm b/test/asm/invalid-utf-8.asm index f27de3c6..18957729 100644 --- a/test/asm/invalid-utf-8.asm +++ b/test/asm/invalid-utf-8.asm @@ -1,5 +1,5 @@ ; This test tries to pass invalid UTF-8 through a macro argument -; to exercise the lexer's reportGarbageChar +; to exercise the lexer's unknown character reporting m:MACRO \1 ENDM