From 44caffe04adf028b21703de1427832b5d94d4af2 Mon Sep 17 00:00:00 2001 From: Rangi <35663410+Rangi42@users.noreply.github.com> Date: Tue, 28 Jan 2025 02:01:18 -0500 Subject: [PATCH] Fix `CHARLEN` and `CHARSUB` on invalid UTF-8 (#1630) --- src/asm/charmap.cpp | 16 +++++++--------- test/asm/invalid-utf-8-strings.asm | 14 +++++++++----- test/asm/invalid-utf-8-strings.err | 28 +++++++++++++++++++++++++++- test/asm/invalid-utf-8-strings.out | 3 ++- 4 files changed, 45 insertions(+), 16 deletions(-) diff --git a/src/asm/charmap.cpp b/src/asm/charmap.cpp index b7eadbca..3b758302 100644 --- a/src/asm/charmap.cpp +++ b/src/asm/charmap.cpp @@ -227,24 +227,22 @@ size_t charmap_ConvertNext(std::string_view &input, std::vector *output } else if (inputIdx < input.length()) { // No match found, but there is some input left size_t codepointLen = 0; // This will write the codepoint's value to `output`, little-endian - for (uint32_t state = 0, codepoint = 0;;) { + for (uint32_t state = 0, codepoint = 0; inputIdx + codepointLen < input.length();) { if (decode(&state, &codepoint, input[inputIdx + codepointLen]) == 1) { - codepointLen = 0; + error("Input string is not valid UTF-8\n"); + codepointLen = 1; break; } - - if (output) { - output->push_back(input[inputIdx + codepointLen]); - } codepointLen++; - if (state == 0) { break; } } - if (codepointLen == 0) { - error("Input string is not valid UTF-8\n"); + if (output) { + output->insert( + output->end(), input.data() + inputIdx, input.data() + inputIdx + codepointLen + ); } // Warn if this character is not mapped but any others are diff --git a/test/asm/invalid-utf-8-strings.asm b/test/asm/invalid-utf-8-strings.asm index f105e6e3..f8cc392f 100644 --- a/test/asm/invalid-utf-8-strings.asm +++ b/test/asm/invalid-utf-8-strings.asm @@ -29,9 +29,13 @@ println "\"{mid2}{mid1}\"" ; 4: invalid byte 0x81 ; 5: invalid byte 0xFF ; 6: U+0020 space -; 7: U+0042 B -REDEF invalid EQUS "A B" +; 7: U+6F22 kanji (0xE6 0xBC 0xA2) +REDEF invalid EQUS "A 漢" -DEF n = strlen("{invalid}") -DEF r = charlen("{invalid}") -println "\"{#s:invalid}\": {d:n} != {d:r}" +DEF n = STRLEN("{invalid}") +DEF r = CHARLEN("{invalid}") +println "\"{#s:invalid}\": {d:n} == {d:r}" + +REDEF mid1 EQUS CHARSUB("{invalid}", 4) +REDEF mid2 EQUS CHARSUB("{invalid}", 7) +println "\"{mid2}{mid1}\"" diff --git a/test/asm/invalid-utf-8-strings.err b/test/asm/invalid-utf-8-strings.err index dfcb1a29..529c7a9e 100644 --- a/test/asm/invalid-utf-8-strings.err +++ b/test/asm/invalid-utf-8-strings.err @@ -50,4 +50,30 @@ error: invalid-utf-8-strings.asm(35): STRLEN: Invalid UTF-8 byte 0xFF error: invalid-utf-8-strings.asm(36): Input string is not valid UTF-8 -error: Assembly aborted (26 errors)! +error: invalid-utf-8-strings.asm(36): + Input string is not valid UTF-8 +error: invalid-utf-8-strings.asm(36): + Input string is not valid UTF-8 +error: invalid-utf-8-strings.asm(39): + Input string is not valid UTF-8 +error: invalid-utf-8-strings.asm(39): + Input string is not valid UTF-8 +error: invalid-utf-8-strings.asm(39): + Input string is not valid UTF-8 +error: invalid-utf-8-strings.asm(39): + Input string is not valid UTF-8 +error: invalid-utf-8-strings.asm(39): + Input string is not valid UTF-8 +error: invalid-utf-8-strings.asm(40): + Input string is not valid UTF-8 +error: invalid-utf-8-strings.asm(40): + Input string is not valid UTF-8 +error: invalid-utf-8-strings.asm(40): + Input string is not valid UTF-8 +error: invalid-utf-8-strings.asm(40): + Input string is not valid UTF-8 +error: invalid-utf-8-strings.asm(40): + Input string is not valid UTF-8 +error: invalid-utf-8-strings.asm(40): + Input string is not valid UTF-8 +error: Assembly aborted (39 errors)! diff --git a/test/asm/invalid-utf-8-strings.out b/test/asm/invalid-utf-8-strings.out index e24c370d..2d9147a9 100644 --- a/test/asm/invalid-utf-8-strings.out +++ b/test/asm/invalid-utf-8-strings.out @@ -1,3 +1,4 @@ "aäb漢,ab!" == "aäb漢,ab!" (12) "b,a" -"A B": 7 != 2 +"A 漢": 7 == 7 +"漢"