Fix CHARLEN and CHARSUB on invalid UTF-8 (#1630)

This commit is contained in:
Rangi
2025-01-28 02:01:18 -05:00
committed by GitHub
parent d54619a453
commit 44caffe04a
4 changed files with 45 additions and 16 deletions

View File

@@ -227,24 +227,22 @@ size_t charmap_ConvertNext(std::string_view &input, std::vector<int32_t> *output
} else if (inputIdx < input.length()) { // No match found, but there is some input left } else if (inputIdx < input.length()) { // No match found, but there is some input left
size_t codepointLen = 0; size_t codepointLen = 0;
// This will write the codepoint's value to `output`, little-endian // This will write the codepoint's value to `output`, little-endian
for (uint32_t state = 0, codepoint = 0;;) { for (uint32_t state = 0, codepoint = 0; inputIdx + codepointLen < input.length();) {
if (decode(&state, &codepoint, input[inputIdx + codepointLen]) == 1) { if (decode(&state, &codepoint, input[inputIdx + codepointLen]) == 1) {
codepointLen = 0; error("Input string is not valid UTF-8\n");
codepointLen = 1;
break; break;
} }
if (output) {
output->push_back(input[inputIdx + codepointLen]);
}
codepointLen++; codepointLen++;
if (state == 0) { if (state == 0) {
break; break;
} }
} }
if (codepointLen == 0) { if (output) {
error("Input string is not valid UTF-8\n"); output->insert(
output->end(), input.data() + inputIdx, input.data() + inputIdx + codepointLen
);
} }
// Warn if this character is not mapped but any others are // Warn if this character is not mapped but any others are

View File

@@ -29,9 +29,13 @@ println "\"{mid2}{mid1}\""
; 4: invalid byte 0x81 ; 4: invalid byte 0x81
; 5: invalid byte 0xFF ; 5: invalid byte 0xFF
; 6: U+0020 space ; 6: U+0020 space
; 7: U+0042 B ; 7: U+6F22 kanji (0xE6 0xBC 0xA2)
REDEF invalid EQUS "A <20><><EFBFBD> B" REDEF invalid EQUS "A <20><><EFBFBD> "
DEF n = strlen("{invalid}") DEF n = STRLEN("{invalid}")
DEF r = charlen("{invalid}") DEF r = CHARLEN("{invalid}")
println "\"{#s:invalid}\": {d:n} != {d:r}" println "\"{#s:invalid}\": {d:n} == {d:r}"
REDEF mid1 EQUS CHARSUB("{invalid}", 4)
REDEF mid2 EQUS CHARSUB("{invalid}", 7)
println "\"{mid2}{mid1}\""

View File

@@ -50,4 +50,30 @@ error: invalid-utf-8-strings.asm(35):
STRLEN: Invalid UTF-8 byte 0xFF STRLEN: Invalid UTF-8 byte 0xFF
error: invalid-utf-8-strings.asm(36): error: invalid-utf-8-strings.asm(36):
Input string is not valid UTF-8 Input string is not valid UTF-8
error: Assembly aborted (26 errors)! error: invalid-utf-8-strings.asm(36):
Input string is not valid UTF-8
error: invalid-utf-8-strings.asm(36):
Input string is not valid UTF-8
error: invalid-utf-8-strings.asm(39):
Input string is not valid UTF-8
error: invalid-utf-8-strings.asm(39):
Input string is not valid UTF-8
error: invalid-utf-8-strings.asm(39):
Input string is not valid UTF-8
error: invalid-utf-8-strings.asm(39):
Input string is not valid UTF-8
error: invalid-utf-8-strings.asm(39):
Input string is not valid UTF-8
error: invalid-utf-8-strings.asm(40):
Input string is not valid UTF-8
error: invalid-utf-8-strings.asm(40):
Input string is not valid UTF-8
error: invalid-utf-8-strings.asm(40):
Input string is not valid UTF-8
error: invalid-utf-8-strings.asm(40):
Input string is not valid UTF-8
error: invalid-utf-8-strings.asm(40):
Input string is not valid UTF-8
error: invalid-utf-8-strings.asm(40):
Input string is not valid UTF-8
error: Assembly aborted (39 errors)!

View File

@@ -1,3 +1,4 @@
"aäb漢,a<><61>b<EFBFBD><62><EFBFBD>!" == "aäb漢,a<><61>b<EFBFBD><62><EFBFBD>!" (12) "aäb漢,a<><61>b<EFBFBD><62><EFBFBD>!" == "aäb漢,a<><61>b<EFBFBD><62><EFBFBD>!" (12)
"b,a" "b,a"
"A <20><><EFBFBD> B": 7 != 2 "A <20><><EFBFBD> ": 7 == 7
"漢<>"