Fix STRLEN and STRSUB on incomplete UTF-8 (#1633)

This commit is contained in:
Rangi
2025-01-28 13:13:35 -05:00
committed by GitHub
parent 44caffe04a
commit 375adc6804
4 changed files with 41 additions and 39 deletions

View File

@@ -72,7 +72,7 @@
static uint32_t strToNum(std::vector<int32_t> const &s); static uint32_t strToNum(std::vector<int32_t> const &s);
static void errorInvalidUTF8Byte(uint8_t byte, char const *functionName); static void errorInvalidUTF8Byte(uint8_t byte, char const *functionName);
static size_t strlenUTF8(std::string const &str); static size_t strlenUTF8(std::string const &str, bool printErrors);
static std::string strsubUTF8(std::string const &str, uint32_t pos, uint32_t len); static std::string strsubUTF8(std::string const &str, uint32_t pos, uint32_t len);
static size_t charlenUTF8(std::string const &str); static size_t charlenUTF8(std::string const &str);
static std::string charsubUTF8(std::string const &str, uint32_t pos); static std::string charsubUTF8(std::string const &str, uint32_t pos);
@@ -1517,7 +1517,7 @@ relocexpr_no_str:
$$.makeNumber(pos != std::string::npos ? pos + 1 : 0); $$.makeNumber(pos != std::string::npos ? pos + 1 : 0);
} }
| OP_STRLEN LPAREN string RPAREN { | OP_STRLEN LPAREN string RPAREN {
$$.makeNumber(strlenUTF8($3)); $$.makeNumber(strlenUTF8($3, true));
} }
| OP_CHARLEN LPAREN string RPAREN { | OP_CHARLEN LPAREN string RPAREN {
$$.makeNumber(charlenUTF8($3)); $$.makeNumber(charlenUTF8($3));
@@ -1569,13 +1569,13 @@ string:
$$ = std::move($1); $$ = std::move($1);
} }
| OP_STRSUB LPAREN string COMMA iconst COMMA uconst RPAREN { | OP_STRSUB LPAREN string COMMA iconst COMMA uconst RPAREN {
size_t len = strlenUTF8($3); size_t len = strlenUTF8($3, false);
uint32_t pos = adjustNegativePos($5, len, "STRSUB"); uint32_t pos = adjustNegativePos($5, len, "STRSUB");
$$ = strsubUTF8($3, pos, $7); $$ = strsubUTF8($3, pos, $7);
} }
| OP_STRSUB LPAREN string COMMA iconst RPAREN { | OP_STRSUB LPAREN string COMMA iconst RPAREN {
size_t len = strlenUTF8($3); size_t len = strlenUTF8($3, false);
uint32_t pos = adjustNegativePos($5, len, "STRSUB"); uint32_t pos = adjustNegativePos($5, len, "STRSUB");
$$ = strsubUTF8($3, pos, pos > len ? 0 : len + 1 - pos); $$ = strsubUTF8($3, pos, pos > len ? 0 : len + 1 - pos);
@@ -2522,7 +2522,7 @@ static void errorInvalidUTF8Byte(uint8_t byte, char const *functionName) {
error("%s: Invalid UTF-8 byte 0x%02hhX\n", functionName, byte); error("%s: Invalid UTF-8 byte 0x%02hhX\n", functionName, byte);
} }
static size_t strlenUTF8(std::string const &str) { static size_t strlenUTF8(std::string const &str, bool printErrors) {
char const *ptr = str.c_str(); char const *ptr = str.c_str();
size_t len = 0; size_t len = 0;
uint32_t state = 0; uint32_t state = 0;
@@ -2532,7 +2532,9 @@ static size_t strlenUTF8(std::string const &str) {
switch (decode(&state, &codepoint, byte)) { switch (decode(&state, &codepoint, byte)) {
case 1: case 1:
errorInvalidUTF8Byte(byte, "STRLEN"); if (printErrors) {
errorInvalidUTF8Byte(byte, "STRLEN");
}
state = 0; state = 0;
// fallthrough // fallthrough
case 0: case 0:
@@ -2543,7 +2545,10 @@ static size_t strlenUTF8(std::string const &str) {
// Check for partial code point. // Check for partial code point.
if (state != 0) { if (state != 0) {
error("STRLEN: Incomplete UTF-8 character\n"); if (printErrors) {
error("STRLEN: Incomplete UTF-8 character\n");
}
len++;
} }
return len; return len;
@@ -2595,13 +2600,14 @@ static std::string strsubUTF8(std::string const &str, uint32_t pos, uint32_t len
index++; index++;
} }
if (curLen < len) {
warning(WARNING_BUILTIN_ARG, "STRSUB: Length too big: %" PRIu32 "\n", len);
}
// Check for partial code point. // Check for partial code point.
if (state != 0) { if (state != 0) {
error("STRSUB: Incomplete UTF-8 character\n"); error("STRSUB: Incomplete UTF-8 character\n");
curLen++;
}
if (curLen < len) {
warning(WARNING_BUILTIN_ARG, "STRSUB: Length too big: %" PRIu32 "\n", len);
} }
return std::string(ptr + startIndex, ptr + index); return std::string(ptr + startIndex, ptr + index);

View File

@@ -16,11 +16,11 @@ DEF invalid EQUS "aäb漢,a
DEF n = STRLEN("{invalid}") DEF n = STRLEN("{invalid}")
DEF copy EQUS STRSUB("{invalid}", 1) DEF copy EQUS STRSUB("{invalid}", 1)
println "\"{invalid}\" == \"{copy}\" ({d:n})" println "\"{#s:invalid}\" == \"{#s:copy}\" ({d:n})"
DEF mid1 EQUS STRSUB("{invalid}", 5, 2) DEF mid1 EQUS STRSUB("{invalid}", 5, 2)
DEF mid2 EQUS STRSUB("{invalid}", 9, 1) DEF mid2 EQUS STRSUB("{invalid}", 9, 1)
println "\"{mid2}{mid1}\"" println "\"{#s:mid2}{#s:mid1}\""
; characters: ; characters:
; 1: U+0041 A ; 1: U+0041 A
@@ -38,4 +38,18 @@ println "\"{#s:invalid}\": {d:n} == {d:r}"
REDEF mid1 EQUS CHARSUB("{invalid}", 4) REDEF mid1 EQUS CHARSUB("{invalid}", 4)
REDEF mid2 EQUS CHARSUB("{invalid}", 7) REDEF mid2 EQUS CHARSUB("{invalid}", 7)
println "\"{mid2}{mid1}\"" println "\"{#s:mid2}{#s:mid1}\""
; characters:
; 1: U+0061 a
; 2: U+0062 b
; 3: U+0063 c
; 4: incomplete U+6F22 kanji (0xE6 0xBC without 0xA2)
REDEF invalid EQUS "abc<62><63>"
DEF n = STRLEN("{invalid}")
DEF r = CHARLEN("{invalid}")
println "\"{#s:invalid}\": {d:n} == {d:r}"
DEF final EQUS STRSUB("{invalid}", 4, 1)
println "\"{#s:invalid}\" ends \"{#s:final}\""

View File

@@ -6,14 +6,6 @@ error: invalid-utf-8-strings.asm(16):
STRLEN: Invalid UTF-8 byte 0xF0 STRLEN: Invalid UTF-8 byte 0xF0
error: invalid-utf-8-strings.asm(16): error: invalid-utf-8-strings.asm(16):
STRLEN: Invalid UTF-8 byte 0xA2 STRLEN: Invalid UTF-8 byte 0xA2
error: invalid-utf-8-strings.asm(17):
STRLEN: Invalid UTF-8 byte 0xA3
error: invalid-utf-8-strings.asm(17):
STRLEN: Invalid UTF-8 byte 0xA4
error: invalid-utf-8-strings.asm(17):
STRLEN: Invalid UTF-8 byte 0xF0
error: invalid-utf-8-strings.asm(17):
STRLEN: Invalid UTF-8 byte 0xA2
error: invalid-utf-8-strings.asm(17): error: invalid-utf-8-strings.asm(17):
STRSUB: Invalid UTF-8 byte 0xA3 STRSUB: Invalid UTF-8 byte 0xA3
error: invalid-utf-8-strings.asm(17): error: invalid-utf-8-strings.asm(17):
@@ -22,22 +14,6 @@ error: invalid-utf-8-strings.asm(17):
STRSUB: Invalid UTF-8 byte 0xF0 STRSUB: Invalid UTF-8 byte 0xF0
error: invalid-utf-8-strings.asm(17): error: invalid-utf-8-strings.asm(17):
STRSUB: Invalid UTF-8 byte 0xA2 STRSUB: Invalid UTF-8 byte 0xA2
error: invalid-utf-8-strings.asm(21):
STRLEN: Invalid UTF-8 byte 0xA3
error: invalid-utf-8-strings.asm(21):
STRLEN: Invalid UTF-8 byte 0xA4
error: invalid-utf-8-strings.asm(21):
STRLEN: Invalid UTF-8 byte 0xF0
error: invalid-utf-8-strings.asm(21):
STRLEN: Invalid UTF-8 byte 0xA2
error: invalid-utf-8-strings.asm(22):
STRLEN: Invalid UTF-8 byte 0xA3
error: invalid-utf-8-strings.asm(22):
STRLEN: Invalid UTF-8 byte 0xA4
error: invalid-utf-8-strings.asm(22):
STRLEN: Invalid UTF-8 byte 0xF0
error: invalid-utf-8-strings.asm(22):
STRLEN: Invalid UTF-8 byte 0xA2
error: invalid-utf-8-strings.asm(22): error: invalid-utf-8-strings.asm(22):
STRSUB: Invalid UTF-8 byte 0xA3 STRSUB: Invalid UTF-8 byte 0xA3
error: invalid-utf-8-strings.asm(22): error: invalid-utf-8-strings.asm(22):
@@ -76,4 +52,8 @@ error: invalid-utf-8-strings.asm(40):
Input string is not valid UTF-8 Input string is not valid UTF-8
error: invalid-utf-8-strings.asm(40): error: invalid-utf-8-strings.asm(40):
Input string is not valid UTF-8 Input string is not valid UTF-8
error: Assembly aborted (39 errors)! error: invalid-utf-8-strings.asm(50):
STRLEN: Incomplete UTF-8 character
error: invalid-utf-8-strings.asm(54):
STRSUB: Incomplete UTF-8 character
error: Assembly aborted (29 errors)!

View File

@@ -2,3 +2,5 @@
"b,a" "b,a"
"A <20><><EFBFBD> 漢": 7 == 7 "A <20><><EFBFBD> 漢": 7 == 7
"漢<>" "漢<>"
"abc<62><63>": 4 == 4
"abc<62><63>" ends "<22><>"