mirror of
https://github.com/gbdev/rgbds.git
synced 2025-11-20 10:12:06 +00:00
Make invalid UTF-8 characters in strings non-fatal
STRLEN and STRSUB report the erroneous bytes Fixes #848
This commit is contained in:
@@ -82,16 +82,24 @@ static char *strrstr(char *s1, char *s2)
|
|||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void errorInvalidUTF8Byte(uint8_t byte, char const *functionName)
|
||||||
|
{
|
||||||
|
error("%s: Invalid UTF-8 byte 0x%02hhX\n", functionName, byte);
|
||||||
|
}
|
||||||
|
|
||||||
static size_t strlenUTF8(char const *s)
|
static size_t strlenUTF8(char const *s)
|
||||||
{
|
{
|
||||||
size_t len = 0;
|
size_t len = 0;
|
||||||
uint32_t state = 0;
|
uint32_t state = 0;
|
||||||
|
|
||||||
for (uint32_t codep = 0; *s; s++) {
|
for (uint32_t codep = 0; *s; s++) {
|
||||||
switch (decode(&state, &codep, *s)) {
|
uint8_t byte = *s;
|
||||||
|
|
||||||
|
switch (decode(&state, &codep, byte)) {
|
||||||
case 1:
|
case 1:
|
||||||
fatalerror("STRLEN: Invalid UTF-8 character\n");
|
errorInvalidUTF8Byte(byte, "STRLEN");
|
||||||
break;
|
state = 0;
|
||||||
|
/* fallthrough */
|
||||||
case 0:
|
case 0:
|
||||||
len++;
|
len++;
|
||||||
break;
|
break;
|
||||||
@@ -100,7 +108,7 @@ static size_t strlenUTF8(char const *s)
|
|||||||
|
|
||||||
/* Check for partial code point. */
|
/* Check for partial code point. */
|
||||||
if (state != 0)
|
if (state != 0)
|
||||||
fatalerror("STRLEN: Invalid UTF-8 character\n");
|
error("STRLEN: Incomplete UTF-8 character\n");
|
||||||
|
|
||||||
return len;
|
return len;
|
||||||
}
|
}
|
||||||
@@ -116,14 +124,16 @@ static void strsubUTF8(char *dest, size_t destLen, char const *src, uint32_t pos
|
|||||||
|
|
||||||
/* Advance to starting position in source string. */
|
/* Advance to starting position in source string. */
|
||||||
while (src[srcIndex] && curPos < pos) {
|
while (src[srcIndex] && curPos < pos) {
|
||||||
switch (decode(&state, &codep, src[srcIndex++])) {
|
switch (decode(&state, &codep, src[srcIndex])) {
|
||||||
case 1:
|
case 1:
|
||||||
fatalerror("STRSUB: Invalid UTF-8 character\n");
|
errorInvalidUTF8Byte(src[srcIndex], "STRSUB");
|
||||||
break;
|
state = 0;
|
||||||
|
/* fallthrough */
|
||||||
case 0:
|
case 0:
|
||||||
curPos++;
|
curPos++;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
srcIndex++;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@@ -138,8 +148,9 @@ static void strsubUTF8(char *dest, size_t destLen, char const *src, uint32_t pos
|
|||||||
while (src[srcIndex] && destIndex < destLen - 1 && curLen < len) {
|
while (src[srcIndex] && destIndex < destLen - 1 && curLen < len) {
|
||||||
switch (decode(&state, &codep, src[srcIndex])) {
|
switch (decode(&state, &codep, src[srcIndex])) {
|
||||||
case 1:
|
case 1:
|
||||||
fatalerror("STRSUB: Invalid UTF-8 character\n");
|
errorInvalidUTF8Byte(src[srcIndex], "STRSUB");
|
||||||
break;
|
state = 0;
|
||||||
|
/* fallthrough */
|
||||||
case 0:
|
case 0:
|
||||||
curLen++;
|
curLen++;
|
||||||
break;
|
break;
|
||||||
@@ -152,7 +163,7 @@ static void strsubUTF8(char *dest, size_t destLen, char const *src, uint32_t pos
|
|||||||
|
|
||||||
/* Check for partial code point. */
|
/* Check for partial code point. */
|
||||||
if (state != 0)
|
if (state != 0)
|
||||||
fatalerror("STRSUB: Invalid UTF-8 character\n");
|
error("STRSUB: Incomplete UTF-8 character\n");
|
||||||
|
|
||||||
dest[destIndex] = '\0';
|
dest[destIndex] = '\0';
|
||||||
}
|
}
|
||||||
|
|||||||
23
test/asm/invalid-utf-8-strings.asm
Normal file
23
test/asm/invalid-utf-8-strings.asm
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
; characters:
|
||||||
|
; 1: U+0061 a
|
||||||
|
; 2: U+00E4 a with diaresis (0xC3 0xA4)
|
||||||
|
; 3: U+0062 b
|
||||||
|
; 4: U+6F22 kanji (0xE6 0xBC 0xA2)
|
||||||
|
; 5: U+002C ,
|
||||||
|
; 6: U+0061 a
|
||||||
|
; 7: invalid byte 0xA3
|
||||||
|
; 8: invalid byte 0xA4
|
||||||
|
; 9: U+0062 b
|
||||||
|
; 10: invalid bytes 0xE6 0xF0
|
||||||
|
; 11: invalid byte 0xA2
|
||||||
|
; 12: U+0021 !
|
||||||
|
invalid EQUS "aäb漢,a<><61>b<EFBFBD><62><EFBFBD>!"
|
||||||
|
|
||||||
|
n = STRLEN("{invalid}")
|
||||||
|
copy EQUS STRSUB("{invalid}", 1)
|
||||||
|
|
||||||
|
println "\"{invalid}\" == \"{copy}\" ({d:n})"
|
||||||
|
|
||||||
|
mid1 EQUS STRSUB("{invalid}", 5, 2)
|
||||||
|
mid2 EQUS STRSUB("{invalid}", 9, 1)
|
||||||
|
println "\"{mid2}{mid1}\""
|
||||||
45
test/asm/invalid-utf-8-strings.err
Normal file
45
test/asm/invalid-utf-8-strings.err
Normal file
@@ -0,0 +1,45 @@
|
|||||||
|
ERROR: invalid-utf-8-strings.asm(16):
|
||||||
|
STRLEN: Invalid UTF-8 byte 0xA3
|
||||||
|
ERROR: invalid-utf-8-strings.asm(16):
|
||||||
|
STRLEN: Invalid UTF-8 byte 0xA4
|
||||||
|
ERROR: invalid-utf-8-strings.asm(16):
|
||||||
|
STRLEN: Invalid UTF-8 byte 0xF0
|
||||||
|
ERROR: invalid-utf-8-strings.asm(16):
|
||||||
|
STRLEN: Invalid UTF-8 byte 0xA2
|
||||||
|
ERROR: invalid-utf-8-strings.asm(17):
|
||||||
|
STRLEN: Invalid UTF-8 byte 0xA3
|
||||||
|
ERROR: invalid-utf-8-strings.asm(17):
|
||||||
|
STRLEN: Invalid UTF-8 byte 0xA4
|
||||||
|
ERROR: invalid-utf-8-strings.asm(17):
|
||||||
|
STRLEN: Invalid UTF-8 byte 0xF0
|
||||||
|
ERROR: invalid-utf-8-strings.asm(17):
|
||||||
|
STRLEN: Invalid UTF-8 byte 0xA2
|
||||||
|
ERROR: invalid-utf-8-strings.asm(17):
|
||||||
|
STRSUB: Invalid UTF-8 byte 0xA3
|
||||||
|
ERROR: invalid-utf-8-strings.asm(17):
|
||||||
|
STRSUB: Invalid UTF-8 byte 0xA4
|
||||||
|
ERROR: invalid-utf-8-strings.asm(17):
|
||||||
|
STRSUB: Invalid UTF-8 byte 0xF0
|
||||||
|
ERROR: invalid-utf-8-strings.asm(17):
|
||||||
|
STRSUB: Invalid UTF-8 byte 0xA2
|
||||||
|
ERROR: invalid-utf-8-strings.asm(21):
|
||||||
|
STRLEN: Invalid UTF-8 byte 0xA3
|
||||||
|
ERROR: invalid-utf-8-strings.asm(21):
|
||||||
|
STRLEN: Invalid UTF-8 byte 0xA4
|
||||||
|
ERROR: invalid-utf-8-strings.asm(21):
|
||||||
|
STRLEN: Invalid UTF-8 byte 0xF0
|
||||||
|
ERROR: invalid-utf-8-strings.asm(21):
|
||||||
|
STRLEN: Invalid UTF-8 byte 0xA2
|
||||||
|
ERROR: invalid-utf-8-strings.asm(22):
|
||||||
|
STRLEN: Invalid UTF-8 byte 0xA3
|
||||||
|
ERROR: invalid-utf-8-strings.asm(22):
|
||||||
|
STRLEN: Invalid UTF-8 byte 0xA4
|
||||||
|
ERROR: invalid-utf-8-strings.asm(22):
|
||||||
|
STRLEN: Invalid UTF-8 byte 0xF0
|
||||||
|
ERROR: invalid-utf-8-strings.asm(22):
|
||||||
|
STRLEN: Invalid UTF-8 byte 0xA2
|
||||||
|
ERROR: invalid-utf-8-strings.asm(22):
|
||||||
|
STRSUB: Invalid UTF-8 byte 0xA3
|
||||||
|
ERROR: invalid-utf-8-strings.asm(22):
|
||||||
|
STRSUB: Invalid UTF-8 byte 0xA4
|
||||||
|
error: Assembly aborted (22 errors)!
|
||||||
2
test/asm/invalid-utf-8-strings.out
Normal file
2
test/asm/invalid-utf-8-strings.out
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
"aäb漢,a<><61>b<EFBFBD><62><EFBFBD>!" == "aäb漢,a<><61>b<EFBFBD><62><EFBFBD>!" (12)
|
||||||
|
"b,a"
|
||||||
@@ -1,5 +1,5 @@
|
|||||||
; This test tries to pass invalid UTF-8 through a macro argument
|
; This test tries to pass invalid UTF-8 through a macro argument
|
||||||
; to exercise the lexer's reportGarbageChar
|
; to exercise the lexer's unknown character reporting
|
||||||
m:MACRO
|
m:MACRO
|
||||||
\1
|
\1
|
||||||
ENDM
|
ENDM
|
||||||
|
|||||||
Reference in New Issue
Block a user