Make invalid UTF-8 characters in strings non-fatal

STRLEN and STRSUB report the erroneous bytes

Fixes #848
This commit is contained in:
Rangi
2021-04-20 12:24:01 -04:00
committed by Eldred Habert
parent e596dbfc80
commit 4d21588eb2
5 changed files with 92 additions and 11 deletions

View File

@@ -82,16 +82,24 @@ static char *strrstr(char *s1, char *s2)
return NULL; return NULL;
} }
static void errorInvalidUTF8Byte(uint8_t byte, char const *functionName)
{
error("%s: Invalid UTF-8 byte 0x%02hhX\n", functionName, byte);
}
static size_t strlenUTF8(char const *s) static size_t strlenUTF8(char const *s)
{ {
size_t len = 0; size_t len = 0;
uint32_t state = 0; uint32_t state = 0;
for (uint32_t codep = 0; *s; s++) { for (uint32_t codep = 0; *s; s++) {
switch (decode(&state, &codep, *s)) { uint8_t byte = *s;
switch (decode(&state, &codep, byte)) {
case 1: case 1:
fatalerror("STRLEN: Invalid UTF-8 character\n"); errorInvalidUTF8Byte(byte, "STRLEN");
break; state = 0;
/* fallthrough */
case 0: case 0:
len++; len++;
break; break;
@@ -100,7 +108,7 @@ static size_t strlenUTF8(char const *s)
/* Check for partial code point. */ /* Check for partial code point. */
if (state != 0) if (state != 0)
fatalerror("STRLEN: Invalid UTF-8 character\n"); error("STRLEN: Incomplete UTF-8 character\n");
return len; return len;
} }
@@ -116,14 +124,16 @@ static void strsubUTF8(char *dest, size_t destLen, char const *src, uint32_t pos
/* Advance to starting position in source string. */ /* Advance to starting position in source string. */
while (src[srcIndex] && curPos < pos) { while (src[srcIndex] && curPos < pos) {
switch (decode(&state, &codep, src[srcIndex++])) { switch (decode(&state, &codep, src[srcIndex])) {
case 1: case 1:
fatalerror("STRSUB: Invalid UTF-8 character\n"); errorInvalidUTF8Byte(src[srcIndex], "STRSUB");
break; state = 0;
/* fallthrough */
case 0: case 0:
curPos++; curPos++;
break; break;
} }
srcIndex++;
} }
/* /*
@@ -138,8 +148,9 @@ static void strsubUTF8(char *dest, size_t destLen, char const *src, uint32_t pos
while (src[srcIndex] && destIndex < destLen - 1 && curLen < len) { while (src[srcIndex] && destIndex < destLen - 1 && curLen < len) {
switch (decode(&state, &codep, src[srcIndex])) { switch (decode(&state, &codep, src[srcIndex])) {
case 1: case 1:
fatalerror("STRSUB: Invalid UTF-8 character\n"); errorInvalidUTF8Byte(src[srcIndex], "STRSUB");
break; state = 0;
/* fallthrough */
case 0: case 0:
curLen++; curLen++;
break; break;
@@ -152,7 +163,7 @@ static void strsubUTF8(char *dest, size_t destLen, char const *src, uint32_t pos
/* Check for partial code point. */ /* Check for partial code point. */
if (state != 0) if (state != 0)
fatalerror("STRSUB: Invalid UTF-8 character\n"); error("STRSUB: Incomplete UTF-8 character\n");
dest[destIndex] = '\0'; dest[destIndex] = '\0';
} }

View File

@@ -0,0 +1,23 @@
; characters:
; 1: U+0061 a
; 2: U+00E4 a with diaresis (0xC3 0xA4)
; 3: U+0062 b
; 4: U+6F22 kanji (0xE6 0xBC 0xA2)
; 5: U+002C ,
; 6: U+0061 a
; 7: invalid byte 0xA3
; 8: invalid byte 0xA4
; 9: U+0062 b
; 10: invalid bytes 0xE6 0xF0
; 11: invalid byte 0xA2
; 12: U+0021 !
invalid EQUS "aäb漢,a<><61>b<EFBFBD><62><EFBFBD>!"
n = STRLEN("{invalid}")
copy EQUS STRSUB("{invalid}", 1)
println "\"{invalid}\" == \"{copy}\" ({d:n})"
mid1 EQUS STRSUB("{invalid}", 5, 2)
mid2 EQUS STRSUB("{invalid}", 9, 1)
println "\"{mid2}{mid1}\""

View File

@@ -0,0 +1,45 @@
ERROR: invalid-utf-8-strings.asm(16):
STRLEN: Invalid UTF-8 byte 0xA3
ERROR: invalid-utf-8-strings.asm(16):
STRLEN: Invalid UTF-8 byte 0xA4
ERROR: invalid-utf-8-strings.asm(16):
STRLEN: Invalid UTF-8 byte 0xF0
ERROR: invalid-utf-8-strings.asm(16):
STRLEN: Invalid UTF-8 byte 0xA2
ERROR: invalid-utf-8-strings.asm(17):
STRLEN: Invalid UTF-8 byte 0xA3
ERROR: invalid-utf-8-strings.asm(17):
STRLEN: Invalid UTF-8 byte 0xA4
ERROR: invalid-utf-8-strings.asm(17):
STRLEN: Invalid UTF-8 byte 0xF0
ERROR: invalid-utf-8-strings.asm(17):
STRLEN: Invalid UTF-8 byte 0xA2
ERROR: invalid-utf-8-strings.asm(17):
STRSUB: Invalid UTF-8 byte 0xA3
ERROR: invalid-utf-8-strings.asm(17):
STRSUB: Invalid UTF-8 byte 0xA4
ERROR: invalid-utf-8-strings.asm(17):
STRSUB: Invalid UTF-8 byte 0xF0
ERROR: invalid-utf-8-strings.asm(17):
STRSUB: Invalid UTF-8 byte 0xA2
ERROR: invalid-utf-8-strings.asm(21):
STRLEN: Invalid UTF-8 byte 0xA3
ERROR: invalid-utf-8-strings.asm(21):
STRLEN: Invalid UTF-8 byte 0xA4
ERROR: invalid-utf-8-strings.asm(21):
STRLEN: Invalid UTF-8 byte 0xF0
ERROR: invalid-utf-8-strings.asm(21):
STRLEN: Invalid UTF-8 byte 0xA2
ERROR: invalid-utf-8-strings.asm(22):
STRLEN: Invalid UTF-8 byte 0xA3
ERROR: invalid-utf-8-strings.asm(22):
STRLEN: Invalid UTF-8 byte 0xA4
ERROR: invalid-utf-8-strings.asm(22):
STRLEN: Invalid UTF-8 byte 0xF0
ERROR: invalid-utf-8-strings.asm(22):
STRLEN: Invalid UTF-8 byte 0xA2
ERROR: invalid-utf-8-strings.asm(22):
STRSUB: Invalid UTF-8 byte 0xA3
ERROR: invalid-utf-8-strings.asm(22):
STRSUB: Invalid UTF-8 byte 0xA4
error: Assembly aborted (22 errors)!

View File

@@ -0,0 +1,2 @@
"aäb漢,a<><61>b<EFBFBD><62><EFBFBD>!" == "aäb漢,a<><61>b<EFBFBD><62><EFBFBD>!" (12)
"b,a"

View File

@@ -1,5 +1,5 @@
; This test tries to pass invalid UTF-8 through a macro argument ; This test tries to pass invalid UTF-8 through a macro argument
; to exercise the lexer's reportGarbageChar ; to exercise the lexer's unknown character reporting
m:MACRO m:MACRO
\1 \1
ENDM ENDM