Update the UTF-8 decoder (#1741)

This commit is contained in:
Rangi
2025-07-09 23:13:30 -04:00
committed by GitHub
parent 34cf959c9d
commit a40109e4e4
5 changed files with 62 additions and 52 deletions

View File

@@ -5,6 +5,9 @@
#include <stdint.h> #include <stdint.h>
#define UTF8_ACCEPT 0
#define UTF8_REJECT 12
uint32_t decode(uint32_t *state, uint32_t *codep, uint8_t byte); uint32_t decode(uint32_t *state, uint32_t *codep, uint8_t byte);
#endif // RGBDS_EXTERN_UTF8DECODER_HPP #endif // RGBDS_EXTERN_UTF8DECODER_HPP

View File

@@ -266,14 +266,15 @@ size_t charmap_ConvertNext(std::string_view &input, std::vector<int32_t> *output
} else if (inputIdx < input.length()) { // No match found, but there is some input left } else if (inputIdx < input.length()) { // No match found, but there is some input left
size_t codepointLen = 0; size_t codepointLen = 0;
// This will write the codepoint's value to `output`, little-endian // This will write the codepoint's value to `output`, little-endian
for (uint32_t state = 0, codepoint = 0; inputIdx + codepointLen < input.length();) { for (uint32_t state = UTF8_ACCEPT, codepoint = 0;
if (decode(&state, &codepoint, input[inputIdx + codepointLen]) == 1) { inputIdx + codepointLen < input.length();) {
if (decode(&state, &codepoint, input[inputIdx + codepointLen]) == UTF8_REJECT) {
error("Input string is not valid UTF-8"); error("Input string is not valid UTF-8");
codepointLen = 1; codepointLen = 1;
break; break;
} }
codepointLen++; codepointLen++;
if (state == 0) { if (state == UTF8_ACCEPT) {
break; break;
} }
} }

View File

@@ -2708,26 +2708,26 @@ static void errorInvalidUTF8Byte(uint8_t byte, char const *functionName) {
static size_t strlenUTF8(std::string const &str, bool printErrors) { static size_t strlenUTF8(std::string const &str, bool printErrors) {
char const *ptr = str.c_str(); char const *ptr = str.c_str();
size_t len = 0; size_t len = 0;
uint32_t state = 0; uint32_t state = UTF8_ACCEPT;
for (uint32_t codepoint = 0; *ptr; ptr++) { for (uint32_t codepoint = 0; *ptr; ptr++) {
uint8_t byte = *ptr; uint8_t byte = *ptr;
switch (decode(&state, &codepoint, byte)) { switch (decode(&state, &codepoint, byte)) {
case 1: case UTF8_REJECT:
if (printErrors) { if (printErrors) {
errorInvalidUTF8Byte(byte, "STRLEN"); errorInvalidUTF8Byte(byte, "STRLEN");
} }
state = 0; state = UTF8_ACCEPT;
// fallthrough // fallthrough
case 0: case UTF8_ACCEPT:
len++; len++;
break; break;
} }
} }
// Check for partial code point. // Check for partial code point.
if (state != 0) { if (state != UTF8_ACCEPT) {
if (printErrors) { if (printErrors) {
error("STRLEN: Incomplete UTF-8 character"); error("STRLEN: Incomplete UTF-8 character");
} }
@@ -2740,18 +2740,18 @@ static size_t strlenUTF8(std::string const &str, bool printErrors) {
static std::string strsliceUTF8(std::string const &str, uint32_t start, uint32_t stop) { static std::string strsliceUTF8(std::string const &str, uint32_t start, uint32_t stop) {
char const *ptr = str.c_str(); char const *ptr = str.c_str();
size_t index = 0; size_t index = 0;
uint32_t state = 0; uint32_t state = UTF8_ACCEPT;
uint32_t codepoint = 0; uint32_t codepoint = 0;
uint32_t curIdx = 0; uint32_t curIdx = 0;
// Advance to starting index in source string. // Advance to starting index in source string.
while (ptr[index] && curIdx < start) { while (ptr[index] && curIdx < start) {
switch (decode(&state, &codepoint, ptr[index])) { switch (decode(&state, &codepoint, ptr[index])) {
case 1: case UTF8_REJECT:
errorInvalidUTF8Byte(ptr[index], "STRSLICE"); errorInvalidUTF8Byte(ptr[index], "STRSLICE");
state = 0; state = UTF8_ACCEPT;
// fallthrough // fallthrough
case 0: case UTF8_ACCEPT:
curIdx++; curIdx++;
break; break;
} }
@@ -2773,11 +2773,11 @@ static std::string strsliceUTF8(std::string const &str, uint32_t start, uint32_t
// Advance to ending index in source string. // Advance to ending index in source string.
while (ptr[index] && curIdx < stop) { while (ptr[index] && curIdx < stop) {
switch (decode(&state, &codepoint, ptr[index])) { switch (decode(&state, &codepoint, ptr[index])) {
case 1: case UTF8_REJECT:
errorInvalidUTF8Byte(ptr[index], "STRSLICE"); errorInvalidUTF8Byte(ptr[index], "STRSLICE");
state = 0; state = UTF8_ACCEPT;
// fallthrough // fallthrough
case 0: case UTF8_ACCEPT:
curIdx++; curIdx++;
break; break;
} }
@@ -2785,7 +2785,7 @@ static std::string strsliceUTF8(std::string const &str, uint32_t start, uint32_t
} }
// Check for partial code point. // Check for partial code point.
if (state != 0) { if (state != UTF8_ACCEPT) {
error("STRSLICE: Incomplete UTF-8 character"); error("STRSLICE: Incomplete UTF-8 character");
curIdx++; curIdx++;
} }
@@ -2804,18 +2804,18 @@ static std::string strsliceUTF8(std::string const &str, uint32_t start, uint32_t
static std::string strsubUTF8(std::string const &str, uint32_t pos, uint32_t len) { static std::string strsubUTF8(std::string const &str, uint32_t pos, uint32_t len) {
char const *ptr = str.c_str(); char const *ptr = str.c_str();
size_t index = 0; size_t index = 0;
uint32_t state = 0; uint32_t state = UTF8_ACCEPT;
uint32_t codepoint = 0; uint32_t codepoint = 0;
uint32_t curPos = 1; uint32_t curPos = 1;
// Advance to starting position in source string. // Advance to starting position in source string.
while (ptr[index] && curPos < pos) { while (ptr[index] && curPos < pos) {
switch (decode(&state, &codepoint, ptr[index])) { switch (decode(&state, &codepoint, ptr[index])) {
case 1: case UTF8_REJECT:
errorInvalidUTF8Byte(ptr[index], "STRSUB"); errorInvalidUTF8Byte(ptr[index], "STRSUB");
state = 0; state = UTF8_ACCEPT;
// fallthrough // fallthrough
case 0: case UTF8_ACCEPT:
curPos++; curPos++;
break; break;
} }
@@ -2836,11 +2836,11 @@ static std::string strsubUTF8(std::string const &str, uint32_t pos, uint32_t len
// Compute the result length in bytes. // Compute the result length in bytes.
while (ptr[index] && curLen < len) { while (ptr[index] && curLen < len) {
switch (decode(&state, &codepoint, ptr[index])) { switch (decode(&state, &codepoint, ptr[index])) {
case 1: case UTF8_REJECT:
errorInvalidUTF8Byte(ptr[index], "STRSUB"); errorInvalidUTF8Byte(ptr[index], "STRSUB");
state = 0; state = UTF8_ACCEPT;
// fallthrough // fallthrough
case 0: case UTF8_ACCEPT:
curLen++; curLen++;
break; break;
} }
@@ -2848,7 +2848,7 @@ static std::string strsubUTF8(std::string const &str, uint32_t pos, uint32_t len
} }
// Check for partial code point. // Check for partial code point.
if (state != 0) { if (state != UTF8_ACCEPT) {
error("STRSUB: Incomplete UTF-8 character"); error("STRSUB: Incomplete UTF-8 character");
curLen++; curLen++;
} }

View File

@@ -6,7 +6,10 @@
#include "extern/utf8decoder.hpp" #include "extern/utf8decoder.hpp"
// clang-format off: vertically align values
static uint8_t const utf8d[] = { static uint8_t const utf8d[] = {
// The first part of the table maps bytes to character classes that
// to reduce the size of the transition table and create bitmasks.
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 00..0f 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 00..0f
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 10..1f 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 10..1f
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 20..2f 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 20..2f
@@ -23,20 +26,23 @@ static uint8_t const utf8d[] = {
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // d0..df 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // d0..df
10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, // e0..ef 10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, // e0..ef
11, 6, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, // f0..ff 11, 6, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, // f0..ff
0, 1, 2, 3, 5, 8, 7, 1, 1, 1, 4, 6, 1, 1, 1, 1, // s0 // The second part is a transition table that maps a combination
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // s1 // of a state of the automaton and a character class to a state.
1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, // s1 0, 12, 24, 36, 60, 96, 84, 12, 12, 12, 48, 72, // s0
1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, // s3 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, // s1
1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, // s4 12, 0, 12, 12, 12, 12, 12, 0, 12, 0, 12, 12, // s2
1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, // s5 12, 24, 12, 12, 12, 12, 12, 24, 12, 24, 12, 12, // s3
1, 1, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, // s6 12, 12, 12, 12, 12, 12, 12, 24, 12, 12, 12, 12, // s4
1, 3, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, // s7 12, 24, 12, 12, 12, 12, 12, 12, 12, 24, 12, 12, // s5
1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // s8 12, 12, 12, 12, 12, 12, 12, 36, 12, 36, 12, 12, // s6
12, 36, 12, 12, 12, 12, 12, 36, 12, 36, 12, 12, // s7
12, 36, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, // s8
}; };
// clang-format on
uint32_t decode(uint32_t *state, uint32_t *codep, uint8_t byte) { uint32_t decode(uint32_t *state, uint32_t *codep, uint8_t byte) {
uint8_t type = utf8d[byte]; uint8_t type = utf8d[byte];
*codep = *state != 0 ? (byte & 0b111111) | (*codep << 6) : byte & (0xFF >> type); *codep = *state != UTF8_ACCEPT ? (byte & 0b111111) | (*codep << 6) : (0xff >> type) & byte;
*state = utf8d[0x100 + *state * 0x10 + type]; *state = utf8d[0x100 + *state + type];
return *state; return *state;
} }

View File

@@ -272,11 +272,11 @@ static void writeSymName(std::string const &name, FILE *file) {
} else { } else {
// Output illegal characters using Unicode escapes ('\u' or '\U') // Output illegal characters using Unicode escapes ('\u' or '\U')
// Decode the UTF-8 codepoint; or at least attempt to // Decode the UTF-8 codepoint; or at least attempt to
uint32_t state = 0, codepoint; uint32_t state = UTF8_ACCEPT, codepoint;
do { do {
decode(&state, &codepoint, *ptr); decode(&state, &codepoint, *ptr);
if (state == 1) { if (state == UTF8_REJECT) {
// This sequence was invalid; emit a U+FFFD, and recover // This sequence was invalid; emit a U+FFFD, and recover
codepoint = 0xFFFD; codepoint = 0xFFFD;
// Skip continuation bytes // Skip continuation bytes
@@ -287,7 +287,7 @@ static void writeSymName(std::string const &name, FILE *file) {
break; break;
} }
++ptr; ++ptr;
} while (state != 0); } while (state != UTF8_ACCEPT);
fprintf(file, codepoint <= 0xFFFF ? "\\u%04" PRIx32 : "\\U%08" PRIx32, codepoint); fprintf(file, codepoint <= 0xFFFF ? "\\u%04" PRIx32 : "\\U%08" PRIx32, codepoint);
} }