Update the UTF-8 decoder (#1741)

2026-07-02 14:08:04 +00:00 · 2025-07-09 23:13:30 -04:00
parent 34cf959c9d
commit a40109e4e4
5 changed files with 62 additions and 52 deletions
@@ -5,6 +5,9 @@

 #include <stdint.h>

+#define UTF8_ACCEPT 0
+#define UTF8_REJECT 12
+
 uint32_t decode(uint32_t *state, uint32_t *codep, uint8_t byte);

 #endif // RGBDS_EXTERN_UTF8DECODER_HPP
@@ -266,14 +266,15 @@ size_t charmap_ConvertNext(std::string_view &input, std::vector<int32_t> *output
 	} else if (inputIdx < input.length()) { // No match found, but there is some input left
 		size_t codepointLen = 0;
 		// This will write the codepoint's value to `output`, little-endian
-		for (uint32_t state = 0, codepoint = 0; inputIdx + codepointLen < input.length();) {
-			if (decode(&state, &codepoint, input[inputIdx + codepointLen]) == 1) {
+		for (uint32_t state = UTF8_ACCEPT, codepoint = 0;
+		     inputIdx + codepointLen < input.length();) {
+			if (decode(&state, &codepoint, input[inputIdx + codepointLen]) == UTF8_REJECT) {
 				error("Input string is not valid UTF-8");
 				codepointLen = 1;
 				break;
 			}
 			codepointLen++;
-			if (state == 0) {
+			if (state == UTF8_ACCEPT) {
 				break;
 			}
 		}
@@ -2708,26 +2708,26 @@ static void errorInvalidUTF8Byte(uint8_t byte, char const *functionName) {
 static size_t strlenUTF8(std::string const &str, bool printErrors) {
 	char const *ptr = str.c_str();
 	size_t len = 0;
-	uint32_t state = 0;
+	uint32_t state = UTF8_ACCEPT;

 	for (uint32_t codepoint = 0; *ptr; ptr++) {
 		uint8_t byte = *ptr;

 		switch (decode(&state, &codepoint, byte)) {
-		case 1:
+		case UTF8_REJECT:
 			if (printErrors) {
 				errorInvalidUTF8Byte(byte, "STRLEN");
 			}
-			state = 0;
+			state = UTF8_ACCEPT;
 			// fallthrough
-		case 0:
+		case UTF8_ACCEPT:
 			len++;
 			break;
 		}
 	}

 	// Check for partial code point.
-	if (state != 0) {
+	if (state != UTF8_ACCEPT) {
 		if (printErrors) {
 			error("STRLEN: Incomplete UTF-8 character");
 		}
@@ -2740,18 +2740,18 @@ static size_t strlenUTF8(std::string const &str, bool printErrors) {
 static std::string strsliceUTF8(std::string const &str, uint32_t start, uint32_t stop) {
 	char const *ptr = str.c_str();
 	size_t index = 0;
-	uint32_t state = 0;
+	uint32_t state = UTF8_ACCEPT;
 	uint32_t codepoint = 0;
 	uint32_t curIdx = 0;

 	// Advance to starting index in source string.
 	while (ptr[index] && curIdx < start) {
 		switch (decode(&state, &codepoint, ptr[index])) {
-		case 1:
+		case UTF8_REJECT:
 			errorInvalidUTF8Byte(ptr[index], "STRSLICE");
-			state = 0;
+			state = UTF8_ACCEPT;
 			// fallthrough
-		case 0:
+		case UTF8_ACCEPT:
 			curIdx++;
 			break;
 		}
@@ -2773,11 +2773,11 @@ static std::string strsliceUTF8(std::string const &str, uint32_t start, uint32_t
 	// Advance to ending index in source string.
 	while (ptr[index] && curIdx < stop) {
 		switch (decode(&state, &codepoint, ptr[index])) {
-		case 1:
+		case UTF8_REJECT:
 			errorInvalidUTF8Byte(ptr[index], "STRSLICE");
-			state = 0;
+			state = UTF8_ACCEPT;
 			// fallthrough
-		case 0:
+		case UTF8_ACCEPT:
 			curIdx++;
 			break;
 		}
@@ -2785,7 +2785,7 @@ static std::string strsliceUTF8(std::string const &str, uint32_t start, uint32_t
 	}

 	// Check for partial code point.
-	if (state != 0) {
+	if (state != UTF8_ACCEPT) {
 		error("STRSLICE: Incomplete UTF-8 character");
 		curIdx++;
 	}
@@ -2804,18 +2804,18 @@ static std::string strsliceUTF8(std::string const &str, uint32_t start, uint32_t
 static std::string strsubUTF8(std::string const &str, uint32_t pos, uint32_t len) {
 	char const *ptr = str.c_str();
 	size_t index = 0;
-	uint32_t state = 0;
+	uint32_t state = UTF8_ACCEPT;
 	uint32_t codepoint = 0;
 	uint32_t curPos = 1;

 	// Advance to starting position in source string.
 	while (ptr[index] && curPos < pos) {
 		switch (decode(&state, &codepoint, ptr[index])) {
-		case 1:
+		case UTF8_REJECT:
 			errorInvalidUTF8Byte(ptr[index], "STRSUB");
-			state = 0;
+			state = UTF8_ACCEPT;
 			// fallthrough
-		case 0:
+		case UTF8_ACCEPT:
 			curPos++;
 			break;
 		}
@@ -2836,11 +2836,11 @@ static std::string strsubUTF8(std::string const &str, uint32_t pos, uint32_t len
 	// Compute the result length in bytes.
 	while (ptr[index] && curLen < len) {
 		switch (decode(&state, &codepoint, ptr[index])) {
-		case 1:
+		case UTF8_REJECT:
 			errorInvalidUTF8Byte(ptr[index], "STRSUB");
-			state = 0;
+			state = UTF8_ACCEPT;
 			// fallthrough
-		case 0:
+		case UTF8_ACCEPT:
 			curLen++;
 			break;
 		}
@@ -2848,7 +2848,7 @@ static std::string strsubUTF8(std::string const &str, uint32_t pos, uint32_t len
 	}

 	// Check for partial code point.
-	if (state != 0) {
+	if (state != UTF8_ACCEPT) {
 		error("STRSUB: Incomplete UTF-8 character");
 		curLen++;
 	}
@@ -6,37 +6,43 @@

 #include "extern/utf8decoder.hpp"

+// clang-format off: vertically align values
 static uint8_t const utf8d[] = {
-    0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 00..0f
-    0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 10..1f
-    0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 20..2f
-    0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 30..3f
-    0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 40..4f
-    0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 50..5f
-    0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 60..6f
-    0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 70..7f
-    1,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 80..8f
-    9,  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, // 90..9f
-    7,  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // a0..af
-    7,  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // b0..bf
-    8,  8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // c0..cf
-    2,  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // d0..df
+    // The first part of the table maps bytes to character classes that
+    // to reduce the size of the transition table and create bitmasks.
+     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 00..0f
+     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 10..1f
+     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 20..2f
+     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 30..3f
+     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 40..4f
+     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 50..5f
+     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 60..6f
+     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 70..7f
+     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 80..8f
+     9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, // 90..9f
+     7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // a0..af
+     7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // b0..bf
+     8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // c0..cf
+     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // d0..df
    10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, // e0..ef
    11, 6, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, // f0..ff
-    0,  1, 2, 3, 5, 8, 7, 1, 1, 1, 4, 6, 1, 1, 1, 1, // s0
-    1,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // s1
-    1,  0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, // s1
-    1,  2, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, // s3
-    1,  1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, // s4
-    1,  2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, // s5
-    1,  1, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, // s6
-    1,  3, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, // s7
-    1,  3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // s8
+    // The second part is a transition table that maps a combination
+    // of a state of the automaton and a character class to a state.
+     0, 12, 24, 36, 60, 96, 84, 12, 12, 12, 48, 72, // s0
+    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, // s1
+    12,  0, 12, 12, 12, 12, 12,  0, 12,  0, 12, 12, // s2
+    12, 24, 12, 12, 12, 12, 12, 24, 12, 24, 12, 12, // s3
+    12, 12, 12, 12, 12, 12, 12, 24, 12, 12, 12, 12, // s4
+    12, 24, 12, 12, 12, 12, 12, 12, 12, 24, 12, 12, // s5
+    12, 12, 12, 12, 12, 12, 12, 36, 12, 36, 12, 12, // s6
+    12, 36, 12, 12, 12, 12, 12, 36, 12, 36, 12, 12, // s7
+    12, 36, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, // s8
 };
+// clang-format on

 uint32_t decode(uint32_t *state, uint32_t *codep, uint8_t byte) {
 	uint8_t type = utf8d[byte];
-	*codep = *state != 0 ? (byte & 0b111111) | (*codep << 6) : byte & (0xFF >> type);
-	*state = utf8d[0x100 + *state * 0x10 + type];
+	*codep = *state != UTF8_ACCEPT ? (byte & 0b111111) | (*codep << 6) : (0xff >> type) & byte;
+	*state = utf8d[0x100 + *state + type];
 	return *state;
 }
@@ -272,11 +272,11 @@ static void writeSymName(std::string const &name, FILE *file) {
 		} else {
 			// Output illegal characters using Unicode escapes ('\u' or '\U')
 			// Decode the UTF-8 codepoint; or at least attempt to
-			uint32_t state = 0, codepoint;
+			uint32_t state = UTF8_ACCEPT, codepoint;

 			do {
 				decode(&state, &codepoint, *ptr);
-				if (state == 1) {
+				if (state == UTF8_REJECT) {
 					// This sequence was invalid; emit a U+FFFD, and recover
 					codepoint = 0xFFFD;
 					// Skip continuation bytes
@@ -287,7 +287,7 @@ static void writeSymName(std::string const &name, FILE *file) {
 					break;
 				}
 				++ptr;
-			} while (state != 0);
+			} while (state != UTF8_ACCEPT);

 			fprintf(file, codepoint <= 0xFFFF ? "\\u%04" PRIx32 : "\\U%08" PRIx32, codepoint);
 		}