diff --git a/include/extern/utf8decoder.hpp b/include/extern/utf8decoder.hpp index 66bdc266..64a9a187 100644 --- a/include/extern/utf8decoder.hpp +++ b/include/extern/utf8decoder.hpp @@ -5,6 +5,9 @@ #include +#define UTF8_ACCEPT 0 +#define UTF8_REJECT 12 + uint32_t decode(uint32_t *state, uint32_t *codep, uint8_t byte); #endif // RGBDS_EXTERN_UTF8DECODER_HPP diff --git a/src/asm/charmap.cpp b/src/asm/charmap.cpp index fa064eda..80cc7c65 100644 --- a/src/asm/charmap.cpp +++ b/src/asm/charmap.cpp @@ -266,14 +266,15 @@ size_t charmap_ConvertNext(std::string_view &input, std::vector *output } else if (inputIdx < input.length()) { // No match found, but there is some input left size_t codepointLen = 0; // This will write the codepoint's value to `output`, little-endian - for (uint32_t state = 0, codepoint = 0; inputIdx + codepointLen < input.length();) { - if (decode(&state, &codepoint, input[inputIdx + codepointLen]) == 1) { + for (uint32_t state = UTF8_ACCEPT, codepoint = 0; + inputIdx + codepointLen < input.length();) { + if (decode(&state, &codepoint, input[inputIdx + codepointLen]) == UTF8_REJECT) { error("Input string is not valid UTF-8"); codepointLen = 1; break; } codepointLen++; - if (state == 0) { + if (state == UTF8_ACCEPT) { break; } } diff --git a/src/asm/parser.y b/src/asm/parser.y index e0e8e2c7..db38e360 100644 --- a/src/asm/parser.y +++ b/src/asm/parser.y @@ -2708,26 +2708,26 @@ static void errorInvalidUTF8Byte(uint8_t byte, char const *functionName) { static size_t strlenUTF8(std::string const &str, bool printErrors) { char const *ptr = str.c_str(); size_t len = 0; - uint32_t state = 0; + uint32_t state = UTF8_ACCEPT; for (uint32_t codepoint = 0; *ptr; ptr++) { uint8_t byte = *ptr; switch (decode(&state, &codepoint, byte)) { - case 1: + case UTF8_REJECT: if (printErrors) { errorInvalidUTF8Byte(byte, "STRLEN"); } - state = 0; + state = UTF8_ACCEPT; // fallthrough - case 0: + case UTF8_ACCEPT: len++; break; } } // Check for partial code point. - if (state != 0) { + if (state != UTF8_ACCEPT) { if (printErrors) { error("STRLEN: Incomplete UTF-8 character"); } @@ -2740,18 +2740,18 @@ static size_t strlenUTF8(std::string const &str, bool printErrors) { static std::string strsliceUTF8(std::string const &str, uint32_t start, uint32_t stop) { char const *ptr = str.c_str(); size_t index = 0; - uint32_t state = 0; + uint32_t state = UTF8_ACCEPT; uint32_t codepoint = 0; uint32_t curIdx = 0; // Advance to starting index in source string. while (ptr[index] && curIdx < start) { switch (decode(&state, &codepoint, ptr[index])) { - case 1: + case UTF8_REJECT: errorInvalidUTF8Byte(ptr[index], "STRSLICE"); - state = 0; + state = UTF8_ACCEPT; // fallthrough - case 0: + case UTF8_ACCEPT: curIdx++; break; } @@ -2773,11 +2773,11 @@ static std::string strsliceUTF8(std::string const &str, uint32_t start, uint32_t // Advance to ending index in source string. while (ptr[index] && curIdx < stop) { switch (decode(&state, &codepoint, ptr[index])) { - case 1: + case UTF8_REJECT: errorInvalidUTF8Byte(ptr[index], "STRSLICE"); - state = 0; + state = UTF8_ACCEPT; // fallthrough - case 0: + case UTF8_ACCEPT: curIdx++; break; } @@ -2785,7 +2785,7 @@ static std::string strsliceUTF8(std::string const &str, uint32_t start, uint32_t } // Check for partial code point. - if (state != 0) { + if (state != UTF8_ACCEPT) { error("STRSLICE: Incomplete UTF-8 character"); curIdx++; } @@ -2804,18 +2804,18 @@ static std::string strsliceUTF8(std::string const &str, uint32_t start, uint32_t static std::string strsubUTF8(std::string const &str, uint32_t pos, uint32_t len) { char const *ptr = str.c_str(); size_t index = 0; - uint32_t state = 0; + uint32_t state = UTF8_ACCEPT; uint32_t codepoint = 0; uint32_t curPos = 1; // Advance to starting position in source string. while (ptr[index] && curPos < pos) { switch (decode(&state, &codepoint, ptr[index])) { - case 1: + case UTF8_REJECT: errorInvalidUTF8Byte(ptr[index], "STRSUB"); - state = 0; + state = UTF8_ACCEPT; // fallthrough - case 0: + case UTF8_ACCEPT: curPos++; break; } @@ -2836,11 +2836,11 @@ static std::string strsubUTF8(std::string const &str, uint32_t pos, uint32_t len // Compute the result length in bytes. while (ptr[index] && curLen < len) { switch (decode(&state, &codepoint, ptr[index])) { - case 1: + case UTF8_REJECT: errorInvalidUTF8Byte(ptr[index], "STRSUB"); - state = 0; + state = UTF8_ACCEPT; // fallthrough - case 0: + case UTF8_ACCEPT: curLen++; break; } @@ -2848,7 +2848,7 @@ static std::string strsubUTF8(std::string const &str, uint32_t pos, uint32_t len } // Check for partial code point. - if (state != 0) { + if (state != UTF8_ACCEPT) { error("STRSUB: Incomplete UTF-8 character"); curLen++; } diff --git a/src/extern/utf8decoder.cpp b/src/extern/utf8decoder.cpp index 116f50c8..480bd340 100644 --- a/src/extern/utf8decoder.cpp +++ b/src/extern/utf8decoder.cpp @@ -6,37 +6,43 @@ #include "extern/utf8decoder.hpp" +// clang-format off: vertically align values static uint8_t const utf8d[] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 00..0f - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 10..1f - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 20..2f - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 30..3f - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 40..4f - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 50..5f - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 60..6f - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 70..7f - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 80..8f - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, // 90..9f - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // a0..af - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // b0..bf - 8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // c0..cf - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // d0..df + // The first part of the table maps bytes to character classes that + // to reduce the size of the transition table and create bitmasks. + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 00..0f + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 10..1f + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 20..2f + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 30..3f + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 40..4f + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 50..5f + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 60..6f + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 70..7f + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 80..8f + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, // 90..9f + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // a0..af + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // b0..bf + 8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // c0..cf + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // d0..df 10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, // e0..ef 11, 6, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, // f0..ff - 0, 1, 2, 3, 5, 8, 7, 1, 1, 1, 4, 6, 1, 1, 1, 1, // s0 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // s1 - 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, // s1 - 1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, // s3 - 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, // s4 - 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, // s5 - 1, 1, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, // s6 - 1, 3, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, // s7 - 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // s8 + // The second part is a transition table that maps a combination + // of a state of the automaton and a character class to a state. + 0, 12, 24, 36, 60, 96, 84, 12, 12, 12, 48, 72, // s0 + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, // s1 + 12, 0, 12, 12, 12, 12, 12, 0, 12, 0, 12, 12, // s2 + 12, 24, 12, 12, 12, 12, 12, 24, 12, 24, 12, 12, // s3 + 12, 12, 12, 12, 12, 12, 12, 24, 12, 12, 12, 12, // s4 + 12, 24, 12, 12, 12, 12, 12, 12, 12, 24, 12, 12, // s5 + 12, 12, 12, 12, 12, 12, 12, 36, 12, 36, 12, 12, // s6 + 12, 36, 12, 12, 12, 12, 12, 36, 12, 36, 12, 12, // s7 + 12, 36, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, // s8 }; +// clang-format on uint32_t decode(uint32_t *state, uint32_t *codep, uint8_t byte) { uint8_t type = utf8d[byte]; - *codep = *state != 0 ? (byte & 0b111111) | (*codep << 6) : byte & (0xFF >> type); - *state = utf8d[0x100 + *state * 0x10 + type]; + *codep = *state != UTF8_ACCEPT ? (byte & 0b111111) | (*codep << 6) : (0xff >> type) & byte; + *state = utf8d[0x100 + *state + type]; return *state; } diff --git a/src/link/output.cpp b/src/link/output.cpp index 1ee40d27..f051fdec 100644 --- a/src/link/output.cpp +++ b/src/link/output.cpp @@ -272,11 +272,11 @@ static void writeSymName(std::string const &name, FILE *file) { } else { // Output illegal characters using Unicode escapes ('\u' or '\U') // Decode the UTF-8 codepoint; or at least attempt to - uint32_t state = 0, codepoint; + uint32_t state = UTF8_ACCEPT, codepoint; do { decode(&state, &codepoint, *ptr); - if (state == 1) { + if (state == UTF8_REJECT) { // This sequence was invalid; emit a U+FFFD, and recover codepoint = 0xFFFD; // Skip continuation bytes @@ -287,7 +287,7 @@ static void writeSymName(std::string const &name, FILE *file) { break; } ++ptr; - } while (state != 0); + } while (state != UTF8_ACCEPT); fprintf(file, codepoint <= 0xFFFF ? "\\u%04" PRIx32 : "\\U%08" PRIx32, codepoint); }