Refactor readUTF8Char into charmap_ConvertNext

This commit is contained in:
Rangi42
2025-01-28 00:04:49 -05:00
parent 34a9c8e083
commit e49291b7cf
4 changed files with 23 additions and 38 deletions

View File

@@ -3,13 +3,6 @@
#ifndef RGBDS_UTIL_HPP #ifndef RGBDS_UTIL_HPP
#define RGBDS_UTIL_HPP #define RGBDS_UTIL_HPP
#include <stddef.h>
#include <stdint.h>
#include <vector>
char const *printChar(int c); char const *printChar(int c);
// @return The number of bytes read, or 0 if invalid data was found
size_t readUTF8Char(std::vector<int32_t> *dest, char const *src);
#endif // RGBDS_UTIL_HPP #endif // RGBDS_UTIL_HPP

View File

@@ -11,6 +11,7 @@
#include <unordered_map> #include <unordered_map>
#include <utility> #include <utility>
#include "extern/utf8decoder.hpp"
#include "helpers.hpp" #include "helpers.hpp"
#include "util.hpp" #include "util.hpp"
@@ -224,16 +225,30 @@ size_t charmap_ConvertNext(std::string_view &input, std::vector<int32_t> *output
matchLen = value.size(); matchLen = value.size();
} else if (inputIdx < input.length()) { // No match found, but there is some input left } else if (inputIdx < input.length()) { // No match found, but there is some input left
int firstChar = input[inputIdx]; size_t codepointLen = 0;
// This will write the codepoint's value to `output`, little-endian // This will write the codepoint's value to `output`, little-endian
size_t codepointLen = readUTF8Char(output, input.data() + inputIdx); for (uint32_t state = 0, codepoint = 0;;) {
if (decode(&state, &codepoint, input[inputIdx + codepointLen]) == 1) {
codepointLen = 0;
break;
}
if (output) {
output->push_back(input[inputIdx + codepointLen]);
}
codepointLen++;
if (state == 0) {
break;
}
}
if (codepointLen == 0) { if (codepointLen == 0) {
error("Input string is not valid UTF-8\n"); error("Input string is not valid UTF-8\n");
} }
// Warn if this character is not mapped but any others are // Warn if this character is not mapped but any others are
if (charmap.nodes.size() > 1) { if (int firstChar = input[inputIdx]; charmap.nodes.size() > 1) {
warning(WARNING_UNMAPPED_CHAR_1, "Unmapped character %s\n", printChar(firstChar)); warning(WARNING_UNMAPPED_CHAR_1, "Unmapped character %s\n", printChar(firstChar));
} else if (charmap.name != DEFAULT_CHARMAP_NAME) { } else if (charmap.name != DEFAULT_CHARMAP_NAME) {
warning( warning(

View File

@@ -2527,10 +2527,10 @@ static size_t strlenUTF8(std::string const &str) {
size_t len = 0; size_t len = 0;
uint32_t state = 0; uint32_t state = 0;
for (uint32_t codep = 0; *ptr; ptr++) { for (uint32_t codepoint = 0; *ptr; ptr++) {
uint8_t byte = *ptr; uint8_t byte = *ptr;
switch (decode(&state, &codep, byte)) { switch (decode(&state, &codepoint, byte)) {
case 1: case 1:
errorInvalidUTF8Byte(byte, "STRLEN"); errorInvalidUTF8Byte(byte, "STRLEN");
state = 0; state = 0;
@@ -2553,12 +2553,12 @@ static std::string strsubUTF8(std::string const &str, uint32_t pos, uint32_t len
char const *ptr = str.c_str(); char const *ptr = str.c_str();
size_t index = 0; size_t index = 0;
uint32_t state = 0; uint32_t state = 0;
uint32_t codep = 0; uint32_t codepoint = 0;
uint32_t curPos = 1; // RGBASM strings are 1-indexed! uint32_t curPos = 1; // RGBASM strings are 1-indexed!
// Advance to starting position in source string. // Advance to starting position in source string.
while (ptr[index] && curPos < pos) { while (ptr[index] && curPos < pos) {
switch (decode(&state, &codep, ptr[index])) { switch (decode(&state, &codepoint, ptr[index])) {
case 1: case 1:
errorInvalidUTF8Byte(ptr[index], "STRSUB"); errorInvalidUTF8Byte(ptr[index], "STRSUB");
state = 0; state = 0;
@@ -2583,7 +2583,7 @@ static std::string strsubUTF8(std::string const &str, uint32_t pos, uint32_t len
// Compute the result length in bytes. // Compute the result length in bytes.
while (ptr[index] && curLen < len) { while (ptr[index] && curLen < len) {
switch (decode(&state, &codep, ptr[index])) { switch (decode(&state, &codepoint, ptr[index])) {
case 1: case 1:
errorInvalidUTF8Byte(ptr[index], "STRSUB"); errorInvalidUTF8Byte(ptr[index], "STRSUB");
state = 0; state = 0;

View File

@@ -5,9 +5,6 @@
#include <ctype.h> #include <ctype.h>
#include <stdint.h> #include <stdint.h>
#include <stdio.h> #include <stdio.h>
#include <vector>
#include "extern/utf8decoder.hpp"
char const *printChar(int c) { char const *printChar(int c) {
// "'A'" + '\0': 4 bytes // "'A'" + '\0': 4 bytes
@@ -53,23 +50,3 @@ char const *printChar(int c) {
buf[4] = '\0'; buf[4] = '\0';
return buf; return buf;
} }
size_t readUTF8Char(std::vector<int32_t> *dest, char const *src) {
uint32_t state = 0, codepoint;
size_t i = 0;
for (;;) {
if (decode(&state, &codepoint, src[i]) == 1) {
return 0;
}
if (dest) {
dest->push_back(src[i]);
}
i++;
if (state == 0) {
return i;
}
}
}