Implement multi-value charmaps (#1429)

This commit is contained in:
Sylvie
2024-08-04 17:32:08 -04:00
committed by GitHub
parent 436580a649
commit 9a5b3f0902
16 changed files with 181 additions and 89 deletions

View File

@@ -8,6 +8,7 @@
#include <string.h>
#include <unordered_map>
#include "helpers.hpp"
#include "util.hpp"
#include "asm/warning.hpp"
@@ -16,10 +17,11 @@
// Essentially a tree, where each nodes stores a single character's worth of info:
// whether there exists a mapping that ends at the current character,
struct CharmapNode {
bool isTerminal; // Whether there exists a mapping that ends here
uint8_t value; // If the above is true, its corresponding value
std::vector<int32_t> value; // The mapped value, if there exists a mapping that ends here
// This MUST be indexes and not pointers, because pointers get invalidated by reallocation!
size_t next[256]; // Indexes of where to go next, 0 = nowhere
size_t next[256]; // Indexes of where to go next, 0 = nowhere
bool isTerminal() const { return !value.empty(); }
};
struct Charmap {
@@ -84,7 +86,7 @@ void charmap_Pop() {
charmapStack.pop();
}
void charmap_Add(std::string const &mapping, uint8_t value) {
void charmap_Add(std::string const &mapping, std::vector<int32_t> &&value) {
Charmap &charmap = *currentCharmap;
size_t nodeIdx = 0;
@@ -106,11 +108,10 @@ void charmap_Add(std::string const &mapping, uint8_t value) {
CharmapNode &node = charmap.nodes[nodeIdx];
if (node.isTerminal)
if (node.isTerminal())
warning(WARNING_CHARMAP_REDEF, "Overriding charmap mapping\n");
node.isTerminal = true;
node.value = value;
std::swap(node.value, value);
}
bool charmap_HasChar(std::string const &input) {
@@ -124,17 +125,17 @@ bool charmap_HasChar(std::string const &input) {
return false;
}
return charmap.nodes[nodeIdx].isTerminal;
return charmap.nodes[nodeIdx].isTerminal();
}
std::vector<uint8_t> charmap_Convert(std::string const &input) {
std::vector<uint8_t> output;
std::vector<int32_t> charmap_Convert(std::string const &input) {
std::vector<int32_t> output;
for (std::string_view inputView = input; charmap_ConvertNext(inputView, &output);)
;
return output;
}
size_t charmap_ConvertNext(std::string_view &input, std::vector<uint8_t> *output) {
size_t charmap_ConvertNext(std::string_view &input, std::vector<int32_t> *output) {
// The goal is to match the longest mapping possible.
// For that, advance through the trie with each character read.
// If that would lead to a dead end, rewind characters until the last match, and output.
@@ -152,7 +153,7 @@ size_t charmap_ConvertNext(std::string_view &input, std::vector<uint8_t> *output
inputIdx++; // Consume that char
if (charmap.nodes[nodeIdx].isTerminal) {
if (charmap.nodes[nodeIdx].isTerminal()) {
matchIdx = nodeIdx; // This node matches, register it
rewindDistance = 0; // If no longer match is found, rewind here
} else {
@@ -166,11 +167,12 @@ size_t charmap_ConvertNext(std::string_view &input, std::vector<uint8_t> *output
size_t matchLen = 0;
if (matchIdx) { // A match was found, use it
std::vector<int32_t> const &value = charmap.nodes[matchIdx].value;
if (output)
output->push_back(charmap.nodes[matchIdx].value);
matchLen = 1;
output->insert(output->end(), RANGE(value));
matchLen = value.size();
} else if (inputIdx < input.length()) { // No match found, but there is some input left
int firstChar = input[inputIdx];
// This will write the codepoint's value to `output`, little-endian