Implement multi-value charmaps (#1429)

This commit is contained in:
Sylvie
2024-08-04 17:32:08 -04:00
committed by GitHub
parent 436580a649
commit 9a5b3f0902
16 changed files with 181 additions and 89 deletions

View File

@@ -8,6 +8,7 @@
#include <string.h>
#include <unordered_map>
#include "helpers.hpp"
#include "util.hpp"
#include "asm/warning.hpp"
@@ -16,10 +17,11 @@
// Essentially a tree, where each nodes stores a single character's worth of info:
// whether there exists a mapping that ends at the current character,
struct CharmapNode {
bool isTerminal; // Whether there exists a mapping that ends here
uint8_t value; // If the above is true, its corresponding value
std::vector<int32_t> value; // The mapped value, if there exists a mapping that ends here
// This MUST be indexes and not pointers, because pointers get invalidated by reallocation!
size_t next[256]; // Indexes of where to go next, 0 = nowhere
size_t next[256]; // Indexes of where to go next, 0 = nowhere
bool isTerminal() const { return !value.empty(); }
};
struct Charmap {
@@ -84,7 +86,7 @@ void charmap_Pop() {
charmapStack.pop();
}
void charmap_Add(std::string const &mapping, uint8_t value) {
void charmap_Add(std::string const &mapping, std::vector<int32_t> &&value) {
Charmap &charmap = *currentCharmap;
size_t nodeIdx = 0;
@@ -106,11 +108,10 @@ void charmap_Add(std::string const &mapping, uint8_t value) {
CharmapNode &node = charmap.nodes[nodeIdx];
if (node.isTerminal)
if (node.isTerminal())
warning(WARNING_CHARMAP_REDEF, "Overriding charmap mapping\n");
node.isTerminal = true;
node.value = value;
std::swap(node.value, value);
}
bool charmap_HasChar(std::string const &input) {
@@ -124,17 +125,17 @@ bool charmap_HasChar(std::string const &input) {
return false;
}
return charmap.nodes[nodeIdx].isTerminal;
return charmap.nodes[nodeIdx].isTerminal();
}
std::vector<uint8_t> charmap_Convert(std::string const &input) {
std::vector<uint8_t> output;
std::vector<int32_t> charmap_Convert(std::string const &input) {
std::vector<int32_t> output;
for (std::string_view inputView = input; charmap_ConvertNext(inputView, &output);)
;
return output;
}
size_t charmap_ConvertNext(std::string_view &input, std::vector<uint8_t> *output) {
size_t charmap_ConvertNext(std::string_view &input, std::vector<int32_t> *output) {
// The goal is to match the longest mapping possible.
// For that, advance through the trie with each character read.
// If that would lead to a dead end, rewind characters until the last match, and output.
@@ -152,7 +153,7 @@ size_t charmap_ConvertNext(std::string_view &input, std::vector<uint8_t> *output
inputIdx++; // Consume that char
if (charmap.nodes[nodeIdx].isTerminal) {
if (charmap.nodes[nodeIdx].isTerminal()) {
matchIdx = nodeIdx; // This node matches, register it
rewindDistance = 0; // If no longer match is found, rewind here
} else {
@@ -166,11 +167,12 @@ size_t charmap_ConvertNext(std::string_view &input, std::vector<uint8_t> *output
size_t matchLen = 0;
if (matchIdx) { // A match was found, use it
std::vector<int32_t> const &value = charmap.nodes[matchIdx].value;
if (output)
output->push_back(charmap.nodes[matchIdx].value);
matchLen = 1;
output->insert(output->end(), RANGE(value));
matchLen = value.size();
} else if (inputIdx < input.length()) { // No match found, but there is some input left
int firstChar = input[inputIdx];
// This will write the codepoint's value to `output`, little-endian

View File

@@ -70,7 +70,7 @@
yy::parser::symbol_type yylex(); // Provided by lexer.cpp
static uint32_t str2int2(std::vector<uint8_t> const &s);
static uint32_t str2int2(std::vector<int32_t> const &s);
static void errorInvalidUTF8Byte(uint8_t byte, char const *functionName);
static size_t strlenUTF8(std::string const &str);
static std::string strsubUTF8(std::string const &str, uint32_t pos, uint32_t len);
@@ -105,7 +105,6 @@
%type <Expression> relocexpr_no_str
%type <int32_t> const
%type <int32_t> const_no_str
%type <int32_t> const_8bit
%type <int32_t> uconst
%type <int32_t> rs_uconst
%type <int32_t> shift_const
@@ -264,6 +263,7 @@
%type <std::vector<Expression>> ds_args
%type <std::vector<std::string>> purge_args
%type <std::vector<int32_t>> charmap_args
%type <ForArgs> for_args
%token Z80_ADC "adc" Z80_ADD "add" Z80_AND "and"
@@ -1083,8 +1083,18 @@ incbin:
;
charmap:
POP_CHARMAP string COMMA const_8bit {
charmap_Add($2, (uint8_t)$4);
POP_CHARMAP string COMMA charmap_args trailing_comma {
charmap_Add($2, std::move($4));
}
;
charmap_args:
const {
$$.push_back(std::move($1));
}
| charmap_args COMMA const {
$$ = std::move($1);
$$.push_back(std::move($3));
}
;
@@ -1170,7 +1180,7 @@ constlist_8bit_entry:
sect_RelByte($1, 0);
}
| string {
std::vector<uint8_t> output = charmap_Convert($1);
std::vector<int32_t> output = charmap_Convert($1);
sect_AbsByteString(output);
}
;
@@ -1185,7 +1195,7 @@ constlist_16bit_entry:
sect_RelWord($1, 0);
}
| string {
std::vector<uint8_t> output = charmap_Convert($1);
std::vector<int32_t> output = charmap_Convert($1);
sect_AbsWordString(output);
}
;
@@ -1200,7 +1210,7 @@ constlist_32bit_entry:
sect_RelLong($1, 0);
}
| string {
std::vector<uint8_t> output = charmap_Convert($1);
std::vector<int32_t> output = charmap_Convert($1);
sect_AbsLongString(output);
}
;
@@ -1250,7 +1260,7 @@ relocexpr:
$$ = std::move($1);
}
| string {
std::vector<uint8_t> output = charmap_Convert($1);
std::vector<int32_t> output = charmap_Convert($1);
$$.makeNumber(str2int2(output));
}
;
@@ -1465,12 +1475,6 @@ const_no_str:
}
;
const_8bit:
reloc_8bit {
$$ = $1.getConstVal();
}
;
opt_q_arg:
%empty {
$$ = fix_Precision();
@@ -2374,26 +2378,37 @@ void yy::parser::error(std::string const &str) {
::error("%s\n", str.c_str());
}
static uint32_t str2int2(std::vector<uint8_t> const &s) {
static uint32_t str2int2(std::vector<int32_t> const &s) {
uint32_t length = s.size();
if (length == 1) {
// The string is a single character with a single value,
// which can be used directly as a number.
return (uint32_t)s[0];
}
for (int32_t v : s) {
if (!checkNBit(v, 8, "All character units"))
break;
}
if (length > 4)
warning(
WARNING_NUMERIC_STRING_1,
"Treating string as a number ignores first %" PRIu32 " character%s\n",
"Treating string as a number ignores first %" PRIu32 " byte%s\n",
length - 4,
length == 5 ? "" : "s"
);
else if (length > 1)
warning(
WARNING_NUMERIC_STRING_2, "Treating %" PRIu32 "-character string as a number\n", length
WARNING_NUMERIC_STRING_2, "Treating %" PRIu32 "-byte string as a number\n", length
);
uint32_t r = 0;
for (uint32_t i = length < 4 ? 0 : length - 4; i < length; i++) {
r <<= 8;
r |= s[i];
r |= static_cast<uint8_t>(s[i]);
}
return r;

View File

@@ -516,13 +516,22 @@ void Expression::makeCheckRST() {
// Checks that an RPN expression's value fits within N bits (signed or unsigned)
void Expression::checkNBit(uint8_t n) const {
if (isKnown())
::checkNBit(value(), n, "Expression");
}
bool checkNBit(int32_t v, uint8_t n, char const *name) {
assume(n != 0); // That doesn't make sense
assume(n < CHAR_BIT * sizeof(int)); // Otherwise `1 << n` is UB
if (isKnown()) {
if (int32_t val = value(); val < -(1 << n) || val >= 1 << n)
warning(WARNING_TRUNCATION_1, "Expression must be %u-bit\n", n);
else if (val < -(1 << (n - 1)))
warning(WARNING_TRUNCATION_2, "Expression must be %u-bit\n", n);
if (v < -(1 << n) || v >= 1 << n) {
warning(WARNING_TRUNCATION_1, "%s must be %u-bit\n", name, n);
return false;
}
if (v < -(1 << (n - 1))) {
warning(WARNING_TRUNCATION_2, "%s must be %u-bit\n", name, n);
return false;
}
return true;
}

View File

@@ -671,34 +671,44 @@ void sect_AbsByte(uint8_t b) {
writebyte(b);
}
void sect_AbsByteString(std::vector<uint8_t> const &s) {
void sect_AbsByteString(std::vector<int32_t> const &s) {
if (!checkcodesection())
return;
if (!reserveSpace(s.size()))
return;
for (uint8_t v : s)
writebyte(v);
for (int32_t v : s) {
if (!checkNBit(v, 8, "All character units"))
break;
}
for (int32_t v : s)
writebyte(static_cast<uint8_t>(v));
}
void sect_AbsWordString(std::vector<uint8_t> const &s) {
void sect_AbsWordString(std::vector<int32_t> const &s) {
if (!checkcodesection())
return;
if (!reserveSpace(s.size() * 2))
return;
for (uint8_t v : s)
writeword(v);
for (int32_t v : s) {
if (!checkNBit(v, 16, "All character units"))
break;
}
for (int32_t v : s)
writeword(static_cast<uint16_t>(v));
}
void sect_AbsLongString(std::vector<uint8_t> const &s) {
void sect_AbsLongString(std::vector<int32_t> const &s) {
if (!checkcodesection())
return;
if (!reserveSpace(s.size() * 4))
return;
for (uint8_t v : s)
writelong(v);
for (int32_t v : s)
writelong(static_cast<uint32_t>(v));
}
// Skip this many bytes