Implement multi-value charmaps (#1429)

This commit is contained in:
Sylvie
2024-08-04 17:32:08 -04:00
committed by GitHub
parent 436580a649
commit 9a5b3f0902
16 changed files with 181 additions and 89 deletions

View File

@@ -14,9 +14,9 @@ void charmap_New(std::string const &name, std::string const *baseName);
void charmap_Set(std::string const &name);
void charmap_Push();
void charmap_Pop();
void charmap_Add(std::string const &mapping, uint8_t value);
void charmap_Add(std::string const &mapping, std::vector<int32_t> &&value);
bool charmap_HasChar(std::string const &input);
std::vector<uint8_t> charmap_Convert(std::string const &input);
size_t charmap_ConvertNext(std::string_view &input, std::vector<uint8_t> *output);
std::vector<int32_t> charmap_Convert(std::string const &input);
size_t charmap_ConvertNext(std::string_view &input, std::vector<int32_t> *output);
#endif // RGBDS_ASM_CHARMAP_HPP

View File

@@ -63,4 +63,6 @@ private:
uint8_t *reserveSpace(uint32_t size, uint32_t patchSize);
};
bool checkNBit(int32_t v, uint8_t n, char const *name);
#endif // RGBDS_ASM_RPN_HPP

View File

@@ -84,9 +84,9 @@ void sect_EndUnion();
void sect_CheckUnionClosed();
void sect_AbsByte(uint8_t b);
void sect_AbsByteString(std::vector<uint8_t> const &s);
void sect_AbsWordString(std::vector<uint8_t> const &s);
void sect_AbsLongString(std::vector<uint8_t> const &s);
void sect_AbsByteString(std::vector<int32_t> const &s);
void sect_AbsWordString(std::vector<int32_t> const &s);
void sect_AbsLongString(std::vector<int32_t> const &s);
void sect_Skip(uint32_t skip, bool ds);
void sect_RelByte(Expression &expr, uint32_t pcShift);
void sect_RelBytes(uint32_t n, std::vector<Expression> &exprs);

View File

@@ -12,6 +12,6 @@ char const *printChar(int c);
/*
* @return The number of bytes read, or 0 if invalid data was found
*/
size_t readUTF8Char(std::vector<uint8_t> *dest, char const *src);
size_t readUTF8Char(std::vector<int32_t> *dest, char const *src);
#endif // RGBDS_UTIL_HPP

View File

@@ -474,17 +474,22 @@ with its corresponding argument in
When writing text strings that are meant to be displayed on the Game Boy, the character encoding in the ROM may need to be different than the source file encoding.
For example, the tiles used for uppercase letters may be placed starting at tile index 128, which differs from ASCII starting at 65.
.Pp
Character maps allow mapping strings to arbitrary 8-bit values:
Character maps allow mapping strings to arbitrary sequences of numbers:
.Bd -literal -offset indent
CHARMAP "<LF>", 10
CHARMAP "&iacute", 20
CHARMAP "A", 128
CHARMAP "A", 42
CHARMAP ":)", 39
CHARMAP "<br>", 13, 10
CHARMAP "&euro;", $20ac
.Ed
.Pp
This would result in
.Ql db \(dqAmen<LF>\(dq
.Ql db \(dqAmen :)<br>\(dq
being equivalent to
.Ql db 128, 109, 101, 110, 10 .
.Ql db 42, 109, 101, 110, 32, 39, 13, 10 ,
and
.Ql dw \(dq25&euro;\(dq
being equivalent to
.Ql dw 50, 53, $20ac .
.Pp
Any characters in a string without defined mappings will be copied directly, using the source file's encoding of characters to bytes.
.Pp

View File

@@ -8,6 +8,7 @@
#include <string.h>
#include <unordered_map>
#include "helpers.hpp"
#include "util.hpp"
#include "asm/warning.hpp"
@@ -16,10 +17,11 @@
// Essentially a tree, where each nodes stores a single character's worth of info:
// whether there exists a mapping that ends at the current character,
struct CharmapNode {
bool isTerminal; // Whether there exists a mapping that ends here
uint8_t value; // If the above is true, its corresponding value
std::vector<int32_t> value; // The mapped value, if there exists a mapping that ends here
// This MUST be indexes and not pointers, because pointers get invalidated by reallocation!
size_t next[256]; // Indexes of where to go next, 0 = nowhere
size_t next[256]; // Indexes of where to go next, 0 = nowhere
bool isTerminal() const { return !value.empty(); }
};
struct Charmap {
@@ -84,7 +86,7 @@ void charmap_Pop() {
charmapStack.pop();
}
void charmap_Add(std::string const &mapping, uint8_t value) {
void charmap_Add(std::string const &mapping, std::vector<int32_t> &&value) {
Charmap &charmap = *currentCharmap;
size_t nodeIdx = 0;
@@ -106,11 +108,10 @@ void charmap_Add(std::string const &mapping, uint8_t value) {
CharmapNode &node = charmap.nodes[nodeIdx];
if (node.isTerminal)
if (node.isTerminal())
warning(WARNING_CHARMAP_REDEF, "Overriding charmap mapping\n");
node.isTerminal = true;
node.value = value;
std::swap(node.value, value);
}
bool charmap_HasChar(std::string const &input) {
@@ -124,17 +125,17 @@ bool charmap_HasChar(std::string const &input) {
return false;
}
return charmap.nodes[nodeIdx].isTerminal;
return charmap.nodes[nodeIdx].isTerminal();
}
std::vector<uint8_t> charmap_Convert(std::string const &input) {
std::vector<uint8_t> output;
std::vector<int32_t> charmap_Convert(std::string const &input) {
std::vector<int32_t> output;
for (std::string_view inputView = input; charmap_ConvertNext(inputView, &output);)
;
return output;
}
size_t charmap_ConvertNext(std::string_view &input, std::vector<uint8_t> *output) {
size_t charmap_ConvertNext(std::string_view &input, std::vector<int32_t> *output) {
// The goal is to match the longest mapping possible.
// For that, advance through the trie with each character read.
// If that would lead to a dead end, rewind characters until the last match, and output.
@@ -152,7 +153,7 @@ size_t charmap_ConvertNext(std::string_view &input, std::vector<uint8_t> *output
inputIdx++; // Consume that char
if (charmap.nodes[nodeIdx].isTerminal) {
if (charmap.nodes[nodeIdx].isTerminal()) {
matchIdx = nodeIdx; // This node matches, register it
rewindDistance = 0; // If no longer match is found, rewind here
} else {
@@ -166,11 +167,12 @@ size_t charmap_ConvertNext(std::string_view &input, std::vector<uint8_t> *output
size_t matchLen = 0;
if (matchIdx) { // A match was found, use it
std::vector<int32_t> const &value = charmap.nodes[matchIdx].value;
if (output)
output->push_back(charmap.nodes[matchIdx].value);
matchLen = 1;
output->insert(output->end(), RANGE(value));
matchLen = value.size();
} else if (inputIdx < input.length()) { // No match found, but there is some input left
int firstChar = input[inputIdx];
// This will write the codepoint's value to `output`, little-endian

View File

@@ -70,7 +70,7 @@
yy::parser::symbol_type yylex(); // Provided by lexer.cpp
static uint32_t str2int2(std::vector<uint8_t> const &s);
static uint32_t str2int2(std::vector<int32_t> const &s);
static void errorInvalidUTF8Byte(uint8_t byte, char const *functionName);
static size_t strlenUTF8(std::string const &str);
static std::string strsubUTF8(std::string const &str, uint32_t pos, uint32_t len);
@@ -105,7 +105,6 @@
%type <Expression> relocexpr_no_str
%type <int32_t> const
%type <int32_t> const_no_str
%type <int32_t> const_8bit
%type <int32_t> uconst
%type <int32_t> rs_uconst
%type <int32_t> shift_const
@@ -264,6 +263,7 @@
%type <std::vector<Expression>> ds_args
%type <std::vector<std::string>> purge_args
%type <std::vector<int32_t>> charmap_args
%type <ForArgs> for_args
%token Z80_ADC "adc" Z80_ADD "add" Z80_AND "and"
@@ -1083,8 +1083,18 @@ incbin:
;
charmap:
POP_CHARMAP string COMMA const_8bit {
charmap_Add($2, (uint8_t)$4);
POP_CHARMAP string COMMA charmap_args trailing_comma {
charmap_Add($2, std::move($4));
}
;
charmap_args:
const {
$$.push_back(std::move($1));
}
| charmap_args COMMA const {
$$ = std::move($1);
$$.push_back(std::move($3));
}
;
@@ -1170,7 +1180,7 @@ constlist_8bit_entry:
sect_RelByte($1, 0);
}
| string {
std::vector<uint8_t> output = charmap_Convert($1);
std::vector<int32_t> output = charmap_Convert($1);
sect_AbsByteString(output);
}
;
@@ -1185,7 +1195,7 @@ constlist_16bit_entry:
sect_RelWord($1, 0);
}
| string {
std::vector<uint8_t> output = charmap_Convert($1);
std::vector<int32_t> output = charmap_Convert($1);
sect_AbsWordString(output);
}
;
@@ -1200,7 +1210,7 @@ constlist_32bit_entry:
sect_RelLong($1, 0);
}
| string {
std::vector<uint8_t> output = charmap_Convert($1);
std::vector<int32_t> output = charmap_Convert($1);
sect_AbsLongString(output);
}
;
@@ -1250,7 +1260,7 @@ relocexpr:
$$ = std::move($1);
}
| string {
std::vector<uint8_t> output = charmap_Convert($1);
std::vector<int32_t> output = charmap_Convert($1);
$$.makeNumber(str2int2(output));
}
;
@@ -1465,12 +1475,6 @@ const_no_str:
}
;
const_8bit:
reloc_8bit {
$$ = $1.getConstVal();
}
;
opt_q_arg:
%empty {
$$ = fix_Precision();
@@ -2374,26 +2378,37 @@ void yy::parser::error(std::string const &str) {
::error("%s\n", str.c_str());
}
static uint32_t str2int2(std::vector<uint8_t> const &s) {
static uint32_t str2int2(std::vector<int32_t> const &s) {
uint32_t length = s.size();
if (length == 1) {
// The string is a single character with a single value,
// which can be used directly as a number.
return (uint32_t)s[0];
}
for (int32_t v : s) {
if (!checkNBit(v, 8, "All character units"))
break;
}
if (length > 4)
warning(
WARNING_NUMERIC_STRING_1,
"Treating string as a number ignores first %" PRIu32 " character%s\n",
"Treating string as a number ignores first %" PRIu32 " byte%s\n",
length - 4,
length == 5 ? "" : "s"
);
else if (length > 1)
warning(
WARNING_NUMERIC_STRING_2, "Treating %" PRIu32 "-character string as a number\n", length
WARNING_NUMERIC_STRING_2, "Treating %" PRIu32 "-byte string as a number\n", length
);
uint32_t r = 0;
for (uint32_t i = length < 4 ? 0 : length - 4; i < length; i++) {
r <<= 8;
r |= s[i];
r |= static_cast<uint8_t>(s[i]);
}
return r;

View File

@@ -516,13 +516,22 @@ void Expression::makeCheckRST() {
// Checks that an RPN expression's value fits within N bits (signed or unsigned)
void Expression::checkNBit(uint8_t n) const {
if (isKnown())
::checkNBit(value(), n, "Expression");
}
bool checkNBit(int32_t v, uint8_t n, char const *name) {
assume(n != 0); // That doesn't make sense
assume(n < CHAR_BIT * sizeof(int)); // Otherwise `1 << n` is UB
if (isKnown()) {
if (int32_t val = value(); val < -(1 << n) || val >= 1 << n)
warning(WARNING_TRUNCATION_1, "Expression must be %u-bit\n", n);
else if (val < -(1 << (n - 1)))
warning(WARNING_TRUNCATION_2, "Expression must be %u-bit\n", n);
if (v < -(1 << n) || v >= 1 << n) {
warning(WARNING_TRUNCATION_1, "%s must be %u-bit\n", name, n);
return false;
}
if (v < -(1 << (n - 1))) {
warning(WARNING_TRUNCATION_2, "%s must be %u-bit\n", name, n);
return false;
}
return true;
}

View File

@@ -671,34 +671,44 @@ void sect_AbsByte(uint8_t b) {
writebyte(b);
}
void sect_AbsByteString(std::vector<uint8_t> const &s) {
void sect_AbsByteString(std::vector<int32_t> const &s) {
if (!checkcodesection())
return;
if (!reserveSpace(s.size()))
return;
for (uint8_t v : s)
writebyte(v);
for (int32_t v : s) {
if (!checkNBit(v, 8, "All character units"))
break;
}
for (int32_t v : s)
writebyte(static_cast<uint8_t>(v));
}
void sect_AbsWordString(std::vector<uint8_t> const &s) {
void sect_AbsWordString(std::vector<int32_t> const &s) {
if (!checkcodesection())
return;
if (!reserveSpace(s.size() * 2))
return;
for (uint8_t v : s)
writeword(v);
for (int32_t v : s) {
if (!checkNBit(v, 16, "All character units"))
break;
}
for (int32_t v : s)
writeword(static_cast<uint16_t>(v));
}
void sect_AbsLongString(std::vector<uint8_t> const &s) {
void sect_AbsLongString(std::vector<int32_t> const &s) {
if (!checkcodesection())
return;
if (!reserveSpace(s.size() * 4))
return;
for (uint8_t v : s)
writelong(v);
for (int32_t v : s)
writelong(static_cast<uint32_t>(v));
}
// Skip this many bytes

View File

@@ -53,7 +53,7 @@ char const *printChar(int c) {
return buf;
}
size_t readUTF8Char(std::vector<uint8_t> *dest, char const *src) {
size_t readUTF8Char(std::vector<int32_t> *dest, char const *src) {
uint32_t state = 0, codepoint;
size_t i = 0;

View File

@@ -1,6 +1,6 @@
warning: db-dw-dl-string.asm(15): [-Wnumeric-string]
Treating 4-character string as a number
Treating 4-byte string as a number
warning: db-dw-dl-string.asm(16): [-Wnumeric-string]
Treating 4-character string as a number
Treating 4-byte string as a number
warning: db-dw-dl-string.asm(17): [-Wnumeric-string]
Treating 4-character string as a number
Treating 4-byte string as a number

View File

@@ -1,15 +1,15 @@
warning: multiple-charmaps.asm(46) -> multiple-charmaps.asm::print_mapped(34): [-Wnumeric-string]
Treating 2-character string as a number
Treating 2-byte string as a number
warning: multiple-charmaps.asm(54) -> multiple-charmaps.asm::print_mapped(34): [-Wnumeric-string]
Treating 2-character string as a number
Treating 2-byte string as a number
warning: multiple-charmaps.asm(73) -> multiple-charmaps.asm::print_mapped(34): [-Wnumeric-string]
Treating 2-character string as a number
Treating 2-byte string as a number
warning: multiple-charmaps.asm(95) -> multiple-charmaps.asm::print_mapped(34): [-Wnumeric-string]
Treating 2-character string as a number
Treating 2-byte string as a number
warning: multiple-charmaps.asm(96) -> multiple-charmaps.asm::print_mapped(34): [-Wnumeric-string]
Treating 2-character string as a number
Treating 2-byte string as a number
warning: multiple-charmaps.asm(104) -> multiple-charmaps.asm::print_mapped(34): [-Wnumeric-string]
Treating 2-character string as a number
Treating 2-byte string as a number
error: multiple-charmaps.asm(106) -> multiple-charmaps.asm::new_(9):
Charmap 'map1' already exists
error: multiple-charmaps.asm(108) -> multiple-charmaps.asm::set_(15):

View File

@@ -0,0 +1,35 @@
section "test", rom0[0]
charmap "a", $61
charmap "b", $62
charmap "c", $63
charmap "啊", $04, $c3
charmap "de", $6564
charmap "fghi", $66, $67, $6968
db "abc啊" ; db $61, $62, $63, $04, $C3
db "abcde" ; db $61, $62, $63, $64 (truncated)
dw "abc啊" ; dw $61, $62, $63, $04, $C3
dw "abcde" ; dw $61, $62, $63, $6564
dw "abcdefghi" ; dw $61, $62, $63, $6564, $66, $67, $6968
dl 0 ; separator
charmap "A", $01234567
charmap "B", $fedcba98
assert "A" == $01234567
assert "B" == $fedcba98
db "AB" ; db $01234567, $fedcba98 (truncated to $67, $98)
dl "AB" ; dl $01234567, $fedcba98
charmap "C", $01, $23, $45, $67
charmap "D", $fe, $dc, $ba, $98
assert "C" == $01234567
assert "D" == $fedcba98
db "CD" ; db $01, $23, $45, $67, $fe, $dc, $ba, $98
dw "CD" ; dw $01, $23, $45, $67, $fe, $dc, $ba, $98
charmap "E", $01, $2345, $6789ab, $cdef
assert "E" == $0145abef
db "E" ; db $01, $2345, $6789ab, $cdef (truncated to $01, $45, $ab, $ef)
dl "E" ; dl $01, $2345, $6789ab, $cdef

View File

@@ -0,0 +1,14 @@
warning: multivalue-charmap.asm(11): [-Wtruncation]
All character units must be 8-bit
warning: multivalue-charmap.asm(22): [-Wtruncation]
All character units must be 8-bit
warning: multivalue-charmap.asm(27): [-Wnumeric-string]
Treating 4-byte string as a number
warning: multivalue-charmap.asm(28): [-Wnumeric-string]
Treating 4-byte string as a number
warning: multivalue-charmap.asm(33): [-Wtruncation]
All character units must be 8-bit
warning: multivalue-charmap.asm(33): [-Wnumeric-string]
Treating 4-byte string as a number
warning: multivalue-charmap.asm(34): [-Wtruncation]
All character units must be 8-bit

Binary file not shown.

View File

@@ -1,38 +1,38 @@
warning: warn-numeric-string.asm(20) -> warn-numeric-string.asm::try(12): [-Wnumeric-string]
Treating string as a number ignores first 1 character
Treating string as a number ignores first 1 byte
warning: warn-numeric-string.asm(20) -> warn-numeric-string.asm::try(13): [-Wnumeric-string]
Treating string as a number ignores first 1 character
Treating string as a number ignores first 1 byte
warning: warn-numeric-string.asm(20) -> warn-numeric-string.asm::try(13): [-Wnumeric-string]
Treating string as a number ignores first 2 characters
Treating string as a number ignores first 2 bytes
warning: warn-numeric-string.asm(22) -> warn-numeric-string.asm::try(12): [-Wnumeric-string]
Treating string as a number ignores first 1 character
Treating string as a number ignores first 1 byte
warning: warn-numeric-string.asm(22) -> warn-numeric-string.asm::try(13): [-Wnumeric-string]
Treating string as a number ignores first 1 character
Treating string as a number ignores first 1 byte
warning: warn-numeric-string.asm(22) -> warn-numeric-string.asm::try(13): [-Wnumeric-string]
Treating string as a number ignores first 2 characters
Treating string as a number ignores first 2 bytes
warning: warn-numeric-string.asm(23) -> warn-numeric-string.asm::try(12): [-Wnumeric-string]
Treating string as a number ignores first 1 character
Treating string as a number ignores first 1 byte
warning: warn-numeric-string.asm(23) -> warn-numeric-string.asm::try(13): [-Wnumeric-string]
Treating string as a number ignores first 1 character
Treating string as a number ignores first 1 byte
warning: warn-numeric-string.asm(23) -> warn-numeric-string.asm::try(13): [-Wnumeric-string]
Treating string as a number ignores first 2 characters
Treating string as a number ignores first 2 bytes
warning: warn-numeric-string.asm(23) -> warn-numeric-string.asm::try(15): [-Wnumeric-string]
Treating 2-character string as a number
Treating 2-byte string as a number
warning: warn-numeric-string.asm(23) -> warn-numeric-string.asm::try(16): [-Wnumeric-string]
Treating 3-character string as a number
Treating 3-byte string as a number
error: warn-numeric-string.asm(24) -> warn-numeric-string.asm::try(12): [-Werror=numeric-string]
Treating string as a number ignores first 1 character
Treating string as a number ignores first 1 byte
error: warn-numeric-string.asm(24) -> warn-numeric-string.asm::try(13): [-Werror=numeric-string]
Treating string as a number ignores first 1 character
Treating string as a number ignores first 1 byte
error: warn-numeric-string.asm(24) -> warn-numeric-string.asm::try(13): [-Werror=numeric-string]
Treating string as a number ignores first 2 characters
Treating string as a number ignores first 2 bytes
error: warn-numeric-string.asm(25) -> warn-numeric-string.asm::try(12): [-Werror=numeric-string]
Treating string as a number ignores first 1 character
Treating string as a number ignores first 1 byte
error: warn-numeric-string.asm(25) -> warn-numeric-string.asm::try(13): [-Werror=numeric-string]
Treating string as a number ignores first 1 character
Treating string as a number ignores first 1 byte
error: warn-numeric-string.asm(25) -> warn-numeric-string.asm::try(13): [-Werror=numeric-string]
Treating string as a number ignores first 2 characters
Treating string as a number ignores first 2 bytes
error: warn-numeric-string.asm(25) -> warn-numeric-string.asm::try(15): [-Werror=numeric-string]
Treating 2-character string as a number
Treating 2-byte string as a number
error: warn-numeric-string.asm(25) -> warn-numeric-string.asm::try(16): [-Werror=numeric-string]
Treating 3-character string as a number
Treating 3-byte string as a number