From 9a5b3f09027150c71b2a2f3f622f8b285a9b2c56 Mon Sep 17 00:00:00 2001 From: Sylvie <35663410+Rangi42@users.noreply.github.com> Date: Sun, 4 Aug 2024 17:32:08 -0400 Subject: [PATCH] Implement multi-value charmaps (#1429) --- include/asm/charmap.hpp | 6 ++-- include/asm/rpn.hpp | 2 ++ include/asm/section.hpp | 6 ++-- include/util.hpp | 2 +- man/rgbasm.5 | 17 ++++++---- src/asm/charmap.cpp | 32 +++++++++-------- src/asm/parser.y | 51 ++++++++++++++++++---------- src/asm/rpn.cpp | 19 ++++++++--- src/asm/section.cpp | 28 ++++++++++----- src/util.cpp | 2 +- test/asm/db-dw-dl-string.err | 6 ++-- test/asm/multiple-charmaps.err | 12 +++---- test/asm/multivalue-charmap.asm | 35 +++++++++++++++++++ test/asm/multivalue-charmap.err | 14 ++++++++ test/asm/multivalue-charmap.out.bin | Bin 0 -> 99 bytes test/asm/warn-numeric-string.err | 38 ++++++++++----------- 16 files changed, 181 insertions(+), 89 deletions(-) create mode 100644 test/asm/multivalue-charmap.asm create mode 100644 test/asm/multivalue-charmap.err create mode 100644 test/asm/multivalue-charmap.out.bin diff --git a/include/asm/charmap.hpp b/include/asm/charmap.hpp index 767ea00a..926a8fb6 100644 --- a/include/asm/charmap.hpp +++ b/include/asm/charmap.hpp @@ -14,9 +14,9 @@ void charmap_New(std::string const &name, std::string const *baseName); void charmap_Set(std::string const &name); void charmap_Push(); void charmap_Pop(); -void charmap_Add(std::string const &mapping, uint8_t value); +void charmap_Add(std::string const &mapping, std::vector &&value); bool charmap_HasChar(std::string const &input); -std::vector charmap_Convert(std::string const &input); -size_t charmap_ConvertNext(std::string_view &input, std::vector *output); +std::vector charmap_Convert(std::string const &input); +size_t charmap_ConvertNext(std::string_view &input, std::vector *output); #endif // RGBDS_ASM_CHARMAP_HPP diff --git a/include/asm/rpn.hpp b/include/asm/rpn.hpp index e457dce2..2955c2bc 100644 --- a/include/asm/rpn.hpp +++ b/include/asm/rpn.hpp @@ -63,4 +63,6 @@ private: uint8_t *reserveSpace(uint32_t size, uint32_t patchSize); }; +bool checkNBit(int32_t v, uint8_t n, char const *name); + #endif // RGBDS_ASM_RPN_HPP diff --git a/include/asm/section.hpp b/include/asm/section.hpp index 0f1c5368..66eea114 100644 --- a/include/asm/section.hpp +++ b/include/asm/section.hpp @@ -84,9 +84,9 @@ void sect_EndUnion(); void sect_CheckUnionClosed(); void sect_AbsByte(uint8_t b); -void sect_AbsByteString(std::vector const &s); -void sect_AbsWordString(std::vector const &s); -void sect_AbsLongString(std::vector const &s); +void sect_AbsByteString(std::vector const &s); +void sect_AbsWordString(std::vector const &s); +void sect_AbsLongString(std::vector const &s); void sect_Skip(uint32_t skip, bool ds); void sect_RelByte(Expression &expr, uint32_t pcShift); void sect_RelBytes(uint32_t n, std::vector &exprs); diff --git a/include/util.hpp b/include/util.hpp index 80c76434..b08edc53 100644 --- a/include/util.hpp +++ b/include/util.hpp @@ -12,6 +12,6 @@ char const *printChar(int c); /* * @return The number of bytes read, or 0 if invalid data was found */ -size_t readUTF8Char(std::vector *dest, char const *src); +size_t readUTF8Char(std::vector *dest, char const *src); #endif // RGBDS_UTIL_HPP diff --git a/man/rgbasm.5 b/man/rgbasm.5 index 23b6d053..d9afb6a9 100644 --- a/man/rgbasm.5 +++ b/man/rgbasm.5 @@ -474,17 +474,22 @@ with its corresponding argument in When writing text strings that are meant to be displayed on the Game Boy, the character encoding in the ROM may need to be different than the source file encoding. For example, the tiles used for uppercase letters may be placed starting at tile index 128, which differs from ASCII starting at 65. .Pp -Character maps allow mapping strings to arbitrary 8-bit values: +Character maps allow mapping strings to arbitrary sequences of numbers: .Bd -literal -offset indent -CHARMAP "", 10 -CHARMAP "í", 20 -CHARMAP "A", 128 +CHARMAP "A", 42 +CHARMAP ":)", 39 +CHARMAP "
", 13, 10 +CHARMAP "€", $20ac .Ed .Pp This would result in -.Ql db \(dqAmen\(dq +.Ql db \(dqAmen :)
\(dq being equivalent to -.Ql db 128, 109, 101, 110, 10 . +.Ql db 42, 109, 101, 110, 32, 39, 13, 10 , +and +.Ql dw \(dq25€\(dq +being equivalent to +.Ql dw 50, 53, $20ac . .Pp Any characters in a string without defined mappings will be copied directly, using the source file's encoding of characters to bytes. .Pp diff --git a/src/asm/charmap.cpp b/src/asm/charmap.cpp index adec7323..978820af 100644 --- a/src/asm/charmap.cpp +++ b/src/asm/charmap.cpp @@ -8,6 +8,7 @@ #include #include +#include "helpers.hpp" #include "util.hpp" #include "asm/warning.hpp" @@ -16,10 +17,11 @@ // Essentially a tree, where each nodes stores a single character's worth of info: // whether there exists a mapping that ends at the current character, struct CharmapNode { - bool isTerminal; // Whether there exists a mapping that ends here - uint8_t value; // If the above is true, its corresponding value + std::vector value; // The mapped value, if there exists a mapping that ends here // This MUST be indexes and not pointers, because pointers get invalidated by reallocation! - size_t next[256]; // Indexes of where to go next, 0 = nowhere + size_t next[256]; // Indexes of where to go next, 0 = nowhere + + bool isTerminal() const { return !value.empty(); } }; struct Charmap { @@ -84,7 +86,7 @@ void charmap_Pop() { charmapStack.pop(); } -void charmap_Add(std::string const &mapping, uint8_t value) { +void charmap_Add(std::string const &mapping, std::vector &&value) { Charmap &charmap = *currentCharmap; size_t nodeIdx = 0; @@ -106,11 +108,10 @@ void charmap_Add(std::string const &mapping, uint8_t value) { CharmapNode &node = charmap.nodes[nodeIdx]; - if (node.isTerminal) + if (node.isTerminal()) warning(WARNING_CHARMAP_REDEF, "Overriding charmap mapping\n"); - node.isTerminal = true; - node.value = value; + std::swap(node.value, value); } bool charmap_HasChar(std::string const &input) { @@ -124,17 +125,17 @@ bool charmap_HasChar(std::string const &input) { return false; } - return charmap.nodes[nodeIdx].isTerminal; + return charmap.nodes[nodeIdx].isTerminal(); } -std::vector charmap_Convert(std::string const &input) { - std::vector output; +std::vector charmap_Convert(std::string const &input) { + std::vector output; for (std::string_view inputView = input; charmap_ConvertNext(inputView, &output);) ; return output; } -size_t charmap_ConvertNext(std::string_view &input, std::vector *output) { +size_t charmap_ConvertNext(std::string_view &input, std::vector *output) { // The goal is to match the longest mapping possible. // For that, advance through the trie with each character read. // If that would lead to a dead end, rewind characters until the last match, and output. @@ -152,7 +153,7 @@ size_t charmap_ConvertNext(std::string_view &input, std::vector *output inputIdx++; // Consume that char - if (charmap.nodes[nodeIdx].isTerminal) { + if (charmap.nodes[nodeIdx].isTerminal()) { matchIdx = nodeIdx; // This node matches, register it rewindDistance = 0; // If no longer match is found, rewind here } else { @@ -166,11 +167,12 @@ size_t charmap_ConvertNext(std::string_view &input, std::vector *output size_t matchLen = 0; if (matchIdx) { // A match was found, use it + std::vector const &value = charmap.nodes[matchIdx].value; + if (output) - output->push_back(charmap.nodes[matchIdx].value); - - matchLen = 1; + output->insert(output->end(), RANGE(value)); + matchLen = value.size(); } else if (inputIdx < input.length()) { // No match found, but there is some input left int firstChar = input[inputIdx]; // This will write the codepoint's value to `output`, little-endian diff --git a/src/asm/parser.y b/src/asm/parser.y index c1b66f6a..5087ed2d 100644 --- a/src/asm/parser.y +++ b/src/asm/parser.y @@ -70,7 +70,7 @@ yy::parser::symbol_type yylex(); // Provided by lexer.cpp - static uint32_t str2int2(std::vector const &s); + static uint32_t str2int2(std::vector const &s); static void errorInvalidUTF8Byte(uint8_t byte, char const *functionName); static size_t strlenUTF8(std::string const &str); static std::string strsubUTF8(std::string const &str, uint32_t pos, uint32_t len); @@ -105,7 +105,6 @@ %type relocexpr_no_str %type const %type const_no_str -%type const_8bit %type uconst %type rs_uconst %type shift_const @@ -264,6 +263,7 @@ %type > ds_args %type > purge_args +%type > charmap_args %type for_args %token Z80_ADC "adc" Z80_ADD "add" Z80_AND "and" @@ -1083,8 +1083,18 @@ incbin: ; charmap: - POP_CHARMAP string COMMA const_8bit { - charmap_Add($2, (uint8_t)$4); + POP_CHARMAP string COMMA charmap_args trailing_comma { + charmap_Add($2, std::move($4)); + } +; + +charmap_args: + const { + $$.push_back(std::move($1)); + } + | charmap_args COMMA const { + $$ = std::move($1); + $$.push_back(std::move($3)); } ; @@ -1170,7 +1180,7 @@ constlist_8bit_entry: sect_RelByte($1, 0); } | string { - std::vector output = charmap_Convert($1); + std::vector output = charmap_Convert($1); sect_AbsByteString(output); } ; @@ -1185,7 +1195,7 @@ constlist_16bit_entry: sect_RelWord($1, 0); } | string { - std::vector output = charmap_Convert($1); + std::vector output = charmap_Convert($1); sect_AbsWordString(output); } ; @@ -1200,7 +1210,7 @@ constlist_32bit_entry: sect_RelLong($1, 0); } | string { - std::vector output = charmap_Convert($1); + std::vector output = charmap_Convert($1); sect_AbsLongString(output); } ; @@ -1250,7 +1260,7 @@ relocexpr: $$ = std::move($1); } | string { - std::vector output = charmap_Convert($1); + std::vector output = charmap_Convert($1); $$.makeNumber(str2int2(output)); } ; @@ -1465,12 +1475,6 @@ const_no_str: } ; -const_8bit: - reloc_8bit { - $$ = $1.getConstVal(); - } -; - opt_q_arg: %empty { $$ = fix_Precision(); @@ -2374,26 +2378,37 @@ void yy::parser::error(std::string const &str) { ::error("%s\n", str.c_str()); } -static uint32_t str2int2(std::vector const &s) { +static uint32_t str2int2(std::vector const &s) { uint32_t length = s.size(); + if (length == 1) { + // The string is a single character with a single value, + // which can be used directly as a number. + return (uint32_t)s[0]; + } + + for (int32_t v : s) { + if (!checkNBit(v, 8, "All character units")) + break; + } + if (length > 4) warning( WARNING_NUMERIC_STRING_1, - "Treating string as a number ignores first %" PRIu32 " character%s\n", + "Treating string as a number ignores first %" PRIu32 " byte%s\n", length - 4, length == 5 ? "" : "s" ); else if (length > 1) warning( - WARNING_NUMERIC_STRING_2, "Treating %" PRIu32 "-character string as a number\n", length + WARNING_NUMERIC_STRING_2, "Treating %" PRIu32 "-byte string as a number\n", length ); uint32_t r = 0; for (uint32_t i = length < 4 ? 0 : length - 4; i < length; i++) { r <<= 8; - r |= s[i]; + r |= static_cast(s[i]); } return r; diff --git a/src/asm/rpn.cpp b/src/asm/rpn.cpp index 34a01b42..1242498e 100644 --- a/src/asm/rpn.cpp +++ b/src/asm/rpn.cpp @@ -516,13 +516,22 @@ void Expression::makeCheckRST() { // Checks that an RPN expression's value fits within N bits (signed or unsigned) void Expression::checkNBit(uint8_t n) const { + if (isKnown()) + ::checkNBit(value(), n, "Expression"); +} + +bool checkNBit(int32_t v, uint8_t n, char const *name) { assume(n != 0); // That doesn't make sense assume(n < CHAR_BIT * sizeof(int)); // Otherwise `1 << n` is UB - if (isKnown()) { - if (int32_t val = value(); val < -(1 << n) || val >= 1 << n) - warning(WARNING_TRUNCATION_1, "Expression must be %u-bit\n", n); - else if (val < -(1 << (n - 1))) - warning(WARNING_TRUNCATION_2, "Expression must be %u-bit\n", n); + if (v < -(1 << n) || v >= 1 << n) { + warning(WARNING_TRUNCATION_1, "%s must be %u-bit\n", name, n); + return false; } + if (v < -(1 << (n - 1))) { + warning(WARNING_TRUNCATION_2, "%s must be %u-bit\n", name, n); + return false; + } + + return true; } diff --git a/src/asm/section.cpp b/src/asm/section.cpp index 4100b02f..3fdd9496 100644 --- a/src/asm/section.cpp +++ b/src/asm/section.cpp @@ -671,34 +671,44 @@ void sect_AbsByte(uint8_t b) { writebyte(b); } -void sect_AbsByteString(std::vector const &s) { +void sect_AbsByteString(std::vector const &s) { if (!checkcodesection()) return; if (!reserveSpace(s.size())) return; - for (uint8_t v : s) - writebyte(v); + for (int32_t v : s) { + if (!checkNBit(v, 8, "All character units")) + break; + } + + for (int32_t v : s) + writebyte(static_cast(v)); } -void sect_AbsWordString(std::vector const &s) { +void sect_AbsWordString(std::vector const &s) { if (!checkcodesection()) return; if (!reserveSpace(s.size() * 2)) return; - for (uint8_t v : s) - writeword(v); + for (int32_t v : s) { + if (!checkNBit(v, 16, "All character units")) + break; + } + + for (int32_t v : s) + writeword(static_cast(v)); } -void sect_AbsLongString(std::vector const &s) { +void sect_AbsLongString(std::vector const &s) { if (!checkcodesection()) return; if (!reserveSpace(s.size() * 4)) return; - for (uint8_t v : s) - writelong(v); + for (int32_t v : s) + writelong(static_cast(v)); } // Skip this many bytes diff --git a/src/util.cpp b/src/util.cpp index ca1071e7..29bca3c3 100644 --- a/src/util.cpp +++ b/src/util.cpp @@ -53,7 +53,7 @@ char const *printChar(int c) { return buf; } -size_t readUTF8Char(std::vector *dest, char const *src) { +size_t readUTF8Char(std::vector *dest, char const *src) { uint32_t state = 0, codepoint; size_t i = 0; diff --git a/test/asm/db-dw-dl-string.err b/test/asm/db-dw-dl-string.err index e03c4e1e..41c6af9b 100644 --- a/test/asm/db-dw-dl-string.err +++ b/test/asm/db-dw-dl-string.err @@ -1,6 +1,6 @@ warning: db-dw-dl-string.asm(15): [-Wnumeric-string] - Treating 4-character string as a number + Treating 4-byte string as a number warning: db-dw-dl-string.asm(16): [-Wnumeric-string] - Treating 4-character string as a number + Treating 4-byte string as a number warning: db-dw-dl-string.asm(17): [-Wnumeric-string] - Treating 4-character string as a number + Treating 4-byte string as a number diff --git a/test/asm/multiple-charmaps.err b/test/asm/multiple-charmaps.err index 663114c8..c4f79d15 100644 --- a/test/asm/multiple-charmaps.err +++ b/test/asm/multiple-charmaps.err @@ -1,15 +1,15 @@ warning: multiple-charmaps.asm(46) -> multiple-charmaps.asm::print_mapped(34): [-Wnumeric-string] - Treating 2-character string as a number + Treating 2-byte string as a number warning: multiple-charmaps.asm(54) -> multiple-charmaps.asm::print_mapped(34): [-Wnumeric-string] - Treating 2-character string as a number + Treating 2-byte string as a number warning: multiple-charmaps.asm(73) -> multiple-charmaps.asm::print_mapped(34): [-Wnumeric-string] - Treating 2-character string as a number + Treating 2-byte string as a number warning: multiple-charmaps.asm(95) -> multiple-charmaps.asm::print_mapped(34): [-Wnumeric-string] - Treating 2-character string as a number + Treating 2-byte string as a number warning: multiple-charmaps.asm(96) -> multiple-charmaps.asm::print_mapped(34): [-Wnumeric-string] - Treating 2-character string as a number + Treating 2-byte string as a number warning: multiple-charmaps.asm(104) -> multiple-charmaps.asm::print_mapped(34): [-Wnumeric-string] - Treating 2-character string as a number + Treating 2-byte string as a number error: multiple-charmaps.asm(106) -> multiple-charmaps.asm::new_(9): Charmap 'map1' already exists error: multiple-charmaps.asm(108) -> multiple-charmaps.asm::set_(15): diff --git a/test/asm/multivalue-charmap.asm b/test/asm/multivalue-charmap.asm new file mode 100644 index 00000000..9253c64f --- /dev/null +++ b/test/asm/multivalue-charmap.asm @@ -0,0 +1,35 @@ +section "test", rom0[0] + +charmap "a", $61 +charmap "b", $62 +charmap "c", $63 +charmap "啊", $04, $c3 +charmap "de", $6564 +charmap "fghi", $66, $67, $6968 + +db "abc啊" ; db $61, $62, $63, $04, $C3 +db "abcde" ; db $61, $62, $63, $64 (truncated) +dw "abc啊" ; dw $61, $62, $63, $04, $C3 +dw "abcde" ; dw $61, $62, $63, $6564 +dw "abcdefghi" ; dw $61, $62, $63, $6564, $66, $67, $6968 + +dl 0 ; separator + +charmap "A", $01234567 +charmap "B", $fedcba98 +assert "A" == $01234567 +assert "B" == $fedcba98 +db "AB" ; db $01234567, $fedcba98 (truncated to $67, $98) +dl "AB" ; dl $01234567, $fedcba98 + +charmap "C", $01, $23, $45, $67 +charmap "D", $fe, $dc, $ba, $98 +assert "C" == $01234567 +assert "D" == $fedcba98 +db "CD" ; db $01, $23, $45, $67, $fe, $dc, $ba, $98 +dw "CD" ; dw $01, $23, $45, $67, $fe, $dc, $ba, $98 + +charmap "E", $01, $2345, $6789ab, $cdef +assert "E" == $0145abef +db "E" ; db $01, $2345, $6789ab, $cdef (truncated to $01, $45, $ab, $ef) +dl "E" ; dl $01, $2345, $6789ab, $cdef diff --git a/test/asm/multivalue-charmap.err b/test/asm/multivalue-charmap.err new file mode 100644 index 00000000..174ce165 --- /dev/null +++ b/test/asm/multivalue-charmap.err @@ -0,0 +1,14 @@ +warning: multivalue-charmap.asm(11): [-Wtruncation] + All character units must be 8-bit +warning: multivalue-charmap.asm(22): [-Wtruncation] + All character units must be 8-bit +warning: multivalue-charmap.asm(27): [-Wnumeric-string] + Treating 4-byte string as a number +warning: multivalue-charmap.asm(28): [-Wnumeric-string] + Treating 4-byte string as a number +warning: multivalue-charmap.asm(33): [-Wtruncation] + All character units must be 8-bit +warning: multivalue-charmap.asm(33): [-Wnumeric-string] + Treating 4-byte string as a number +warning: multivalue-charmap.asm(34): [-Wtruncation] + All character units must be 8-bit diff --git a/test/asm/multivalue-charmap.out.bin b/test/asm/multivalue-charmap.out.bin new file mode 100644 index 0000000000000000000000000000000000000000..fffa4f7d7b0f312212fdd40d1fb8d61bff99f2b1 GIT binary patch literal 99 zcmXwvOAUZ95CvxwoK(vS=?501;GiDrpx2&pP!Dy`(1h?VGs!$3;?{hK$rFfdG(JoF l&jaJUShH;EB-cL0sm)%q;Lz14ok~Wu3|UWgP;O&9@B)}w9drNy literal 0 HcmV?d00001 diff --git a/test/asm/warn-numeric-string.err b/test/asm/warn-numeric-string.err index 7c0d1079..7baa273a 100644 --- a/test/asm/warn-numeric-string.err +++ b/test/asm/warn-numeric-string.err @@ -1,38 +1,38 @@ warning: warn-numeric-string.asm(20) -> warn-numeric-string.asm::try(12): [-Wnumeric-string] - Treating string as a number ignores first 1 character + Treating string as a number ignores first 1 byte warning: warn-numeric-string.asm(20) -> warn-numeric-string.asm::try(13): [-Wnumeric-string] - Treating string as a number ignores first 1 character + Treating string as a number ignores first 1 byte warning: warn-numeric-string.asm(20) -> warn-numeric-string.asm::try(13): [-Wnumeric-string] - Treating string as a number ignores first 2 characters + Treating string as a number ignores first 2 bytes warning: warn-numeric-string.asm(22) -> warn-numeric-string.asm::try(12): [-Wnumeric-string] - Treating string as a number ignores first 1 character + Treating string as a number ignores first 1 byte warning: warn-numeric-string.asm(22) -> warn-numeric-string.asm::try(13): [-Wnumeric-string] - Treating string as a number ignores first 1 character + Treating string as a number ignores first 1 byte warning: warn-numeric-string.asm(22) -> warn-numeric-string.asm::try(13): [-Wnumeric-string] - Treating string as a number ignores first 2 characters + Treating string as a number ignores first 2 bytes warning: warn-numeric-string.asm(23) -> warn-numeric-string.asm::try(12): [-Wnumeric-string] - Treating string as a number ignores first 1 character + Treating string as a number ignores first 1 byte warning: warn-numeric-string.asm(23) -> warn-numeric-string.asm::try(13): [-Wnumeric-string] - Treating string as a number ignores first 1 character + Treating string as a number ignores first 1 byte warning: warn-numeric-string.asm(23) -> warn-numeric-string.asm::try(13): [-Wnumeric-string] - Treating string as a number ignores first 2 characters + Treating string as a number ignores first 2 bytes warning: warn-numeric-string.asm(23) -> warn-numeric-string.asm::try(15): [-Wnumeric-string] - Treating 2-character string as a number + Treating 2-byte string as a number warning: warn-numeric-string.asm(23) -> warn-numeric-string.asm::try(16): [-Wnumeric-string] - Treating 3-character string as a number + Treating 3-byte string as a number error: warn-numeric-string.asm(24) -> warn-numeric-string.asm::try(12): [-Werror=numeric-string] - Treating string as a number ignores first 1 character + Treating string as a number ignores first 1 byte error: warn-numeric-string.asm(24) -> warn-numeric-string.asm::try(13): [-Werror=numeric-string] - Treating string as a number ignores first 1 character + Treating string as a number ignores first 1 byte error: warn-numeric-string.asm(24) -> warn-numeric-string.asm::try(13): [-Werror=numeric-string] - Treating string as a number ignores first 2 characters + Treating string as a number ignores first 2 bytes error: warn-numeric-string.asm(25) -> warn-numeric-string.asm::try(12): [-Werror=numeric-string] - Treating string as a number ignores first 1 character + Treating string as a number ignores first 1 byte error: warn-numeric-string.asm(25) -> warn-numeric-string.asm::try(13): [-Werror=numeric-string] - Treating string as a number ignores first 1 character + Treating string as a number ignores first 1 byte error: warn-numeric-string.asm(25) -> warn-numeric-string.asm::try(13): [-Werror=numeric-string] - Treating string as a number ignores first 2 characters + Treating string as a number ignores first 2 bytes error: warn-numeric-string.asm(25) -> warn-numeric-string.asm::try(15): [-Werror=numeric-string] - Treating 2-character string as a number + Treating 2-byte string as a number error: warn-numeric-string.asm(25) -> warn-numeric-string.asm::try(16): [-Werror=numeric-string] - Treating 3-character string as a number + Treating 3-byte string as a number