diff --git a/include/asm/charmap.hpp b/include/asm/charmap.hpp index caf708ad..5f61d3df 100644 --- a/include/asm/charmap.hpp +++ b/include/asm/charmap.hpp @@ -5,6 +5,7 @@ #include #include +#include #include #define DEFAULT_CHARMAP_NAME "main" @@ -16,6 +17,6 @@ void charmap_Pop(); void charmap_Add(std::string const &mapping, uint8_t value); bool charmap_HasChar(std::string const &input); void charmap_Convert(std::string const &input, std::vector &output); -size_t charmap_ConvertNext(char const *&input, std::vector *output); +size_t charmap_ConvertNext(std::string_view &input, std::vector *output); #endif // RGBDS_ASM_CHARMAP_HPP diff --git a/man/rgbasm.5 b/man/rgbasm.5 index 07f9cef6..680563e3 100644 --- a/man/rgbasm.5 +++ b/man/rgbasm.5 @@ -417,6 +417,7 @@ There are a number of escape sequences you can use within a string: .It Ql \en Ta Newline ($0A) .It Ql \er Ta Carriage return ($0D) .It Ql \et Ta Tab ($09) +.It Ql \e0 Ta Null ($00) .It Qo \e1 Qc \[en] Qo \e9 Qc Ta Macro argument Pq Only in the body of a macro; see Sx Invoking macros .It Ql \e# Ta All Dv _NARG No macro arguments, separated by commas Pq Only in the body of a macro .It Ql \e@ Ta Label name suffix Pq Only in the body of a macro or a Ic REPT No block @@ -792,7 +793,7 @@ RAMLocation: ret \&.string - db "Hello World!", 0 + db "Hello World!\e0" \&.end ENDL .Ed diff --git a/src/asm/charmap.cpp b/src/asm/charmap.cpp index 638048d7..e0d4ee43 100644 --- a/src/asm/charmap.cpp +++ b/src/asm/charmap.cpp @@ -19,7 +19,7 @@ struct CharmapNode { bool isTerminal; // Whether there exists a mapping that ends here uint8_t value; // If the above is true, its corresponding value // This MUST be indexes and not pointers, because pointers get invalidated by reallocation! - size_t next[255]; // Indexes of where to go next, 0 = nowhere + size_t next[256]; // Indexes of where to go next, 0 = nowhere }; struct Charmap { @@ -89,7 +89,7 @@ void charmap_Add(std::string const &mapping, uint8_t value) { size_t nodeIdx = 0; for (char c : mapping) { - size_t &nextIdxRef = charmap.nodes[nodeIdx].next[(uint8_t)c - 1]; + size_t &nextIdxRef = charmap.nodes[nodeIdx].next[(uint8_t)c]; size_t nextIdx = nextIdxRef; if (!nextIdx) { @@ -118,7 +118,7 @@ bool charmap_HasChar(std::string const &input) { size_t nodeIdx = 0; for (char c : input) { - nodeIdx = charmap.nodes[nodeIdx].next[(uint8_t)c - 1]; + nodeIdx = charmap.nodes[nodeIdx].next[(uint8_t)c]; if (!nodeIdx) return false; @@ -128,12 +128,12 @@ bool charmap_HasChar(std::string const &input) { } void charmap_Convert(std::string const &input, std::vector &output) { - char const *ptr = input.c_str(); - while (charmap_ConvertNext(ptr, &output)) + std::string_view inputView = input; + while (charmap_ConvertNext(inputView, &output)) ; } -size_t charmap_ConvertNext(char const *&input, std::vector *output) { +size_t charmap_ConvertNext(std::string_view &input, std::vector *output) { // The goal is to match the longest mapping possible. // For that, advance through the trie with each character read. // If that would lead to a dead end, rewind characters until the last match, and output. @@ -141,14 +141,15 @@ size_t charmap_ConvertNext(char const *&input, std::vector *output) { Charmap const &charmap = *currentCharmap; size_t matchIdx = 0; size_t rewindDistance = 0; + size_t inputIdx = 0; - for (size_t nodeIdx = 0; *input;) { - nodeIdx = charmap.nodes[nodeIdx].next[(uint8_t)*input - 1]; + for (size_t nodeIdx = 0; inputIdx < input.length();) { + nodeIdx = charmap.nodes[nodeIdx].next[(uint8_t)input[inputIdx]]; if (!nodeIdx) break; - input++; // Consume that char + inputIdx++; // Consume that char if (charmap.nodes[nodeIdx].isTerminal) { matchIdx = nodeIdx; // This node matches, register it @@ -160,25 +161,23 @@ size_t charmap_ConvertNext(char const *&input, std::vector *output) { // We are at a dead end (either because we reached the end of input, or of the trie), // so rewind up to the last match, and output. - input -= rewindDistance; // This will rewind all the way if no match found + inputIdx -= rewindDistance; // This will rewind all the way if no match found + size_t matchLen = 0; if (matchIdx) { // A match was found, use it if (output) output->push_back(charmap.nodes[matchIdx].value); - return 1; + matchLen = 1; - } else if (*input) { // No match found, but there is some input left - int firstChar = *input; + } else if (inputIdx < input.length()) { // No match found, but there is some input left + int firstChar = input[inputIdx]; // This will write the codepoint's value to `output`, little-endian - size_t codepointLen = readUTF8Char(output, input); + size_t codepointLen = readUTF8Char(output, input.data() + inputIdx); if (codepointLen == 0) error("Input string is not valid UTF-8\n"); - // OK because UTF-8 has no NUL in multi-byte chars - input += codepointLen; - // Warn if this character is not mapped but any others are if (charmap.nodes.size() > 1) warning(WARNING_UNMAPPED_CHAR_1, "Unmapped character %s\n", printChar(firstChar)); @@ -189,9 +188,10 @@ size_t charmap_ConvertNext(char const *&input, std::vector *output) { printChar(firstChar) ); - return codepointLen; - - } else { // End of input - return 0; + inputIdx += codepointLen; + matchLen = codepointLen; } + + input = input.substr(inputIdx); + return matchLen; } diff --git a/src/asm/lexer.cpp b/src/asm/lexer.cpp index 3ca0728c..d58d04e5 100644 --- a/src/asm/lexer.cpp +++ b/src/asm/lexer.cpp @@ -566,7 +566,7 @@ void lexer_CheckRecursionDepth() { } static bool isMacroChar(char c) { - return c == '@' || c == '#' || c == '<' || (c >= '0' && c <= '9'); + return c == '@' || c == '#' || c == '<' || (c > '0' && c <= '9'); } // forward declarations for readBracketedMacroArgNum @@ -1245,6 +1245,9 @@ static void appendEscapedString(std::string &str, std::string const &escape) { case '\t': str += "\\t"; break; + case '\0': + str += "\\0"; + break; } } } @@ -1325,6 +1328,10 @@ static std::string readString(bool raw) { c = '\t'; shiftChar(); break; + case '0': + c = '\0'; + shiftChar(); + break; // Line continuation case ' ': @@ -1336,7 +1343,6 @@ static std::string readString(bool raw) { // Macro arg case '@': case '#': - case '0': case '1': case '2': case '3': @@ -1453,6 +1459,7 @@ static void appendStringLiteral(std::string &str, bool raw) { case 'n': case 'r': case 't': + case '0': // Return that character unchanged str += '\\'; shiftChar(); @@ -1468,7 +1475,6 @@ static void appendStringLiteral(std::string &str, bool raw) { // Macro arg case '@': case '#': - case '0': case '1': case '2': case '3': @@ -1916,6 +1922,9 @@ backslash: case 't': c = '\t'; break; + case '0': + c = '\0'; + break; case ' ': case '\r': diff --git a/src/asm/parser.y b/src/asm/parser.y index b2d4bd3b..55044bfc 100644 --- a/src/asm/parser.y +++ b/src/asm/parser.y @@ -1096,7 +1096,8 @@ print_expr: printf("$%" PRIX32, $1); } | string { - fputs($1.c_str(), stdout); + // Allow printing NUL characters + fwrite($1.data(), 1, $1.length(), stdout); } ; @@ -2436,33 +2437,34 @@ static std::string strsubUTF8(std::string const &str, uint32_t pos, uint32_t len } static size_t charlenUTF8(std::string const &str) { - char const *ptr = str.c_str(); + std::string_view view = str; size_t len; - for (len = 0; charmap_ConvertNext(ptr, nullptr); len++) + for (len = 0; charmap_ConvertNext(view, nullptr); len++) ; return len; } static std::string charsubUTF8(std::string const &str, uint32_t pos) { - char const *ptr = str.c_str(); + std::string_view view = str; size_t charLen = 1; // Advance to starting position in source string. for (uint32_t curPos = 1; charLen && curPos < pos; curPos++) - charLen = charmap_ConvertNext(ptr, nullptr); + charLen = charmap_ConvertNext(view, nullptr); - char const *start = ptr; + std::string_view start = view; - if (!charmap_ConvertNext(ptr, nullptr)) + if (!charmap_ConvertNext(view, nullptr)) warning( WARNING_BUILTIN_ARG, "CHARSUB: Position %" PRIu32 " is past the end of the string\n", pos ); - return std::string(start, ptr - start); + start = start.substr(0, start.length() - view.length()); + return std::string(start); } static uint32_t adjustNegativePos(int32_t pos, size_t len, char const *functionName) { diff --git a/src/util.cpp b/src/util.cpp index daf0c08e..722af449 100644 --- a/src/util.cpp +++ b/src/util.cpp @@ -36,6 +36,9 @@ char const *printChar(int c) { case '\t': buf[2] = 't'; break; + case '\0': + buf[2] = '0'; + break; default: // Print as hex buf[0] = '0'; diff --git a/test/asm/null-character.asm b/test/asm/null-character.asm new file mode 100644 index 00000000..4b9e2fd2 --- /dev/null +++ b/test/asm/null-character.asm @@ -0,0 +1,12 @@ +MACRO echo + print "\#" +ENDM + ; '\0' can be printed like any other character + print "hello\0world\0" + echo left\0right\0 + +SECTION "test", ROM0 + ; '\0' can be included in ROM like any other character + db "foo\0bar", 0 + charmap "a\0b", $42 + db "a\0b\0" diff --git a/test/asm/null-character.err b/test/asm/null-character.err new file mode 100644 index 00000000..c6b74622 --- /dev/null +++ b/test/asm/null-character.err @@ -0,0 +1,2 @@ +warning: null-character.asm(12): [-Wunmapped-char] + Unmapped character '\0' diff --git a/test/asm/null-character.out b/test/asm/null-character.out new file mode 100644 index 00000000..077f756c Binary files /dev/null and b/test/asm/null-character.out differ diff --git a/test/asm/null-character.out.bin b/test/asm/null-character.out.bin new file mode 100644 index 00000000..f482bf95 Binary files /dev/null and b/test/asm/null-character.out.bin differ diff --git a/test/asm/null-in-macro.err b/test/asm/null-in-macro.err index 789176ce..927dc560 100644 --- a/test/asm/null-in-macro.err +++ b/test/asm/null-in-macro.err @@ -1,3 +1,3 @@ error: null-in-macro.asm(4) -> null-in-macro.asm::foo(2): - Unknown character 0x00 + Unknown character '\0' error: Assembly aborted (1 error)! diff --git a/test/asm/null-outside-string.asm b/test/asm/null-outside-string.asm new file mode 100644 index 00000000..8748ba7b --- /dev/null +++ b/test/asm/null-outside-string.asm @@ -0,0 +1,7 @@ +SECTION "test", ROM0 + ; '\0' is not special here; it's lexed as a line continuation... + DEF foo\0bar EQU 42 + db foo\0bar + ; ...just like any other non-whitespace character + DEF spam\Xeggs EQU 69 + db spam\Xeggs diff --git a/test/asm/null-outside-string.err b/test/asm/null-outside-string.err new file mode 100644 index 00000000..f5261a8c --- /dev/null +++ b/test/asm/null-outside-string.err @@ -0,0 +1,17 @@ +error: null-outside-string.asm(3): + Begun line continuation, but encountered character '0' +error: null-outside-string.asm(3): + syntax error, unexpected number +error: null-outside-string.asm(4): + Begun line continuation, but encountered character '0' +error: null-outside-string.asm(4): + syntax error, unexpected number +error: null-outside-string.asm(6): + Begun line continuation, but encountered character 'X' +error: null-outside-string.asm(6): + syntax error, unexpected identifier +error: null-outside-string.asm(7): + Begun line continuation, but encountered character 'X' +error: null-outside-string.asm(7): + syntax error, unexpected identifier +error: Assembly aborted (8 errors)! diff --git a/test/asm/symbol-invalid-macro-arg.asm b/test/asm/symbol-invalid-macro-arg.asm index 80b64702..71d11f78 100644 --- a/test/asm/symbol-invalid-macro-arg.asm +++ b/test/asm/symbol-invalid-macro-arg.asm @@ -1,2 +1,2 @@ -def x\0 = 10 +def x\<0> = 10 println x diff --git a/test/asm/symbol-invalid-macro-arg.err b/test/asm/symbol-invalid-macro-arg.err index 02c1d8b2..ec8922ad 100644 --- a/test/asm/symbol-invalid-macro-arg.err +++ b/test/asm/symbol-invalid-macro-arg.err @@ -1,3 +1,3 @@ error: symbol-invalid-macro-arg.asm(1): - Invalid macro argument '\0' + Invalid bracketed macro argument '\<0>' error: Assembly aborted (1 error)!