Allow NUL characters in strings (#1405)

This commit is contained in:
Sylvie
2024-06-18 14:26:18 -04:00
committed by GitHub
parent 9cc595b2cc
commit 623c3f662c
15 changed files with 91 additions and 37 deletions

View File

@@ -5,6 +5,7 @@
#include <stdint.h> #include <stdint.h>
#include <string> #include <string>
#include <string_view>
#include <vector> #include <vector>
#define DEFAULT_CHARMAP_NAME "main" #define DEFAULT_CHARMAP_NAME "main"
@@ -16,6 +17,6 @@ void charmap_Pop();
void charmap_Add(std::string const &mapping, uint8_t value); void charmap_Add(std::string const &mapping, uint8_t value);
bool charmap_HasChar(std::string const &input); bool charmap_HasChar(std::string const &input);
void charmap_Convert(std::string const &input, std::vector<uint8_t> &output); void charmap_Convert(std::string const &input, std::vector<uint8_t> &output);
size_t charmap_ConvertNext(char const *&input, std::vector<uint8_t> *output); size_t charmap_ConvertNext(std::string_view &input, std::vector<uint8_t> *output);
#endif // RGBDS_ASM_CHARMAP_HPP #endif // RGBDS_ASM_CHARMAP_HPP

View File

@@ -417,6 +417,7 @@ There are a number of escape sequences you can use within a string:
.It Ql \en Ta Newline ($0A) .It Ql \en Ta Newline ($0A)
.It Ql \er Ta Carriage return ($0D) .It Ql \er Ta Carriage return ($0D)
.It Ql \et Ta Tab ($09) .It Ql \et Ta Tab ($09)
.It Ql \e0 Ta Null ($00)
.It Qo \e1 Qc \[en] Qo \e9 Qc Ta Macro argument Pq Only in the body of a macro; see Sx Invoking macros .It Qo \e1 Qc \[en] Qo \e9 Qc Ta Macro argument Pq Only in the body of a macro; see Sx Invoking macros
.It Ql \e# Ta All Dv _NARG No macro arguments, separated by commas Pq Only in the body of a macro .It Ql \e# Ta All Dv _NARG No macro arguments, separated by commas Pq Only in the body of a macro
.It Ql \e@ Ta Label name suffix Pq Only in the body of a macro or a Ic REPT No block .It Ql \e@ Ta Label name suffix Pq Only in the body of a macro or a Ic REPT No block
@@ -792,7 +793,7 @@ RAMLocation:
ret ret
\&.string \&.string
db "Hello World!", 0 db "Hello World!\e0"
\&.end \&.end
ENDL ENDL
.Ed .Ed

View File

@@ -19,7 +19,7 @@ struct CharmapNode {
bool isTerminal; // Whether there exists a mapping that ends here bool isTerminal; // Whether there exists a mapping that ends here
uint8_t value; // If the above is true, its corresponding value uint8_t value; // If the above is true, its corresponding value
// This MUST be indexes and not pointers, because pointers get invalidated by reallocation! // This MUST be indexes and not pointers, because pointers get invalidated by reallocation!
size_t next[255]; // Indexes of where to go next, 0 = nowhere size_t next[256]; // Indexes of where to go next, 0 = nowhere
}; };
struct Charmap { struct Charmap {
@@ -89,7 +89,7 @@ void charmap_Add(std::string const &mapping, uint8_t value) {
size_t nodeIdx = 0; size_t nodeIdx = 0;
for (char c : mapping) { for (char c : mapping) {
size_t &nextIdxRef = charmap.nodes[nodeIdx].next[(uint8_t)c - 1]; size_t &nextIdxRef = charmap.nodes[nodeIdx].next[(uint8_t)c];
size_t nextIdx = nextIdxRef; size_t nextIdx = nextIdxRef;
if (!nextIdx) { if (!nextIdx) {
@@ -118,7 +118,7 @@ bool charmap_HasChar(std::string const &input) {
size_t nodeIdx = 0; size_t nodeIdx = 0;
for (char c : input) { for (char c : input) {
nodeIdx = charmap.nodes[nodeIdx].next[(uint8_t)c - 1]; nodeIdx = charmap.nodes[nodeIdx].next[(uint8_t)c];
if (!nodeIdx) if (!nodeIdx)
return false; return false;
@@ -128,12 +128,12 @@ bool charmap_HasChar(std::string const &input) {
} }
void charmap_Convert(std::string const &input, std::vector<uint8_t> &output) { void charmap_Convert(std::string const &input, std::vector<uint8_t> &output) {
char const *ptr = input.c_str(); std::string_view inputView = input;
while (charmap_ConvertNext(ptr, &output)) while (charmap_ConvertNext(inputView, &output))
; ;
} }
size_t charmap_ConvertNext(char const *&input, std::vector<uint8_t> *output) { size_t charmap_ConvertNext(std::string_view &input, std::vector<uint8_t> *output) {
// The goal is to match the longest mapping possible. // The goal is to match the longest mapping possible.
// For that, advance through the trie with each character read. // For that, advance through the trie with each character read.
// If that would lead to a dead end, rewind characters until the last match, and output. // If that would lead to a dead end, rewind characters until the last match, and output.
@@ -141,14 +141,15 @@ size_t charmap_ConvertNext(char const *&input, std::vector<uint8_t> *output) {
Charmap const &charmap = *currentCharmap; Charmap const &charmap = *currentCharmap;
size_t matchIdx = 0; size_t matchIdx = 0;
size_t rewindDistance = 0; size_t rewindDistance = 0;
size_t inputIdx = 0;
for (size_t nodeIdx = 0; *input;) { for (size_t nodeIdx = 0; inputIdx < input.length();) {
nodeIdx = charmap.nodes[nodeIdx].next[(uint8_t)*input - 1]; nodeIdx = charmap.nodes[nodeIdx].next[(uint8_t)input[inputIdx]];
if (!nodeIdx) if (!nodeIdx)
break; break;
input++; // Consume that char inputIdx++; // Consume that char
if (charmap.nodes[nodeIdx].isTerminal) { if (charmap.nodes[nodeIdx].isTerminal) {
matchIdx = nodeIdx; // This node matches, register it matchIdx = nodeIdx; // This node matches, register it
@@ -160,25 +161,23 @@ size_t charmap_ConvertNext(char const *&input, std::vector<uint8_t> *output) {
// We are at a dead end (either because we reached the end of input, or of the trie), // We are at a dead end (either because we reached the end of input, or of the trie),
// so rewind up to the last match, and output. // so rewind up to the last match, and output.
input -= rewindDistance; // This will rewind all the way if no match found inputIdx -= rewindDistance; // This will rewind all the way if no match found
size_t matchLen = 0;
if (matchIdx) { // A match was found, use it if (matchIdx) { // A match was found, use it
if (output) if (output)
output->push_back(charmap.nodes[matchIdx].value); output->push_back(charmap.nodes[matchIdx].value);
return 1; matchLen = 1;
} else if (*input) { // No match found, but there is some input left } else if (inputIdx < input.length()) { // No match found, but there is some input left
int firstChar = *input; int firstChar = input[inputIdx];
// This will write the codepoint's value to `output`, little-endian // This will write the codepoint's value to `output`, little-endian
size_t codepointLen = readUTF8Char(output, input); size_t codepointLen = readUTF8Char(output, input.data() + inputIdx);
if (codepointLen == 0) if (codepointLen == 0)
error("Input string is not valid UTF-8\n"); error("Input string is not valid UTF-8\n");
// OK because UTF-8 has no NUL in multi-byte chars
input += codepointLen;
// Warn if this character is not mapped but any others are // Warn if this character is not mapped but any others are
if (charmap.nodes.size() > 1) if (charmap.nodes.size() > 1)
warning(WARNING_UNMAPPED_CHAR_1, "Unmapped character %s\n", printChar(firstChar)); warning(WARNING_UNMAPPED_CHAR_1, "Unmapped character %s\n", printChar(firstChar));
@@ -189,9 +188,10 @@ size_t charmap_ConvertNext(char const *&input, std::vector<uint8_t> *output) {
printChar(firstChar) printChar(firstChar)
); );
return codepointLen; inputIdx += codepointLen;
matchLen = codepointLen;
} else { // End of input
return 0;
} }
input = input.substr(inputIdx);
return matchLen;
} }

View File

@@ -566,7 +566,7 @@ void lexer_CheckRecursionDepth() {
} }
static bool isMacroChar(char c) { static bool isMacroChar(char c) {
return c == '@' || c == '#' || c == '<' || (c >= '0' && c <= '9'); return c == '@' || c == '#' || c == '<' || (c > '0' && c <= '9');
} }
// forward declarations for readBracketedMacroArgNum // forward declarations for readBracketedMacroArgNum
@@ -1245,6 +1245,9 @@ static void appendEscapedString(std::string &str, std::string const &escape) {
case '\t': case '\t':
str += "\\t"; str += "\\t";
break; break;
case '\0':
str += "\\0";
break;
} }
} }
} }
@@ -1325,6 +1328,10 @@ static std::string readString(bool raw) {
c = '\t'; c = '\t';
shiftChar(); shiftChar();
break; break;
case '0':
c = '\0';
shiftChar();
break;
// Line continuation // Line continuation
case ' ': case ' ':
@@ -1336,7 +1343,6 @@ static std::string readString(bool raw) {
// Macro arg // Macro arg
case '@': case '@':
case '#': case '#':
case '0':
case '1': case '1':
case '2': case '2':
case '3': case '3':
@@ -1453,6 +1459,7 @@ static void appendStringLiteral(std::string &str, bool raw) {
case 'n': case 'n':
case 'r': case 'r':
case 't': case 't':
case '0':
// Return that character unchanged // Return that character unchanged
str += '\\'; str += '\\';
shiftChar(); shiftChar();
@@ -1468,7 +1475,6 @@ static void appendStringLiteral(std::string &str, bool raw) {
// Macro arg // Macro arg
case '@': case '@':
case '#': case '#':
case '0':
case '1': case '1':
case '2': case '2':
case '3': case '3':
@@ -1916,6 +1922,9 @@ backslash:
case 't': case 't':
c = '\t'; c = '\t';
break; break;
case '0':
c = '\0';
break;
case ' ': case ' ':
case '\r': case '\r':

View File

@@ -1096,7 +1096,8 @@ print_expr:
printf("$%" PRIX32, $1); printf("$%" PRIX32, $1);
} }
| string { | string {
fputs($1.c_str(), stdout); // Allow printing NUL characters
fwrite($1.data(), 1, $1.length(), stdout);
} }
; ;
@@ -2436,33 +2437,34 @@ static std::string strsubUTF8(std::string const &str, uint32_t pos, uint32_t len
} }
static size_t charlenUTF8(std::string const &str) { static size_t charlenUTF8(std::string const &str) {
char const *ptr = str.c_str(); std::string_view view = str;
size_t len; size_t len;
for (len = 0; charmap_ConvertNext(ptr, nullptr); len++) for (len = 0; charmap_ConvertNext(view, nullptr); len++)
; ;
return len; return len;
} }
static std::string charsubUTF8(std::string const &str, uint32_t pos) { static std::string charsubUTF8(std::string const &str, uint32_t pos) {
char const *ptr = str.c_str(); std::string_view view = str;
size_t charLen = 1; size_t charLen = 1;
// Advance to starting position in source string. // Advance to starting position in source string.
for (uint32_t curPos = 1; charLen && curPos < pos; curPos++) for (uint32_t curPos = 1; charLen && curPos < pos; curPos++)
charLen = charmap_ConvertNext(ptr, nullptr); charLen = charmap_ConvertNext(view, nullptr);
char const *start = ptr; std::string_view start = view;
if (!charmap_ConvertNext(ptr, nullptr)) if (!charmap_ConvertNext(view, nullptr))
warning( warning(
WARNING_BUILTIN_ARG, WARNING_BUILTIN_ARG,
"CHARSUB: Position %" PRIu32 " is past the end of the string\n", "CHARSUB: Position %" PRIu32 " is past the end of the string\n",
pos pos
); );
return std::string(start, ptr - start); start = start.substr(0, start.length() - view.length());
return std::string(start);
} }
static uint32_t adjustNegativePos(int32_t pos, size_t len, char const *functionName) { static uint32_t adjustNegativePos(int32_t pos, size_t len, char const *functionName) {

View File

@@ -36,6 +36,9 @@ char const *printChar(int c) {
case '\t': case '\t':
buf[2] = 't'; buf[2] = 't';
break; break;
case '\0':
buf[2] = '0';
break;
default: // Print as hex default: // Print as hex
buf[0] = '0'; buf[0] = '0';

View File

@@ -0,0 +1,12 @@
MACRO echo
print "\#"
ENDM
; '\0' can be printed like any other character
print "hello\0world\0"
echo left\0right\0
SECTION "test", ROM0
; '\0' can be included in ROM like any other character
db "foo\0bar", 0
charmap "a\0b", $42
db "a\0b\0"

View File

@@ -0,0 +1,2 @@
warning: null-character.asm(12): [-Wunmapped-char]
Unmapped character '\0'

BIN
test/asm/null-character.out Normal file

Binary file not shown.

Binary file not shown.

View File

@@ -1,3 +1,3 @@
error: null-in-macro.asm(4) -> null-in-macro.asm::foo(2): error: null-in-macro.asm(4) -> null-in-macro.asm::foo(2):
Unknown character 0x00 Unknown character '\0'
error: Assembly aborted (1 error)! error: Assembly aborted (1 error)!

View File

@@ -0,0 +1,7 @@
SECTION "test", ROM0
; '\0' is not special here; it's lexed as a line continuation...
DEF foo\0bar EQU 42
db foo\0bar
; ...just like any other non-whitespace character
DEF spam\Xeggs EQU 69
db spam\Xeggs

View File

@@ -0,0 +1,17 @@
error: null-outside-string.asm(3):
Begun line continuation, but encountered character '0'
error: null-outside-string.asm(3):
syntax error, unexpected number
error: null-outside-string.asm(4):
Begun line continuation, but encountered character '0'
error: null-outside-string.asm(4):
syntax error, unexpected number
error: null-outside-string.asm(6):
Begun line continuation, but encountered character 'X'
error: null-outside-string.asm(6):
syntax error, unexpected identifier
error: null-outside-string.asm(7):
Begun line continuation, but encountered character 'X'
error: null-outside-string.asm(7):
syntax error, unexpected identifier
error: Assembly aborted (8 errors)!

View File

@@ -1,2 +1,2 @@
def x\0 = 10 def x\<0> = 10
println x println x

View File

@@ -1,3 +1,3 @@
error: symbol-invalid-macro-arg.asm(1): error: symbol-invalid-macro-arg.asm(1):
Invalid macro argument '\0' Invalid bracketed macro argument '\<0>'
error: Assembly aborted (1 error)! error: Assembly aborted (1 error)!