mirror of
https://github.com/gbdev/rgbds.git
synced 2025-11-20 10:12:06 +00:00
Allow NUL characters in strings (#1405)
This commit is contained in:
@@ -5,6 +5,7 @@
|
|||||||
|
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
#include <string_view>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#define DEFAULT_CHARMAP_NAME "main"
|
#define DEFAULT_CHARMAP_NAME "main"
|
||||||
@@ -16,6 +17,6 @@ void charmap_Pop();
|
|||||||
void charmap_Add(std::string const &mapping, uint8_t value);
|
void charmap_Add(std::string const &mapping, uint8_t value);
|
||||||
bool charmap_HasChar(std::string const &input);
|
bool charmap_HasChar(std::string const &input);
|
||||||
void charmap_Convert(std::string const &input, std::vector<uint8_t> &output);
|
void charmap_Convert(std::string const &input, std::vector<uint8_t> &output);
|
||||||
size_t charmap_ConvertNext(char const *&input, std::vector<uint8_t> *output);
|
size_t charmap_ConvertNext(std::string_view &input, std::vector<uint8_t> *output);
|
||||||
|
|
||||||
#endif // RGBDS_ASM_CHARMAP_HPP
|
#endif // RGBDS_ASM_CHARMAP_HPP
|
||||||
|
|||||||
@@ -417,6 +417,7 @@ There are a number of escape sequences you can use within a string:
|
|||||||
.It Ql \en Ta Newline ($0A)
|
.It Ql \en Ta Newline ($0A)
|
||||||
.It Ql \er Ta Carriage return ($0D)
|
.It Ql \er Ta Carriage return ($0D)
|
||||||
.It Ql \et Ta Tab ($09)
|
.It Ql \et Ta Tab ($09)
|
||||||
|
.It Ql \e0 Ta Null ($00)
|
||||||
.It Qo \e1 Qc \[en] Qo \e9 Qc Ta Macro argument Pq Only in the body of a macro; see Sx Invoking macros
|
.It Qo \e1 Qc \[en] Qo \e9 Qc Ta Macro argument Pq Only in the body of a macro; see Sx Invoking macros
|
||||||
.It Ql \e# Ta All Dv _NARG No macro arguments, separated by commas Pq Only in the body of a macro
|
.It Ql \e# Ta All Dv _NARG No macro arguments, separated by commas Pq Only in the body of a macro
|
||||||
.It Ql \e@ Ta Label name suffix Pq Only in the body of a macro or a Ic REPT No block
|
.It Ql \e@ Ta Label name suffix Pq Only in the body of a macro or a Ic REPT No block
|
||||||
@@ -792,7 +793,7 @@ RAMLocation:
|
|||||||
ret
|
ret
|
||||||
|
|
||||||
\&.string
|
\&.string
|
||||||
db "Hello World!", 0
|
db "Hello World!\e0"
|
||||||
\&.end
|
\&.end
|
||||||
ENDL
|
ENDL
|
||||||
.Ed
|
.Ed
|
||||||
|
|||||||
@@ -19,7 +19,7 @@ struct CharmapNode {
|
|||||||
bool isTerminal; // Whether there exists a mapping that ends here
|
bool isTerminal; // Whether there exists a mapping that ends here
|
||||||
uint8_t value; // If the above is true, its corresponding value
|
uint8_t value; // If the above is true, its corresponding value
|
||||||
// This MUST be indexes and not pointers, because pointers get invalidated by reallocation!
|
// This MUST be indexes and not pointers, because pointers get invalidated by reallocation!
|
||||||
size_t next[255]; // Indexes of where to go next, 0 = nowhere
|
size_t next[256]; // Indexes of where to go next, 0 = nowhere
|
||||||
};
|
};
|
||||||
|
|
||||||
struct Charmap {
|
struct Charmap {
|
||||||
@@ -89,7 +89,7 @@ void charmap_Add(std::string const &mapping, uint8_t value) {
|
|||||||
size_t nodeIdx = 0;
|
size_t nodeIdx = 0;
|
||||||
|
|
||||||
for (char c : mapping) {
|
for (char c : mapping) {
|
||||||
size_t &nextIdxRef = charmap.nodes[nodeIdx].next[(uint8_t)c - 1];
|
size_t &nextIdxRef = charmap.nodes[nodeIdx].next[(uint8_t)c];
|
||||||
size_t nextIdx = nextIdxRef;
|
size_t nextIdx = nextIdxRef;
|
||||||
|
|
||||||
if (!nextIdx) {
|
if (!nextIdx) {
|
||||||
@@ -118,7 +118,7 @@ bool charmap_HasChar(std::string const &input) {
|
|||||||
size_t nodeIdx = 0;
|
size_t nodeIdx = 0;
|
||||||
|
|
||||||
for (char c : input) {
|
for (char c : input) {
|
||||||
nodeIdx = charmap.nodes[nodeIdx].next[(uint8_t)c - 1];
|
nodeIdx = charmap.nodes[nodeIdx].next[(uint8_t)c];
|
||||||
|
|
||||||
if (!nodeIdx)
|
if (!nodeIdx)
|
||||||
return false;
|
return false;
|
||||||
@@ -128,12 +128,12 @@ bool charmap_HasChar(std::string const &input) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void charmap_Convert(std::string const &input, std::vector<uint8_t> &output) {
|
void charmap_Convert(std::string const &input, std::vector<uint8_t> &output) {
|
||||||
char const *ptr = input.c_str();
|
std::string_view inputView = input;
|
||||||
while (charmap_ConvertNext(ptr, &output))
|
while (charmap_ConvertNext(inputView, &output))
|
||||||
;
|
;
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t charmap_ConvertNext(char const *&input, std::vector<uint8_t> *output) {
|
size_t charmap_ConvertNext(std::string_view &input, std::vector<uint8_t> *output) {
|
||||||
// The goal is to match the longest mapping possible.
|
// The goal is to match the longest mapping possible.
|
||||||
// For that, advance through the trie with each character read.
|
// For that, advance through the trie with each character read.
|
||||||
// If that would lead to a dead end, rewind characters until the last match, and output.
|
// If that would lead to a dead end, rewind characters until the last match, and output.
|
||||||
@@ -141,14 +141,15 @@ size_t charmap_ConvertNext(char const *&input, std::vector<uint8_t> *output) {
|
|||||||
Charmap const &charmap = *currentCharmap;
|
Charmap const &charmap = *currentCharmap;
|
||||||
size_t matchIdx = 0;
|
size_t matchIdx = 0;
|
||||||
size_t rewindDistance = 0;
|
size_t rewindDistance = 0;
|
||||||
|
size_t inputIdx = 0;
|
||||||
|
|
||||||
for (size_t nodeIdx = 0; *input;) {
|
for (size_t nodeIdx = 0; inputIdx < input.length();) {
|
||||||
nodeIdx = charmap.nodes[nodeIdx].next[(uint8_t)*input - 1];
|
nodeIdx = charmap.nodes[nodeIdx].next[(uint8_t)input[inputIdx]];
|
||||||
|
|
||||||
if (!nodeIdx)
|
if (!nodeIdx)
|
||||||
break;
|
break;
|
||||||
|
|
||||||
input++; // Consume that char
|
inputIdx++; // Consume that char
|
||||||
|
|
||||||
if (charmap.nodes[nodeIdx].isTerminal) {
|
if (charmap.nodes[nodeIdx].isTerminal) {
|
||||||
matchIdx = nodeIdx; // This node matches, register it
|
matchIdx = nodeIdx; // This node matches, register it
|
||||||
@@ -160,25 +161,23 @@ size_t charmap_ConvertNext(char const *&input, std::vector<uint8_t> *output) {
|
|||||||
|
|
||||||
// We are at a dead end (either because we reached the end of input, or of the trie),
|
// We are at a dead end (either because we reached the end of input, or of the trie),
|
||||||
// so rewind up to the last match, and output.
|
// so rewind up to the last match, and output.
|
||||||
input -= rewindDistance; // This will rewind all the way if no match found
|
inputIdx -= rewindDistance; // This will rewind all the way if no match found
|
||||||
|
|
||||||
|
size_t matchLen = 0;
|
||||||
if (matchIdx) { // A match was found, use it
|
if (matchIdx) { // A match was found, use it
|
||||||
if (output)
|
if (output)
|
||||||
output->push_back(charmap.nodes[matchIdx].value);
|
output->push_back(charmap.nodes[matchIdx].value);
|
||||||
|
|
||||||
return 1;
|
matchLen = 1;
|
||||||
|
|
||||||
} else if (*input) { // No match found, but there is some input left
|
} else if (inputIdx < input.length()) { // No match found, but there is some input left
|
||||||
int firstChar = *input;
|
int firstChar = input[inputIdx];
|
||||||
// This will write the codepoint's value to `output`, little-endian
|
// This will write the codepoint's value to `output`, little-endian
|
||||||
size_t codepointLen = readUTF8Char(output, input);
|
size_t codepointLen = readUTF8Char(output, input.data() + inputIdx);
|
||||||
|
|
||||||
if (codepointLen == 0)
|
if (codepointLen == 0)
|
||||||
error("Input string is not valid UTF-8\n");
|
error("Input string is not valid UTF-8\n");
|
||||||
|
|
||||||
// OK because UTF-8 has no NUL in multi-byte chars
|
|
||||||
input += codepointLen;
|
|
||||||
|
|
||||||
// Warn if this character is not mapped but any others are
|
// Warn if this character is not mapped but any others are
|
||||||
if (charmap.nodes.size() > 1)
|
if (charmap.nodes.size() > 1)
|
||||||
warning(WARNING_UNMAPPED_CHAR_1, "Unmapped character %s\n", printChar(firstChar));
|
warning(WARNING_UNMAPPED_CHAR_1, "Unmapped character %s\n", printChar(firstChar));
|
||||||
@@ -189,9 +188,10 @@ size_t charmap_ConvertNext(char const *&input, std::vector<uint8_t> *output) {
|
|||||||
printChar(firstChar)
|
printChar(firstChar)
|
||||||
);
|
);
|
||||||
|
|
||||||
return codepointLen;
|
inputIdx += codepointLen;
|
||||||
|
matchLen = codepointLen;
|
||||||
|
}
|
||||||
|
|
||||||
} else { // End of input
|
input = input.substr(inputIdx);
|
||||||
return 0;
|
return matchLen;
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -566,7 +566,7 @@ void lexer_CheckRecursionDepth() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static bool isMacroChar(char c) {
|
static bool isMacroChar(char c) {
|
||||||
return c == '@' || c == '#' || c == '<' || (c >= '0' && c <= '9');
|
return c == '@' || c == '#' || c == '<' || (c > '0' && c <= '9');
|
||||||
}
|
}
|
||||||
|
|
||||||
// forward declarations for readBracketedMacroArgNum
|
// forward declarations for readBracketedMacroArgNum
|
||||||
@@ -1245,6 +1245,9 @@ static void appendEscapedString(std::string &str, std::string const &escape) {
|
|||||||
case '\t':
|
case '\t':
|
||||||
str += "\\t";
|
str += "\\t";
|
||||||
break;
|
break;
|
||||||
|
case '\0':
|
||||||
|
str += "\\0";
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1325,6 +1328,10 @@ static std::string readString(bool raw) {
|
|||||||
c = '\t';
|
c = '\t';
|
||||||
shiftChar();
|
shiftChar();
|
||||||
break;
|
break;
|
||||||
|
case '0':
|
||||||
|
c = '\0';
|
||||||
|
shiftChar();
|
||||||
|
break;
|
||||||
|
|
||||||
// Line continuation
|
// Line continuation
|
||||||
case ' ':
|
case ' ':
|
||||||
@@ -1336,7 +1343,6 @@ static std::string readString(bool raw) {
|
|||||||
// Macro arg
|
// Macro arg
|
||||||
case '@':
|
case '@':
|
||||||
case '#':
|
case '#':
|
||||||
case '0':
|
|
||||||
case '1':
|
case '1':
|
||||||
case '2':
|
case '2':
|
||||||
case '3':
|
case '3':
|
||||||
@@ -1453,6 +1459,7 @@ static void appendStringLiteral(std::string &str, bool raw) {
|
|||||||
case 'n':
|
case 'n':
|
||||||
case 'r':
|
case 'r':
|
||||||
case 't':
|
case 't':
|
||||||
|
case '0':
|
||||||
// Return that character unchanged
|
// Return that character unchanged
|
||||||
str += '\\';
|
str += '\\';
|
||||||
shiftChar();
|
shiftChar();
|
||||||
@@ -1468,7 +1475,6 @@ static void appendStringLiteral(std::string &str, bool raw) {
|
|||||||
// Macro arg
|
// Macro arg
|
||||||
case '@':
|
case '@':
|
||||||
case '#':
|
case '#':
|
||||||
case '0':
|
|
||||||
case '1':
|
case '1':
|
||||||
case '2':
|
case '2':
|
||||||
case '3':
|
case '3':
|
||||||
@@ -1916,6 +1922,9 @@ backslash:
|
|||||||
case 't':
|
case 't':
|
||||||
c = '\t';
|
c = '\t';
|
||||||
break;
|
break;
|
||||||
|
case '0':
|
||||||
|
c = '\0';
|
||||||
|
break;
|
||||||
|
|
||||||
case ' ':
|
case ' ':
|
||||||
case '\r':
|
case '\r':
|
||||||
|
|||||||
@@ -1096,7 +1096,8 @@ print_expr:
|
|||||||
printf("$%" PRIX32, $1);
|
printf("$%" PRIX32, $1);
|
||||||
}
|
}
|
||||||
| string {
|
| string {
|
||||||
fputs($1.c_str(), stdout);
|
// Allow printing NUL characters
|
||||||
|
fwrite($1.data(), 1, $1.length(), stdout);
|
||||||
}
|
}
|
||||||
;
|
;
|
||||||
|
|
||||||
@@ -2436,33 +2437,34 @@ static std::string strsubUTF8(std::string const &str, uint32_t pos, uint32_t len
|
|||||||
}
|
}
|
||||||
|
|
||||||
static size_t charlenUTF8(std::string const &str) {
|
static size_t charlenUTF8(std::string const &str) {
|
||||||
char const *ptr = str.c_str();
|
std::string_view view = str;
|
||||||
size_t len;
|
size_t len;
|
||||||
|
|
||||||
for (len = 0; charmap_ConvertNext(ptr, nullptr); len++)
|
for (len = 0; charmap_ConvertNext(view, nullptr); len++)
|
||||||
;
|
;
|
||||||
|
|
||||||
return len;
|
return len;
|
||||||
}
|
}
|
||||||
|
|
||||||
static std::string charsubUTF8(std::string const &str, uint32_t pos) {
|
static std::string charsubUTF8(std::string const &str, uint32_t pos) {
|
||||||
char const *ptr = str.c_str();
|
std::string_view view = str;
|
||||||
size_t charLen = 1;
|
size_t charLen = 1;
|
||||||
|
|
||||||
// Advance to starting position in source string.
|
// Advance to starting position in source string.
|
||||||
for (uint32_t curPos = 1; charLen && curPos < pos; curPos++)
|
for (uint32_t curPos = 1; charLen && curPos < pos; curPos++)
|
||||||
charLen = charmap_ConvertNext(ptr, nullptr);
|
charLen = charmap_ConvertNext(view, nullptr);
|
||||||
|
|
||||||
char const *start = ptr;
|
std::string_view start = view;
|
||||||
|
|
||||||
if (!charmap_ConvertNext(ptr, nullptr))
|
if (!charmap_ConvertNext(view, nullptr))
|
||||||
warning(
|
warning(
|
||||||
WARNING_BUILTIN_ARG,
|
WARNING_BUILTIN_ARG,
|
||||||
"CHARSUB: Position %" PRIu32 " is past the end of the string\n",
|
"CHARSUB: Position %" PRIu32 " is past the end of the string\n",
|
||||||
pos
|
pos
|
||||||
);
|
);
|
||||||
|
|
||||||
return std::string(start, ptr - start);
|
start = start.substr(0, start.length() - view.length());
|
||||||
|
return std::string(start);
|
||||||
}
|
}
|
||||||
|
|
||||||
static uint32_t adjustNegativePos(int32_t pos, size_t len, char const *functionName) {
|
static uint32_t adjustNegativePos(int32_t pos, size_t len, char const *functionName) {
|
||||||
|
|||||||
@@ -36,6 +36,9 @@ char const *printChar(int c) {
|
|||||||
case '\t':
|
case '\t':
|
||||||
buf[2] = 't';
|
buf[2] = 't';
|
||||||
break;
|
break;
|
||||||
|
case '\0':
|
||||||
|
buf[2] = '0';
|
||||||
|
break;
|
||||||
|
|
||||||
default: // Print as hex
|
default: // Print as hex
|
||||||
buf[0] = '0';
|
buf[0] = '0';
|
||||||
|
|||||||
12
test/asm/null-character.asm
Normal file
12
test/asm/null-character.asm
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
MACRO echo
|
||||||
|
print "\#"
|
||||||
|
ENDM
|
||||||
|
; '\0' can be printed like any other character
|
||||||
|
print "hello\0world\0"
|
||||||
|
echo left\0right\0
|
||||||
|
|
||||||
|
SECTION "test", ROM0
|
||||||
|
; '\0' can be included in ROM like any other character
|
||||||
|
db "foo\0bar", 0
|
||||||
|
charmap "a\0b", $42
|
||||||
|
db "a\0b\0"
|
||||||
2
test/asm/null-character.err
Normal file
2
test/asm/null-character.err
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
warning: null-character.asm(12): [-Wunmapped-char]
|
||||||
|
Unmapped character '\0'
|
||||||
BIN
test/asm/null-character.out
Normal file
BIN
test/asm/null-character.out
Normal file
Binary file not shown.
BIN
test/asm/null-character.out.bin
Normal file
BIN
test/asm/null-character.out.bin
Normal file
Binary file not shown.
@@ -1,3 +1,3 @@
|
|||||||
error: null-in-macro.asm(4) -> null-in-macro.asm::foo(2):
|
error: null-in-macro.asm(4) -> null-in-macro.asm::foo(2):
|
||||||
Unknown character 0x00
|
Unknown character '\0'
|
||||||
error: Assembly aborted (1 error)!
|
error: Assembly aborted (1 error)!
|
||||||
|
|||||||
7
test/asm/null-outside-string.asm
Normal file
7
test/asm/null-outside-string.asm
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
SECTION "test", ROM0
|
||||||
|
; '\0' is not special here; it's lexed as a line continuation...
|
||||||
|
DEF foo\0bar EQU 42
|
||||||
|
db foo\0bar
|
||||||
|
; ...just like any other non-whitespace character
|
||||||
|
DEF spam\Xeggs EQU 69
|
||||||
|
db spam\Xeggs
|
||||||
17
test/asm/null-outside-string.err
Normal file
17
test/asm/null-outside-string.err
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
error: null-outside-string.asm(3):
|
||||||
|
Begun line continuation, but encountered character '0'
|
||||||
|
error: null-outside-string.asm(3):
|
||||||
|
syntax error, unexpected number
|
||||||
|
error: null-outside-string.asm(4):
|
||||||
|
Begun line continuation, but encountered character '0'
|
||||||
|
error: null-outside-string.asm(4):
|
||||||
|
syntax error, unexpected number
|
||||||
|
error: null-outside-string.asm(6):
|
||||||
|
Begun line continuation, but encountered character 'X'
|
||||||
|
error: null-outside-string.asm(6):
|
||||||
|
syntax error, unexpected identifier
|
||||||
|
error: null-outside-string.asm(7):
|
||||||
|
Begun line continuation, but encountered character 'X'
|
||||||
|
error: null-outside-string.asm(7):
|
||||||
|
syntax error, unexpected identifier
|
||||||
|
error: Assembly aborted (8 errors)!
|
||||||
@@ -1,2 +1,2 @@
|
|||||||
def x\0 = 10
|
def x\<0> = 10
|
||||||
println x
|
println x
|
||||||
|
|||||||
@@ -1,3 +1,3 @@
|
|||||||
error: symbol-invalid-macro-arg.asm(1):
|
error: symbol-invalid-macro-arg.asm(1):
|
||||||
Invalid macro argument '\0'
|
Invalid bracketed macro argument '\<0>'
|
||||||
error: Assembly aborted (1 error)!
|
error: Assembly aborted (1 error)!
|
||||||
|
|||||||
Reference in New Issue
Block a user