Implement 'character' literals (#1747)

This commit is contained in:
Rangi
2025-07-15 13:08:50 -04:00
committed by GitHub
parent b6d77fbb9e
commit 1fecf80659
8 changed files with 124 additions and 11 deletions

View File

@@ -280,7 +280,7 @@ There are a number of numeric formats.
.It Binary Ta Li % , 0b , 0B Ta 01 .It Binary Ta Li % , 0b , 0B Ta 01
.It Fixed-point Ta none Ta 01234.56789 .It Fixed-point Ta none Ta 01234.56789
.It Precise fixed-point Ta none Ta 12.34q8 .It Precise fixed-point Ta none Ta 12.34q8
.It Character constant Ta none Ta \(dqABYZ\(dq .It Character constant Ta none Ta 'ABYZ'
.It Game Boy graphics Ta Li \` Ta 0123 .It Game Boy graphics Ta Li \` Ta 0123
.El .El
.Pp .Pp
@@ -293,11 +293,14 @@ or
The "character constant" form yields the value the character maps to in the current charmap. The "character constant" form yields the value the character maps to in the current charmap.
For example, by default For example, by default
.Pq refer to Xr ascii 7 .Pq refer to Xr ascii 7
.Sq \(dqA\(dq .Sq 'A'
yields 65. yields 65.
A character constant must represent a single value, so it cannot include multiple characters, or characters which map to multiple values.
See See
.Sx Character maps .Sx Character maps
for information on charmaps. for information on charmaps, and
.Sx String expressions
for information on escape characters allowed in character constants.
.Pp .Pp
The last one, Game Boy graphics, is quite interesting and useful. The last one, Game Boy graphics, is quite interesting and useful.
After the backtick, 8 digits between 0 and 3 are expected, corresponding to pixel values. After the backtick, 8 digits between 0 and 3 are expected, corresponding to pixel values.
@@ -538,7 +541,8 @@ There are a number of escape sequences you can use within a string:
.Bl -column -offset indent "Sequence" .Bl -column -offset indent "Sequence"
.It Sy Sequence Ta Sy Meaning .It Sy Sequence Ta Sy Meaning
.It Ql \e\e Ta Backslash Pq escapes the escape character itself .It Ql \e\e Ta Backslash Pq escapes the escape character itself
.It Ql \e" Ta Double quote Pq does not terminate the string .It Ql \e" Ta Double quote Pq does not terminate a string
.It Ql \e' Ta Single quote Pq does not terminate a character literal
.It Ql \e{ Ta Open curly brace Pq does not start interpolation .It Ql \e{ Ta Open curly brace Pq does not start interpolation
.It Ql \e} Ta Close curly brace Pq does not end interpolation .It Ql \e} Ta Close curly brace Pq does not end interpolation
.It Ql \en Ta Newline Pq ASCII $0A .It Ql \en Ta Newline Pq ASCII $0A

View File

@@ -1428,6 +1428,7 @@ static void appendExpandedString(std::string &str, std::string const &expanded)
break; break;
case '\\': case '\\':
case '"': case '"':
case '\'':
case '{': case '{':
str += '\\'; str += '\\';
[[fallthrough]]; [[fallthrough]];
@@ -1448,6 +1449,7 @@ static void appendCharInLiteral(std::string &str, int c) {
// Character escape // Character escape
case '\\': case '\\':
case '"': case '"':
case '\'':
case '{': case '{':
case '}': case '}':
if (rawMode) { if (rawMode) {
@@ -1518,7 +1520,7 @@ static void appendCharInLiteral(std::string &str, int c) {
break; break;
case '{': // Symbol interpolation case '{': // Symbol interpolation
// We'll be exiting the string scope, so re-enable expansions // We'll be exiting the string/character scope, so re-enable expansions
// (Not interpolations, since they're handled by the function itself...) // (Not interpolations, since they're handled by the function itself...)
lexerState->disableMacroArgs = false; lexerState->disableMacroArgs = false;
if (auto interpolation = readInterpolation(0); interpolation) { if (auto interpolation = readInterpolation(0); interpolation) {
@@ -1614,6 +1616,42 @@ static void readString(std::string &str, bool rawString) {
} }
} }
static void readCharacter(std::string &str) {
// This is essentially a simplified `readString`
Defer reenableExpansions = scopedDisableExpansions();
bool rawMode = lexerState->mode == LEXER_RAW;
// We reach this function after reading a single quote
if (rawMode) {
str += '\'';
}
for (;;) {
int c = peek();
// '\r', '\n' or EOF ends a character early
if (c == EOF || c == '\r' || c == '\n') {
error("Unterminated character");
return;
}
// We'll be staying in the character, so we can safely consume the char
shiftChar();
// Close the character and return if it's terminated
if (c == '\'') {
if (rawMode) {
str += c;
}
return;
}
// Append the character or handle special ones
appendCharInLiteral(str, c);
}
}
// Lexer core // Lexer core
static Token yylex_SKIP_TO_ENDC(); // forward declaration for yylex_NORMAL static Token yylex_SKIP_TO_ENDC(); // forward declaration for yylex_NORMAL
@@ -1896,7 +1934,7 @@ static Token yylex_NORMAL() {
case '`': // Gfx constant case '`': // Gfx constant
return Token(T_(NUMBER), readGfxConstant()); return Token(T_(NUMBER), readGfxConstant());
// Handle strings // Handle string and character literals
case '"': { case '"': {
std::string str; std::string str;
@@ -1904,6 +1942,12 @@ static Token yylex_NORMAL() {
return Token(T_(STRING), str); return Token(T_(STRING), str);
} }
case '\'': {
std::string chr;
readCharacter(chr);
return Token(T_(CHARACTER), chr);
}
// Handle newlines and EOF // Handle newlines and EOF
case '\r': case '\r':
@@ -2036,6 +2080,11 @@ static Token yylex_RAW() {
readString(str, false); readString(str, false);
break; break;
case '\'': // Character literals inside macro args
shiftChar();
readCharacter(str);
break;
case '#': // Raw string literals inside macro args case '#': // Raw string literals inside macro args
str += c; str += c;
shiftChar(); shiftChar();
@@ -2093,6 +2142,7 @@ backslash:
case ')': case ')':
case '\\': // Escapes shared with string literals case '\\': // Escapes shared with string literals
case '"': case '"':
case '\'':
case '{': case '{':
case '}': case '}':
break; break;

View File

@@ -339,6 +339,7 @@
// Literals // Literals
%token <int32_t> NUMBER "number" %token <int32_t> NUMBER "number"
%token <std::string> STRING "string" %token <std::string> STRING "string"
%token <std::string> CHARACTER "character"
%token <std::string> SYMBOL "symbol" %token <std::string> SYMBOL "symbol"
%token <std::string> LABEL "label" %token <std::string> LABEL "label"
%token <std::string> LOCAL "local label" %token <std::string> LOCAL "local label"
@@ -1415,6 +1416,15 @@ relocexpr_no_str:
NUMBER { NUMBER {
$$.makeNumber($1); $$.makeNumber($1);
} }
| CHARACTER {
std::vector<int32_t> output = charmap_Convert($1);
if (output.size() == 1) {
$$.makeNumber(static_cast<uint32_t>(output[0]));
} else {
::error("Character literals must be a single charmap unit");
$$.makeNumber(0);
}
}
| OP_LOGICNOT relocexpr %prec NEG { | OP_LOGICNOT relocexpr %prec NEG {
$$.makeUnaryOp(RPN_LOGNOT, std::move($2)); $$.makeUnaryOp(RPN_LOGNOT, std::move($2));
} }

View File

@@ -293,7 +293,7 @@ yy::parser::symbol_type yylex() {
c = '\r'; c = '\r';
} else if (c == 't') { } else if (c == 't') {
c = '\t'; c = '\t';
} else if (c != '\\' && c != '"') { } else if (c != '\\' && c != '"' && c != '\'') {
scriptError(context, "Cannot escape character %s", printChar(c)); scriptError(context, "Cannot escape character %s", printChar(c));
} }
context.file.sbumpc(); context.file.sbumpc();

View File

@@ -0,0 +1,34 @@
def s equs "d"
charmap "A", 1
charmap "B", 2
charmap "c{s}e", 3
charmap "F", 4, 5, 6
charmap "'", 42
charmap "\"", 1234
charmap "\n\r\t\0", 1337
charmap "',\",\\", 99
MACRO char
assert (\1) == (\2)
ENDM
char 'A', 1
char 'B', 2
char 'c{s}e', 3
char '\'', 42
char '"', 1234
char '\n\r\t\0', 1337
char '\',",\\', 99
char charval("c{s}e", 0), 'c{s}e'
def v equs "\n\r\t\0"
def x = '{v}'
char x, '\n\r\t\0'
; errors
char '?', $3f ; ASCII
char 'F', 0
char 'ABF', 0
char '\n\r\t', 0

View File

@@ -0,0 +1,15 @@
warning: character-literals.asm(31) -> character-literals.asm::char(13): [-Wunmapped-char]
Unmapped character '?'
error: character-literals.asm(32) -> character-literals.asm::char(13):
Character literals must be a single charmap unit
error: character-literals.asm(33) -> character-literals.asm::char(13):
Character literals must be a single charmap unit
warning: character-literals.asm(34) -> character-literals.asm::char(13): [-Wunmapped-char]
Unmapped character '\n'
warning: character-literals.asm(34) -> character-literals.asm::char(13): [-Wunmapped-char]
Unmapped character '\r'
warning: character-literals.asm(34) -> character-literals.asm::char(13): [-Wunmapped-char]
Unmapped character '\t'
error: character-literals.asm(34) -> character-literals.asm::char(13):
Character literals must be a single charmap unit
Assembly aborted with 3 errors!

View File

@@ -1,3 +1,3 @@
assert 1 +# 1 == 2 assert 1 +# 1 == 2
assert 2 '?* 2 == 4 assert 2 ?<EFBFBD>* 2 == 4
assert 3 **?''?##?? 3 == 27 assert 3 **?<EFBFBD>?##?? 3 == 27

View File

@@ -1,9 +1,9 @@
error: garbage_sequence.asm(1): error: garbage_sequence.asm(1):
Unknown character '#' Unknown character '#'
error: garbage_sequence.asm(2): error: garbage_sequence.asm(2):
Unknown characters ''', '?' Unknown characters '?', 0xFF
error: garbage_sequence.asm(3): error: garbage_sequence.asm(3):
Unknown characters '?', ''', ''', '?' Unknown characters '?', 0xFF, '?'
error: garbage_sequence.asm(3): error: garbage_sequence.asm(3):
Unknown character '#' Unknown character '#'
error: garbage_sequence.asm(3): error: garbage_sequence.asm(3):