From d049ffc0f0a508f9d48d27e850692f54b6c062f8 Mon Sep 17 00:00:00 2001 From: Rangi <35663410+Rangi42@users.noreply.github.com> Date: Tue, 16 Feb 2021 19:44:25 -0800 Subject: [PATCH] Handle string literals within macro arguments (#685) Fixes #683 and #691 The lexer's raw mode for reading macro args already attempted to handle semicolons inside string literals, versus outside ones which start comments. This change reuses the same function for reading string literals in normal and raw modes, also handling: - Commas in strings versus between macro args - Character escapes - {Interpolations} and \1-\9 args inside vs. outside strings - Multi-line string literals Macro args now allow escaping '\', '"', and '\\'. A consistent model for expanding macro args and interpolations, within macro args, string literals, and normal context: - "{S}" should always equal the contents of S - "\1" should always act like quoting the value of \1 --- src/asm/lexer.c | 282 ++++++++++++++++++++++++------- src/asm/rgbasm.5 | 16 +- test/asm/macro-arg-in-string.asm | 12 +- test/asm/macro-arg-in-string.err | 2 +- test/asm/macro-arg-in-string.out | 7 +- test/asm/multi-line-strings.asm | 3 +- test/asm/multi-line-strings.err | 5 +- test/asm/multi-line-strings.out | 6 +- test/asm/quine.asm | 14 ++ test/asm/quine.err | 0 test/asm/quine.out | 14 ++ test/asm/quine2.asm | 4 + test/asm/quine2.err | 0 test/asm/quine2.out | 4 + test/asm/raw-macro-args.asm | 38 +++++ test/asm/raw-macro-args.err | 9 + test/asm/raw-macro-args.out | 23 +++ 17 files changed, 360 insertions(+), 79 deletions(-) create mode 100644 test/asm/quine.asm create mode 100644 test/asm/quine.err create mode 100644 test/asm/quine.out create mode 100644 test/asm/quine2.asm create mode 100644 test/asm/quine2.err create mode 100644 test/asm/quine2.out create mode 100644 test/asm/raw-macro-args.asm create mode 100644 test/asm/raw-macro-args.err create mode 100644 test/asm/raw-macro-args.out diff --git a/src/asm/lexer.c b/src/asm/lexer.c index e07553d3..493b2fea 100644 --- a/src/asm/lexer.c +++ b/src/asm/lexer.c @@ -845,7 +845,6 @@ restart: } } else if (c == '{' && !lexerState->disableInterpolation) { /* If character is an open brace, do symbol interpolation */ - lexerState->macroArgScanDistance++; shiftChars(1); char const *ptr = readInterpolation(); @@ -1247,7 +1246,7 @@ static void readGfxConstant(void) yylval.nConstValue = bp1 << 8 | bp0; } -/* Function to read identifiers & keywords */ +/* Functions to read identifiers & keywords */ static bool startsIdentifier(int c) { @@ -1373,51 +1372,39 @@ static char const *readInterpolation(void) return NULL; } -static int appendMacroArg(char const *str, int i) +#define append_yylval_tzString(c) do { \ + if (i < sizeof(yylval.tzString)) \ + yylval.tzString[i++] = (c); \ +} while (0) + +static size_t appendEscapedSubstring(char const *str, size_t i) { - while (*str && i < sizeof(yylval.tzString)) { + /* Copy one extra to flag overflow */ + while (*str) { int c = *str++; - if (c != '\\') { - yylval.tzString[i++] = c; - continue; - } - - c = *str++; - + /* Escape characters that need escaping */ switch (c) { - case '\\': /* Return that character unchanged */ + case '\\': case '"': case '{': - case '}': + append_yylval_tzString('\\'); break; - case 'n': - c = '\n'; + case '\n': + append_yylval_tzString('\\'); + c = 'n'; break; - case 'r': - c = '\r'; + case '\r': + append_yylval_tzString('\\'); + c = 'r'; break; - case 't': - c = '\t'; - break; - - case '\0': /* Can't really print that one */ - error("Illegal character escape at end of macro arg\n"); - yylval.tzString[i++] = '\\'; - break; - - /* - * Line continuations and macro args were already - * handled while reading the macro args, so '\@', - * '\#', and '\0'-'\9' should not occur here. - */ - - default: - error("Illegal character escape '%s'\n", print(c)); - c = '\\'; + case '\t': + append_yylval_tzString('\\'); + c = 't'; break; } - yylval.tzString[i++] = c; + + append_yylval_tzString(c); } return i; @@ -1479,10 +1466,11 @@ static void readString(void) case '\\': // Character escape or macro arg c = peek(0); switch (c) { - case '\\': // Return that character unchanged + case '\\': case '"': case '{': case '}': + // Return that character unchanged shiftChars(1); break; case 'n': @@ -1521,16 +1509,18 @@ static void readString(void) shiftChars(1); char const *str = readMacroArg(c); - i = appendMacroArg(str, i); + while (*str) + append_yylval_tzString(*str++); continue; // Do not copy an additional character case EOF: // Can't really print that one error("Illegal character escape at end of input\n"); c = '\\'; break; + default: error("Illegal character escape '%s'\n", print(c)); - c = '\\'; + shiftChars(1); break; } break; @@ -1542,16 +1532,15 @@ static void readString(void) char const *ptr = readInterpolation(); if (ptr) - while (*ptr && i < sizeof(yylval.tzString)) - yylval.tzString[i++] = *ptr++; + while (*ptr) + append_yylval_tzString(*ptr++); lexerState->disableMacroArgs = true; continue; // Do not copy an additional character // Regular characters will just get copied } - if (i < sizeof(yylval.tzString)) // Copy one extra to flag overflow - yylval.tzString[i++] = c; + append_yylval_tzString(c); } finish: @@ -1566,6 +1555,155 @@ finish: lexerState->disableInterpolation = false; } +static size_t appendStringLiteral(size_t i) +{ + dbgPrint("Reading string\n"); + lexerState->disableMacroArgs = true; + lexerState->disableInterpolation = true; + + bool multiline = false; + + // We reach this function after reading a single quote, but we also support triple quotes + append_yylval_tzString('"'); + if (peek(0) == '"') { + append_yylval_tzString('"'); + shiftChars(1); + if (peek(0) == '"') { + // """ begins a multi-line string + append_yylval_tzString('"'); + shiftChars(1); + multiline = true; + } else { + // "" is an empty string, skip the loop + goto finish; + } + } + + for (;;) { + int c = peek(0); + + // '\r', '\n' or EOF ends a single-line string early + if (c == EOF || (!multiline && (c == '\r' || c == '\n'))) { + error("Unterminated string\n"); + break; + } + + // We'll be staying in the string, so we can safely consume the char + shiftChars(1); + + // Handle '\r' or '\n' (in multiline strings only, already handled above otherwise) + if (c == '\r' || c == '\n') { + /* Handle CRLF before nextLine() since shiftChars updates colNo */ + if (c == '\r' && peek(0) == '\n') + shiftChars(1); + nextLine(); + c = '\n'; + } + + switch (c) { + case '"': + if (multiline) { + // Only """ ends a multi-line string + if (peek(0) != '"' || peek(1) != '"') + break; + append_yylval_tzString('"'); + append_yylval_tzString('"'); + shiftChars(2); + } + append_yylval_tzString('"'); + goto finish; + + case '\\': // Character escape or macro arg + c = peek(0); + switch (c) { + // Character escape + case '\\': + case '"': + case '{': + case '}': + case 'n': + case 'r': + case 't': + // Return that character unchanged + append_yylval_tzString('\\'); + shiftChars(1); + break; + + // Line continuation + case ' ': + case '\r': + case '\n': + readLineContinuation(); + continue; + + // Macro arg + case '@': + case '#': + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + shiftChars(1); + char const *str = readMacroArg(c); + + i = appendEscapedSubstring(str, i); + continue; // Do not copy an additional character + + case EOF: // Can't really print that one + error("Illegal character escape at end of input\n"); + c = '\\'; + break; + + case ',': /* `\,` inside a macro arg string literal */ + warning(WARNING_OBSOLETE, + "`\\,` is deprecated inside strings\n"); + shiftChars(1); + break; + + default: + error("Illegal character escape '%s'\n", print(c)); + shiftChars(1); + break; + } + break; + + case '{': // Symbol interpolation + // We'll be exiting the string scope, so re-enable expansions + // (Not interpolations, since they're handled by the function itself...) + lexerState->disableMacroArgs = false; + char const *ptr = readInterpolation(); + + if (ptr) + i = appendEscapedSubstring(ptr, i); + lexerState->disableMacroArgs = true; + continue; // Do not copy an additional character + + // Regular characters will just get copied + } + + append_yylval_tzString(c); + } + +finish: + if (i == sizeof(yylval.tzString)) { + i--; + warning(WARNING_LONG_STR, "String constant too long\n"); + } + yylval.tzString[i] = '\0'; + + dbgPrint("Read string \"%s\"\n", yylval.tzString); + lexerState->disableMacroArgs = false; + lexerState->disableInterpolation = false; + + return i; +} + /* Function to report one character's worth of garbage bytes */ static char const *reportGarbageChar(unsigned char firstByte) @@ -1835,6 +1973,7 @@ static int yylex_NORMAL(void) case EOF: error("Illegal character escape at end of input\n"); break; + default: shiftChars(1); error("Illegal character escape '%s'\n", print(c)); @@ -1886,9 +2025,8 @@ static int yylex_RAW(void) dbgPrint("Lexing in raw mode, line=%" PRIu32 ", col=%" PRIu32 "\n", lexer_GetLineNo(), lexer_GetColNo()); - /* This is essentially a modified `readString` */ + /* This is essentially a modified `appendStringLiteral` */ size_t i = 0; - bool insideString = false; /* Trim left of string... */ while (isWhitespace(peek(0))) @@ -1898,18 +2036,16 @@ static int yylex_RAW(void) int c = peek(0); switch (c) { - case '"': - insideString = !insideString; - /* Other than that, just process quotes normally */ + case '"': /* String literals inside macro args */ + shiftChars(1); + i = appendStringLiteral(i); break; case ';': /* Comments inside macro args */ - if (insideString) - break; discardComment(); c = peek(0); /* fallthrough */ - case ',': + case ',': /* End of macro arg */ case '\r': case '\n': case EOF: @@ -1939,16 +2075,30 @@ static int yylex_RAW(void) return T_STRING; case '\\': /* Character escape */ - c = peek(1); + shiftChars(1); /* Shift the backslash */ + c = peek(0); + switch (c) { - case ',': - shiftChars(1); + case ',': /* Escape `\,` only inside a macro arg */ + case '\\': /* Escapes shared with string literals */ + case '"': + case '{': + case '}': + break; + + case 'n': + c = '\n'; + break; + case 'r': + c = '\r'; + break; + case 't': + c = '\t'; break; case ' ': case '\r': case '\n': - shiftChars(1); /* Shift the backslash */ readLineContinuation(); continue; @@ -1956,20 +2106,28 @@ static int yylex_RAW(void) error("Illegal character escape at end of input\n"); c = '\\'; break; - default: /* Pass the rest as-is */ - c = '\\'; + + /* + * Macro args were already handled by peek, so '\@', + * '\#', and '\0'-'\9' should not occur here. + */ + + default: + error("Illegal character escape '%s'\n", print(c)); break; } - break; + /* fallthrough */ - /* Regular characters will just get copied */ + default: /* Regular characters will just get copied */ + append_yylval_tzString(c); + shiftChars(1); + break; } - if (i < sizeof(yylval.tzString)) /* Copy one extra to flag overflow */ - yylval.tzString[i++] = c; - shiftChars(1); } } +#undef append_yylval_tzString + /* * This function uses the fact that `if`, etc. constructs are only valid when * there's nothing before them on their lines. This enables filtering diff --git a/src/asm/rgbasm.5 b/src/asm/rgbasm.5 index 5718a13f..e0956f2a 100644 --- a/src/asm/rgbasm.5 +++ b/src/asm/rgbasm.5 @@ -235,7 +235,6 @@ There are a number of escape sequences you can use within a string: .It Sy String Ta Sy Meaning .It Ql \[rs]\[rs] Ta Produces a backslash .It Ql \[rs]" Ta Produces a double quote without terminating -.It Ql \[rs], Ta Comma .It Ql \[rs]{ Ta Curly bracket left .It Ql \[rs]} Ta Curly bracket right .It Ql \[rs]n Ta Newline ($0A) @@ -1088,6 +1087,10 @@ definition ENDM .Ed .El +.Pp +Macro arguments support all the escape sequences of strings, as well as +.Ql \[rs], +to escape commas, since those otherwise separate arguments. .Ss Exporting and importing symbols Importing and exporting of symbols is a feature that is very useful when your project spans many source files and, for example, you need to jump to a routine defined in another file. .Pp @@ -1462,16 +1465,13 @@ PrintMacro: MACRO ENDM PrintMacro STRCAT("Hello "\[rs], \[rs] - "world\[rs]\[rs]n") + "world\[rs]n") .Ed .Pp The comma needs to be escaped to avoid it being treated as separating the macro's arguments. -The backslash -.Sq \[rs] -.Pq from Sq \[rs]n -also needs to be escaped because of the way -.Nm -processes macro arguments. +The backslash in +.Ql \[rs]n +does not need to be escaped because string literals also work as usual inside macro arguments. .Pp In reality, up to 256 arguments can be passed to a macro, but you can only use the first 9 like this. If you want to use the rest, you need to use the diff --git a/test/asm/macro-arg-in-string.asm b/test/asm/macro-arg-in-string.asm index a1d93fa2..4982fb2f 100644 --- a/test/asm/macro-arg-in-string.asm +++ b/test/asm/macro-arg-in-string.asm @@ -1,9 +1,12 @@ print1: MACRO + if _NARG == 2 + assert !STRCMP("\1", \2) + endc PRINTLN "\1" ENDM print1 John "Danger" Smith - print1 \\A\nB + print1 \\\\A\\nB\n, "\\\\A\\nB\n" print1 C\ D print1 E\!F ; illegal character escape @@ -15,3 +18,10 @@ ENDM s EQUS "hello" iprint s + +symprint: MACRO + PRINTLN {\1} +ENDM + +hello EQUS "\"goodbye\"" + symprint s diff --git a/test/asm/macro-arg-in-string.err b/test/asm/macro-arg-in-string.err index 058cc43d..1b16204e 100644 --- a/test/asm/macro-arg-in-string.err +++ b/test/asm/macro-arg-in-string.err @@ -1,3 +1,3 @@ -ERROR: macro-arg-in-string.asm(9) -> macro-arg-in-string.asm::print1(2): +ERROR: macro-arg-in-string.asm(12): Illegal character escape '!' error: Assembly aborted (1 errors)! diff --git a/test/asm/macro-arg-in-string.out b/test/asm/macro-arg-in-string.out index 5fd0e1d5..3e47777f 100644 --- a/test/asm/macro-arg-in-string.out +++ b/test/asm/macro-arg-in-string.out @@ -1,6 +1,7 @@ John "Danger" Smith -\A -B +\\A\nB + CD -E\F +E!F hello +goodbye diff --git a/test/asm/multi-line-strings.asm b/test/asm/multi-line-strings.asm index e39635db..fd6b9fe4 100644 --- a/test/asm/multi-line-strings.asm +++ b/test/asm/multi-line-strings.asm @@ -21,7 +21,8 @@ printarg: MACRO ENDM printarg " - printarg """ + printarg """multi-line +string argument""" EMPTY1 EQUS "" EMPTY2 EQUS "\ ; comment diff --git a/test/asm/multi-line-strings.err b/test/asm/multi-line-strings.err index 9f139531..47a604f4 100644 --- a/test/asm/multi-line-strings.err +++ b/test/asm/multi-line-strings.err @@ -1,2 +1,5 @@ -warning: multi-line-strings.asm(34): [-Wuser] +ERROR: multi-line-strings.asm(23): + Unterminated string +warning: multi-line-strings.asm(35): [-Wuser] check the line number +error: Assembly aborted (1 errors)! diff --git a/test/asm/multi-line-strings.out b/test/asm/multi-line-strings.out index ef7d1180..c88ad8dc 100644 --- a/test/asm/multi-line-strings.out +++ b/test/asm/multi-line-strings.out @@ -8,6 +8,8 @@ The multi-line string can contain: ! arg <"> arg (") -arg <"""> -arg (""") +arg <"""multi-line +string argument"""> +arg ("""multi-line +string argument""") () diff --git a/test/asm/quine.asm b/test/asm/quine.asm new file mode 100644 index 00000000..f70a31af --- /dev/null +++ b/test/asm/quine.asm @@ -0,0 +1,14 @@ +R:MACRO +REPT _NARG +PRINT STRSUB("\n\"\\ ENRST1ABCDFGHIMOPU_n#()+,:>",\1+1,1) +SHIFT +ENDR +ENDM +N:MACRO + R \# +REPT _NARG +PRINT"\1",STRSUB("\n,",(_NARG>1)+1,1) +SHIFT +ENDR +ENDM + N 6,29,18,10,12,6,19,0,6,4,20,8,3,22,5,10,6,15,0,20,6,17,5,8,3,7,8,6,7,21,11,25,1,2,23,2,1,2,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,1,28,2,9,27,9,28,9,26,0,7,16,17,14,8,0,4,5,13,6,0,4,5,13,18,0,5,29,18,10,12,6,19,0,3,6,3,2,24,0,6,4,20,8,3,22,5,10,6,15,0,20,6,17,5,8,1,2,9,1,28,7,8,6,7,21,11,25,1,2,23,28,1,28,25,22,5,10,6,15,30,9,26,27,9,28,9,26,0,7,16,17,14,8,0,4,5,13,6,0,4,5,13,18,0,3,5,3 diff --git a/test/asm/quine.err b/test/asm/quine.err new file mode 100644 index 00000000..e69de29b diff --git a/test/asm/quine.out b/test/asm/quine.out new file mode 100644 index 00000000..f70a31af --- /dev/null +++ b/test/asm/quine.out @@ -0,0 +1,14 @@ +R:MACRO +REPT _NARG +PRINT STRSUB("\n\"\\ ENRST1ABCDFGHIMOPU_n#()+,:>",\1+1,1) +SHIFT +ENDR +ENDM +N:MACRO + R \# +REPT _NARG +PRINT"\1",STRSUB("\n,",(_NARG>1)+1,1) +SHIFT +ENDR +ENDM + N 6,29,18,10,12,6,19,0,6,4,20,8,3,22,5,10,6,15,0,20,6,17,5,8,3,7,8,6,7,21,11,25,1,2,23,2,1,2,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,1,28,2,9,27,9,28,9,26,0,7,16,17,14,8,0,4,5,13,6,0,4,5,13,18,0,5,29,18,10,12,6,19,0,3,6,3,2,24,0,6,4,20,8,3,22,5,10,6,15,0,20,6,17,5,8,1,2,9,1,28,7,8,6,7,21,11,25,1,2,23,28,1,28,25,22,5,10,6,15,30,9,26,27,9,28,9,26,0,7,16,17,14,8,0,4,5,13,6,0,4,5,13,18,0,3,5,3 diff --git a/test/asm/quine2.asm b/test/asm/quine2.asm new file mode 100644 index 00000000..89d5f1dd --- /dev/null +++ b/test/asm/quine2.asm @@ -0,0 +1,4 @@ +q: macro + println \1,"\1" +endm + q "q: macro\n\tprintln \\1,\"\\1\"\nendm\n\tq " diff --git a/test/asm/quine2.err b/test/asm/quine2.err new file mode 100644 index 00000000..e69de29b diff --git a/test/asm/quine2.out b/test/asm/quine2.out new file mode 100644 index 00000000..89d5f1dd --- /dev/null +++ b/test/asm/quine2.out @@ -0,0 +1,4 @@ +q: macro + println \1,"\1" +endm + q "q: macro\n\tprintln \\1,\"\\1\"\nendm\n\tq " diff --git a/test/asm/raw-macro-args.asm b/test/asm/raw-macro-args.asm new file mode 100644 index 00000000..a7ca23a3 --- /dev/null +++ b/test/asm/raw-macro-args.asm @@ -0,0 +1,38 @@ +printargs: MACRO + rept _NARG + println \1 + shift + endr +ENDM + +printlit: MACRO + rept _NARG + println "\1" + shift + endr +ENDM + +NUM EQU 42 +STR EQUS "str\"ing" + + printargs NUM + printargs "{d:NUM}" + printargs "{STR}", 16 ; comment 1 + printargs "\"literal \\\"\\\\\\\"\"" + printargs "literal \"\\\"", \ ; comment 2 +"""multi-"line" + ""string"" arg""" + printargs MUL(2.0\, 3.0) + printargs "unclosed + + printlit NUM + printlit "{d:NUM}" + printlit "{STR}", 16 ; comment 3 + printlit "\"literal \\\"\\\\\\\"\"" + printlit "literal \"\\\"", \ ; comment 4 +"""multi-"line" + ""string"" arg""" + printlit MUL(2.0\, 3.0) + printlit this\n is\, \{not\} a\\n syntax\" error + printlit "unclosed + printlit """EOF \ No newline at end of file diff --git a/test/asm/raw-macro-args.err b/test/asm/raw-macro-args.err new file mode 100644 index 00000000..3a68773e --- /dev/null +++ b/test/asm/raw-macro-args.err @@ -0,0 +1,9 @@ +ERROR: raw-macro-args.asm(26): + Unterminated string +ERROR: raw-macro-args.asm(26) -> raw-macro-args.asm::printargs(2) -> raw-macro-args.asm::printargs::REPT~1(3): + Unterminated string +ERROR: raw-macro-args.asm(37): + Unterminated string +ERROR: raw-macro-args.asm(38): + Unterminated string +error: Assembly aborted (4 errors)! diff --git a/test/asm/raw-macro-args.out b/test/asm/raw-macro-args.out new file mode 100644 index 00000000..277a7ddd --- /dev/null +++ b/test/asm/raw-macro-args.out @@ -0,0 +1,23 @@ +$2A +42 +str"ing +$10 +"literal \"\\\"" +literal "\" +multi-"line" + ""string"" arg +$60000 +unclosed +NUM +"42" +"str\"ing" +16 +"\"literal \\\"\\\\\\\"\"" +"literal \"\\\"" +"""multi-"line" + ""string"" arg""" +MUL(2.0, 3.0) +this + is, {not} a\n syntax" error +"unclosed +"""EOF