parser: keep string aliases as the user wrote it

Currently our scanner decodes all the escapes in the strings, and we later reescape the strings when we emit them. This is troublesome, as we do not respect the user input. For instance, when the user writes in UTF-8, we destroy her string when we write it back. And this shows everywhere: in the reports we show the escaped string instead of the actual alias: 0 $accept: . exp $end 1 exp: . exp "\342\212\225" exp 2 | . exp "+" exp 3 | . exp "+" exp 4 | . "number" 5 | . "\303\221\303\271\341\271\203\303\251\342\204\235\303\264" "number" shift, and go to state 1 "\303\221\303\271\341\271\203\303\251\342\204\235\303\264" shift, and go to state 2 This commit preserves the user's exact spelling of the string aliases, instead of interpreting the escapes and then reescaping. The report now shows: 0 $accept: . exp $end 1 exp: . exp "⊕" exp 2 | . exp "+" exp 3 | . exp "+" exp 4 | . "number" 5 | . "Ñùṃéℝô" "number" shift, and go to state 1 "Ñùṃéℝô" shift, and go to state 2 Likewise, the XML (and therefore HTML) outputs are fixed. * src/scan-gram.l (STRING, TSTRING): Do not interpret the escapes in the resulting string. * src/parse-gram.y (unquote, parser_init, parser_free, unquote_free) (handle_defines, handle_language, obstack_for_unquote): New. Use them to unquote where needed. * tests/regression.at, tests/report.at: Update.
2026-03-14 06:43:03 +00:00 · 2020-06-13 08:46:58 +02:00
parent 5d5e1df1dc
commit 5855da4722
7 changed files with 266 additions and 129 deletions
--- a/src/scan-gram.l
+++ b/src/scan-gram.l
@@ -88,16 +88,15 @@ static boundary scanner_cursor;
  do {                                                          \
    verify (UCHAR_MAX < ULONG_MAX);                             \
    long c = Char;                                              \
-    if (0 < c && c <= UCHAR_MAX)                                \
-      STRING_1GROW (c);                                         \
+    bool valid = 0 < c && c <= UCHAR_MAX;                       \
+    if (!valid)                                                 \
+      complain (loc, complaint,                                 \
+                _("invalid number after \\-escape: %s"),        \
+                yytext + 1);                                    \
+    if (YY_START == SC_ESCAPED_CHARACTER)                       \
+      STRING_1GROW (valid ? c : '?');                           \
    else                                                        \
-      {                                                         \
-        complain (loc, complaint,                               \
-                  _("invalid number after \\-escape: %s"),      \
-                  yytext + 1);                                  \
-        /* Avoid additional errors about empty char literal. */ \
-        STRING_1GROW ('?');                                     \
-      }                                                         \
+      STRING_GROW ();                                           \
  } while (0)


@@ -337,8 +336,8 @@ eqopt    ({sp}=)?
  "'"         token_start = loc->start; BEGIN SC_ESCAPED_CHARACTER;

  /* Strings. */
-  "\""        token_start = loc->start; BEGIN SC_ESCAPED_STRING;
-  "_(\""      token_start = loc->start; BEGIN SC_ESCAPED_TSTRING;
+  "\""        token_start = loc->start; STRING_1GROW ('"'); BEGIN SC_ESCAPED_STRING;
+  "_(\""      token_start = loc->start; STRING_1GROW ('"'); BEGIN SC_ESCAPED_TSTRING;

  /* Prologue. */
  "%{"        code_start = loc->start; BEGIN SC_PROLOGUE;
@@ -559,6 +558,7 @@ eqopt    ({sp}=)?
 <SC_ESCAPED_STRING>
 {
  "\"" {
+    STRING_1GROW ('"');
    STRING_FINISH ();
    BEGIN INITIAL;
    loc->start = token_start;
@@ -571,6 +571,7 @@ eqopt    ({sp}=)?
 <SC_ESCAPED_TSTRING>
 {
  "\")" {
+    STRING_1GROW ('"');
    STRING_FINISH ();
    BEGIN INITIAL;
    loc->start = token_start;
@@ -664,16 +665,16 @@ eqopt    ({sp}=)?
    STRING_GROW_ESCAPE (strtol (yytext + 2, NULL, 16));
  }

-  \\a   STRING_1GROW ('\a');
-  \\b   STRING_1GROW ('\b');
-  \\f   STRING_1GROW ('\f');
-  \\n   STRING_1GROW ('\n');
-  \\r   STRING_1GROW ('\r');
-  \\t   STRING_1GROW ('\t');
-  \\v   STRING_1GROW ('\v');
+  \\a   STRING_GROW_ESCAPE ('\a');
+  \\b   STRING_GROW_ESCAPE ('\b');
+  \\f   STRING_GROW_ESCAPE ('\f');
+  \\n   STRING_GROW_ESCAPE ('\n');
+  \\r   STRING_GROW_ESCAPE ('\r');
+  \\t   STRING_GROW_ESCAPE ('\t');
+  \\v   STRING_GROW_ESCAPE ('\v');

  /* \\[\"\'?\\] would be shorter, but it confuses xgettext.  */
-  \\("\""|"'"|"?"|"\\")  STRING_1GROW (yytext[1]);
+  \\("\""|"'"|"?"|"\\")  STRING_GROW_ESCAPE (yytext[1]);

  \\(u|U[0-9abcdefABCDEF]{4})[0-9abcdefABCDEF]{4} {
    STRING_GROW_ESCAPE (convert_ucn_to_byte (yytext));
@@ -1024,7 +1025,7 @@ gram_scanner_open (const char *gram)


 void
-gram_scanner_close ()
+gram_scanner_close (void)
 {
  xfclose (gram_in);
  /* Reclaim Flex's buffers.  */
@@ -1032,7 +1033,6 @@ gram_scanner_close ()
 }


-
 void
 gram_scanner_free (void)
 {