reports: the column width differs from the byte count

From "number" shift, and go to state 1 "Ñùṃéℝô" shift, and go to state 2 to "number" shift, and go to state 1 "Ñùṃéℝô" shift, and go to state 2 * src/print.c: Use mbswidth, not strlen, to compute visual columns. * tests/report.at: Adjust.
2026-06-08 08:42:35 +00:00 · 2020-06-13 11:09:53 +02:00
parent efbcadeca7
commit 251e1b137f
3 changed files with 42 additions and 18 deletions
@@ -27,6 +27,23 @@ GNU Bison NEWS
  header.  This is disabled when the generated header is `y.tab.h`, to
  comply with Automake's ylwrap.

+*** String aliases are faithfully propagated
+
+  Bison used to interpret user strings (i.e., decoding backslash escapes)
+  when reading them, and to escape them (i.e., issue non-printable
+  characters as backslash escapes, taking the locale into account) when
+  outputting them.  As a consequence non-ASCII strings (say in UTF-8) ended
+  up "ciphered" as sequences of backslash escapes.  This happened not only
+  in the generated sources (where the compiler will reinterpret them), but
+  also in all the generated reports (text, xml, html, dot, etc.).  Reports
+  were therefore not readable when string aliases were not pure ASCII.
+  Worse yet: the output depended on the user's locale.
+
+  Now Bison faithfully treats the string aliases exactly the way the user
+  spelled them.  This fixes all the aforementioned problems.  However, now,
+  string aliases semantically equivalent but syntactically different (e.g.,
+  "A", "\x41", "\101") are considered to be different.
+
 ** New features

 *** File prefix mapping
@@ -4205,7 +4222,8 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 LocalWords:  yysymbol yytnamerr yyreport ctx ARGMAX yysyntax stderr LPAREN
 LocalWords:  symrec yypcontext TOKENMAX yyexpected YYEMPTY yypstate YYEOF
 LocalWords:  autocompletion bistromathic submessages Cayuela lexcalc hoc
- LocalWords:  yytoken YYUNDEF YYerror
+ LocalWords:  yytoken YYUNDEF YYerror basename Automake's UTF ifdef ffile
+ LocalWords:  gotos readline

 Local Variables:
 ispell-dictionary: "american"
@@ -22,6 +22,7 @@
 #include "system.h"

 #include <bitset.h>
+#include <mbswidth.h>

 #include "closure.h"
 #include "conflicts.h"
@@ -49,7 +50,7 @@ static bitset no_reduce_set;
 static void
 max_length (size_t *width, const char *str)
 {
-  size_t len = strlen (str);
+  size_t len = mbswidth (str, 0);
  if (len > *width)
    *width = len;
 }
@@ -130,7 +131,7 @@ print_transitions (state *s, FILE *out, bool display_transitions_p)
        state *s1 = trans->states[i];

        fprintf (out, "    %s", tag);
-        for (int j = width - strlen (tag); j > 0; --j)
+        for (int j = width - mbswidth (tag, 0); j > 0; --j)
          fputc (' ', out);
        if (display_transitions_p)
          fprintf (out, _("shift, and go to state %d\n"), s1->number);
@@ -168,7 +169,7 @@ print_errs (FILE *out, state *s)
      {
        const char *tag = errp->symbols[i]->tag;
        fprintf (out, "    %s", tag);
-        for (int j = width - strlen (tag); j > 0; --j)
+        for (int j = width - mbswidth (tag, 0); j > 0; --j)
          fputc (' ', out);
        fputs (_("error (nonassociative)\n"), out);
      }
@@ -187,7 +188,7 @@ print_reduction (FILE *out, size_t width,
                 rule *r, bool enabled)
 {
  fprintf (out, "    %s", lookahead_token);
-  for (int j = width - strlen (lookahead_token); j > 0; --j)
+  for (int j = width - mbswidth (lookahead_token, 0); j > 0; --j)
    fputc (' ', out);
  if (!enabled)
    fputc ('[', out);
@@ -232,7 +233,7 @@ print_reductions (FILE *out, state *s)
  /* Compute the width of the lookahead token column.  */
  size_t width = 0;
  if (default_reduction)
-    width = strlen (_("$default"));
+    width = mbswidth (_("$default"), 0);

  if (reds->lookahead_tokens)
    for (int i = 0; i < ntokens; i++)
@@ -404,7 +405,7 @@ print_nonterminal_symbols (FILE *out)
            break;
        }

-      int column = 4 + strlen (tag);
+      int column = 4 + mbswidth (tag, 0);
      fprintf (out, "%4s%s", "", tag);
      if (symbols[i]->content->type_name)
        column += fprintf (out, " <%s>",
@@ -1150,6 +1150,11 @@ AT_SETUP([Reports with conflicts])

 AT_KEYWORDS([report])

+# We need UTF-8 support for correct screen-width computation of UTF-8
+# characters.  Skip the test if not available.
+locale=`locale -a | $EGREP '^en_US\.(UTF-8|utf8)$' | sed 1q`
+AT_SKIP_IF([test x == x"$locale"])
+
 AT_BISON_OPTION_PUSHDEFS
 AT_DATA([input.y],
 [[%left "+"
@@ -1162,7 +1167,7 @@ exp
 | "Ñùṃéℝô"
 ]])

-AT_BISON_CHECK([-o input.cc -rall --graph=input.gv --xml input.y], [], [],
+AT_CHECK([LC_ALL="$locale" $5 bison -fno-caret -o input.cc -rall --graph=input.gv --xml input.y], [], [],
 [[input.y: warning: 3 shift/reduce conflicts [-Wconflicts-sr]
 input.y: warning: 3 reduce/reduce conflicts [-Wconflicts-rr]
 input.y: warning: rerun with option '-Wcounterexamples' to generate conflict counterexamples [-Wother]
@@ -1219,7 +1224,7 @@ State 0
    4    | . "number"
    5    | . "Ñùṃéℝô"

-    "number"          shift, and go to state 1
+    "number"  shift, and go to state 1
    "Ñùṃéℝô"  shift, and go to state 2

    exp  go to state 3
@@ -1246,9 +1251,9 @@ State 3
    2    | exp . "+" exp
    3    | exp . "+" exp

-    $end   shift, and go to state 4
-    "+"    shift, and go to state 5
-    "⊕"  shift, and go to state 6
+    $end  shift, and go to state 4
+    "+"   shift, and go to state 5
+    "⊕"   shift, and go to state 6


 State 4
@@ -1268,7 +1273,7 @@ State 5
    4    | . "number"
    5    | . "Ñùṃéℝô"

-    "number"          shift, and go to state 1
+    "number"  shift, and go to state 1
    "Ñùṃéℝô"  shift, and go to state 2

    exp  go to state 7
@@ -1283,7 +1288,7 @@ State 6
    4    | . "number"
    5    | . "Ñùṃéℝô"

-    "number"          shift, and go to state 1
+    "number"  shift, and go to state 1
    "Ñùṃéℝô"  shift, and go to state 2

    exp  go to state 8
@@ -1303,8 +1308,8 @@ State 7
    $end      [reduce using rule 3 (exp)]
    "+"       reduce using rule 2 (exp)
    "+"       [reduce using rule 3 (exp)]
-    "⊕"     [reduce using rule 2 (exp)]
-    "⊕"     [reduce using rule 3 (exp)]
+    "⊕"       [reduce using rule 2 (exp)]
+    "⊕"       [reduce using rule 3 (exp)]
    $default  reduce using rule 2 (exp)

    Conflict between rule 2 and token "+" resolved as reduce (%left "+").
@@ -1317,11 +1322,11 @@ State 8
    2    | exp . "+" exp
    3    | exp . "+" exp

-    "+"    shift, and go to state 5
+    "+"  shift, and go to state 5
    "⊕"  shift, and go to state 6

    "+"       [reduce using rule 1 (exp)]
-    "⊕"     [reduce using rule 1 (exp)]
+    "⊕"       [reduce using rule 1 (exp)]
    $default  reduce using rule 1 (exp)
 ]])