From 251e1b137fd752be2907703c83de8fa52fd137f2 Mon Sep 17 00:00:00 2001
From: Akim Demaille <akim.demaille@gmail.com>
Date: Sat, 13 Jun 2020 11:09:53 +0200
Subject: [PATCH] reports: the column width differs from the byte count
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

From

    "number"          shift, and go to state 1
    "Ñùṃéℝô"  shift, and go to state 2

to

    "number"  shift, and go to state 1
    "Ñùṃéℝô"  shift, and go to state 2

* src/print.c: Use mbswidth, not strlen, to compute visual columns.
* tests/report.at: Adjust.
---
 NEWS            | 20 +++++++++++++++++++-
 src/print.c     | 13 +++++++------
 tests/report.at | 27 ++++++++++++++++-----------
 3 files changed, 42 insertions(+), 18 deletions(-)

diff --git a/NEWS b/NEWS
index 1cec2252..1584d49e 100644
--- a/NEWS
+++ b/NEWS
@@ -27,6 +27,23 @@ GNU Bison NEWS
   header.  This is disabled when the generated header is `y.tab.h`, to
   comply with Automake's ylwrap.
 
+*** String aliases are faithfully propagated
+
+  Bison used to interpret user strings (i.e., decoding backslash escapes)
+  when reading them, and to escape them (i.e., issue non-printable
+  characters as backslash escapes, taking the locale into account) when
+  outputting them.  As a consequence non-ASCII strings (say in UTF-8) ended
+  up "ciphered" as sequences of backslash escapes.  This happened not only
+  in the generated sources (where the compiler will reinterpret them), but
+  also in all the generated reports (text, xml, html, dot, etc.).  Reports
+  were therefore not readable when string aliases were not pure ASCII.
+  Worse yet: the output depended on the user's locale.
+
+  Now Bison faithfully treats the string aliases exactly the way the user
+  spelled them.  This fixes all the aforementioned problems.  However, now,
+  string aliases semantically equivalent but syntactically different (e.g.,
+  "A", "\x41", "\101") are considered to be different.
+
 ** New features
 
 *** File prefix mapping
@@ -4205,7 +4222,8 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
  LocalWords:  yysymbol yytnamerr yyreport ctx ARGMAX yysyntax stderr LPAREN
  LocalWords:  symrec yypcontext TOKENMAX yyexpected YYEMPTY yypstate YYEOF
  LocalWords:  autocompletion bistromathic submessages Cayuela lexcalc hoc
- LocalWords:  yytoken YYUNDEF YYerror
+ LocalWords:  yytoken YYUNDEF YYerror basename Automake's UTF ifdef ffile
+ LocalWords:  gotos readline
 
 Local Variables:
 ispell-dictionary: "american"
diff --git a/src/print.c b/src/print.c
index 1da0f9dd..91b44cb3 100644
--- a/src/print.c
+++ b/src/print.c
@@ -22,6 +22,7 @@
 #include "system.h"
 
 #include <bitset.h>
+#include <mbswidth.h>
 
 #include "closure.h"
 #include "conflicts.h"
@@ -49,7 +50,7 @@ static bitset no_reduce_set;
 static void
 max_length (size_t *width, const char *str)
 {
-  size_t len = strlen (str);
+  size_t len = mbswidth (str, 0);
   if (len > *width)
     *width = len;
 }
@@ -130,7 +131,7 @@ print_transitions (state *s, FILE *out, bool display_transitions_p)
         state *s1 = trans->states[i];
 
         fprintf (out, "    %s", tag);
-        for (int j = width - strlen (tag); j > 0; --j)
+        for (int j = width - mbswidth (tag, 0); j > 0; --j)
           fputc (' ', out);
         if (display_transitions_p)
           fprintf (out, _("shift, and go to state %d\n"), s1->number);
@@ -168,7 +169,7 @@ print_errs (FILE *out, state *s)
       {
         const char *tag = errp->symbols[i]->tag;
         fprintf (out, "    %s", tag);
-        for (int j = width - strlen (tag); j > 0; --j)
+        for (int j = width - mbswidth (tag, 0); j > 0; --j)
           fputc (' ', out);
         fputs (_("error (nonassociative)\n"), out);
       }
@@ -187,7 +188,7 @@ print_reduction (FILE *out, size_t width,
                  rule *r, bool enabled)
 {
   fprintf (out, "    %s", lookahead_token);
-  for (int j = width - strlen (lookahead_token); j > 0; --j)
+  for (int j = width - mbswidth (lookahead_token, 0); j > 0; --j)
     fputc (' ', out);
   if (!enabled)
     fputc ('[', out);
@@ -232,7 +233,7 @@ print_reductions (FILE *out, state *s)
   /* Compute the width of the lookahead token column.  */
   size_t width = 0;
   if (default_reduction)
-    width = strlen (_("$default"));
+    width = mbswidth (_("$default"), 0);
 
   if (reds->lookahead_tokens)
     for (int i = 0; i < ntokens; i++)
@@ -404,7 +405,7 @@ print_nonterminal_symbols (FILE *out)
             break;
         }
 
-      int column = 4 + strlen (tag);
+      int column = 4 + mbswidth (tag, 0);
       fprintf (out, "%4s%s", "", tag);
       if (symbols[i]->content->type_name)
         column += fprintf (out, " <%s>",
diff --git a/tests/report.at b/tests/report.at
index 27d4e7b8..21708070 100644
--- a/tests/report.at
+++ b/tests/report.at
@@ -1150,6 +1150,11 @@ AT_SETUP([Reports with conflicts])
 
 AT_KEYWORDS([report])
 
+# We need UTF-8 support for correct screen-width computation of UTF-8
+# characters.  Skip the test if not available.
+locale=`locale -a | $EGREP '^en_US\.(UTF-8|utf8)$' | sed 1q`
+AT_SKIP_IF([test x == x"$locale"])
+
 AT_BISON_OPTION_PUSHDEFS
 AT_DATA([input.y],
 [[%left "+"
@@ -1162,7 +1167,7 @@ exp
 | "Ñùṃéℝô"
 ]])
 
-AT_BISON_CHECK([-o input.cc -rall --graph=input.gv --xml input.y], [], [],
+AT_CHECK([LC_ALL="$locale" $5 bison -fno-caret -o input.cc -rall --graph=input.gv --xml input.y], [], [],
 [[input.y: warning: 3 shift/reduce conflicts [-Wconflicts-sr]
 input.y: warning: 3 reduce/reduce conflicts [-Wconflicts-rr]
 input.y: warning: rerun with option '-Wcounterexamples' to generate conflict counterexamples [-Wother]
@@ -1219,7 +1224,7 @@ State 0
     4    | . "number"
     5    | . "Ñùṃéℝô"
 
-    "number"          shift, and go to state 1
+    "number"  shift, and go to state 1
     "Ñùṃéℝô"  shift, and go to state 2
 
     exp  go to state 3
@@ -1246,9 +1251,9 @@ State 3
     2    | exp . "+" exp
     3    | exp . "+" exp
 
-    $end   shift, and go to state 4
-    "+"    shift, and go to state 5
-    "⊕"  shift, and go to state 6
+    $end  shift, and go to state 4
+    "+"   shift, and go to state 5
+    "⊕"   shift, and go to state 6
 
 
 State 4
@@ -1268,7 +1273,7 @@ State 5
     4    | . "number"
     5    | . "Ñùṃéℝô"
 
-    "number"          shift, and go to state 1
+    "number"  shift, and go to state 1
     "Ñùṃéℝô"  shift, and go to state 2
 
     exp  go to state 7
@@ -1283,7 +1288,7 @@ State 6
     4    | . "number"
     5    | . "Ñùṃéℝô"
 
-    "number"          shift, and go to state 1
+    "number"  shift, and go to state 1
     "Ñùṃéℝô"  shift, and go to state 2
 
     exp  go to state 8
@@ -1303,8 +1308,8 @@ State 7
     $end      [reduce using rule 3 (exp)]
     "+"       reduce using rule 2 (exp)
     "+"       [reduce using rule 3 (exp)]
-    "⊕"     [reduce using rule 2 (exp)]
-    "⊕"     [reduce using rule 3 (exp)]
+    "⊕"       [reduce using rule 2 (exp)]
+    "⊕"       [reduce using rule 3 (exp)]
     $default  reduce using rule 2 (exp)
 
     Conflict between rule 2 and token "+" resolved as reduce (%left "+").
@@ -1317,11 +1322,11 @@ State 8
     2    | exp . "+" exp
     3    | exp . "+" exp
 
-    "+"    shift, and go to state 5
+    "+"  shift, and go to state 5
     "⊕"  shift, and go to state 6
 
     "+"       [reduce using rule 1 (exp)]
-    "⊕"     [reduce using rule 1 (exp)]
+    "⊕"       [reduce using rule 1 (exp)]
     $default  reduce using rule 1 (exp)
 ]])