diagnostics: fix the handling of multibyte characters

This is a pity: efforts were invested in computing correctly the number of screen columns consumed by multibyte characters, but the routines that do that were fed by single-byte inputs... As a consequence Bison never displayed correctly locations when there are multibyte characters. * src/scan-gram.l (mbchar): New. Use it instead of . in the catch-all clause. * tests/diagnostics.at (Tabulations): Enhance into... (Tabulations and multibyte characters): this.
2026-07-25 15:30:32 +00:00 · 2019-04-23 18:29:10 +02:00
parent 6b6c3de2ae
commit afe7dfd3b9
2 changed files with 52 additions and 12 deletions
@@ -135,11 +135,13 @@ static void unexpected_newline (boundary, char const *);
 %x SC_BRACKETED_ID SC_RETURN_BRACKETED_ID

 letter    [.abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_]
-notletter [^.abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_]{-}[%\{]
 id        {letter}({letter}|[-0-9])*
 int       [0-9]+
 xint      0[xX][0-9abcdefABCDEF]+

+ /* UTF-8 Encoded Unicode Code Point, from Flex's documentation. */
+mbchar    [\x09\x0A\x0D\x20-\x7E]|[\xC2-\xDF][\x80-\xBF]|\xE0[\xA0-\xBF][\x80-\xBF]|[\xE1-\xEC\xEE\xEF]([\x80-\xBF]{2})|\xED[\x80-\x9F][\x80-\xBF]|\xF0[\x\90-\xBF]([\x80-\xBF]{2})|[\xF1-\xF3]([\x80-\xBF]{3})|\xF4[\x80-\x8F]([\x80-\xBF]{2})
+
 /* Zero or more instances of backslash-newline.  Following GCC, allow
   white space between the backslash and the newline.  */
 splice   (\\[ \f\t\v]*\n)*
@@ -790,8 +792,16 @@ eqopt    ({sp}=)?
  | By default, grow the string obstack with the input.  |
  `-----------------------------------------------------*/

-<SC_COMMENT,SC_LINE_COMMENT,SC_BRACED_CODE,SC_PREDICATE,SC_PROLOGUE,SC_EPILOGUE,SC_STRING,SC_CHARACTER,SC_ESCAPED_STRING,SC_ESCAPED_CHARACTER>. |
-  <SC_COMMENT,SC_LINE_COMMENT,SC_BRACED_CODE,SC_PREDICATE,SC_PROLOGUE,SC_EPILOGUE>\n    STRING_GROW;
+<SC_COMMENT,SC_LINE_COMMENT,SC_BRACED_CODE,SC_PREDICATE,SC_PROLOGUE,SC_EPILOGUE,SC_STRING,SC_CHARACTER,SC_ESCAPED_STRING,SC_ESCAPED_CHARACTER>
+{
+  /* Accept multibyte characters in one block instead of byte after
+     byte, so that add_column_width and mbsnwidth can compute correct
+     screen width.
+
+     Add a fallthrough "|." so that non UTF-8 input is still accepted
+     and does not jam the scanner.  */
+  {mbchar}|.   STRING_GROW;
+}

 %%

@@ -17,22 +17,31 @@

 AT_BANNER([[Diagnostics.]])

+
 # AT_TEST([TITLE], [GRAMMAR], [OUTPUT-WITH-STYLE])
+# ------------------------------------------------
 m4_pushdef([AT_TEST],
 [
 AT_SETUP([$1])
 AT_KEYWORDS([diagnostics])

+# We need UTF-8 support for correct screen-width computation of UTF-8
+# characters.  Skip the test if not available.
+AT_SKIP_IF([! locale -a | grep '^en_US.UTF-8$'])
+
 AT_BISON_OPTION_PUSHDEFS

 AT_DATA_GRAMMAR([[input.y]], [$2])

 AT_DATA([experr], [$3])
-AT_BISON_CHECK([-fcaret --style=debug -Wall input.y], [], [], [experr])
+# Cannot use AT_BISON_CHECK easily as we need to change the
+# environment.
+# FIXME: Enhance AT_BISON_CHECK.
+AT_CHECK([LC_ALL=en_US.UTF-8 bison -fcaret --style=debug -Wall input.y], [], [], [experr])

-# When no style, same messages, except the style.
+# When no style, same messages, but without style.
 AT_CHECK([perl -pi -e 's{</?\w+>}{}g' experr])
-AT_BISON_CHECK([-fcaret -Wall input.y], [], [], [experr])
+AT_CHECK([LC_ALL=en_US.UTF-8 bison -fcaret -Wall input.y], [], [], [experr])

 AT_BISON_OPTION_POPDEFS

@@ -106,18 +115,24 @@ input.y:17.2: <warning>warning:</warning> empty rule without %empty [<warning>-W
 ]])


-## ------------- ##
-## Tabulations.  ##
-## ------------- ##
+## -------------------------------------- ##
+## Tabulations and multibyte characters.  ##
+## -------------------------------------- ##

-# Make sure we treat tabulations as eight spaces.
+# Make sure we treat tabulations as eight spaces, and that multibyte
+# characters have correct width.

-AT_TEST([[Tabulations]],
+AT_TEST([[Tabulations and multibyte characters]],
 [[%%
-exp: a b c
+exp: a b c d e f g h
 a: {		}
 b: {            }
 c: {------------}
+d: {éééééééééééé}
+e: {∇⃗×𝐸⃗ = -∂𝐵⃗/∂t}
+f: {	42	}
+g: {	"฿¥$€₦"	}
+h: {	🐃	}
 ]],
 [[input.y:11.4-17: <warning>warning:</warning> empty rule without %empty [<warning>-Wempty-rule</warning>]
 a: <warning>{		}</warning>
@@ -128,6 +143,21 @@ input.y:12.4-17: <warning>warning:</warning> empty rule without %empty [<warning
 input.y:13.4-17: <warning>warning:</warning> empty rule without %empty [<warning>-Wempty-rule</warning>]
 c: <warning>{------------}</warning>
    <warning>^~~~~~~~~~~~~~</warning>
+input.y:14.4-17: <warning>warning:</warning> empty rule without %empty [<warning>-Wempty-rule</warning>]
+ d: <warning>{éééééééééééé}</warning>
+    <warning>^~~~~~~~~~~~~~</warning>
+input.y:15.4-17: <warning>warning:</warning> empty rule without %empty [<warning>-Wempty-rule</warning>]
+ e: <warning>{∇⃗×𝐸⃗ = -∂𝐵⃗/∂t}</warning>
+    <warning>^~~~~~~~~~~~~~</warning>
+input.y:16.4-17: <warning>warning:</warning> empty rule without %empty [<warning>-Wempty-rule</warning>]
+ f: <warning>{	42	}</warning>
+    <warning>^~~~~~~~~~~~~~</warning>
+input.y:17.4-17: <warning>warning:</warning> empty rule without %empty [<warning>-Wempty-rule</warning>]
+ g: <warning>{	"฿¥$€₦"	}</warning>
+    <warning>^~~~~~~~~~~~~~</warning>
+input.y:18.4-17: <warning>warning:</warning> empty rule without %empty [<warning>-Wempty-rule</warning>]
+ h: <warning>{	🐃	}</warning>
+    <warning>^~~~~~~~~~~~~~</warning>
 ]])