From afe7dfd3b97bfbc7165d9035ba1473096ca4570e Mon Sep 17 00:00:00 2001 From: Akim Demaille Date: Sun, 21 Apr 2019 08:56:49 +0200 Subject: [PATCH] diagnostics: fix the handling of multibyte characters This is a pity: efforts were invested in computing correctly the number of screen columns consumed by multibyte characters, but the routines that do that were fed by single-byte inputs... As a consequence Bison never displayed correctly locations when there are multibyte characters. * src/scan-gram.l (mbchar): New. Use it instead of . in the catch-all clause. * tests/diagnostics.at (Tabulations): Enhance into... (Tabulations and multibyte characters): this. --- src/scan-gram.l | 16 ++++++++++++--- tests/diagnostics.at | 48 +++++++++++++++++++++++++++++++++++--------- 2 files changed, 52 insertions(+), 12 deletions(-) diff --git a/src/scan-gram.l b/src/scan-gram.l index c69b1b5d..96bb3ba0 100644 --- a/src/scan-gram.l +++ b/src/scan-gram.l @@ -135,11 +135,13 @@ static void unexpected_newline (boundary, char const *); %x SC_BRACKETED_ID SC_RETURN_BRACKETED_ID letter [.abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_] -notletter [^.abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_]{-}[%\{] id {letter}({letter}|[-0-9])* int [0-9]+ xint 0[xX][0-9abcdefABCDEF]+ + /* UTF-8 Encoded Unicode Code Point, from Flex's documentation. */ +mbchar [\x09\x0A\x0D\x20-\x7E]|[\xC2-\xDF][\x80-\xBF]|\xE0[\xA0-\xBF][\x80-\xBF]|[\xE1-\xEC\xEE\xEF]([\x80-\xBF]{2})|\xED[\x80-\x9F][\x80-\xBF]|\xF0[\x\90-\xBF]([\x80-\xBF]{2})|[\xF1-\xF3]([\x80-\xBF]{3})|\xF4[\x80-\x8F]([\x80-\xBF]{2}) + /* Zero or more instances of backslash-newline. Following GCC, allow white space between the backslash and the newline. */ splice (\\[ \f\t\v]*\n)* @@ -790,8 +792,16 @@ eqopt ({sp}=)? | By default, grow the string obstack with the input. | `-----------------------------------------------------*/ -. | - \n STRING_GROW; + +{ + /* Accept multibyte characters in one block instead of byte after + byte, so that add_column_width and mbsnwidth can compute correct + screen width. + + Add a fallthrough "|." so that non UTF-8 input is still accepted + and does not jam the scanner. */ + {mbchar}|. STRING_GROW; +} %% diff --git a/tests/diagnostics.at b/tests/diagnostics.at index 606b0373..2b49debf 100644 --- a/tests/diagnostics.at +++ b/tests/diagnostics.at @@ -17,22 +17,31 @@ AT_BANNER([[Diagnostics.]]) + # AT_TEST([TITLE], [GRAMMAR], [OUTPUT-WITH-STYLE]) +# ------------------------------------------------ m4_pushdef([AT_TEST], [ AT_SETUP([$1]) AT_KEYWORDS([diagnostics]) +# We need UTF-8 support for correct screen-width computation of UTF-8 +# characters. Skip the test if not available. +AT_SKIP_IF([! locale -a | grep '^en_US.UTF-8$']) + AT_BISON_OPTION_PUSHDEFS AT_DATA_GRAMMAR([[input.y]], [$2]) AT_DATA([experr], [$3]) -AT_BISON_CHECK([-fcaret --style=debug -Wall input.y], [], [], [experr]) +# Cannot use AT_BISON_CHECK easily as we need to change the +# environment. +# FIXME: Enhance AT_BISON_CHECK. +AT_CHECK([LC_ALL=en_US.UTF-8 bison -fcaret --style=debug -Wall input.y], [], [], [experr]) -# When no style, same messages, except the style. +# When no style, same messages, but without style. AT_CHECK([perl -pi -e 's{}{}g' experr]) -AT_BISON_CHECK([-fcaret -Wall input.y], [], [], [experr]) +AT_CHECK([LC_ALL=en_US.UTF-8 bison -fcaret -Wall input.y], [], [], [experr]) AT_BISON_OPTION_POPDEFS @@ -106,18 +115,24 @@ input.y:17.2: warning: empty rule without %empty [-W ]]) -## ------------- ## -## Tabulations. ## -## ------------- ## +## -------------------------------------- ## +## Tabulations and multibyte characters. ## +## -------------------------------------- ## -# Make sure we treat tabulations as eight spaces. +# Make sure we treat tabulations as eight spaces, and that multibyte +# characters have correct width. -AT_TEST([[Tabulations]], +AT_TEST([[Tabulations and multibyte characters]], [[%% -exp: a b c +exp: a b c d e f g h a: { } b: { } c: {------------} +d: {éééééééééééé} +e: {∇⃗×𝐸⃗ = -∂𝐵⃗/∂t} +f: { 42 } +g: { "฿¥$€₦" } +h: { 🐃 } ]], [[input.y:11.4-17: warning: empty rule without %empty [-Wempty-rule] a: { } @@ -128,6 +143,21 @@ input.y:12.4-17: warning: empty rule without %empty [warning: empty rule without %empty [-Wempty-rule] c: {------------} ^~~~~~~~~~~~~~ +input.y:14.4-17: warning: empty rule without %empty [-Wempty-rule] + d: {éééééééééééé} + ^~~~~~~~~~~~~~ +input.y:15.4-17: warning: empty rule without %empty [-Wempty-rule] + e: {∇⃗×𝐸⃗ = -∂𝐵⃗/∂t} + ^~~~~~~~~~~~~~ +input.y:16.4-17: warning: empty rule without %empty [-Wempty-rule] + f: { 42 } + ^~~~~~~~~~~~~~ +input.y:17.4-17: warning: empty rule without %empty [-Wempty-rule] + g: { "฿¥$€₦" } + ^~~~~~~~~~~~~~ +input.y:18.4-17: warning: empty rule without %empty [-Wempty-rule] + h: { 🐃 } + ^~~~~~~~~~~~~~ ]])