diagnostics: beware of zero-width characters

Currenly we rely on (visual) width of the characters to decide where
to open and close the styling of the quoted lines.  This breaks when
we deal with zero-width characters: we cannot just rely on (visual)
columns, we need to know whether we are before, inside, or after the
highlighted portion.

* src/location.c (location_caret): col_end: no longer add 1, "regular"
characters have a width of 1, only 0-width characters have 0-width.
opened: replace with 'state', a three-valued enum.
Don't reopen the style if we already did.
* tests/diagnostics.at (Zero-width characters): New.
This commit is contained in:
Akim Demaille
2020-03-07 12:59:09 +01:00
parent e21ff47f5d
commit b638603477
3 changed files with 45 additions and 23 deletions

View File

@@ -421,12 +421,14 @@ location_caret (location loc, const char *style, FILE *out)
{
/* The last column to highlight. Only the first line of
multiline locations are quoted, in which case the ending
column is the end of line. Single point locations (with
equal boundaries) denote the character that they
follow. */
int col_end
column is the end of line.
We used to work with byte offsets, and that was much
easier. However, we went back to using (visual) columns to
support truncating of long lines. */
const int col_end
= loc.start.line == loc.end.line
? loc.end.column + (loc.start.column == loc.end.column)
? loc.end.column
: caret_info.line_len;
/* Quote the file (at most the first line in the case of
multiline locations). */
@@ -436,24 +438,28 @@ location_caret (location loc, const char *style, FILE *out)
expected (maybe the file was changed since the scanner
ran), we might reach the end before we actually saw the
opening column. */
bool opened = false;
enum { before, inside, after } state = before;
while (!mb_iseof (c) && !mb_iseq (c, '\n'))
{
if (caret_info.pos.column == loc.start.column)
// We might have already opened (and even closed!) the
// style and yet have the equality of the columns if we
// just saw zero-width characters.
if (state == before
&& caret_info.pos.column == loc.start.column)
{
begin_use_class (style, out);
opened = true;
state = inside;
}
if (skip < caret_info.pos.column)
mb_putc (c, out);
boundary_compute (&caret_info.pos, mb_ptr (c), mb_len (c));
caret_getc (c);
if (opened
if (state == inside
&& (caret_info.pos.column == col_end
|| width < caret_info.pos.column - skip))
{
end_use_class (style, out);
opened = false;
state = after;
}
if (width < caret_info.pos.column - skip)
{
@@ -461,11 +467,11 @@ location_caret (location loc, const char *style, FILE *out)
break;
}
}
// The line is shorter than expected.
if (opened)
if (state == inside)
{
// The line is shorter than expected.
end_use_class (style, out);
opened = false;
state = after;
}
putc ('\n', out);
}

View File

@@ -42,16 +42,14 @@ typedef struct
/* If positive, the column (starting at 1) just after the boundary.
This is neither a byte count, nor a character count; it is a
column count. If this is INT_MAX, the column number has
(visual) column count. If this is INT_MAX, the column number has
overflowed.
Meaningless and not displayed if nonpositive.
*/
Meaningless and not displayed if nonpositive. */
int column;
/* If nonnegative, the byte number (starting at 0) in the current line.
Never displayed, used when printing error messages with colors to
know where colors start and end. */
/* If nonnegative, the byte number (starting at 0) in the current
line. Not displayed (unless --trace=location). */
int byte;
} boundary;

View File

@@ -37,15 +37,15 @@ AT_BISON_OPTION_PUSHDEFS
AT_DATA_GRAMMAR([[input.y]], [$2])
AT_DATA([experr], [$4])
# For some reason, literal ^M in the input are removed and don't end
# in `input.y`. So use the two-character ^M represent it, and let
# Perl insert real CR characters.
if grep '\^M' input.y >/dev/null; then
AT_PERL_REQUIRE([-pi -e 's{\^M}{\r}gx' input.y])
if $EGREP ['\^M|\\[0-9][0-9][0-9]'] input.y experr >/dev/null; then
AT_PERL_REQUIRE([-pi -e 's{\^M}{\r}g;s{\\(\d{3}|.)}{$v = $[]1; $v =~ /\A\d+\z/ ? chr($v) : $v}ge' input.y experr])
fi
AT_DATA([experr], [$4])
AT_CHECK([LC_ALL="$locale" $5 bison -fcaret --color=debug -Wall input.y], [$3], [], [experr])
# When no style, same messages, but without style.
@@ -193,6 +193,24 @@ input.y:12.8-10: previous declaration
]])
## ----------------------- ##
## Zero-width characters. ##
## ----------------------- ##
# We used to open twice the styling for characters that have a
# zero-width on display (e.g., \005).
AT_TEST([[Zero-width characters]],
[[%%
exp: an\005error.
]],
[1],
[[input.y:10.8: <error>error:</error> invalid character: '\\005'
10 | exp: an<error>\005</error>error.
| <error>^</error>
]])
## -------------------------------------- ##
## Tabulations and multibyte characters. ##
## -------------------------------------- ##