[packages/grep] - rel 2; from upstream: be less picky about encoding errors

arekm arekm at pld-linux.org
Tue Jan 5 22:31:30 CET 2016


commit fccf3f8906a337debeee952c732415d24536730d
Author: Arkadiusz Miśkiewicz <arekm at maven.pl>
Date:   Tue Jan 5 22:31:16 2016 +0100

    - rel 2; from upstream: be less picky about encoding errors

 grep-enc-errors.patch | 918 ++++++++++++++++++++++++++++++++++++++++++++++++++
 grep.spec             |   6 +-
 2 files changed, 923 insertions(+), 1 deletion(-)
---
diff --git a/grep.spec b/grep.spec
index 5455cc5..23ddf46 100644
--- a/grep.spec
+++ b/grep.spec
@@ -15,7 +15,7 @@ Summary(tr.UTF-8):	Dosyalarda katar arama aracı
 Summary(uk.UTF-8):	Утиліти пошуку по шаблонам GNU grep
 Name:		grep
 Version:	2.22
-Release:	1
+Release:	2
 Epoch:		2
 License:	GPL v3+
 Group:		Applications/Text
@@ -25,6 +25,7 @@ Source1:	http://www.mif.pg.gda.pl/homepages/ankry/man-PLD/%{name}-non-english-ma
 # Source1-md5:	1b5e726d0bee53e898531de4a76ad290
 Patch0:		%{name}-info.patch
 Patch1:		%{name}-pl.po-update.patch
+Patch2:		grep-enc-errors.patch
 URL:		http://www.gnu.org/software/grep/grep.html
 BuildRequires:	autoconf >= 2.59
 BuildRequires:	automake >= 1:1.11
@@ -90,6 +91,9 @@ kullanılır.
 %patch0 -p1
 %patch1 -p1
 
+%patch2 -p1
+chmod +x tests/encoding-error tests/unibyte-binary
+
 %{__rm} po/stamp-po
 
 %build
diff --git a/grep-enc-errors.patch b/grep-enc-errors.patch
new file mode 100644
index 0000000..212740a
--- /dev/null
+++ b/grep-enc-errors.patch
@@ -0,0 +1,918 @@
+From 8521001643bc6a28c760552824eaea5ecee0aa8c Mon Sep 17 00:00:00 2001
+From: Paul Eggert <eggert at cs.ucla.edu>
+Date: Thu, 31 Dec 2015 03:10:14 +0000
+Subject: grep: be less picky about encoding errors
+
+This fixes a longstanding problem introduced in grep 2.21,
+which is overly picky about binary files.
+* NEWS:
+* doc/grep.texi (File and Directory Selection): Document this.
+* src/grep.c (input_textbin, textbin_is_binary, buffer_textbin)
+(file_textbin):
+Remove.  All uses removed.
+(encoding_error_output): New static var.
+(buf_has_encoding_errors, buf_has_nulls, file_must_have_nulls):
+New functions, which reuse bits
+and pieces of the removed functions.
+(lastout, print_line_head, print_line_middle, print_line_tail, prline)
+(prpending, prtext, grepbuf):
+Avoid use of const, now that we have
+functions that require modifying a sentinel.
+(print_line_head): New arg LEN.  All uses changed.
+(print_line_head, print_line_tail):
+Return indicator whether the output line was printed.
+All uses changed.
+(print_line_middle): Exit early on encoding error.
+(grep): Use new method for determining whether file is binary.
+* src/grep.h (enum textbin, TEXTBIN_BINARY, TEXTBIN_UNKNOWN)
+(TEXTBIN_TEXT, input_textbin): Remove decls.  All uses removed.
+* src/pcresearch.c (Pexecute): Remove multiline optimization,
+since the main program no longer checks for encoding errors on input.
+* tests/encoding-error: New file.
+* tests/Makefile.am (TESTS): Add it.
+---
+diff --git a/doc/grep.texi b/doc/grep.texi
+index 76c7f46..58e7f48 100644
+--- a/doc/grep.texi
++++ b/doc/grep.texi
+@@ -596,13 +596,13 @@ If a file's allocation metadata,
+ or if its data read before a line is selected for output,
+ indicate that the file contains binary data,
+ assume that the file is of type @var{type}.
+-Non-text bytes indicate binary data; these are either data bytes
+-improperly encoded for the current locale, or null bytes when the
++Non-text bytes indicate binary data; these are either output bytes that are
++improperly encoded for the current locale, or null input bytes when the
+ @option{-z} (@option{--null-data}) option is not given (@pxref{Other
+ Options}).
+ 
+-By default, @var{type} is @samp{binary},
+-and @command{grep} normally outputs either
++By default, @var{type} is @samp{binary}, and when @command{grep}
++discovers that a file is binary it normally outputs either
+ a one-line message saying that a binary file matches,
+ or no message if there is no match.
+ When processing binary data, @command{grep} may treat non-text bytes
+@@ -611,7 +611,8 @@ not match a null byte, as the null byte might be treated as a line
+ terminator even without the @option{-z} (@option{--null-data}) option.
+ 
+ If @var{type} is @samp{without-match},
+- at command{grep} assumes that a binary file does not match;
++when @command{grep} discovers that a file is binary
++it assumes that the rest of the file does not match;
+ this is equivalent to the @option{-I} option.
+ 
+ If @var{type} is @samp{text},
+diff --git a/src/grep.c b/src/grep.c
+index 19ba208..e059a46 100644
+--- a/src/grep.c
++++ b/src/grep.c
+@@ -377,7 +377,6 @@ bool match_icase;
+ bool match_words;
+ bool match_lines;
+ char eolbyte;
+-enum textbin input_textbin;
+ 
+ static char const *matcher;
+ 
+@@ -389,6 +388,10 @@ static bool omit_dot_slash;
+ static bool errseen;
+ static bool write_error_seen;
+ 
++/* True if output from the current input file has been suppressed
++   because an output line had an encoding error.  */
++static bool encoding_error_output;
++
+ enum directories_type
+   {
+     READ_DIRECTORIES = 2,
+@@ -481,12 +484,6 @@ clean_up_stdout (void)
+     close_stdout ();
+ }
+ 
+-static bool
+-textbin_is_binary (enum textbin textbin)
+-{
+-  return textbin < TEXTBIN_UNKNOWN;
+-}
+-
+ /* The high-order bit of a byte.  */
+ enum { HIBYTE = 0x80 };
+ 
+@@ -551,58 +548,60 @@ skip_easy_bytes (char const *buf)
+   return p;
+ }
+ 
+-/* Return the text type of data in BUF, of size SIZE.
++/* Return true if BUF, of size SIZE, has an encoding error.
+    BUF must be followed by at least sizeof (uword) bytes,
+-   which may be arbitrarily written to or read from.  */
+-static enum textbin
+-buffer_textbin (char *buf, size_t size)
++   the first of which may be modified.  */
++static bool
++buf_has_encoding_errors (char *buf, size_t size)
+ {
+-  if (eolbyte && memchr (buf, '\0', size))
+-    return TEXTBIN_BINARY;
++  if (MB_CUR_MAX <= 1)
++    return false;
+ 
+-  if (1 < MB_CUR_MAX)
+-    {
+-      mbstate_t mbs = { 0 };
+-      size_t clen;
+-      char const *p;
++  mbstate_t mbs = { 0 };
++  size_t clen;
+ 
+-      buf[size] = -1;
+-      for (p = buf; (p = skip_easy_bytes (p)) < buf + size; p += clen)
+-        {
+-          clen = mbrlen (p, buf + size - p, &mbs);
+-          if ((size_t) -2 <= clen)
+-            return clen == (size_t) -2 ? TEXTBIN_UNKNOWN : TEXTBIN_BINARY;
+-        }
++  buf[size] = -1;
++  for (char const *p = buf; (p = skip_easy_bytes (p)) < buf + size; p += clen)
++    {
++      clen = mbrlen (p, buf + size - p, &mbs);
++      if ((size_t) -2 <= clen)
++        return true;
+     }
+ 
+-  return TEXTBIN_TEXT;
++  return false;
+ }
+ 
+-/* Return the text type of a file.  BUF, of size SIZE, is the initial
+-   buffer read from the file with descriptor FD and status ST.
+-   BUF must be followed by at least sizeof (uword) bytes,
++
++/* Return true if BUF, of size SIZE, has a null byte.
++   BUF must be followed by at least one byte,
+    which may be arbitrarily written to or read from.  */
+-static enum textbin
+-file_textbin (char *buf, size_t size, int fd, struct stat const *st)
++static bool
++buf_has_nulls (char *buf, size_t size)
+ {
+-  enum textbin textbin = buffer_textbin (buf, size);
+-  if (textbin_is_binary (textbin))
+-    return textbin;
++  buf[size] = 0;
++  return strlen (buf) != size;
++}
+ 
++/* Return true if a file is known to contain null bytes.
++   SIZE bytes have already been read from the file
++   with descriptor FD and status ST.  */
++static bool
++file_must_have_nulls (size_t size, int fd, struct stat const *st)
++{
+   if (usable_st_size (st))
+     {
+       if (st->st_size <= size)
+-        return textbin == TEXTBIN_UNKNOWN ? TEXTBIN_BINARY : textbin;
++        return false;
+ 
+       /* If the file has holes, it must contain a null byte somewhere.  */
+-      if (SEEK_HOLE != SEEK_SET && eolbyte)
++      if (SEEK_HOLE != SEEK_SET)
+         {
+           off_t cur = size;
+           if (O_BINARY || fd == STDIN_FILENO)
+             {
+               cur = lseek (fd, 0, SEEK_CUR);
+               if (cur < 0)
+-                return TEXTBIN_UNKNOWN;
++                return false;
+             }
+ 
+           /* Look for a hole after the current location.  */
+@@ -612,12 +611,12 @@ file_textbin (char *buf, size_t size, int fd, struct stat const *st)
+               if (lseek (fd, cur, SEEK_SET) < 0)
+                 suppressible_error (filename, errno);
+               if (hole_start < st->st_size)
+-                return TEXTBIN_BINARY;
++                return true;
+             }
+         }
+     }
+ 
+-  return TEXTBIN_UNKNOWN;
++  return false;
+ }
+ 
+ /* Convert STR to a nonnegative integer, storing the result in *OUT.
+@@ -899,7 +898,7 @@ static char *label = NULL;      /* Fake filename for stdin */
+ /* Internal variables to keep track of byte count, context, etc. */
+ static uintmax_t totalcc;	/* Total character count before bufbeg. */
+ static char const *lastnl;	/* Pointer after last newline counted. */
+-static char const *lastout;	/* Pointer after last character output;
++static char *lastout;		/* Pointer after last character output;
+                                    NULL if no character has been output
+                                    or if it's conceptually before bufbeg. */
+ static intmax_t outleft;	/* Maximum number of lines to be output.  */
+@@ -971,10 +970,31 @@ print_offset (uintmax_t pos, int min_width, const char *color)
+   pr_sgr_end_if (color);
+ }
+ 
+-/* Print a whole line head (filename, line, byte).  */
+-static void
+-print_line_head (char const *beg, char const *lim, char sep)
++/* Print a whole line head (filename, line, byte).  The output data
++   starts at BEG and contains LEN bytes; it is followed by at least
++   sizeof (uword) bytes, the first of which may be temporarily modified.
++   The output data comes from what is perhaps a larger input line that
++   goes until LIM, where LIM[-1] is an end-of-line byte.  Use SEP as
++   the separator on output.
++
++   Return true unless the line was suppressed due to an encoding error.  */
++
++static bool
++print_line_head (char *beg, size_t len, char const *lim, char sep)
+ {
++  bool encoding_errors = false;
++  if (binary_files != TEXT_BINARY_FILES)
++    {
++      char ch = beg[len];
++      encoding_errors = buf_has_encoding_errors (beg, len);
++      beg[len] = ch;
++    }
++  if (encoding_errors)
++    {
++      encoding_error_output = done_on_match = out_quiet = true;
++      return false;
++    }
++
+   bool pending_sep = false;
+ 
+   if (out_file)
+@@ -1021,22 +1041,27 @@ print_line_head (char const *beg, char const *lim, char sep)
+ 
+       print_sep (sep);
+     }
++
++  return true;
+ }
+ 
+-static const char *
+-print_line_middle (const char *beg, const char *lim,
++static char *
++print_line_middle (char *beg, char *lim,
+                    const char *line_color, const char *match_color)
+ {
+   size_t match_size;
+   size_t match_offset;
+-  const char *cur = beg;
+-  const char *mid = NULL;
+-
+-  while (cur < lim
+-         && ((match_offset = execute (beg, lim - beg, &match_size, cur))
+-             != (size_t) -1))
++  char *cur = beg;
++  char *mid = NULL;
++  char *b;
++
++  for (cur = beg;
++       (cur < lim
++        && ((match_offset = execute (beg, lim - beg, &match_size, cur))
++            != (size_t) -1));
++       cur = b + match_size)
+     {
+-      char const *b = beg + match_offset;
++      b = beg + match_offset;
+ 
+       /* Avoid matching the empty line at the end of the buffer. */
+       if (b == lim)
+@@ -1056,8 +1081,11 @@ print_line_middle (const char *beg, const char *lim,
+           /* This function is called on a matching line only,
+              but is it selected or rejected/context?  */
+           if (only_matching)
+-            print_line_head (b, lim, (out_invert ? SEP_CHAR_REJECTED
+-                                      : SEP_CHAR_SELECTED));
++            {
++              char sep = out_invert ? SEP_CHAR_REJECTED : SEP_CHAR_SELECTED;
++              if (! print_line_head (b, match_size, lim, sep))
++                return NULL;
++            }
+           else
+             {
+               pr_sgr_start (line_color);
+@@ -1075,7 +1103,6 @@ print_line_middle (const char *beg, const char *lim,
+           if (only_matching)
+             fputs ("\n", stdout);
+         }
+-      cur = b + match_size;
+     }
+ 
+   if (only_matching)
+@@ -1086,8 +1113,8 @@ print_line_middle (const char *beg, const char *lim,
+   return cur;
+ }
+ 
+-static const char *
+-print_line_tail (const char *beg, const char *lim, const char *line_color)
++static char *
++print_line_tail (char *beg, const char *lim, const char *line_color)
+ {
+   size_t eol_size;
+   size_t tail_size;
+@@ -1108,14 +1135,15 @@ print_line_tail (const char *beg, const char *lim, const char *line_color)
+ }
+ 
+ static void
+-prline (char const *beg, char const *lim, char sep)
++prline (char *beg, char *lim, char sep)
+ {
+   bool matching;
+   const char *line_color;
+   const char *match_color;
+ 
+   if (!only_matching)
+-    print_line_head (beg, lim, sep);
++    if (! print_line_head (beg, lim - beg - 1, lim, sep))
++      return;
+ 
+   matching = (sep == SEP_CHAR_SELECTED) ^ out_invert;
+ 
+@@ -1135,7 +1163,11 @@ prline (char const *beg, char const *lim, char sep)
+     {
+       /* We already know that non-matching lines have no match (to colorize). */
+       if (matching && (only_matching || *match_color))
+-        beg = print_line_middle (beg, lim, line_color, match_color);
++        {
++          beg = print_line_middle (beg, lim, line_color, match_color);
++          if (! beg)
++            return;
++        }
+ 
+       if (!only_matching && *line_color)
+         {
+@@ -1169,7 +1201,7 @@ prpending (char const *lim)
+     lastout = bufbeg;
+   while (pending > 0 && lastout < lim)
+     {
+-      char const *nl = memchr (lastout, eolbyte, lim - lastout);
++      char *nl = memchr (lastout, eolbyte, lim - lastout);
+       size_t match_size;
+       --pending;
+       if (outleft
+@@ -1184,7 +1216,7 @@ prpending (char const *lim)
+ 
+ /* Output the lines between BEG and LIM.  Deal with context.  */
+ static void
+-prtext (char const *beg, char const *lim)
++prtext (char *beg, char *lim)
+ {
+   static bool used;	/* Avoid printing SEP_STR_GROUP before any output.  */
+   char eol = eolbyte;
+@@ -1192,7 +1224,7 @@ prtext (char const *beg, char const *lim)
+   if (!out_quiet && pending > 0)
+     prpending (beg);
+ 
+-  char const *p = beg;
++  char *p = beg;
+ 
+   if (!out_quiet)
+     {
+@@ -1218,7 +1250,7 @@ prtext (char const *beg, char const *lim)
+ 
+       while (p < beg)
+         {
+-          char const *nl = memchr (p, eol, beg - p);
++          char *nl = memchr (p, eol, beg - p);
+           nl++;
+           prline (p, nl, SEP_CHAR_REJECTED);
+           p = nl;
+@@ -1231,7 +1263,7 @@ prtext (char const *beg, char const *lim)
+       /* One or more lines are output.  */
+       for (n = 0; p < lim && n < outleft; n++)
+         {
+-          char const *nl = memchr (p, eol, lim - p);
++          char *nl = memchr (p, eol, lim - p);
+           nl++;
+           if (!out_quiet)
+             prline (p, nl, SEP_CHAR_SELECTED);
+@@ -1278,13 +1310,12 @@ zap_nuls (char *p, char *lim, char eol)
+    between matching lines if OUT_INVERT is true).  Return a count of
+    lines printed.  Replace all NUL bytes with NUL_ZAPPER as we go.  */
+ static intmax_t
+-grepbuf (char const *beg, char const *lim)
++grepbuf (char *beg, char const *lim)
+ {
+   intmax_t outleft0 = outleft;
+-  char const *p;
+-  char const *endp;
++  char *endp;
+ 
+-  for (p = beg; p < lim; p = endp)
++  for (char *p = beg; p < lim; p = endp)
+     {
+       size_t match_size;
+       size_t match_offset = execute (p, lim - p, &match_size, NULL);
+@@ -1295,15 +1326,15 @@ grepbuf (char const *beg, char const *lim)
+           match_offset = lim - p;
+           match_size = 0;
+         }
+-      char const *b = p + match_offset;
++      char *b = p + match_offset;
+       endp = b + match_size;
+       /* Avoid matching the empty line at the end of the buffer. */
+       if (!out_invert && b == lim)
+         break;
+       if (!out_invert || p < b)
+         {
+-          char const *prbeg = out_invert ? p : b;
+-          char const *prend = out_invert ? b : endp;
++          char *prbeg = out_invert ? p : b;
++          char *prend = out_invert ? b : endp;
+           prtext (prbeg, prend);
+           if (!outleft || done_on_match)
+             {
+@@ -1324,7 +1355,6 @@ static intmax_t
+ grep (int fd, struct stat const *st)
+ {
+   intmax_t nlines, i;
+-  enum textbin textbin;
+   size_t residue, save;
+   char oldc;
+   char *beg;
+@@ -1333,6 +1363,7 @@ grep (int fd, struct stat const *st)
+   char nul_zapper = '\0';
+   bool done_on_match_0 = done_on_match;
+   bool out_quiet_0 = out_quiet;
++  bool has_nulls = false;
+ 
+   if (! reset (fd, st))
+     return 0;
+@@ -1344,6 +1375,7 @@ grep (int fd, struct stat const *st)
+   after_last_match = 0;
+   pending = 0;
+   skip_nuls = skip_empty_lines && !eol;
++  encoding_error_output = false;
+   seek_data_failed = false;
+ 
+   nlines = 0;
+@@ -1356,26 +1388,20 @@ grep (int fd, struct stat const *st)
+       return 0;
+     }
+ 
+-  if (binary_files == TEXT_BINARY_FILES)
+-    textbin = TEXTBIN_TEXT;
+-  else
++  for (bool firsttime = true; ; firsttime = false)
+     {
+-      textbin = file_textbin (bufbeg, buflim - bufbeg, fd, st);
+-      if (textbin_is_binary (textbin))
++      if (!has_nulls && eol && binary_files != TEXT_BINARY_FILES
++          && (buf_has_nulls (bufbeg, buflim - bufbeg)
++              || (firsttime && file_must_have_nulls (buflim - bufbeg, fd, st))))
+         {
++          has_nulls = true;
+           if (binary_files == WITHOUT_MATCH_BINARY_FILES)
+             return 0;
+           done_on_match = out_quiet = true;
+           nul_zapper = eol;
+           skip_nuls = skip_empty_lines;
+         }
+-      else if (execute != Pexecute)
+-        textbin = TEXTBIN_TEXT;
+-    }
+ 
+-  for (;;)
+-    {
+-      input_textbin = textbin;
+       lastnl = bufbeg;
+       if (lastout)
+         lastout = bufbeg;
+@@ -1426,13 +1452,8 @@ grep (int fd, struct stat const *st)
+         }
+ 
+       /* Detect whether leading context is adjacent to previous output.  */
+-      if (lastout)
+-        {
+-          if (textbin == TEXTBIN_UNKNOWN)
+-            textbin = TEXTBIN_TEXT;
+-          if (beg != lastout)
+-            lastout = 0;
+-        }
++      if (beg != lastout)
++        lastout = 0;
+ 
+       /* Handle some details and read more data to scan.  */
+       save = residue + lim - beg;
+@@ -1445,22 +1466,6 @@ grep (int fd, struct stat const *st)
+           suppressible_error (filename, errno);
+           goto finish_grep;
+         }
+-
+-      /* If the file's textbin has not been determined yet, assume
+-         it's binary if the next input buffer suggests so.  */
+-      if (textbin == TEXTBIN_UNKNOWN)
+-        {
+-          enum textbin tb = buffer_textbin (bufbeg, buflim - bufbeg);
+-          if (textbin_is_binary (tb))
+-            {
+-              if (binary_files == WITHOUT_MATCH_BINARY_FILES)
+-                return 0;
+-              textbin = tb;
+-              done_on_match = out_quiet = true;
+-              nul_zapper = eol;
+-              skip_nuls = skip_empty_lines;
+-            }
+-        }
+     }
+   if (residue)
+     {
+@@ -1474,7 +1479,7 @@ grep (int fd, struct stat const *st)
+  finish_grep:
+   done_on_match = done_on_match_0;
+   out_quiet = out_quiet_0;
+-  if (textbin_is_binary (textbin) && !out_quiet && nlines != 0)
++  if ((has_nulls || encoding_error_output) && !out_quiet && nlines != 0)
+     printf (_("Binary file %s matches\n"), filename);
+   return nlines;
+ }
+diff --git a/src/grep.h b/src/grep.h
+index 580eb11..2e4527c 100644
+--- a/src/grep.h
++++ b/src/grep.h
+@@ -29,22 +29,4 @@ extern bool match_words;	/* -w */
+ extern bool match_lines;	/* -x */
+ extern char eolbyte;		/* -z */
+ 
+-/* An enum textbin describes the file's type, inferred from data read
+-   before the first line is selected for output.  */
+-enum textbin
+-  {
+-    /* Binary, as it contains null bytes and the -z option is not in effect,
+-       or it contains encoding errors.  */
+-    TEXTBIN_BINARY = -1,
+-
+-    /* Not known yet.  Only text has been seen so far.  */
+-    TEXTBIN_UNKNOWN = 0,
+-
+-    /* Text.  */
+-    TEXTBIN_TEXT = 1
+-  };
+-
+-/* Input file type.  */
+-extern enum textbin input_textbin;
+-
+ #endif
+diff --git a/src/pcresearch.c b/src/pcresearch.c
+index dc68345..c403032 100644
+--- a/src/pcresearch.c
++++ b/src/pcresearch.c
+@@ -194,32 +194,13 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
+      error.  */
+   char const *subject = buf;
+ 
+-  /* If the input type is unknown, the caller is still testing the
+-     input, which means the current buffer cannot contain encoding
+-     errors and a multiline search is typically more efficient.
+-     Otherwise, a single-line search is typically faster, so that
+-     pcre_exec doesn't waste time validating the entire input
+-     buffer.  */
+-  bool multiline = input_textbin == TEXTBIN_UNKNOWN;
+-
+   for (; p < buf + size; p = line_start = line_end + 1)
+     {
+-      bool too_big;
+-
+-      if (multiline)
+-        {
+-          size_t pcre_size_max = MIN (INT_MAX, SIZE_MAX - 1);
+-          size_t scan_size = MIN (pcre_size_max + 1, buf + size - p);
+-          line_end = memrchr (p, eolbyte, scan_size);
+-          too_big = ! line_end;
+-        }
+-      else
+-        {
+-          line_end = memchr (p, eolbyte, buf + size - p);
+-          too_big = INT_MAX < line_end - p;
+-        }
+-
+-      if (too_big)
++      /* A single-line search is typically faster, so that
++         pcre_exec doesn't waste time validating the entire input
++         buffer.  */
++      line_end = memchr (p, eolbyte, buf + size - p);
++      if (INT_MAX < line_end - p)
+         error (EXIT_TROUBLE, 0, _("exceeded PCRE's line length limit"));
+ 
+       for (;;)
+@@ -247,27 +228,11 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
+           int options = 0;
+           if (!bol)
+             options |= PCRE_NOTBOL;
+-          if (multiline)
+-            options |= PCRE_NO_UTF8_CHECK;
+ 
+           e = jit_exec (subject, line_end - subject, search_offset,
+                         options, sub);
+           if (e != PCRE_ERROR_BADUTF8)
+-            {
+-              if (0 < e && multiline && sub[1] - sub[0] != 0)
+-                {
+-                  char const *nl = memchr (subject + sub[0], eolbyte,
+-                                           sub[1] - sub[0]);
+-                  if (nl)
+-                    {
+-                      /* This match crosses a line boundary; reject it.  */
+-                      p = subject + sub[0];
+-                      line_end = nl;
+-                      continue;
+-                    }
+-                }
+-              break;
+-            }
++            break;
+           int valid_bytes = sub[0];
+ 
+           /* Try to match the string before the encoding error.  */
+@@ -339,15 +304,6 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
+           beg = matchbeg;
+           end = matchend;
+         }
+-      else if (multiline)
+-        {
+-          char const *prev_nl = memrchr (line_start - 1, eolbyte,
+-                                         matchbeg - (line_start - 1));
+-          char const *next_nl = memchr (matchend, eolbyte,
+-                                        line_end + 1 - matchend);
+-          beg = prev_nl + 1;
+-          end = next_nl + 1;
+-        }
+       else
+         {
+           beg = line_start;
+diff --git a/tests/Makefile.am b/tests/Makefile.am
+index 37bb501..f1b8c43 100644
+--- a/tests/Makefile.am
++++ b/tests/Makefile.am
+@@ -70,6 +70,7 @@ TESTS =						\
+   empty						\
+   empty-line					\
+   empty-line-mb					\
++  encoding-error				\
+   epipe						\
+   equiv-classes					\
+   ere						\
+diff --git a/tests/encoding-error b/tests/encoding-error
+new file mode 100755
+index 0000000..fe52de2
+--- a/dev/null
++++ b/tests/encoding-error
+@@ -0,0 +1,41 @@
++#! /bin/sh
++# Test grep's behavior on encoding errors.
++#
++# Copyright 2015 Free Software Foundation, Inc.
++#
++# Copying and distribution of this file, with or without modification,
++# are permitted in any medium without royalty provided the copyright
++# notice and this notice are preserved.
++
++. "${srcdir=.}/init.sh"; path_prepend_ ../src
++
++require_en_utf8_locale_
++
++LC_ALL=en_US.UTF-8
++export LC_ALL
++
++printf 'Alfred Jones\n' > a || framework_failure_
++printf 'John Smith\n' >j || framework_failure_
++printf 'Pedro P\xe9rez\n' >p || framework_failure_
++cat a p j >in || framework_failure_
++
++fail=0
++
++grep '^A' in >out || fail=1
++compare a out || fail=1
++
++grep '^P' in >out || fail=1
++printf 'Binary file in matches\n' >exp || framework_failure_
++compare exp out || fail=1
++
++grep '^J' in >out || fail=1
++compare j out || fail=1
++
++grep '^X' in >out
++test $? = 1 || fail=1
++compare /dev/null out || fail=1
++
++grep -a . in >out || fail=1
++compare in out
++
++Exit $fail
+--
+cgit v0.9.0.2
+From 40ed879db22d57516a31fefd1c39416974b74ec4 Mon Sep 17 00:00:00 2001
+From: Paul Eggert <eggert at cs.ucla.edu>
+Date: Sat, 02 Jan 2016 05:16:12 +0000
+Subject: grep: fix bug with with invalid unibyte sequence
+
+This was introduced by the recent binary-data-detection changes.
+Problem reported by Norihiro Tanaka in: http://bugs.gnu.org/20526#86
+* src/grep.c (HIBYTE, easy_encoding, init_easy_encoding): Remove,
+replacing with ...
+(uword_max, unibyte_mask, initialize_unibyte_mask): ... this new
+constant, static var, and function.  All uses changed.  The
+unibyte_mask var generalizes the old local var hibyte_mask, which
+worked only for encodings where every byte with 0x80 turned off is
+a single-byte character.
+(buf_has_encoding_errors): Return false immediately if
+unibyte_mask is zero, not whether the current encoding is unibyte.
+The old test was incorrect in unibyte locales in which some bytes
+were encoding errors.
+* tests/pcre-z: Require UTF-8 locale, since the grep -z . test now
+needs this.  Use printf \0 rather than tr.  Port the 'grep -z .'
+test to platforms where the C locale says '\200' is an encoding
+error.  Use cmp rather than compare, as the file is binary and
+so non-GNU diff might not work.
+* tests/unibyte-binary: New file.
+* tests/Makefile.am (TESTS): Add it.
+---
+diff --git a/src/grep.c b/src/grep.c
+index 1207a76..a5f1fa2 100644
+--- a/src/grep.c
++++ b/src/grep.c
+@@ -484,21 +484,6 @@ clean_up_stdout (void)
+     close_stdout ();
+ }
+ 
+-/* The high-order bit of a byte.  */
+-enum { HIBYTE = 0x80 };
+-
+-/* True if every byte with HIBYTE off is a single-byte character.
+-   UTF-8 has this property.  */
+-static bool easy_encoding;
+-
+-static void
+-init_easy_encoding (void)
+-{
+-  easy_encoding = true;
+-  for (int i = 0; i < HIBYTE; i++)
+-    easy_encoding &= mbclen_cache[i] == 1;
+-}
+-
+ /* A cast to TYPE of VAL.  Use this when TYPE is a pointer type, VAL
+    is properly aligned for TYPE, and 'gcc -Wcast-align' cannot infer
+    the alignment and would otherwise complain about the cast.  */
+@@ -517,21 +502,33 @@ init_easy_encoding (void)
+ /* An unsigned type suitable for fast matching.  */
+ typedef uintmax_t uword;
+ 
++/* All bytes that are not unibyte characters, ANDed together, and then
++   with the pattern repeated to fill a uword.  For an encoding where
++   all bytes are unibyte characters, this is 0.  For UTF-8, this is
++   0x808080....  For encodings where unibyte characters have no useful
++   pattern, this is all 1s.  The unsigned char C is a unibyte
++   character if C & UNIBYTE_MASK is zero.  If the uword W is the
++   concatenation of bytes, the bytes are all unibyte characters
++   if W & UNIBYTE_MASK is zero.  */
++static uword unibyte_mask;
++
++static void
++initialize_unibyte_mask (void)
++{
++  unsigned char mask = UCHAR_MAX;
++  for (int i = 1; i <= UCHAR_MAX; i++)
++    if (mbclen_cache[i] != 1)
++      mask &= i;
++  uword uword_max = -1;
++  unibyte_mask = uword_max / UCHAR_MAX * mask;
++}
++
+ /* Skip the easy bytes in a buffer that is guaranteed to have a sentinel
+    that is not easy, and return a pointer to the first non-easy byte.
+-   In easy encodings, the easy bytes all have HIBYTE off.
+-   In other encodings, no byte is easy.  */
++   The easy bytes all have UNIBYTE_MASK off.  */
+ static char const * _GL_ATTRIBUTE_PURE
+ skip_easy_bytes (char const *buf)
+ {
+-  if (!easy_encoding)
+-    return buf;
+-
+-  uword uword_max = -1;
+-
+-  /* 0x8080..., extended to be wide enough for uword.  */
+-  uword hibyte_mask = uword_max / UCHAR_MAX * HIBYTE;
+-
+   /* Search a byte at a time until the pointer is aligned, then a
+      uword at a time until a match is found, then a byte at a time to
+      identify the exact byte.  The uword search may go slightly past
+@@ -539,11 +536,11 @@ skip_easy_bytes (char const *buf)
+   char const *p;
+   uword const *s;
+   for (p = buf; (uintptr_t) p % sizeof (uword) != 0; p++)
+-    if (*p & HIBYTE)
++    if (to_uchar (*p) & unibyte_mask)
+       return p;
+-  for (s = CAST_ALIGNED (uword const *, p); ! (*s & hibyte_mask); s++)
++  for (s = CAST_ALIGNED (uword const *, p); ! (*s & unibyte_mask); s++)
+     continue;
+-  for (p = (char const *) s; ! (*p & HIBYTE); p++)
++  for (p = (char const *) s; ! (to_uchar (*p) & unibyte_mask); p++)
+     continue;
+   return p;
+ }
+@@ -554,7 +551,7 @@ skip_easy_bytes (char const *buf)
+ static bool
+ buf_has_encoding_errors (char *buf, size_t size)
+ {
+-  if (MB_CUR_MAX <= 1)
++  if (! unibyte_mask)
+     return false;
+ 
+   mbstate_t mbs = { 0 };
+@@ -2592,7 +2589,7 @@ main (int argc, char **argv)
+     usage (EXIT_TROUBLE);
+ 
+   build_mbclen_cache ();
+-  init_easy_encoding ();
++  initialize_unibyte_mask ();
+ 
+   /* In a unibyte locale, switch from fgrep to grep if
+      the pattern matches words (where grep is typically faster).
+diff --git a/tests/Makefile.am b/tests/Makefile.am
+index f349aa3..a38303c 100644
+--- a/tests/Makefile.am
++++ b/tests/Makefile.am
+@@ -133,6 +133,7 @@ TESTS =						\
+   turkish-I-without-dot				\
+   turkish-eyes					\
+   two-files					\
++  unibyte-binary				\
+   unibyte-bracket-expr				\
+   unibyte-negated-circumflex			\
+   utf8-bracket					\
+diff --git a/tests/pcre-z b/tests/pcre-z
+index 6bbde94..4ce9a93 100755
+--- a/tests/pcre-z
++++ b/tests/pcre-z
+@@ -2,10 +2,11 @@
+ # Test Perl regex with NUL-separated input
+ . "${srcdir=.}/init.sh"; path_prepend_ ../src
+ require_pcre_
++require_en_utf8_locale_
+ 
+ REGEX=a
+ 
+-printf "%s\n0" abc def ghi aaa gah | tr 0 \\0 > in
++printf '%s\n\0' abc def ghi aaa gah > in || framework_failure_
+ 
+ grep -z "$REGEX" in > exp 2>err || fail_ 'Cannot do BRE (grep -z) match.'
+ compare /dev/null err || fail_ 'stderr not empty on grep -z.'
+@@ -20,8 +21,8 @@ grep -Pz "$REGEX" in > out 2>err || fail=1
+ compare exp out || fail=1
+ compare /dev/null err || fail=1
+ 
+-printf '\200\0' >in0
+-LC_ALL=C grep -z . in0 >out || fail=1
+-compare in0 out || fail=1
++printf '\303\200\0' >in0 # "À" followed by a NUL.
++LC_ALL=en_US.UTF-8 grep -z . in0 >out || fail=1
++cmp in0 out || fail=1
+ 
+ Exit $fail
+diff --git a/tests/unibyte-binary b/tests/unibyte-binary
+new file mode 100755
+index 0000000..78735b8
+--- a/dev/null
++++ b/tests/unibyte-binary
+@@ -0,0 +1,28 @@
++#!/bin/sh
++# Test binary files in unibyte locales with encoding errors
++
++# Copyright 2016 Free Software Foundation, Inc.
++
++# This program is free software: you can redistribute it and/or modify
++# it under the terms of the GNU General Public License as published by
++# the Free Software Foundation, either version 3 of the License, or
++# (at your option) any later version.
++
++# This program is distributed in the hope that it will be useful,
++# but WITHOUT ANY WARRANTY; without even the implied warranty of
++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++# GNU General Public License for more details.
++
++# You should have received a copy of the GNU General Public License
++# along with this program.  If not, see <http://www.gnu.org/licenses/>.
++
++. "${srcdir=.}/init.sh"; path_prepend_ ../src
++require_unibyte_locale
++
++fail=0
++
++printf 'a\n\200\nb\n' >in || framework_failure_
++printf 'a\nBinary file in matches\n' >exp || framework_failure_
++grep . in >out || fail=1
++compare exp out || fail=1
++Exit $fail
+--
+cgit v0.9.0.2
================================================================

---- gitweb:

http://git.pld-linux.org/gitweb.cgi/packages/grep.git/commitdiff/fccf3f8906a337debeee952c732415d24536730d



More information about the pld-cvs-commit mailing list