SOURCES: coreutils-fmt-wchars.patch (NEW) - coreutils-fmt-wchars.p...

Tue Feb 13 16:59:09 CET 2007

Author: qrczak                       Date: Tue Feb 13 15:59:09 2007 GMT
Module: SOURCES                       Tag: HEAD
---- Log message:
- coreutils-fmt-wchars.patch: Added support for multibyte encodings
  and wcwidth. Added -n / --single-spacing option, which is like -u
  but uses a single space between sentences.
- Release 2

---- Files affected:
SOURCES:
   coreutils-fmt-wchars.patch (NONE -> 1.1)  (NEW)

---- Diffs:

================================================================
Index: SOURCES/coreutils-fmt-wchars.patch
diff -u /dev/null SOURCES/coreutils-fmt-wchars.patch:1.1

--- /dev/null	Tue Feb 13 16:59:09 2007
+++ SOURCES/coreutils-fmt-wchars.patch	Tue Feb 13 16:59:04 2007
@@ -0,0 +1,686 @@
+--- coreutils-6.7/src/fmt.c.orig	2006-10-22 18:54:15.000000000 +0200
++++ coreutils-6.7/src/fmt.c	2007-02-13 16:51:44.000000000 +0100
+@@ -18,6 +18,7 @@
+ /* Written by Ross Paterson <rap at doc.ic.ac.uk>.  */
+ 
+ #include <config.h>
++#include <wchar.h>
+ #include <stdio.h>
+ #include <sys/types.h>
+ #include <getopt.h>
+@@ -39,7 +40,7 @@
+ /* The following parameters represent the program's idea of what is
+    "best".  Adjust to taste, subject to the caveats given.  */
+ 
+-/* Default longest permitted line length (max_width).  */
++/* Default longest permitted line width (max_width).  */
+ #define WIDTH	75
+ 
+ /* Prefer lines to be LEEWAY % shorter than the maximum width, giving
+@@ -51,7 +52,7 @@
+ #define DEF_INDENT 3
+ 
+ /* Costs and bonuses are expressed as the equivalent departure from the
+-   optimal line length, multiplied by 10.  e.g. assigning something a
++   optimal line width, multiplied by 10.  e.g. assigning something a
+    cost of 50 means that it is as bad as a line 5 characters too short
+    or too long.  The definition of SHORT_COST(n) should not be changed.
+    However, EQUIV(n) may need tuning.  */
+@@ -78,11 +79,11 @@
+ #define LINE_COST	EQUIV (70)
+ 
+ /* Cost of breaking a line after the first word of a sentence, where
+-   the length of the word is N.  */
++   the width of the word is N.  */
+ #define WIDOW_COST(n)	(EQUIV (200) / ((n) + 2))
+ 
+ /* Cost of breaking a line before the last word of a sentence, where
+-   the length of the word is N.  */
++   the width of the word is N.  */
+ #define ORPHAN_COST(n)	(EQUIV (150) / ((n) + 2))
+ 
+ /* Bonus for breaking a line at the end of a sentence.  */
+@@ -114,11 +115,30 @@
+ #define MAXWORDS	1000
+ #define MAXCHARS	5000
+ 
++/* Wide character support */
++
++static wint_t
++xgetwc (FILE *stream)
++{
++  wint_t c = getwc (stream);
++  if (c == WEOF && ferror (stream))
++    error (EXIT_FAILURE, errno, _("read error"));
++  return c;
++}
++
++static inline int
++xwcwidth (wchar_t wc)
++{
++  int w = wcwidth (wc);
++  return w < 0 ? 0 : w;
++}
++
+ /* Extra ctype(3)-style macros.  */
+ 
+-#define isopen(c)	(strchr ("([`'\"", c) != NULL)
+-#define isclose(c)	(strchr (")]'\"", c) != NULL)
+-#define isperiod(c)	(strchr (".?!", c) != NULL)
++#define isopen(c)	\
++  (wcschr (L"([`'\"\u2018\u201A\u201B\u201C\u201E\u201F", c) != NULL)
++#define isclose(c)	(wcschr (L")]'\"\u2018\u2019\u201C\u201D", c) != NULL)
++#define isperiod(c)	(wcschr (L".?!", c) != NULL)
+ 
+ /* Size of a tab stop, for expansion on input and re-introduction on
+    output.  */
+@@ -133,8 +153,9 @@
+ 
+     /* Static attributes determined during input.  */
+ 
+-    const char *text;		/* the text of the word */
+-    int length;			/* length of this word */
++    const wchar_t *text;	/* the text of the word */
++    int length;			/* length of this word, in characters */
++    int width;			/* width of this word, in columns */
+     int space;			/* the size of the following space */
+     unsigned int paren:1;	/* starts with open paren */
+     unsigned int period:1;	/* ends in [.?!])* */
+@@ -143,7 +164,7 @@
+ 
+     /* The remaining fields are computed during the optimization.  */
+ 
+-    int line_length;		/* length of the best line starting here */
++    int line_width;		/* width of the best line starting here */
+     COST best_cost;		/* cost of best paragraph starting here */
+     WORD *next_break;		/* break which achieves best_cost */
+   };
+@@ -153,16 +174,16 @@
+ static void set_prefix (char *p);
+ static void fmt (FILE *f);
+ static bool get_paragraph (FILE *f);
+-static int get_line (FILE *f, int c);
+-static int get_prefix (FILE *f);
+-static int get_space (FILE *f, int c);
+-static int copy_rest (FILE *f, int c);
+-static bool same_para (int c);
++static wint_t get_line (FILE *f, wint_t c);
++static wint_t get_prefix (FILE *f);
++static wint_t get_space (FILE *f, wint_t c);
++static wint_t copy_rest (FILE *f, wint_t c);
++static bool same_para (wint_t c);
+ static void flush_paragraph (void);
+ static void fmt_paragraph (void);
+ static void check_punctuation (WORD *w);
+ static COST base_cost (WORD *this);
+-static COST line_cost (WORD *next, int len);
++static COST line_cost (WORD *next, int wid);
+ static void put_paragraph (WORD *finish);
+ static void put_line (WORD *w, int indent);
+ static void put_word (WORD *w);
+@@ -185,8 +206,11 @@
+ /* If true, don't preserve inter-word spacing (default false).  */
+ static bool uniform;
+ 
++/* How many spaces to put after a sentence (1 or 2).  */
++static int sentence_space;
++
+ /* Prefix minus leading and trailing spaces (default "").  */
+-static const char *prefix;
++static wchar_t *prefix;
+ 
+ /* User-supplied maximum line width (default WIDTH).  The only output
+    lines longer than this will each comprise a single word.  */
+@@ -194,14 +218,14 @@
+ 
+ /* Values derived from the option values.  */
+ 
+-/* The length of prefix minus leading space.  */
+-static int prefix_full_length;
++/* The width of prefix minus leading space.  */
++static int prefix_full_width;
+ 
+-/* The length of the leading space trimmed from the prefix.  */
++/* The width of the leading space trimmed from the prefix.  */
+ static int prefix_lead_space;
+ 
+-/* The length of prefix minus leading and trailing space.  */
+-static int prefix_length;
++/* The width of prefix minus leading and trailing space.  */
++static int prefix_width;
+ 
+ /* The preferred width of text lines, set to LEEWAY % less than max_width.  */
+ static int best_width;
+@@ -216,10 +240,10 @@
+ 
+ /* Space for the paragraph text -- longer paragraphs are handled neatly
+    (cf. flush_paragraph()).  */
+-static char parabuf[MAXCHARS];
++static wchar_t parabuf[MAXCHARS];
+ 
+ /* A pointer into parabuf, indicating the first unused character position.  */
+-static char *wptr;
++static wchar_t *wptr;
+ 
+ /* The words of a paragraph -- longer paragraphs are handled neatly
+    (cf. flush_paragraph()).  */
+@@ -251,16 +275,16 @@
+    prefix (next_prefix_indent).  See get_paragraph() and copy_rest().  */
+ 
+ /* The last character read from the input file.  */
+-static int next_char;
++static wint_t next_char;
+ 
+ /* The space before the trimmed prefix (or part of it) on the next line
+    after the current paragraph.  */
+ static int next_prefix_indent;
+ 
+-/* If nonzero, the length of the last line output in the current
++/* If nonzero, the width of the last line output in the current
+    paragraph, used to charge for raggedness at the split point for long
+    paragraphs chosen by fmt_paragraph().  */
+-static int last_line_length;
++static int last_line_width;
+ 
+ void
+ usage (int status)
+@@ -289,6 +313,7 @@
+       fputs (_("\
+   -t, --tagged-paragraph    indentation of first line different from second\n\
+   -u, --uniform-spacing     one space between words, two after sentences\n\
++  -n, --single-spacing      one space between words and after sentences\n\
+   -w, --width=WIDTH         maximum line width (default of 75 columns)\n\
+ "), stdout);
+       fputs (HELP_OPTION_DESCRIPTION, stdout);
+@@ -311,6 +336,7 @@
+   {"split-only", no_argument, NULL, 's'},
+   {"tagged-paragraph", no_argument, NULL, 't'},
+   {"uniform-spacing", no_argument, NULL, 'u'},
++  {"single-spacing", no_argument, NULL, 'n'},
+   {"width", required_argument, NULL, 'w'},
+   {GETOPT_HELP_OPTION_DECL},
+   {GETOPT_VERSION_OPTION_DECL},
+@@ -334,8 +360,8 @@
+ 
+   crown = tagged = split = uniform = false;
+   max_width = WIDTH;
+-  prefix = "";
+-  prefix_length = prefix_lead_space = prefix_full_length = 0;
++  prefix = L"";
++  prefix_width = prefix_lead_space = prefix_full_width = 0;
+ 
+   if (argc > 1 && argv[1][0] == '-' && ISDIGIT (argv[1][1]))
+     {
+@@ -348,7 +374,7 @@
+       argc--;
+     }
+ 
+-  while ((optchar = getopt_long (argc, argv, "0123456789cstuw:p:",
++  while ((optchar = getopt_long (argc, argv, "0123456789cstunw:p:",
+ 				 long_options, NULL))
+ 	 != -1)
+     switch (optchar)
+@@ -374,6 +400,12 @@
+ 
+       case 'u':
+ 	uniform = true;
++        sentence_space = 2;
++	break;
++
++      case 'n':
++	uniform = true;
++        sentence_space = 1;
+ 	break;
+ 
+       case 'w':
+@@ -440,26 +472,32 @@
+ }
+ 
+ /* Trim space from the front and back of the string P, yielding the prefix,
+-   and record the lengths of the prefix and the space trimmed.  */
++   and record the widths of the prefix and the space trimmed.  */
+ 
+ static void
+ set_prefix (char *p)
+ {
+-  char *s;
++  size_t len;
++  wchar_t *s;
+ 
+   prefix_lead_space = 0;
+-  while (*p == ' ')
++  while (*p == L' ')
+     {
+       prefix_lead_space++;
+       p++;
+     }
+-  prefix = p;
+-  prefix_full_length = strlen (p);
+-  s = p + prefix_full_length;
+-  while (s > p && s[-1] == ' ')
+-    s--;
+-  *s = '\0';
+-  prefix_length = s - p;
++  len = mbsrtowcs (NULL, (const char **) &p, 0, NULL);
++  prefix = xmalloc (len * sizeof (wchar_t));
++  mbsrtowcs (prefix, (const char **) &p, len, NULL);
++  for (s = prefix; *s; s++)
++    prefix_full_width += xwcwidth (*s);
++  prefix_width = prefix_full_width;
++  while (s > prefix && s[-1] == L' ')
++    {
++      s--;
++      prefix_width--;
++    }
++  *s = L'\0';
+ }
+ 
+ /* read file F and send formatted output to stdout.  */
+@@ -528,24 +566,24 @@
+ static bool
+ get_paragraph (FILE *f)
+ {
+-  int c;
++  wint_t c;
+ 
+-  last_line_length = 0;
++  last_line_width = 0;
+   c = next_char;
+ 
+   /* Scan (and copy) blank lines, and lines not introduced by the prefix.  */
+ 
+-  while (c == '\n' || c == EOF
++  while (c == L'\n' || c == WEOF
+ 	 || next_prefix_indent < prefix_lead_space
+-	 || in_column < next_prefix_indent + prefix_full_length)
++	 || in_column < next_prefix_indent + prefix_full_width)
+     {
+       c = copy_rest (f, c);
+-      if (c == EOF)
++      if (c == WEOF)
+ 	{
+-	  next_char = EOF;
++	  next_char = WEOF;
+ 	  return false;
+ 	}
+-      putchar ('\n');
++      putwchar (L'\n');
+       c = get_prefix (f);
+     }
+ 
+@@ -601,23 +639,23 @@
+    that failed to match the prefix.  In the latter, C is \n or EOF.
+    Return the character (\n or EOF) ending the line.  */
+ 
+-static int
+-copy_rest (FILE *f, int c)
++static wint_t
++copy_rest (FILE *f, wint_t c)
+ {
+-  const char *s;
++  const wchar_t *s;
+ 
+   out_column = 0;
+-  if (in_column > next_prefix_indent && c != '\n' && c != EOF)
++  if (in_column > next_prefix_indent && c != L'\n' && c != WEOF)
+     {
+       put_space (next_prefix_indent);
+       for (s = prefix; out_column != in_column && *s; out_column++)
+-	putchar (*s++);
++	putwchar (*s++);
+       put_space (in_column - out_column);
+     }
+-  while (c != '\n' && c != EOF)
++  while (c != L'\n' && c != WEOF)
+     {
+-      putchar (c);
+-      c = getc (f);
++      putwchar (c);
++      c = xgetwc (f);
+     }
+   return c;
+ }
+@@ -627,11 +665,11 @@
+    otherwise false.  */
+ 
+ static bool
+-same_para (int c)
++same_para (wint_t c)
+ {
+   return (next_prefix_indent == prefix_indent
+-	  && in_column >= next_prefix_indent + prefix_full_length
+-	  && c != '\n' && c != EOF);
++	  && in_column >= next_prefix_indent + prefix_full_width
++	  && c != L'\n' && c != WEOF);
+ }
+ 
+ /* Read a line from input file F, given first non-blank character C
+@@ -642,11 +680,11 @@
+ 
+    Return the first non-blank character of the next line.  */
+ 
+-static int
+-get_line (FILE *f, int c)
++static wint_t
++get_line (FILE *f, wint_t c)
+ {
+   int start;
+-  char *end_of_parabuf;
++  wchar_t *end_of_parabuf;
+   WORD *end_of_word;
+ 
+   end_of_parabuf = &parabuf[MAXCHARS];
+@@ -658,6 +696,7 @@
+       /* Scan word.  */
+ 
+       word_limit->text = wptr;
++      word_limit->width = 0;
+       do
+ 	{
+ 	  if (wptr == end_of_parabuf)
+@@ -666,10 +705,12 @@
+ 	      flush_paragraph ();
+ 	    }
+ 	  *wptr++ = c;
+-	  c = getc (f);
++          word_limit->width += xwcwidth (c);
++	  c = xgetwc (f);
+ 	}
+-      while (c != EOF && !isspace (c));
+-      in_column += word_limit->length = wptr - word_limit->text;
++      while (c != WEOF && !isspace (c));
++      word_limit->length = wptr - word_limit->text;
++      in_column += word_limit->width;
+       check_punctuation (word_limit);
+ 
+       /* Scan inter-word space.  */
+@@ -677,48 +718,48 @@
+       start = in_column;
+       c = get_space (f, c);
+       word_limit->space = in_column - start;
+-      word_limit->final = (c == EOF
++      word_limit->final = (c == WEOF
+ 			   || (word_limit->period
+-			       && (c == '\n' || word_limit->space > 1)));
+-      if (c == '\n' || c == EOF || uniform)
+-	word_limit->space = word_limit->final ? 2 : 1;
++			       && (c == L'\n' || word_limit->space > 1)));
++      if (c == L'\n' || c == WEOF || uniform)
++	word_limit->space = word_limit->final ? sentence_space : 1;
+       if (word_limit == end_of_word)
+ 	{
+ 	  set_other_indent (true);
+ 	  flush_paragraph ();
+ 	}
+       word_limit++;
+-      if (c == EOF)
+-	return EOF;
++      if (c == WEOF)
++	return WEOF;
+     }
+-  while (c != '\n');
++  while (c != L'\n');
+   return get_prefix (f);
+ }
+ 
+ /* Read a prefix from input file F.  Return either first non-matching
+    character, or first non-blank character after the prefix.  */
+ 
+-static int
++static wint_t
+ get_prefix (FILE *f)
+ {
+-  int c;
++  wint_t c;
+ 
+   in_column = 0;
+-  c = get_space (f, getc (f));
+-  if (prefix_length == 0)
++  c = get_space (f, xgetwc (f));
++  if (prefix_width == 0)
+     next_prefix_indent = prefix_lead_space < in_column ?
+       prefix_lead_space : in_column;
+   else
+     {
+-      const char *p;
++      const wchar_t *p;
+       next_prefix_indent = in_column;
+-      for (p = prefix; *p != '\0'; p++)
++      for (p = prefix; *p != L'\0'; p++)
+ 	{
+-	  unsigned char pc = *p;
++	  wchar_t pc = *p;
+ 	  if (c != pc)
+ 	    return c;
+ 	  in_column++;
+-	  c = getc (f);
++	  c = xgetwc (f);
+ 	}
+       c = get_space (f, c);
+     }
+@@ -728,21 +769,21 @@
+ /* Read blank characters from input file F, starting with C, and keeping
+    in_column up-to-date.  Return first non-blank character.  */
+ 
+-static int
+-get_space (FILE *f, int c)
++static wint_t
++get_space (FILE *f, wint_t c)
+ {
+   for (;;)
+     {
+-      if (c == ' ')
++      if (c == L' ')
+ 	in_column++;
+-      else if (c == '\t')
++      else if (c == L'\t')
+ 	{
+ 	  tabs = true;
+ 	  in_column = (in_column / TABWIDTH + 1) * TABWIDTH;
+ 	}
+       else
+ 	return c;
+-      c = getc (f);
++      c = xgetwc (f);
+     }
+ }
+ 
+@@ -751,9 +792,9 @@
+ static void
+ check_punctuation (WORD *w)
+ {
+-  char const *start = w->text;
+-  char const *finish = start + (w->length - 1);
+-  unsigned char fin = *finish;
++  wchar_t const *start = w->text;
++  wchar_t const *finish = start + (w->length - 1);
++  wchar_t fin = *finish;
+ 
+   w->paren = isopen (*start);
+   w->punct = !! ispunct (fin);
+@@ -777,7 +818,9 @@
+ 
+   if (word_limit == word)
+     {
+-      fwrite (parabuf, sizeof *parabuf, wptr - parabuf, stdout);
++      wchar_t *outptr;
++      for (outptr = parabuf; outptr < wptr; outptr++)
++        putwchar (*outptr);
+       wptr = parabuf;
+       return;
+     }
+@@ -809,7 +852,8 @@
+   /* Copy text of words down to start of parabuf -- we use memmove because
+      the source and target may overlap.  */
+ 
+-  memmove (parabuf, split_point->text, wptr - split_point->text);
++  memmove (parabuf, split_point->text,
++           (wptr - split_point->text) * sizeof (wchar_t));
+   shift = split_point->text - parabuf;
+   wptr -= shift;
+ 
+@@ -833,53 +877,53 @@
+ fmt_paragraph (void)
+ {
+   WORD *start, *w;
+-  int len;
++  int wid;
+   COST wcost, best;
+-  int saved_length;
++  int saved_width;
+ 
+   word_limit->best_cost = 0;
+-  saved_length = word_limit->length;
+-  word_limit->length = max_width;	/* sentinel */
++  saved_width = word_limit->width;
++  word_limit->width = max_width;	/* sentinel */
+ 
+   for (start = word_limit - 1; start >= word; start--)
+     {
+       best = MAXCOST;
+-      len = start == word ? first_indent : other_indent;
++      wid = start == word ? first_indent : other_indent;
+ 
+       /* At least one word, however long, in the line.  */
+ 
+       w = start;
+-      len += w->length;
++      wid += w->width;
+       do
+ 	{
+ 	  w++;
+ 
+ 	  /* Consider breaking before w.  */
+ 
+-	  wcost = line_cost (w, len) + w->best_cost;
+-	  if (start == word && last_line_length > 0)
+-	    wcost += RAGGED_COST (len - last_line_length);
++	  wcost = line_cost (w, wid) + w->best_cost;
++	  if (start == word && last_line_width > 0)
++	    wcost += RAGGED_COST (wid - last_line_width);
+ 	  if (wcost < best)
+ 	    {
+ 	      best = wcost;
+ 	      start->next_break = w;
+-	      start->line_length = len;
++	      start->line_width = wid;
+ 	    }
+ 
+-	  /* This is a kludge to keep us from computing `len' as the
+-	     sum of the sentinel length and some non-zero number.
+-	     Since the sentinel w->length may be INT_MAX, adding
++	  /* This is a kludge to keep us from computing `wid' as the
++	     sum of the sentinel width and some non-zero number.
++	     Since the sentinel w->width may be INT_MAX, adding
+ 	     to that would give a negative result.  */
+ 	  if (w == word_limit)
+ 	    break;
+ 
+-	  len += (w - 1)->space + w->length;	/* w > start >= word */
++	  wid += (w - 1)->space + w->width;	/* w > start >= word */
+ 	}
+-      while (len < max_width);
++      while (wid < max_width);
+       start->best_cost = best + base_cost (start);
+     }
+ 
+-  word_limit->length = saved_length;
++  word_limit->width = saved_width;
+ }
+ 
+ /* Return the constant component of the cost of breaking before the
<<Diff was trimmed, longer than 597 lines>>