SOURCES: coreutils-fmt-wchars.patch (NEW) - coreutils-fmt-wchars.p...
qrczak
qrczak at pld-linux.org
Tue Feb 13 16:59:09 CET 2007
Author: qrczak Date: Tue Feb 13 15:59:09 2007 GMT
Module: SOURCES Tag: HEAD
---- Log message:
- coreutils-fmt-wchars.patch: Added support for multibyte encodings
and wcwidth. Added -n / --single-spacing option, which is like -u
but uses a single space between sentences.
- Release 2
---- Files affected:
SOURCES:
coreutils-fmt-wchars.patch (NONE -> 1.1) (NEW)
---- Diffs:
================================================================
Index: SOURCES/coreutils-fmt-wchars.patch
diff -u /dev/null SOURCES/coreutils-fmt-wchars.patch:1.1
--- /dev/null Tue Feb 13 16:59:09 2007
+++ SOURCES/coreutils-fmt-wchars.patch Tue Feb 13 16:59:04 2007
@@ -0,0 +1,686 @@
+--- coreutils-6.7/src/fmt.c.orig 2006-10-22 18:54:15.000000000 +0200
++++ coreutils-6.7/src/fmt.c 2007-02-13 16:51:44.000000000 +0100
+@@ -18,6 +18,7 @@
+ /* Written by Ross Paterson <rap at doc.ic.ac.uk>. */
+
+ #include <config.h>
++#include <wchar.h>
+ #include <stdio.h>
+ #include <sys/types.h>
+ #include <getopt.h>
+@@ -39,7 +40,7 @@
+ /* The following parameters represent the program's idea of what is
+ "best". Adjust to taste, subject to the caveats given. */
+
+-/* Default longest permitted line length (max_width). */
++/* Default longest permitted line width (max_width). */
+ #define WIDTH 75
+
+ /* Prefer lines to be LEEWAY % shorter than the maximum width, giving
+@@ -51,7 +52,7 @@
+ #define DEF_INDENT 3
+
+ /* Costs and bonuses are expressed as the equivalent departure from the
+- optimal line length, multiplied by 10. e.g. assigning something a
++ optimal line width, multiplied by 10. e.g. assigning something a
+ cost of 50 means that it is as bad as a line 5 characters too short
+ or too long. The definition of SHORT_COST(n) should not be changed.
+ However, EQUIV(n) may need tuning. */
+@@ -78,11 +79,11 @@
+ #define LINE_COST EQUIV (70)
+
+ /* Cost of breaking a line after the first word of a sentence, where
+- the length of the word is N. */
++ the width of the word is N. */
+ #define WIDOW_COST(n) (EQUIV (200) / ((n) + 2))
+
+ /* Cost of breaking a line before the last word of a sentence, where
+- the length of the word is N. */
++ the width of the word is N. */
+ #define ORPHAN_COST(n) (EQUIV (150) / ((n) + 2))
+
+ /* Bonus for breaking a line at the end of a sentence. */
+@@ -114,11 +115,30 @@
+ #define MAXWORDS 1000
+ #define MAXCHARS 5000
+
++/* Wide character support */
++
++static wint_t
++xgetwc (FILE *stream)
++{
++ wint_t c = getwc (stream);
++ if (c == WEOF && ferror (stream))
++ error (EXIT_FAILURE, errno, _("read error"));
++ return c;
++}
++
++static inline int
++xwcwidth (wchar_t wc)
++{
++ int w = wcwidth (wc);
++ return w < 0 ? 0 : w;
++}
++
+ /* Extra ctype(3)-style macros. */
+
+-#define isopen(c) (strchr ("([`'\"", c) != NULL)
+-#define isclose(c) (strchr (")]'\"", c) != NULL)
+-#define isperiod(c) (strchr (".?!", c) != NULL)
++#define isopen(c) \
++ (wcschr (L"([`'\"\u2018\u201A\u201B\u201C\u201E\u201F", c) != NULL)
++#define isclose(c) (wcschr (L")]'\"\u2018\u2019\u201C\u201D", c) != NULL)
++#define isperiod(c) (wcschr (L".?!", c) != NULL)
+
+ /* Size of a tab stop, for expansion on input and re-introduction on
+ output. */
+@@ -133,8 +153,9 @@
+
+ /* Static attributes determined during input. */
+
+- const char *text; /* the text of the word */
+- int length; /* length of this word */
++ const wchar_t *text; /* the text of the word */
++ int length; /* length of this word, in characters */
++ int width; /* width of this word, in columns */
+ int space; /* the size of the following space */
+ unsigned int paren:1; /* starts with open paren */
+ unsigned int period:1; /* ends in [.?!])* */
+@@ -143,7 +164,7 @@
+
+ /* The remaining fields are computed during the optimization. */
+
+- int line_length; /* length of the best line starting here */
++ int line_width; /* width of the best line starting here */
+ COST best_cost; /* cost of best paragraph starting here */
+ WORD *next_break; /* break which achieves best_cost */
+ };
+@@ -153,16 +174,16 @@
+ static void set_prefix (char *p);
+ static void fmt (FILE *f);
+ static bool get_paragraph (FILE *f);
+-static int get_line (FILE *f, int c);
+-static int get_prefix (FILE *f);
+-static int get_space (FILE *f, int c);
+-static int copy_rest (FILE *f, int c);
+-static bool same_para (int c);
++static wint_t get_line (FILE *f, wint_t c);
++static wint_t get_prefix (FILE *f);
++static wint_t get_space (FILE *f, wint_t c);
++static wint_t copy_rest (FILE *f, wint_t c);
++static bool same_para (wint_t c);
+ static void flush_paragraph (void);
+ static void fmt_paragraph (void);
+ static void check_punctuation (WORD *w);
+ static COST base_cost (WORD *this);
+-static COST line_cost (WORD *next, int len);
++static COST line_cost (WORD *next, int wid);
+ static void put_paragraph (WORD *finish);
+ static void put_line (WORD *w, int indent);
+ static void put_word (WORD *w);
+@@ -185,8 +206,11 @@
+ /* If true, don't preserve inter-word spacing (default false). */
+ static bool uniform;
+
++/* How many spaces to put after a sentence (1 or 2). */
++static int sentence_space;
++
+ /* Prefix minus leading and trailing spaces (default ""). */
+-static const char *prefix;
++static wchar_t *prefix;
+
+ /* User-supplied maximum line width (default WIDTH). The only output
+ lines longer than this will each comprise a single word. */
+@@ -194,14 +218,14 @@
+
+ /* Values derived from the option values. */
+
+-/* The length of prefix minus leading space. */
+-static int prefix_full_length;
++/* The width of prefix minus leading space. */
++static int prefix_full_width;
+
+-/* The length of the leading space trimmed from the prefix. */
++/* The width of the leading space trimmed from the prefix. */
+ static int prefix_lead_space;
+
+-/* The length of prefix minus leading and trailing space. */
+-static int prefix_length;
++/* The width of prefix minus leading and trailing space. */
++static int prefix_width;
+
+ /* The preferred width of text lines, set to LEEWAY % less than max_width. */
+ static int best_width;
+@@ -216,10 +240,10 @@
+
+ /* Space for the paragraph text -- longer paragraphs are handled neatly
+ (cf. flush_paragraph()). */
+-static char parabuf[MAXCHARS];
++static wchar_t parabuf[MAXCHARS];
+
+ /* A pointer into parabuf, indicating the first unused character position. */
+-static char *wptr;
++static wchar_t *wptr;
+
+ /* The words of a paragraph -- longer paragraphs are handled neatly
+ (cf. flush_paragraph()). */
+@@ -251,16 +275,16 @@
+ prefix (next_prefix_indent). See get_paragraph() and copy_rest(). */
+
+ /* The last character read from the input file. */
+-static int next_char;
++static wint_t next_char;
+
+ /* The space before the trimmed prefix (or part of it) on the next line
+ after the current paragraph. */
+ static int next_prefix_indent;
+
+-/* If nonzero, the length of the last line output in the current
++/* If nonzero, the width of the last line output in the current
+ paragraph, used to charge for raggedness at the split point for long
+ paragraphs chosen by fmt_paragraph(). */
+-static int last_line_length;
++static int last_line_width;
+
+ void
+ usage (int status)
+@@ -289,6 +313,7 @@
+ fputs (_("\
+ -t, --tagged-paragraph indentation of first line different from second\n\
+ -u, --uniform-spacing one space between words, two after sentences\n\
++ -n, --single-spacing one space between words and after sentences\n\
+ -w, --width=WIDTH maximum line width (default of 75 columns)\n\
+ "), stdout);
+ fputs (HELP_OPTION_DESCRIPTION, stdout);
+@@ -311,6 +336,7 @@
+ {"split-only", no_argument, NULL, 's'},
+ {"tagged-paragraph", no_argument, NULL, 't'},
+ {"uniform-spacing", no_argument, NULL, 'u'},
++ {"single-spacing", no_argument, NULL, 'n'},
+ {"width", required_argument, NULL, 'w'},
+ {GETOPT_HELP_OPTION_DECL},
+ {GETOPT_VERSION_OPTION_DECL},
+@@ -334,8 +360,8 @@
+
+ crown = tagged = split = uniform = false;
+ max_width = WIDTH;
+- prefix = "";
+- prefix_length = prefix_lead_space = prefix_full_length = 0;
++ prefix = L"";
++ prefix_width = prefix_lead_space = prefix_full_width = 0;
+
+ if (argc > 1 && argv[1][0] == '-' && ISDIGIT (argv[1][1]))
+ {
+@@ -348,7 +374,7 @@
+ argc--;
+ }
+
+- while ((optchar = getopt_long (argc, argv, "0123456789cstuw:p:",
++ while ((optchar = getopt_long (argc, argv, "0123456789cstunw:p:",
+ long_options, NULL))
+ != -1)
+ switch (optchar)
+@@ -374,6 +400,12 @@
+
+ case 'u':
+ uniform = true;
++ sentence_space = 2;
++ break;
++
++ case 'n':
++ uniform = true;
++ sentence_space = 1;
+ break;
+
+ case 'w':
+@@ -440,26 +472,32 @@
+ }
+
+ /* Trim space from the front and back of the string P, yielding the prefix,
+- and record the lengths of the prefix and the space trimmed. */
++ and record the widths of the prefix and the space trimmed. */
+
+ static void
+ set_prefix (char *p)
+ {
+- char *s;
++ size_t len;
++ wchar_t *s;
+
+ prefix_lead_space = 0;
+- while (*p == ' ')
++ while (*p == L' ')
+ {
+ prefix_lead_space++;
+ p++;
+ }
+- prefix = p;
+- prefix_full_length = strlen (p);
+- s = p + prefix_full_length;
+- while (s > p && s[-1] == ' ')
+- s--;
+- *s = '\0';
+- prefix_length = s - p;
++ len = mbsrtowcs (NULL, (const char **) &p, 0, NULL);
++ prefix = xmalloc (len * sizeof (wchar_t));
++ mbsrtowcs (prefix, (const char **) &p, len, NULL);
++ for (s = prefix; *s; s++)
++ prefix_full_width += xwcwidth (*s);
++ prefix_width = prefix_full_width;
++ while (s > prefix && s[-1] == L' ')
++ {
++ s--;
++ prefix_width--;
++ }
++ *s = L'\0';
+ }
+
+ /* read file F and send formatted output to stdout. */
+@@ -528,24 +566,24 @@
+ static bool
+ get_paragraph (FILE *f)
+ {
+- int c;
++ wint_t c;
+
+- last_line_length = 0;
++ last_line_width = 0;
+ c = next_char;
+
+ /* Scan (and copy) blank lines, and lines not introduced by the prefix. */
+
+- while (c == '\n' || c == EOF
++ while (c == L'\n' || c == WEOF
+ || next_prefix_indent < prefix_lead_space
+- || in_column < next_prefix_indent + prefix_full_length)
++ || in_column < next_prefix_indent + prefix_full_width)
+ {
+ c = copy_rest (f, c);
+- if (c == EOF)
++ if (c == WEOF)
+ {
+- next_char = EOF;
++ next_char = WEOF;
+ return false;
+ }
+- putchar ('\n');
++ putwchar (L'\n');
+ c = get_prefix (f);
+ }
+
+@@ -601,23 +639,23 @@
+ that failed to match the prefix. In the latter, C is \n or EOF.
+ Return the character (\n or EOF) ending the line. */
+
+-static int
+-copy_rest (FILE *f, int c)
++static wint_t
++copy_rest (FILE *f, wint_t c)
+ {
+- const char *s;
++ const wchar_t *s;
+
+ out_column = 0;
+- if (in_column > next_prefix_indent && c != '\n' && c != EOF)
++ if (in_column > next_prefix_indent && c != L'\n' && c != WEOF)
+ {
+ put_space (next_prefix_indent);
+ for (s = prefix; out_column != in_column && *s; out_column++)
+- putchar (*s++);
++ putwchar (*s++);
+ put_space (in_column - out_column);
+ }
+- while (c != '\n' && c != EOF)
++ while (c != L'\n' && c != WEOF)
+ {
+- putchar (c);
+- c = getc (f);
++ putwchar (c);
++ c = xgetwc (f);
+ }
+ return c;
+ }
+@@ -627,11 +665,11 @@
+ otherwise false. */
+
+ static bool
+-same_para (int c)
++same_para (wint_t c)
+ {
+ return (next_prefix_indent == prefix_indent
+- && in_column >= next_prefix_indent + prefix_full_length
+- && c != '\n' && c != EOF);
++ && in_column >= next_prefix_indent + prefix_full_width
++ && c != L'\n' && c != WEOF);
+ }
+
+ /* Read a line from input file F, given first non-blank character C
+@@ -642,11 +680,11 @@
+
+ Return the first non-blank character of the next line. */
+
+-static int
+-get_line (FILE *f, int c)
++static wint_t
++get_line (FILE *f, wint_t c)
+ {
+ int start;
+- char *end_of_parabuf;
++ wchar_t *end_of_parabuf;
+ WORD *end_of_word;
+
+ end_of_parabuf = ¶buf[MAXCHARS];
+@@ -658,6 +696,7 @@
+ /* Scan word. */
+
+ word_limit->text = wptr;
++ word_limit->width = 0;
+ do
+ {
+ if (wptr == end_of_parabuf)
+@@ -666,10 +705,12 @@
+ flush_paragraph ();
+ }
+ *wptr++ = c;
+- c = getc (f);
++ word_limit->width += xwcwidth (c);
++ c = xgetwc (f);
+ }
+- while (c != EOF && !isspace (c));
+- in_column += word_limit->length = wptr - word_limit->text;
++ while (c != WEOF && !isspace (c));
++ word_limit->length = wptr - word_limit->text;
++ in_column += word_limit->width;
+ check_punctuation (word_limit);
+
+ /* Scan inter-word space. */
+@@ -677,48 +718,48 @@
+ start = in_column;
+ c = get_space (f, c);
+ word_limit->space = in_column - start;
+- word_limit->final = (c == EOF
++ word_limit->final = (c == WEOF
+ || (word_limit->period
+- && (c == '\n' || word_limit->space > 1)));
+- if (c == '\n' || c == EOF || uniform)
+- word_limit->space = word_limit->final ? 2 : 1;
++ && (c == L'\n' || word_limit->space > 1)));
++ if (c == L'\n' || c == WEOF || uniform)
++ word_limit->space = word_limit->final ? sentence_space : 1;
+ if (word_limit == end_of_word)
+ {
+ set_other_indent (true);
+ flush_paragraph ();
+ }
+ word_limit++;
+- if (c == EOF)
+- return EOF;
++ if (c == WEOF)
++ return WEOF;
+ }
+- while (c != '\n');
++ while (c != L'\n');
+ return get_prefix (f);
+ }
+
+ /* Read a prefix from input file F. Return either first non-matching
+ character, or first non-blank character after the prefix. */
+
+-static int
++static wint_t
+ get_prefix (FILE *f)
+ {
+- int c;
++ wint_t c;
+
+ in_column = 0;
+- c = get_space (f, getc (f));
+- if (prefix_length == 0)
++ c = get_space (f, xgetwc (f));
++ if (prefix_width == 0)
+ next_prefix_indent = prefix_lead_space < in_column ?
+ prefix_lead_space : in_column;
+ else
+ {
+- const char *p;
++ const wchar_t *p;
+ next_prefix_indent = in_column;
+- for (p = prefix; *p != '\0'; p++)
++ for (p = prefix; *p != L'\0'; p++)
+ {
+- unsigned char pc = *p;
++ wchar_t pc = *p;
+ if (c != pc)
+ return c;
+ in_column++;
+- c = getc (f);
++ c = xgetwc (f);
+ }
+ c = get_space (f, c);
+ }
+@@ -728,21 +769,21 @@
+ /* Read blank characters from input file F, starting with C, and keeping
+ in_column up-to-date. Return first non-blank character. */
+
+-static int
+-get_space (FILE *f, int c)
++static wint_t
++get_space (FILE *f, wint_t c)
+ {
+ for (;;)
+ {
+- if (c == ' ')
++ if (c == L' ')
+ in_column++;
+- else if (c == '\t')
++ else if (c == L'\t')
+ {
+ tabs = true;
+ in_column = (in_column / TABWIDTH + 1) * TABWIDTH;
+ }
+ else
+ return c;
+- c = getc (f);
++ c = xgetwc (f);
+ }
+ }
+
+@@ -751,9 +792,9 @@
+ static void
+ check_punctuation (WORD *w)
+ {
+- char const *start = w->text;
+- char const *finish = start + (w->length - 1);
+- unsigned char fin = *finish;
++ wchar_t const *start = w->text;
++ wchar_t const *finish = start + (w->length - 1);
++ wchar_t fin = *finish;
+
+ w->paren = isopen (*start);
+ w->punct = !! ispunct (fin);
+@@ -777,7 +818,9 @@
+
+ if (word_limit == word)
+ {
+- fwrite (parabuf, sizeof *parabuf, wptr - parabuf, stdout);
++ wchar_t *outptr;
++ for (outptr = parabuf; outptr < wptr; outptr++)
++ putwchar (*outptr);
+ wptr = parabuf;
+ return;
+ }
+@@ -809,7 +852,8 @@
+ /* Copy text of words down to start of parabuf -- we use memmove because
+ the source and target may overlap. */
+
+- memmove (parabuf, split_point->text, wptr - split_point->text);
++ memmove (parabuf, split_point->text,
++ (wptr - split_point->text) * sizeof (wchar_t));
+ shift = split_point->text - parabuf;
+ wptr -= shift;
+
+@@ -833,53 +877,53 @@
+ fmt_paragraph (void)
+ {
+ WORD *start, *w;
+- int len;
++ int wid;
+ COST wcost, best;
+- int saved_length;
++ int saved_width;
+
+ word_limit->best_cost = 0;
+- saved_length = word_limit->length;
+- word_limit->length = max_width; /* sentinel */
++ saved_width = word_limit->width;
++ word_limit->width = max_width; /* sentinel */
+
+ for (start = word_limit - 1; start >= word; start--)
+ {
+ best = MAXCOST;
+- len = start == word ? first_indent : other_indent;
++ wid = start == word ? first_indent : other_indent;
+
+ /* At least one word, however long, in the line. */
+
+ w = start;
+- len += w->length;
++ wid += w->width;
+ do
+ {
+ w++;
+
+ /* Consider breaking before w. */
+
+- wcost = line_cost (w, len) + w->best_cost;
+- if (start == word && last_line_length > 0)
+- wcost += RAGGED_COST (len - last_line_length);
++ wcost = line_cost (w, wid) + w->best_cost;
++ if (start == word && last_line_width > 0)
++ wcost += RAGGED_COST (wid - last_line_width);
+ if (wcost < best)
+ {
+ best = wcost;
+ start->next_break = w;
+- start->line_length = len;
++ start->line_width = wid;
+ }
+
+- /* This is a kludge to keep us from computing `len' as the
+- sum of the sentinel length and some non-zero number.
+- Since the sentinel w->length may be INT_MAX, adding
++ /* This is a kludge to keep us from computing `wid' as the
++ sum of the sentinel width and some non-zero number.
++ Since the sentinel w->width may be INT_MAX, adding
+ to that would give a negative result. */
+ if (w == word_limit)
+ break;
+
+- len += (w - 1)->space + w->length; /* w > start >= word */
++ wid += (w - 1)->space + w->width; /* w > start >= word */
+ }
+- while (len < max_width);
++ while (wid < max_width);
+ start->best_cost = best + base_cost (start);
+ }
+
+- word_limit->length = saved_length;
++ word_limit->width = saved_width;
+ }
+
+ /* Return the constant component of the cost of breaking before the
<<Diff was trimmed, longer than 597 lines>>
More information about the pld-cvs-commit
mailing list