# HG changeset patch # User Bram Moolenaar # Date 1310757419 -7200 # Node ID c21429d7768cd4b834a086dd47881d01bca74e8c # Parent 03443e1c0050c911f946100935c34f7bdf0cf053 updated for version 7.3.253 Problem: "echo 'abc' > ''" returns 0 or 1, depending on 'ignorecase'. Checks in mb_strnicmp() for illegal and truncated bytes are wrong. Should not assume that byte length is equal before case folding. Solution: Add utf_safe_read_char_adv() and utf_strnicmp(). Add a test for this. (Ivan Krasilnikov) diff --git a/src/mbyte.c b/src/mbyte.c --- a/src/mbyte.c +++ b/src/mbyte.c @@ -132,6 +132,7 @@ static int utf_ptr2cells_len __ARGS((cha static int dbcs_char2cells __ARGS((int c)); static int dbcs_ptr2cells_len __ARGS((char_u *p, int size)); static int dbcs_ptr2char __ARGS((char_u *p)); +static int utf_safe_read_char_adv __ARGS((char_u **s, size_t *n)); /* * Lookup table to quickly get the length in bytes of a UTF-8 character from @@ -1701,6 +1702,66 @@ utf_ptr2char(p) } /* + * Convert a UTF-8 byte sequence to a wide character. + * String is assumed to be terminated by NUL or after "n" bytes, whichever + * comes first. + * The function is safe in the sense that it never accesses memory beyond the + * first "n" bytes of "s". + * + * On success, returns decoded codepoint, advances "s" to the beginning of + * next character and decreases "n" accordingly. + * + * If end of string was reached, returns 0 and, if "n" > 0, advances "s" past + * NUL byte. + * + * If byte sequence is illegal or incomplete, returns -1 and does not advance + * "s". + */ + static int +utf_safe_read_char_adv(s, n) + char_u **s; + size_t *n; +{ + int c, k; + + if (*n == 0) /* end of buffer */ + return 0; + + k = utf8len_tab_zero[**s]; + + if (k == 1) + { + /* ASCII character or NUL */ + (*n)--; + return *(*s)++; + } + + if ((size_t)k <= *n) + { + /* We have a multibyte sequence and it isn't truncated by buffer + * limits so utf_ptr2char() is safe to use. Or the first byte is + * illegal (k=0), and it's also safe to use utf_ptr2char(). */ + c = utf_ptr2char(*s); + + /* On failure, utf_ptr2char() returns the first byte, so here we + * check equality with the first byte. The only non-ASCII character + * which equals the first byte of its own UTF-8 representation is + * U+00C3 (UTF-8: 0xC3 0x83), so need to check that special case too. + * It's safe even if n=1, else we would have k=2 > n. */ + if (c != (int)(**s) || (c == 0xC3 && (*s)[1] == 0x83)) + { + /* byte sequence was successfully decoded */ + *s += k; + *n -= k; + return c; + } + } + + /* byte sequence is incomplete or illegal */ + return -1; +} + +/* * Get character at **pp and advance *pp to the next character. * Note: composing characters are skipped! */ @@ -2667,7 +2728,8 @@ static convertStruct foldCase[] = {0x10400,0x10427,1,40} }; -static int utf_convert(int a, convertStruct table[], int tableSize); +static int utf_convert __ARGS((int a, convertStruct table[], int tableSize)); +static int utf_strnicmp __ARGS((char_u *s1, char_u *s2, size_t n1, size_t n2)); /* * Generic conversion function for case operations. @@ -3079,6 +3141,80 @@ utf_isupper(a) return (utf_tolower(a) != a); } + static int +utf_strnicmp(s1, s2, n1, n2) + char_u *s1, *s2; + size_t n1, n2; +{ + int c1, c2, cdiff; + char_u buffer[6]; + + for (;;) + { + c1 = utf_safe_read_char_adv(&s1, &n1); + c2 = utf_safe_read_char_adv(&s2, &n2); + + if (c1 <= 0 || c2 <= 0) + break; + + if (c1 == c2) + continue; + + cdiff = utf_fold(c1) - utf_fold(c2); + if (cdiff != 0) + return cdiff; + } + + /* some string ended or has an incomplete/illegal character sequence */ + + if (c1 == 0 || c2 == 0) + { + /* some string ended. shorter string is smaller */ + if (c1 == 0 && c2 == 0) + return 0; + return c1 == 0 ? -1 : 1; + } + + /* Continue with bytewise comparison to produce some result that + * would make comparison operations involving this function transitive. + * + * If only one string had an error, comparison should be made with + * folded version of the other string. In this case it is enough + * to fold just one character to determine the result of comparison. */ + + if (c1 != -1 && c2 == -1) + { + n1 = utf_char2bytes(utf_fold(c1), buffer); + s1 = buffer; + } + else if (c2 != -1 && c1 == -1) + { + n2 = utf_char2bytes(utf_fold(c2), buffer); + s2 = buffer; + } + + while (n1 > 0 && n2 > 0 && *s1 != NUL && *s2 != NUL) + { + cdiff = (int)(*s1) - (int)(*s2); + if (cdiff != 0) + return cdiff; + + s1++; + s2++; + n1--; + n2--; + } + + if (n1 > 0 && *s1 == NUL) + n1 = 0; + if (n2 > 0 && *s2 == NUL) + n2 = 0; + + if (n1 == 0 && n2 == 0) + return 0; + return n1 == 0 ? -1 : 1; +} + /* * Version of strnicmp() that handles multi-byte characters. * Needed for Big5, Sjift-JIS and UTF-8 encoding. Other DBCS encodings can @@ -3092,49 +3228,21 @@ mb_strnicmp(s1, s2, nn) char_u *s1, *s2; size_t nn; { - int i, j, l; + int i, l; int cdiff; - int incomplete = FALSE; int n = (int)nn; - for (i = 0; i < n; i += l) + if (enc_utf8) { - if (s1[i] == NUL && s2[i] == NUL) /* both strings end */ - return 0; - if (enc_utf8) + return utf_strnicmp(s1, s2, nn, nn); + } + else + { + for (i = 0; i < n; i += l) { - l = utf_byte2len(s1[i]); - if (l > n - i) - { - l = n - i; /* incomplete character */ - incomplete = TRUE; - } - /* Check directly first, it's faster. */ - for (j = 0; j < l; ++j) - { - if (s1[i + j] != s2[i + j]) - break; - if (s1[i + j] == 0) - /* Both stings have the same bytes but are incomplete or - * have illegal bytes, accept them as equal. */ - l = j; - } - if (j < l) - { - /* If one of the two characters is incomplete return -1. */ - if (incomplete || i + utf_byte2len(s2[i]) > n) - return -1; - /* Don't case-fold illegal bytes or truncated characters. */ - if (utf_ptr2len(s1 + i) < l || utf_ptr2len(s2 + i) < l) - return -1; - cdiff = utf_fold(utf_ptr2char(s1 + i)) - - utf_fold(utf_ptr2char(s2 + i)); - if (cdiff != 0) - return cdiff; - } - } - else - { + if (s1[i] == NUL && s2[i] == NUL) /* both strings end */ + return 0; + l = (*mb_ptr2len)(s1 + i); if (l <= 1) { diff --git a/src/testdir/Make_amiga.mak b/src/testdir/Make_amiga.mak --- a/src/testdir/Make_amiga.mak +++ b/src/testdir/Make_amiga.mak @@ -29,7 +29,7 @@ SCRIPTS = test1.out test3.out test4.out test66.out test67.out test68.out test69.out test70.out \ test71.out test72.out test73.out test74.out test75.out \ test76.out test77.out test78.out test79.out test80.out \ - test81.out + test81.out test82.out .SUFFIXES: .in .out @@ -130,3 +130,4 @@ test78.out: test78.in test79.out: test79.in test80.out: test80.in test81.out: test81.in +test82.out: test82.in diff --git a/src/testdir/Make_dos.mak b/src/testdir/Make_dos.mak --- a/src/testdir/Make_dos.mak +++ b/src/testdir/Make_dos.mak @@ -29,7 +29,7 @@ SCRIPTS = test3.out test4.out test5.out test42.out test52.out test65.out test66.out test67.out \ test68.out test69.out test71.out test72.out test73.out \ test74.out test75.out test76.out test77.out test78.out \ - test79.out test80.out test81.out + test79.out test80.out test81.out test82.out SCRIPTS32 = test50.out test70.out diff --git a/src/testdir/Make_ming.mak b/src/testdir/Make_ming.mak --- a/src/testdir/Make_ming.mak +++ b/src/testdir/Make_ming.mak @@ -49,7 +49,7 @@ SCRIPTS = test3.out test4.out test5.out test42.out test52.out test65.out test66.out test67.out \ test68.out test69.out test71.out test72.out test73.out \ test74.out test75.out test76.out test77.out test78.out \ - test79.out test80.out test81.out + test79.out test80.out test81.out test82.out SCRIPTS32 = test50.out test70.out diff --git a/src/testdir/Make_os2.mak b/src/testdir/Make_os2.mak --- a/src/testdir/Make_os2.mak +++ b/src/testdir/Make_os2.mak @@ -29,7 +29,7 @@ SCRIPTS = test1.out test3.out test4.out test66.out test67.out test68.out test69.out test70.out \ test71.out test72.out test73.out test74.out test75.out \ test76.out test77.out test78.out test79.out test80.out \ - test81.out + test81.out test82.out .SUFFIXES: .in .out diff --git a/src/testdir/Make_vms.mms b/src/testdir/Make_vms.mms --- a/src/testdir/Make_vms.mms +++ b/src/testdir/Make_vms.mms @@ -4,7 +4,7 @@ # Authors: Zoltan Arpadffy, # Sandor Kopanyi, # -# Last change: 2011 Jun 26 +# Last change: 2011 Jul 15 # # This has been tested on VMS 6.2 to 8.3 on DEC Alpha, VAX and IA64. # Edit the lines in the Configuration section below to select. @@ -75,7 +75,8 @@ SCRIPT = test1.out test2.out test3.out test61.out test62.out test63.out test64.out test65.out \ test66.out test67.out test68.out test69.out \ test71.out test72.out test74.out test75.out test76.out \ - test77.out test78.out test79.out test80.out test81.out + test77.out test78.out test79.out test80.out test81.out \ + test82.out # Known problems: # Test 30: a problem around mac format - unknown reason diff --git a/src/testdir/Makefile b/src/testdir/Makefile --- a/src/testdir/Makefile +++ b/src/testdir/Makefile @@ -26,7 +26,7 @@ SCRIPTS = test1.out test2.out test3.out test64.out test65.out test66.out test67.out test68.out \ test69.out test70.out test71.out test72.out test73.out \ test74.out test75.out test76.out test77.out test78.out \ - test79.out test80.out test81.out + test79.out test80.out test81.out test82.out SCRIPTS_GUI = test16.out diff --git a/src/version.c b/src/version.c --- a/src/version.c +++ b/src/version.c @@ -710,6 +710,8 @@ static char *(features[]) = static int included_patches[] = { /* Add new patch number below this line */ /**/ + 253, +/**/ 252, /**/ 251,