comparison src/mbyte.c @ 2961:c21429d7768c v7.3.253

updated for version 7.3.253 Problem: "echo 'abc' > ''" returns 0 or 1, depending on 'ignorecase'. Checks in mb_strnicmp() for illegal and truncated bytes are wrong. Should not assume that byte length is equal before case folding. Solution: Add utf_safe_read_char_adv() and utf_strnicmp(). Add a test for this. (Ivan Krasilnikov)
author Bram Moolenaar <bram@vim.org>
date Fri, 15 Jul 2011 21:16:59 +0200
parents 0bef86c5c985
children 25be7c9dda54
comparison
equal deleted inserted replaced
2960:03443e1c0050 2961:c21429d7768c
130 static int dbcs_ptr2len_len __ARGS((char_u *p, int size)); 130 static int dbcs_ptr2len_len __ARGS((char_u *p, int size));
131 static int utf_ptr2cells_len __ARGS((char_u *p, int size)); 131 static int utf_ptr2cells_len __ARGS((char_u *p, int size));
132 static int dbcs_char2cells __ARGS((int c)); 132 static int dbcs_char2cells __ARGS((int c));
133 static int dbcs_ptr2cells_len __ARGS((char_u *p, int size)); 133 static int dbcs_ptr2cells_len __ARGS((char_u *p, int size));
134 static int dbcs_ptr2char __ARGS((char_u *p)); 134 static int dbcs_ptr2char __ARGS((char_u *p));
135 static int utf_safe_read_char_adv __ARGS((char_u **s, size_t *n));
135 136
136 /* 137 /*
137 * Lookup table to quickly get the length in bytes of a UTF-8 character from 138 * Lookup table to quickly get the length in bytes of a UTF-8 character from
138 * the first byte of a UTF-8 string. 139 * the first byte of a UTF-8 string.
139 * Bytes which are illegal when used as the first byte have a 1. 140 * Bytes which are illegal when used as the first byte have a 1.
1696 } 1697 }
1697 } 1698 }
1698 } 1699 }
1699 /* Illegal value, just return the first byte */ 1700 /* Illegal value, just return the first byte */
1700 return p[0]; 1701 return p[0];
1702 }
1703
1704 /*
1705 * Convert a UTF-8 byte sequence to a wide character.
1706 * String is assumed to be terminated by NUL or after "n" bytes, whichever
1707 * comes first.
1708 * The function is safe in the sense that it never accesses memory beyond the
1709 * first "n" bytes of "s".
1710 *
1711 * On success, returns decoded codepoint, advances "s" to the beginning of
1712 * next character and decreases "n" accordingly.
1713 *
1714 * If end of string was reached, returns 0 and, if "n" > 0, advances "s" past
1715 * NUL byte.
1716 *
1717 * If byte sequence is illegal or incomplete, returns -1 and does not advance
1718 * "s".
1719 */
1720 static int
1721 utf_safe_read_char_adv(s, n)
1722 char_u **s;
1723 size_t *n;
1724 {
1725 int c, k;
1726
1727 if (*n == 0) /* end of buffer */
1728 return 0;
1729
1730 k = utf8len_tab_zero[**s];
1731
1732 if (k == 1)
1733 {
1734 /* ASCII character or NUL */
1735 (*n)--;
1736 return *(*s)++;
1737 }
1738
1739 if ((size_t)k <= *n)
1740 {
1741 /* We have a multibyte sequence and it isn't truncated by buffer
1742 * limits so utf_ptr2char() is safe to use. Or the first byte is
1743 * illegal (k=0), and it's also safe to use utf_ptr2char(). */
1744 c = utf_ptr2char(*s);
1745
1746 /* On failure, utf_ptr2char() returns the first byte, so here we
1747 * check equality with the first byte. The only non-ASCII character
1748 * which equals the first byte of its own UTF-8 representation is
1749 * U+00C3 (UTF-8: 0xC3 0x83), so need to check that special case too.
1750 * It's safe even if n=1, else we would have k=2 > n. */
1751 if (c != (int)(**s) || (c == 0xC3 && (*s)[1] == 0x83))
1752 {
1753 /* byte sequence was successfully decoded */
1754 *s += k;
1755 *n -= k;
1756 return c;
1757 }
1758 }
1759
1760 /* byte sequence is incomplete or illegal */
1761 return -1;
1701 } 1762 }
1702 1763
1703 /* 1764 /*
1704 * Get character at **pp and advance *pp to the next character. 1765 * Get character at **pp and advance *pp to the next character.
1705 * Note: composing characters are skipped! 1766 * Note: composing characters are skipped!
2665 {0xa78b,0xa78b,-1,1}, 2726 {0xa78b,0xa78b,-1,1},
2666 {0xff21,0xff3a,1,32}, 2727 {0xff21,0xff3a,1,32},
2667 {0x10400,0x10427,1,40} 2728 {0x10400,0x10427,1,40}
2668 }; 2729 };
2669 2730
2670 static int utf_convert(int a, convertStruct table[], int tableSize); 2731 static int utf_convert __ARGS((int a, convertStruct table[], int tableSize));
2732 static int utf_strnicmp __ARGS((char_u *s1, char_u *s2, size_t n1, size_t n2));
2671 2733
2672 /* 2734 /*
2673 * Generic conversion function for case operations. 2735 * Generic conversion function for case operations.
2674 * Return the converted equivalent of "a", which is a UCS-4 character. Use 2736 * Return the converted equivalent of "a", which is a UCS-4 character. Use
2675 * the given conversion "table". Uses binary search on "table". 2737 * the given conversion "table". Uses binary search on "table".
3077 int a; 3139 int a;
3078 { 3140 {
3079 return (utf_tolower(a) != a); 3141 return (utf_tolower(a) != a);
3080 } 3142 }
3081 3143
3144 static int
3145 utf_strnicmp(s1, s2, n1, n2)
3146 char_u *s1, *s2;
3147 size_t n1, n2;
3148 {
3149 int c1, c2, cdiff;
3150 char_u buffer[6];
3151
3152 for (;;)
3153 {
3154 c1 = utf_safe_read_char_adv(&s1, &n1);
3155 c2 = utf_safe_read_char_adv(&s2, &n2);
3156
3157 if (c1 <= 0 || c2 <= 0)
3158 break;
3159
3160 if (c1 == c2)
3161 continue;
3162
3163 cdiff = utf_fold(c1) - utf_fold(c2);
3164 if (cdiff != 0)
3165 return cdiff;
3166 }
3167
3168 /* some string ended or has an incomplete/illegal character sequence */
3169
3170 if (c1 == 0 || c2 == 0)
3171 {
3172 /* some string ended. shorter string is smaller */
3173 if (c1 == 0 && c2 == 0)
3174 return 0;
3175 return c1 == 0 ? -1 : 1;
3176 }
3177
3178 /* Continue with bytewise comparison to produce some result that
3179 * would make comparison operations involving this function transitive.
3180 *
3181 * If only one string had an error, comparison should be made with
3182 * folded version of the other string. In this case it is enough
3183 * to fold just one character to determine the result of comparison. */
3184
3185 if (c1 != -1 && c2 == -1)
3186 {
3187 n1 = utf_char2bytes(utf_fold(c1), buffer);
3188 s1 = buffer;
3189 }
3190 else if (c2 != -1 && c1 == -1)
3191 {
3192 n2 = utf_char2bytes(utf_fold(c2), buffer);
3193 s2 = buffer;
3194 }
3195
3196 while (n1 > 0 && n2 > 0 && *s1 != NUL && *s2 != NUL)
3197 {
3198 cdiff = (int)(*s1) - (int)(*s2);
3199 if (cdiff != 0)
3200 return cdiff;
3201
3202 s1++;
3203 s2++;
3204 n1--;
3205 n2--;
3206 }
3207
3208 if (n1 > 0 && *s1 == NUL)
3209 n1 = 0;
3210 if (n2 > 0 && *s2 == NUL)
3211 n2 = 0;
3212
3213 if (n1 == 0 && n2 == 0)
3214 return 0;
3215 return n1 == 0 ? -1 : 1;
3216 }
3217
3082 /* 3218 /*
3083 * Version of strnicmp() that handles multi-byte characters. 3219 * Version of strnicmp() that handles multi-byte characters.
3084 * Needed for Big5, Sjift-JIS and UTF-8 encoding. Other DBCS encodings can 3220 * Needed for Big5, Sjift-JIS and UTF-8 encoding. Other DBCS encodings can
3085 * probably use strnicmp(), because there are no ASCII characters in the 3221 * probably use strnicmp(), because there are no ASCII characters in the
3086 * second byte. 3222 * second byte.
3090 int 3226 int
3091 mb_strnicmp(s1, s2, nn) 3227 mb_strnicmp(s1, s2, nn)
3092 char_u *s1, *s2; 3228 char_u *s1, *s2;
3093 size_t nn; 3229 size_t nn;
3094 { 3230 {
3095 int i, j, l; 3231 int i, l;
3096 int cdiff; 3232 int cdiff;
3097 int incomplete = FALSE;
3098 int n = (int)nn; 3233 int n = (int)nn;
3099 3234
3100 for (i = 0; i < n; i += l) 3235 if (enc_utf8)
3101 { 3236 {
3102 if (s1[i] == NUL && s2[i] == NUL) /* both strings end */ 3237 return utf_strnicmp(s1, s2, nn, nn);
3103 return 0; 3238 }
3104 if (enc_utf8) 3239 else
3240 {
3241 for (i = 0; i < n; i += l)
3105 { 3242 {
3106 l = utf_byte2len(s1[i]); 3243 if (s1[i] == NUL && s2[i] == NUL) /* both strings end */
3107 if (l > n - i) 3244 return 0;
3108 { 3245
3109 l = n - i; /* incomplete character */
3110 incomplete = TRUE;
3111 }
3112 /* Check directly first, it's faster. */
3113 for (j = 0; j < l; ++j)
3114 {
3115 if (s1[i + j] != s2[i + j])
3116 break;
3117 if (s1[i + j] == 0)
3118 /* Both stings have the same bytes but are incomplete or
3119 * have illegal bytes, accept them as equal. */
3120 l = j;
3121 }
3122 if (j < l)
3123 {
3124 /* If one of the two characters is incomplete return -1. */
3125 if (incomplete || i + utf_byte2len(s2[i]) > n)
3126 return -1;
3127 /* Don't case-fold illegal bytes or truncated characters. */
3128 if (utf_ptr2len(s1 + i) < l || utf_ptr2len(s2 + i) < l)
3129 return -1;
3130 cdiff = utf_fold(utf_ptr2char(s1 + i))
3131 - utf_fold(utf_ptr2char(s2 + i));
3132 if (cdiff != 0)
3133 return cdiff;
3134 }
3135 }
3136 else
3137 {
3138 l = (*mb_ptr2len)(s1 + i); 3246 l = (*mb_ptr2len)(s1 + i);
3139 if (l <= 1) 3247 if (l <= 1)
3140 { 3248 {
3141 /* Single byte: first check normally, then with ignore case. */ 3249 /* Single byte: first check normally, then with ignore case. */
3142 if (s1[i] != s2[i]) 3250 if (s1[i] != s2[i])