Mercurial > vim
comparison src/mbyte.c @ 2961:c21429d7768c v7.3.253
updated for version 7.3.253
Problem: "echo 'abc' > ''" returns 0 or 1, depending on 'ignorecase'.
Checks in mb_strnicmp() for illegal and truncated bytes are
wrong. Should not assume that byte length is equal before case
folding.
Solution: Add utf_safe_read_char_adv() and utf_strnicmp(). Add a test for
this. (Ivan Krasilnikov)
author | Bram Moolenaar <bram@vim.org> |
---|---|
date | Fri, 15 Jul 2011 21:16:59 +0200 |
parents | 0bef86c5c985 |
children | 25be7c9dda54 |
comparison
equal
deleted
inserted
replaced
2960:03443e1c0050 | 2961:c21429d7768c |
---|---|
130 static int dbcs_ptr2len_len __ARGS((char_u *p, int size)); | 130 static int dbcs_ptr2len_len __ARGS((char_u *p, int size)); |
131 static int utf_ptr2cells_len __ARGS((char_u *p, int size)); | 131 static int utf_ptr2cells_len __ARGS((char_u *p, int size)); |
132 static int dbcs_char2cells __ARGS((int c)); | 132 static int dbcs_char2cells __ARGS((int c)); |
133 static int dbcs_ptr2cells_len __ARGS((char_u *p, int size)); | 133 static int dbcs_ptr2cells_len __ARGS((char_u *p, int size)); |
134 static int dbcs_ptr2char __ARGS((char_u *p)); | 134 static int dbcs_ptr2char __ARGS((char_u *p)); |
135 static int utf_safe_read_char_adv __ARGS((char_u **s, size_t *n)); | |
135 | 136 |
136 /* | 137 /* |
137 * Lookup table to quickly get the length in bytes of a UTF-8 character from | 138 * Lookup table to quickly get the length in bytes of a UTF-8 character from |
138 * the first byte of a UTF-8 string. | 139 * the first byte of a UTF-8 string. |
139 * Bytes which are illegal when used as the first byte have a 1. | 140 * Bytes which are illegal when used as the first byte have a 1. |
1696 } | 1697 } |
1697 } | 1698 } |
1698 } | 1699 } |
1699 /* Illegal value, just return the first byte */ | 1700 /* Illegal value, just return the first byte */ |
1700 return p[0]; | 1701 return p[0]; |
1702 } | |
1703 | |
1704 /* | |
1705 * Convert a UTF-8 byte sequence to a wide character. | |
1706 * String is assumed to be terminated by NUL or after "n" bytes, whichever | |
1707 * comes first. | |
1708 * The function is safe in the sense that it never accesses memory beyond the | |
1709 * first "n" bytes of "s". | |
1710 * | |
1711 * On success, returns decoded codepoint, advances "s" to the beginning of | |
1712 * next character and decreases "n" accordingly. | |
1713 * | |
1714 * If end of string was reached, returns 0 and, if "n" > 0, advances "s" past | |
1715 * NUL byte. | |
1716 * | |
1717 * If byte sequence is illegal or incomplete, returns -1 and does not advance | |
1718 * "s". | |
1719 */ | |
1720 static int | |
1721 utf_safe_read_char_adv(s, n) | |
1722 char_u **s; | |
1723 size_t *n; | |
1724 { | |
1725 int c, k; | |
1726 | |
1727 if (*n == 0) /* end of buffer */ | |
1728 return 0; | |
1729 | |
1730 k = utf8len_tab_zero[**s]; | |
1731 | |
1732 if (k == 1) | |
1733 { | |
1734 /* ASCII character or NUL */ | |
1735 (*n)--; | |
1736 return *(*s)++; | |
1737 } | |
1738 | |
1739 if ((size_t)k <= *n) | |
1740 { | |
1741 /* We have a multibyte sequence and it isn't truncated by buffer | |
1742 * limits so utf_ptr2char() is safe to use. Or the first byte is | |
1743 * illegal (k=0), and it's also safe to use utf_ptr2char(). */ | |
1744 c = utf_ptr2char(*s); | |
1745 | |
1746 /* On failure, utf_ptr2char() returns the first byte, so here we | |
1747 * check equality with the first byte. The only non-ASCII character | |
1748 * which equals the first byte of its own UTF-8 representation is | |
1749 * U+00C3 (UTF-8: 0xC3 0x83), so need to check that special case too. | |
1750 * It's safe even if n=1, else we would have k=2 > n. */ | |
1751 if (c != (int)(**s) || (c == 0xC3 && (*s)[1] == 0x83)) | |
1752 { | |
1753 /* byte sequence was successfully decoded */ | |
1754 *s += k; | |
1755 *n -= k; | |
1756 return c; | |
1757 } | |
1758 } | |
1759 | |
1760 /* byte sequence is incomplete or illegal */ | |
1761 return -1; | |
1701 } | 1762 } |
1702 | 1763 |
1703 /* | 1764 /* |
1704 * Get character at **pp and advance *pp to the next character. | 1765 * Get character at **pp and advance *pp to the next character. |
1705 * Note: composing characters are skipped! | 1766 * Note: composing characters are skipped! |
2665 {0xa78b,0xa78b,-1,1}, | 2726 {0xa78b,0xa78b,-1,1}, |
2666 {0xff21,0xff3a,1,32}, | 2727 {0xff21,0xff3a,1,32}, |
2667 {0x10400,0x10427,1,40} | 2728 {0x10400,0x10427,1,40} |
2668 }; | 2729 }; |
2669 | 2730 |
2670 static int utf_convert(int a, convertStruct table[], int tableSize); | 2731 static int utf_convert __ARGS((int a, convertStruct table[], int tableSize)); |
2732 static int utf_strnicmp __ARGS((char_u *s1, char_u *s2, size_t n1, size_t n2)); | |
2671 | 2733 |
2672 /* | 2734 /* |
2673 * Generic conversion function for case operations. | 2735 * Generic conversion function for case operations. |
2674 * Return the converted equivalent of "a", which is a UCS-4 character. Use | 2736 * Return the converted equivalent of "a", which is a UCS-4 character. Use |
2675 * the given conversion "table". Uses binary search on "table". | 2737 * the given conversion "table". Uses binary search on "table". |
3077 int a; | 3139 int a; |
3078 { | 3140 { |
3079 return (utf_tolower(a) != a); | 3141 return (utf_tolower(a) != a); |
3080 } | 3142 } |
3081 | 3143 |
3144 static int | |
3145 utf_strnicmp(s1, s2, n1, n2) | |
3146 char_u *s1, *s2; | |
3147 size_t n1, n2; | |
3148 { | |
3149 int c1, c2, cdiff; | |
3150 char_u buffer[6]; | |
3151 | |
3152 for (;;) | |
3153 { | |
3154 c1 = utf_safe_read_char_adv(&s1, &n1); | |
3155 c2 = utf_safe_read_char_adv(&s2, &n2); | |
3156 | |
3157 if (c1 <= 0 || c2 <= 0) | |
3158 break; | |
3159 | |
3160 if (c1 == c2) | |
3161 continue; | |
3162 | |
3163 cdiff = utf_fold(c1) - utf_fold(c2); | |
3164 if (cdiff != 0) | |
3165 return cdiff; | |
3166 } | |
3167 | |
3168 /* some string ended or has an incomplete/illegal character sequence */ | |
3169 | |
3170 if (c1 == 0 || c2 == 0) | |
3171 { | |
3172 /* some string ended. shorter string is smaller */ | |
3173 if (c1 == 0 && c2 == 0) | |
3174 return 0; | |
3175 return c1 == 0 ? -1 : 1; | |
3176 } | |
3177 | |
3178 /* Continue with bytewise comparison to produce some result that | |
3179 * would make comparison operations involving this function transitive. | |
3180 * | |
3181 * If only one string had an error, comparison should be made with | |
3182 * folded version of the other string. In this case it is enough | |
3183 * to fold just one character to determine the result of comparison. */ | |
3184 | |
3185 if (c1 != -1 && c2 == -1) | |
3186 { | |
3187 n1 = utf_char2bytes(utf_fold(c1), buffer); | |
3188 s1 = buffer; | |
3189 } | |
3190 else if (c2 != -1 && c1 == -1) | |
3191 { | |
3192 n2 = utf_char2bytes(utf_fold(c2), buffer); | |
3193 s2 = buffer; | |
3194 } | |
3195 | |
3196 while (n1 > 0 && n2 > 0 && *s1 != NUL && *s2 != NUL) | |
3197 { | |
3198 cdiff = (int)(*s1) - (int)(*s2); | |
3199 if (cdiff != 0) | |
3200 return cdiff; | |
3201 | |
3202 s1++; | |
3203 s2++; | |
3204 n1--; | |
3205 n2--; | |
3206 } | |
3207 | |
3208 if (n1 > 0 && *s1 == NUL) | |
3209 n1 = 0; | |
3210 if (n2 > 0 && *s2 == NUL) | |
3211 n2 = 0; | |
3212 | |
3213 if (n1 == 0 && n2 == 0) | |
3214 return 0; | |
3215 return n1 == 0 ? -1 : 1; | |
3216 } | |
3217 | |
3082 /* | 3218 /* |
3083 * Version of strnicmp() that handles multi-byte characters. | 3219 * Version of strnicmp() that handles multi-byte characters. |
3084 * Needed for Big5, Sjift-JIS and UTF-8 encoding. Other DBCS encodings can | 3220 * Needed for Big5, Sjift-JIS and UTF-8 encoding. Other DBCS encodings can |
3085 * probably use strnicmp(), because there are no ASCII characters in the | 3221 * probably use strnicmp(), because there are no ASCII characters in the |
3086 * second byte. | 3222 * second byte. |
3090 int | 3226 int |
3091 mb_strnicmp(s1, s2, nn) | 3227 mb_strnicmp(s1, s2, nn) |
3092 char_u *s1, *s2; | 3228 char_u *s1, *s2; |
3093 size_t nn; | 3229 size_t nn; |
3094 { | 3230 { |
3095 int i, j, l; | 3231 int i, l; |
3096 int cdiff; | 3232 int cdiff; |
3097 int incomplete = FALSE; | |
3098 int n = (int)nn; | 3233 int n = (int)nn; |
3099 | 3234 |
3100 for (i = 0; i < n; i += l) | 3235 if (enc_utf8) |
3101 { | 3236 { |
3102 if (s1[i] == NUL && s2[i] == NUL) /* both strings end */ | 3237 return utf_strnicmp(s1, s2, nn, nn); |
3103 return 0; | 3238 } |
3104 if (enc_utf8) | 3239 else |
3240 { | |
3241 for (i = 0; i < n; i += l) | |
3105 { | 3242 { |
3106 l = utf_byte2len(s1[i]); | 3243 if (s1[i] == NUL && s2[i] == NUL) /* both strings end */ |
3107 if (l > n - i) | 3244 return 0; |
3108 { | 3245 |
3109 l = n - i; /* incomplete character */ | |
3110 incomplete = TRUE; | |
3111 } | |
3112 /* Check directly first, it's faster. */ | |
3113 for (j = 0; j < l; ++j) | |
3114 { | |
3115 if (s1[i + j] != s2[i + j]) | |
3116 break; | |
3117 if (s1[i + j] == 0) | |
3118 /* Both stings have the same bytes but are incomplete or | |
3119 * have illegal bytes, accept them as equal. */ | |
3120 l = j; | |
3121 } | |
3122 if (j < l) | |
3123 { | |
3124 /* If one of the two characters is incomplete return -1. */ | |
3125 if (incomplete || i + utf_byte2len(s2[i]) > n) | |
3126 return -1; | |
3127 /* Don't case-fold illegal bytes or truncated characters. */ | |
3128 if (utf_ptr2len(s1 + i) < l || utf_ptr2len(s2 + i) < l) | |
3129 return -1; | |
3130 cdiff = utf_fold(utf_ptr2char(s1 + i)) | |
3131 - utf_fold(utf_ptr2char(s2 + i)); | |
3132 if (cdiff != 0) | |
3133 return cdiff; | |
3134 } | |
3135 } | |
3136 else | |
3137 { | |
3138 l = (*mb_ptr2len)(s1 + i); | 3246 l = (*mb_ptr2len)(s1 + i); |
3139 if (l <= 1) | 3247 if (l <= 1) |
3140 { | 3248 { |
3141 /* Single byte: first check normally, then with ignore case. */ | 3249 /* Single byte: first check normally, then with ignore case. */ |
3142 if (s1[i] != s2[i]) | 3250 if (s1[i] != s2[i]) |