vim: src/mbyte.c comparison

comparison src/mbyte.c @ 2961:c21429d7768c v7.3.253

updated for version 7.3.253 Problem: "echo 'abc' > ''" returns 0 or 1, depending on 'ignorecase'. Checks in mb_strnicmp() for illegal and truncated bytes are wrong. Should not assume that byte length is equal before case folding. Solution: Add utf_safe_read_char_adv() and utf_strnicmp(). Add a test for this. (Ivan Krasilnikov)

author	Bram Moolenaar <bram@vim.org>
date	Fri, 15 Jul 2011 21:16:59 +0200
parents	0bef86c5c985
children	25be7c9dda54

comparison

equal deleted inserted replaced

-:03443e1c0050
+:c21429d7768c
 static int dbcs_ptr2len_len __ARGS((char_u *p, int size));
 static int utf_ptr2cells_len __ARGS((char_u *p, int size));
 static int dbcs_char2cells __ARGS((int c));
 static int dbcs_ptr2cells_len __ARGS((char_u *p, int size));
 static int dbcs_ptr2char __ARGS((char_u *p));
+static int utf_safe_read_char_adv __ARGS((char_u **s, size_t *n));
 /*
 * Lookup table to quickly get the length in bytes of a UTF-8 character from
 * the first byte of a UTF-8 string.
 * Bytes which are illegal when used as the first byte have a 1.
 	    }
 	}
 }
 /* Illegal value, just return the first byte */
 return p[0];
+}
+/*
+* Convert a UTF-8 byte sequence to a wide character.
+* String is assumed to be terminated by NUL or after "n" bytes, whichever
+* comes first.
+* The function is safe in the sense that it never accesses memory beyond the
+* first "n" bytes of "s".
+*
+* On success, returns decoded codepoint, advances "s" to the beginning of
+* next character and decreases "n" accordingly.
+*
+* If end of string was reached, returns 0 and, if "n" > 0, advances "s" past
+* NUL byte.
+*
+* If byte sequence is illegal or incomplete, returns -1 and does not advance
+* "s".
+*/
+static int
+utf_safe_read_char_adv(s, n)
+char_u      **s;
+size_t      *n;
+{
+int		c, k;
+if (*n == 0) /* end of buffer */
+	return 0;
+k = utf8len_tab_zero[**s];
+if (k == 1)
+{
+	/* ASCII character or NUL */
+	(*n)--;
+	return *(*s)++;
+}
+if ((size_t)k <= *n)
+{
+	/* We have a multibyte sequence and it isn't truncated by buffer
+	 * limits so utf_ptr2char() is safe to use. Or the first byte is
+	 * illegal (k=0), and it's also safe to use utf_ptr2char(). */
+	c = utf_ptr2char(*s);
+	/* On failure, utf_ptr2char() returns the first byte, so here we
+	 * check equality with the first byte. The only non-ASCII character
+	 * which equals the first byte of its own UTF-8 representation is
+	 * U+00C3 (UTF-8: 0xC3 0x83), so need to check that special case too.
+	 * It's safe even if n=1, else we would have k=2 > n. */
+	if (c != (int)(**s) || (c == 0xC3 && (*s)[1] == 0x83))
+	{
+	    /* byte sequence was successfully decoded */
+	    *s += k;
+	    *n -= k;
+	    return c;
+	}
+}
+/* byte sequence is incomplete or illegal */
+return -1;
 }
 /*
 * Get character at **pp and advance *pp to the next character.
 * Note: composing characters are skipped!
 	{0xa78b,0xa78b,-1,1},
 	{0xff21,0xff3a,1,32},
 	{0x10400,0x10427,1,40}
 };
-static int utf_convert(int a, convertStruct table[], int tableSize);
+static int utf_convert __ARGS((int a, convertStruct table[], int tableSize));
+static int utf_strnicmp __ARGS((char_u *s1, char_u *s2, size_t n1, size_t n2));
 /*
 * Generic conversion function for case operations.
 * Return the converted equivalent of "a", which is a UCS-4 character.  Use
 * the given conversion "table".  Uses binary search on "table".
 int		a;
 {
 return (utf_tolower(a) != a);
 }
+static int
+utf_strnicmp(s1, s2, n1, n2)
+char_u      *s1, *s2;
+size_t      n1, n2;
+{
+int		c1, c2, cdiff;
+char_u	buffer[6];
+for (;;)
+{
+	c1 = utf_safe_read_char_adv(&s1, &n1);
+	c2 = utf_safe_read_char_adv(&s2, &n2);
+	if (c1 <= 0 || c2 <= 0)
+	    break;
+	if (c1 == c2)
+	    continue;
+	cdiff = utf_fold(c1) - utf_fold(c2);
+	if (cdiff != 0)
+	    return cdiff;
+}
+/* some string ended or has an incomplete/illegal character sequence */
+if (c1 == 0 || c2 == 0)
+{
+	/* some string ended. shorter string is smaller */
+	if (c1 == 0 && c2 == 0)
+	    return 0;
+	return c1 == 0 ? -1 : 1;
+}
+/* Continue with bytewise comparison to produce some result that
+* would make comparison operations involving this function transitive.
+*
+* If only one string had an error, comparison should be made with
+* folded version of the other string. In this case it is enough
+* to fold just one character to determine the result of comparison. */
+if (c1 != -1 && c2 == -1)
+{
+	n1 = utf_char2bytes(utf_fold(c1), buffer);
+	s1 = buffer;
+}
+else if (c2 != -1 && c1 == -1)
+{
+	n2 = utf_char2bytes(utf_fold(c2), buffer);
+	s2 = buffer;
+}
+while (n1 > 0 && n2 > 0 && *s1 != NUL && *s2 != NUL)
+{
+	cdiff = (int)(*s1) - (int)(*s2);
+	if (cdiff != 0)
+	    return cdiff;
+	s1++;
+	s2++;
+	n1--;
+	n2--;
+}
+if (n1 > 0 && *s1 == NUL)
+	n1 = 0;
+if (n2 > 0 && *s2 == NUL)
+	n2 = 0;
+if (n1 == 0 && n2 == 0)
+	return 0;
+return n1 == 0 ? -1 : 1;
+}
 /*
 * Version of strnicmp() that handles multi-byte characters.
 * Needed for Big5, Sjift-JIS and UTF-8 encoding.  Other DBCS encodings can
 * probably use strnicmp(), because there are no ASCII characters in the
 * second byte.
 int
 mb_strnicmp(s1, s2, nn)
 char_u	*s1, *s2;
 size_t	nn;
 {
-int		i, j, l;
+int		i, l;
 int		cdiff;
-int		incomplete = FALSE;
 int		n = (int)nn;
-for (i = 0; i < n; i += l)
+if (enc_utf8)
 {
-	if (s1[i] == NUL && s2[i] == NUL)   /* both strings end */
+	return utf_strnicmp(s1, s2, nn, nn);
-	    return 0;
+}
-	if (enc_utf8)
+else
+{
+	for (i = 0; i < n; i += l)
 	{
-	    l = utf_byte2len(s1[i]);
+	    if (s1[i] == NUL && s2[i] == NUL)	/* both strings end */
-	    if (l > n - i)
+		return 0;
-	    {
-		l = n - i;		    /* incomplete character */
-		incomplete = TRUE;
-	    }
-	    /* Check directly first, it's faster. */
-	    for (j = 0; j < l; ++j)
-	    {
-		if (s1[i + j] != s2[i + j])
-		    break;
-		if (s1[i + j] == 0)
-		    /* Both stings have the same bytes but are incomplete or
-		     * have illegal bytes, accept them as equal. */
-		    l = j;
-	    }
-	    if (j < l)
-	    {
-		/* If one of the two characters is incomplete return -1. */
-		if (incomplete || i + utf_byte2len(s2[i]) > n)
-		    return -1;
-		/* Don't case-fold illegal bytes or truncated characters. */
-		if (utf_ptr2len(s1 + i) < l || utf_ptr2len(s2 + i) < l)
-		    return -1;
-		cdiff = utf_fold(utf_ptr2char(s1 + i))
-					     - utf_fold(utf_ptr2char(s2 + i));
-		if (cdiff != 0)
-		    return cdiff;
-	    }
-	}
-	else
-	{
 	    l = (*mb_ptr2len)(s1 + i);
 	    if (l <= 1)
 	    {
 		/* Single byte: first check normally, then with ignore case. */
 		if (s1[i] != s2[i])

Mercurial > vim

comparison src/mbyte.c @ 2961:c21429d7768c v7.3.253