Mercurial > vim

--- a/runtime/doc/version9.txt
+++ b/runtime/doc/version9.txt
@@ -1,4 +1,4 @@
-*version9.txt*  For Vim version 9.1.  Last change: 2024 Jul 28
+*version9.txt*  For Vim version 9.1.  Last change: 2024 Jul 30


 		  VIM REFERENCE MANUAL    by Bram Moolenaar
@@ -41590,6 +41590,8 @@ Changed~
   behaviour/inconsistency (see |d-special| and |cw|).
 - allow to specify additional attributes in the completion menu (allows to
   mark deprecated attributes from LSP server) |complete-items|
+- the regex engines match correctly case-insensitive multi-byte characters
+  (and apply proper case folding)

 							*added-9.2*
 Added ~
--- a/src/mbyte.c
+++ b/src/mbyte.c
@@ -3801,6 +3801,15 @@ utf_strnicmp(
  * two characters otherwise.
  */
     int
+mb_strnicmp2(char_u *s1, char_u *s2, size_t n1, size_t n2)
+{
+    if (n1 == n2 || !enc_utf8)
+	return mb_strnicmp(s1, s2, n1);
+    else
+	return utf_strnicmp(s1, s2, n1, n2);
+}
+
+    int
 mb_strnicmp(char_u *s1, char_u *s2, size_t nn)
 {
     int		i, l;
--- a/src/proto/mbyte.pro
+++ b/src/proto/mbyte.pro
@@ -48,6 +48,7 @@ int utf_islower(int a);
 int utf_tolower(int a);
 int utf_isupper(int a);
 int mb_strnicmp(char_u *s1, char_u *s2, size_t nn);
+int mb_strnicmp2(char_u *s1, char_u *s2, size_t n1, size_t n2);
 void show_utf8(void);
 int latin_head_off(char_u *base, char_u *p);
 int dbcs_screen_head_off(char_u *base, char_u *p);
--- a/src/regexp.c
+++ b/src/regexp.c
@@ -1729,7 +1729,9 @@ mb_decompose(int c, int *c1, int *c2, in
 /*
  * Compare two strings, ignore case if rex.reg_ic set.
  * Return 0 if strings match, non-zero otherwise.
- * Correct the length "*n" when composing characters are ignored.
+ * Correct the length "*n" when composing characters are ignored
+ * or for utf8 when both utf codepoints are considered equal because of
+ * case-folding but have different length (e.g. 's' and 'ſ')
  */
     static int
 cstrncmp(char_u *s1, char_u *s2, int *n)
@@ -1738,6 +1740,29 @@ cstrncmp(char_u *s1, char_u *s2, int *n)

     if (!rex.reg_ic)
 	result = STRNCMP(s1, s2, *n);
+    else if (enc_utf8)
+    {
+	char_u *p = s1;
+	size_t n2 = 0;
+	int n1 = *n;
+	// count the number of characters for byte-length of s1
+	while (n1 > 0 && *p != NUL)
+	{
+	    n1 -= mb_ptr2len(s1);
+	    MB_PTR_ADV(p);
+	    n2++;
+	}
+	// count the number of bytes to advance the same number of chars for s2
+	p = s2;
+	while (n2-- > 0 && *p != NUL)
+	    MB_PTR_ADV(p);
+
+	n2 = p - s2;
+
+	result = MB_STRNICMP2(s1, s2, *n, n2);
+	if (result == 0 && (int)n2 < *n)
+	    *n = n2;
+    }
     else
 	result = MB_STRNICMP(s1, s2, *n);

@@ -1787,7 +1812,7 @@ cstrncmp(char_u *s1, char_u *s2, int *n)
 cstrchr(char_u *s, int c)
 {
     char_u	*p;
-    int		cc;
+    int		cc, lc;

     if (!rex.reg_ic || (!enc_utf8 && mb_char2len(c) > 1))
 	return vim_strchr(s, c);
@@ -1796,26 +1821,35 @@ cstrchr(char_u *s, int c)
     // faster (esp. when using MS Visual C++!).
     // For UTF-8 need to use folded case.
     if (enc_utf8 && c > 0x80)
+    {
 	cc = utf_fold(c);
+	lc = cc;
+    }
     else
-	 if (MB_ISUPPER(c))
-	cc = MB_TOLOWER(c);
-    else if (MB_ISLOWER(c))
-	cc = MB_TOUPPER(c);
-    else
-	return vim_strchr(s, c);
+	if (MB_ISUPPER(c))
+	{
+	    cc = MB_TOLOWER(c);
+	    lc = cc;
+	}
+	else if (MB_ISLOWER(c))
+	{
+	    cc = MB_TOUPPER(c);
+	    lc = c;
+	}
+	else
+	    return vim_strchr(s, c);

     if (has_mbyte)
     {
 	for (p = s; *p != NUL; p += (*mb_ptr2len)(p))
 	{
-	    if (enc_utf8 && c > 0x80)
+	    int uc = utf_ptr2char(p);
+	    if (enc_utf8 && (c > 0x80 || uc > 0x80))
 	    {
-		int uc = utf_ptr2char(p);
-
 		// Do not match an illegal byte.  E.g. 0xff matches 0xc3 0xbf,
 		// not 0xff.
-		if ((uc < 0x80 || uc != *p) && utf_fold(uc) == cc)
+		// compare with lower case of the character
+		if ((uc < 0x80 || uc != *p) && utf_fold(uc) == lc)
 		    return p;
 	    }
 	    else if (*p == c || *p == cc)
--- a/src/regexp_bt.c
+++ b/src/regexp_bt.c
@@ -3823,6 +3823,14 @@ regmatch(
 			}
 		    }
 		}
+		else if (enc_utf8)
+		{
+		    if (cstrncmp(opnd, rex.input, &len) != 0)
+		    {
+			status = RA_NOMATCH;
+			break;
+		    }
+		}
 		else
 		    for (i = 0; i < len; ++i)
 			if (opnd[i] != rex.input[i])
--- a/src/regexp_nfa.c
+++ b/src/regexp_nfa.c
@@ -5666,7 +5666,12 @@ find_match_text(colnr_T *startcol, int r
     for (;;)
     {
 	match = TRUE;
-	len2 = MB_CHAR2LEN(regstart); // skip regstart
+	// skip regstart
+	len2 = MB_CHAR2LEN(regstart);
+	if (enc_utf8 && len2 > 1 && MB_CHAR2LEN(PTR2CHAR(rex.line + col)) != len2)
+	    // because of case-folding of the previously matched text, we may need
+	    // to skip fewer bytes than mb_char2len(regstart)
+	    len2 = mb_char2len(utf_fold(regstart));
 	for (len1 = 0; match_text[len1] != NUL; len1 += MB_CHAR2LEN(c1))
 	{
 	    c1 = PTR2CHAR(match_text + len1);
@@ -7502,7 +7507,7 @@ nfa_regexec_both(

 	// If match_text is set it contains the full text that must match.
 	// Nothing else to try. Doesn't handle combining chars well.
-	if (prog->match_text != NULL && !rex.reg_icombine)
+	if (prog->match_text != NULL && *prog->match_text != NUL && !rex.reg_icombine)
 	{
 	    retval = find_match_text(&col, prog->regstart, prog->match_text);
 	    if (REG_MULTI)
--- a/src/testdir/test_regexp_utf8.vim
+++ b/src/testdir/test_regexp_utf8.vim
@@ -587,4 +587,36 @@ func Test_combining_chars_in_collection(
   bw!
 endfunc

+func Test_search_multibyte_match_ascii()
+  new
+  " Match single 'ſ' and 's'
+  call setline(1,  'das abc heraus abc ſich abc ſind')
+  for i in range(0, 2)
+    exe "set re="..i
+    let ic_match = matchbufline('%', '\c\%u17f', 1, '$')->mapnew({idx, val -> val.text})
+    let noic_match = matchbufline('%', '\C\%u17f', 1, '$')->mapnew({idx, val -> val.text})
+    call assert_equal(['s', 's', 'ſ','ſ'], ic_match, "Ignorecase Regex-engine: " .. &re)
+    call assert_equal(['ſ','ſ'], noic_match, "No-Ignorecase Regex-engine: " .. &re)
+  endfor
+  " Match several 'ſſ' and 'ss'
+  call setline(1,  'das abc herauss abc ſſich abc ſind')
+  for i in range(0, 2)
+    exe "set re="..i
+    let ic_match = matchbufline('%', '\c\%u17f\%u17f', 1, '$')->mapnew({idx, val -> val.text})
+    let noic_match = matchbufline('%', '\C\%u17f\%u17f', 1, '$')->mapnew({idx, val -> val.text})
+    let ic_match2 = matchbufline('%', '\c\%u17f\+', 1, '$')->mapnew({idx, val -> val.text})
+    let noic_match2 = matchbufline('%', '\C\%u17f\+', 1, '$')->mapnew({idx, val -> val.text})
+    let ic_match3 = matchbufline('%', '\c[\u17f]\+', 1, '$')->mapnew({idx, val -> val.text})
+    let noic_match3 = matchbufline('%', '\C[\u17f]\+', 1, '$')->mapnew({idx, val -> val.text})
+
+    call assert_equal(['ss', 'ſſ'], ic_match, "Ignorecase Regex-engine: " .. &re)
+    call assert_equal(['ſſ'], noic_match, "No-Ignorecase Regex-engine: " .. &re)
+    call assert_equal(['s', 'ss', 'ſſ', 'ſ'], ic_match2, "Ignorecase Regex-engine: " .. &re)
+    call assert_equal(['ſſ','ſ'], noic_match2, "No-Ignorecase Regex-engine: " .. &re)
+    call assert_equal(['s', 'ss', 'ſſ', 'ſ'], ic_match3, "Ignorecase Collection Regex-engine: " .. &re)
+    call assert_equal(['ſſ','ſ'], noic_match3, "No-Ignorecase Collection Regex-engine: " .. &re)
+  endfor
+  bw!
+endfunc
+
 " vim: shiftwidth=2 sts=2 expandtab
--- a/src/version.c
+++ b/src/version.c
@@ -705,6 +705,8 @@ static char *(features[]) =
 static int included_patches[] =
 {   /* Add new patch number below this line */
 /**/
+    645,
+/**/
     644,
 /**/
     643,
--- a/src/vim.h
+++ b/src/vim.h
@@ -1769,6 +1769,7 @@ void *vim_memset(void *, int, size_t);

 # define MB_STRICMP(d, s)	mb_strnicmp((char_u *)(d), (char_u *)(s), (int)MAXCOL)
 # define MB_STRNICMP(d, s, n)	mb_strnicmp((char_u *)(d), (char_u *)(s), (int)(n))
+# define MB_STRNICMP2(d, s, n1, n2)	mb_strnicmp2((char_u *)(d), (char_u *)(s), (n1), (n2))

 #define STRCAT(d, s)	    strcat((char *)(d), (char *)(s))
 #define STRNCAT(d, s, n)    strncat((char *)(d), (char *)(s), (size_t)(n))