diff src/search.c @ 22689:f8bf2c122452 v8.2.1893

patch 8.2.1893: fuzzy matching does not support multiple words Commit: https://github.com/vim/vim/commit/8ded5b647aa4b3338da721b343e0bce0f86655f6 Author: Bram Moolenaar <Bram@vim.org> Date: Fri Oct 23 16:50:30 2020 +0200 patch 8.2.1893: fuzzy matching does not support multiple words Problem: Fuzzy matching does not support multiple words. Solution: Add support for matching white space separated words. (Yegappan Lakshmanan, closes #7163)
author Bram Moolenaar <Bram@vim.org>
date Fri, 23 Oct 2020 17:00:04 +0200
parents 0dd527d9c62d
children e82579016863
line wrap: on
line diff
--- a/src/search.c
+++ b/src/search.c
@@ -4203,16 +4203,16 @@ the_end:
  * Ported from the lib_fts library authored by Forrest Smith.
  * https://github.com/forrestthewoods/lib_fts/tree/master/code
  *
- * Blog describing the algorithm:
+ * The following blog describes the fuzzy matching algorithm:
  * https://www.forrestthewoods.com/blog/reverse_engineering_sublime_texts_fuzzy_match/
  *
  * Each matching string is assigned a score. The following factors are checked:
- *   Matched letter
- *   Unmatched letter
- *   Consecutively matched letters
- *   Proximity to start
- *   Letter following a separator (space, underscore)
- *   Uppercase letter following lowercase (aka CamelCase)
+ *   - Matched letter
+ *   - Unmatched letter
+ *   - Consecutively matched letters
+ *   - Proximity to start
+ *   - Letter following a separator (space, underscore)
+ *   - Uppercase letter following lowercase (aka CamelCase)
  *
  * Matched letters are good. Unmatched letters are bad. Matching near the start
  * is good. Matching the first letter in the middle of a phrase is good.
@@ -4222,16 +4222,17 @@ the_end:
  * File paths are different from file names. File extensions may be ignorable.
  * Single words care about consecutive matches but not separators or camel
  * case.
- *   Score starts at 0
+ *   Score starts at 100
  *   Matched letter: +0 points
  *   Unmatched letter: -1 point
- *   Consecutive match bonus: +5 points
- *   Separator bonus: +10 points
- *   Camel case bonus: +10 points
- *   Unmatched leading letter: -3 points (max: -9)
+ *   Consecutive match bonus: +15 points
+ *   First letter bonus: +15 points
+ *   Separator bonus: +30 points
+ *   Camel case bonus: +30 points
+ *   Unmatched leading letter: -5 points (max: -15)
  *
  * There is some nuance to this. Scores don’t have an intrinsic meaning. The
- * score range isn’t 0 to 100. It’s roughly [-50, 50]. Longer words have a
+ * score range isn’t 0 to 100. It’s roughly [50, 150]. Longer words have a
  * lower minimum score due to unmatched letter penalty. Longer search patterns
  * have a higher maximum score due to match bonuses.
  *
@@ -4247,6 +4248,7 @@ the_end:
  */
 typedef struct
 {
+    int		idx;		// used for stable sort
     listitem_T	*item;
     int		score;
     list_T	*lmatchpos;
@@ -4267,6 +4269,8 @@ typedef struct
 #define MAX_LEADING_LETTER_PENALTY -15
 // penalty for every letter that doesn't match
 #define UNMATCHED_LETTER_PENALTY -1
+// penalty for gap in matching positions (-2 * k)
+#define GAP_PENALTY	-2
 // Score for a string that doesn't fuzzy match the pattern
 #define SCORE_NONE	-9999
 
@@ -4319,6 +4323,8 @@ fuzzy_match_compute_score(
 	    // Sequential
 	    if (currIdx == (prevIdx + 1))
 		score += SEQUENTIAL_BONUS;
+	    else
+		score += GAP_PENALTY * (currIdx - prevIdx);
 	}
 
 	// Check for bonuses based on neighbor character value
@@ -4334,7 +4340,7 @@ fuzzy_match_compute_score(
 		while (sidx < currIdx)
 		{
 		    neighbor = (*mb_ptr2char)(p);
-		    (void)mb_ptr2char_adv(&p);
+		    MB_PTR_ADV(p);
 		    sidx++;
 		}
 		curr = (*mb_ptr2char)(p);
@@ -4362,6 +4368,10 @@ fuzzy_match_compute_score(
     return score;
 }
 
+/*
+ * Perform a recursive search for fuzzy matching 'fuzpat' in 'str'.
+ * Return the number of matching characters.
+ */
     static int
 fuzzy_match_recursive(
 	char_u		*fuzpat,
@@ -4386,11 +4396,11 @@ fuzzy_match_recursive(
     // Count recursions
     ++*recursionCount;
     if (*recursionCount >= FUZZY_MATCH_RECURSION_LIMIT)
-	return FALSE;
+	return 0;
 
     // Detect end of strings
     if (*fuzpat == '\0' || *str == '\0')
-	return FALSE;
+	return 0;
 
     // Loop through fuzpat and str looking for a match
     first_match = TRUE;
@@ -4411,7 +4421,7 @@ fuzzy_match_recursive(
 
 	    // Supplied matches buffer was too short
 	    if (nextMatch >= maxMatches)
-		return FALSE;
+		return 0;
 
 	    // "Copy-on-Write" srcMatches into matches
 	    if (first_match && srcMatches)
@@ -4444,12 +4454,12 @@ fuzzy_match_recursive(
 	    // Advance
 	    matches[nextMatch++] = strIdx;
 	    if (has_mbyte)
-		(void)mb_ptr2char_adv(&fuzpat);
+		MB_PTR_ADV(fuzpat);
 	    else
 		++fuzpat;
 	}
 	if (has_mbyte)
-	    (void)mb_ptr2char_adv(&str);
+	    MB_PTR_ADV(str);
 	else
 	    ++str;
 	strIdx++;
@@ -4469,12 +4479,12 @@ fuzzy_match_recursive(
 	// Recursive score is better than "this"
 	memcpy(matches, bestRecursiveMatches, maxMatches * sizeof(matches[0]));
 	*outScore = bestRecursiveScore;
-	return TRUE;
+	return nextMatch;
     }
     else if (matched)
-	return TRUE;		// "this" score is better than recursive
-
-    return FALSE;		// no match
+	return nextMatch;	// "this" score is better than recursive
+
+    return 0;		// no match
 }
 
 /*
@@ -4485,45 +4495,110 @@ fuzzy_match_recursive(
  * Scores values have no intrinsic meaning.  Possible score range is not
  * normalized and varies with pattern.
  * Recursion is limited internally (default=10) to prevent degenerate cases
- * (fuzpat="aaaaaa" str="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa").
+ * (pat_arg="aaaaaa" str="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa").
  * Uses char_u for match indices. Therefore patterns are limited to MAXMATCHES
  * characters.
  *
- * Returns TRUE if 'fuzpat' matches 'str'. Also returns the match score in
+ * Returns TRUE if 'pat_arg' matches 'str'. Also returns the match score in
  * 'outScore' and the matching character positions in 'matches'.
  */
     static int
 fuzzy_match(
 	char_u		*str,
-	char_u		*fuzpat,
+	char_u		*pat_arg,
+	int		matchseq,
 	int		*outScore,
 	matchidx_T	*matches,
 	int		maxMatches)
 {
     int		recursionCount = 0;
     int		len = MB_CHARLEN(str);
+    char_u	*save_pat;
+    char_u	*pat;
+    char_u	*p;
+    int		complete = FALSE;
+    int		score = 0;
+    int		numMatches = 0;
+    int		matchCount;
 
     *outScore = 0;
 
-    return fuzzy_match_recursive(fuzpat, str, 0, outScore, str, len, NULL,
-	    matches, maxMatches, 0, &recursionCount);
+    save_pat = vim_strsave(pat_arg);
+    if (save_pat == NULL)
+	return FALSE;
+    pat = save_pat;
+    p = pat;
+
+    // Try matching each word in 'pat_arg' in 'str'
+    while (TRUE)
+    {
+	if (matchseq)
+	    complete = TRUE;
+	else
+	{
+	    // Extract one word from the pattern (separated by space)
+	    p = skipwhite(p);
+	    if (*p == NUL)
+		break;
+	    pat = p;
+	    while (*p != NUL && !VIM_ISWHITE(PTR2CHAR(p)))
+	    {
+		if (has_mbyte)
+		    MB_PTR_ADV(p);
+		else
+		    ++p;
+	    }
+	    if (*p == NUL)		// processed all the words
+		complete = TRUE;
+	    *p = NUL;
+	}
+
+	score = 0;
+	recursionCount = 0;
+	matchCount = fuzzy_match_recursive(pat, str, 0, &score, str, len, NULL,
+				matches + numMatches, maxMatches - numMatches,
+				0, &recursionCount);
+	if (matchCount == 0)
+	{
+	    numMatches = 0;
+	    break;
+	}
+
+	// Accumulate the match score and the number of matches
+	*outScore += score;
+	numMatches += matchCount;
+
+	if (complete)
+	    break;
+
+	// try matching the next word
+	++p;
+    }
+
+    vim_free(save_pat);
+    return numMatches != 0;
 }
 
 /*
  * Sort the fuzzy matches in the descending order of the match score.
+ * For items with same score, retain the order using the index (stable sort)
  */
     static int
-fuzzy_item_compare(const void *s1, const void *s2)
+fuzzy_match_item_compare(const void *s1, const void *s2)
 {
     int		v1 = ((fuzzyItem_T *)s1)->score;
     int		v2 = ((fuzzyItem_T *)s2)->score;
-
-    return v1 == v2 ? 0 : v1 > v2 ? -1 : 1;
+    int		idx1 = ((fuzzyItem_T *)s1)->idx;
+    int		idx2 = ((fuzzyItem_T *)s2)->idx;
+
+    return v1 == v2 ? (idx1 - idx2) : v1 > v2 ? -1 : 1;
 }
 
 /*
  * Fuzzy search the string 'str' in a list of 'items' and return the matching
  * strings in 'fmatchlist'.
+ * If 'matchseq' is TRUE, then for multi-word search strings, match all the
+ * words in sequence.
  * If 'items' is a list of strings, then search for 'str' in the list.
  * If 'items' is a list of dicts, then either use 'key' to lookup the string
  * for each item or use 'item_cb' Funcref function to get the string.
@@ -4531,9 +4606,10 @@ fuzzy_item_compare(const void *s1, const
  * matches for each item.
  */
     static void
-match_fuzzy(
+fuzzy_match_in_list(
 	list_T		*items,
 	char_u		*str,
+	int		matchseq,
 	char_u		*key,
 	callback_T	*item_cb,
 	int		retmatchpos,
@@ -4561,6 +4637,7 @@ match_fuzzy(
 	char_u		*itemstr;
 	typval_T	rettv;
 
+	ptrs[i].idx = i;
 	ptrs[i].item = li;
 	ptrs[i].score = SCORE_NONE;
 	itemstr = NULL;
@@ -4593,25 +4670,34 @@ match_fuzzy(
 	}
 
 	if (itemstr != NULL
-		&& fuzzy_match(itemstr, str, &score, matches,
+		&& fuzzy_match(itemstr, str, matchseq, &score, matches,
 		    sizeof(matches) / sizeof(matches[0])))
 	{
 	    // Copy the list of matching positions in itemstr to a list, if
 	    // 'retmatchpos' is set.
 	    if (retmatchpos)
 	    {
-		int	j;
-		int	strsz;
+		int	j = 0;
+		char_u	*p;
 
 		ptrs[i].lmatchpos = list_alloc();
 		if (ptrs[i].lmatchpos == NULL)
 		    goto done;
-		strsz = MB_CHARLEN(str);
-		for (j = 0; j < strsz; j++)
+
+		p = str;
+		while (*p != NUL)
 		{
-		    if (list_append_number(ptrs[i].lmatchpos,
-				matches[j]) == FAIL)
-			goto done;
+		    if (!VIM_ISWHITE(PTR2CHAR(p)))
+		    {
+			if (list_append_number(ptrs[i].lmatchpos,
+				    matches[j]) == FAIL)
+			    goto done;
+			j++;
+		    }
+		    if (has_mbyte)
+			MB_PTR_ADV(p);
+		    else
+			++p;
 		}
 	    }
 	    ptrs[i].score = score;
@@ -4627,7 +4713,7 @@ match_fuzzy(
 
 	// Sort the list by the descending order of the match score
 	qsort((void *)ptrs, (size_t)len, sizeof(fuzzyItem_T),
-		fuzzy_item_compare);
+		fuzzy_match_item_compare);
 
 	// For matchfuzzy(), return a list of matched strings.
 	//	    ['str1', 'str2', 'str3']
@@ -4687,6 +4773,7 @@ do_fuzzymatch(typval_T *argvars, typval_
     callback_T	cb;
     char_u	*key = NULL;
     int		ret;
+    int		matchseq = FALSE;
 
     CLEAR_POINTER(&cb);
 
@@ -4737,6 +4824,8 @@ do_fuzzymatch(typval_T *argvars, typval_
 		return;
 	    }
 	}
+	if ((di = dict_find(d, (char_u *)"matchseq", -1)) != NULL)
+	    matchseq = TRUE;
     }
 
     // get the fuzzy matches
@@ -4762,8 +4851,8 @@ do_fuzzymatch(typval_T *argvars, typval_
 	    goto done;
     }
 
-    match_fuzzy(argvars[0].vval.v_list, tv_get_string(&argvars[1]), key,
-	    &cb, retmatchpos, rettv->vval.v_list);
+    fuzzy_match_in_list(argvars[0].vval.v_list, tv_get_string(&argvars[1]),
+	    matchseq, key, &cb, retmatchpos, rettv->vval.v_list);
 
 done:
     free_callback(&cb);