changeset 22232:f22acf6472da v8.2.1665

patch 8.2.1665: cannot do fuzzy string matching Commit: https://github.com/vim/vim/commit/635414dd2f3ae7d4d972d79b806348a6516cb91a Author: Bram Moolenaar <Bram@vim.org> Date: Fri Sep 11 22:25:15 2020 +0200 patch 8.2.1665: cannot do fuzzy string matching Problem: Cannot do fuzzy string matching. Solution: Add matchfuzzy(). (Yegappan Lakshmanan, closes https://github.com/vim/vim/issues/6932)
author Bram Moolenaar <Bram@vim.org>
date Fri, 11 Sep 2020 22:30:04 +0200
parents e1311fb42cd4
children 0aeaeed4640d
files runtime/doc/eval.txt runtime/doc/usr_41.txt src/evalfunc.c src/proto/search.pro src/search.c src/testdir/test_functions.vim src/version.c
diffstat 7 files changed, 393 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- a/runtime/doc/eval.txt
+++ b/runtime/doc/eval.txt
@@ -2641,6 +2641,7 @@ matcharg({nr})			List	arguments of |:mat
 matchdelete({id} [, {win}])	Number	delete match identified by {id}
 matchend({expr}, {pat} [, {start} [, {count}]])
 				Number	position where {pat} ends in {expr}
+matchfuzzy({list}, {str})	List	fuzzy match {str} in {list}
 matchlist({expr}, {pat} [, {start} [, {count}]])
 				List	match and submatches of {pat} in {expr}
 matchstr({expr}, {pat} [, {start} [, {count}]])
@@ -7307,6 +7308,29 @@ matchend({expr}, {pat} [, {start} [, {co
 		Can also be used as a |method|: >
 			GetText()->matchend('word')
 
+
+matchfuzzy({list}, {str})			*matchfuzzy()*
+		Returns a list with all the strings in {list} that fuzzy
+		match {str}. The strings in the returned list are sorted
+		based on the matching score. {str} is treated as a literal
+		string and regular expression matching is NOT supported.
+		The maximum supported {str} length is 256.
+
+		If there are no matching strings or there is an error, then an
+		empty list is returned. If length of {str} is greater than
+		256, then returns an empty list.
+
+		Example: >
+		   :echo matchfuzzy(["clay", "crow"], "cay")
+<		results in ["clay"]. >
+		   :echo getbufinfo()->map({_, v -> v.name})->matchfuzzy("ndl")
+<		results in a list of buffer names fuzzy matching "ndl". >
+		   :echo v:oldfiles->matchfuzzy("test")
+<		results in a list of file names fuzzy matching "test". >
+		   :let l = readfile("buffer.c")->matchfuzzy("str")
+<		results in a list of lines in "buffer.c" fuzzy matching "str".
+
+
 matchlist({expr}, {pat} [, {start} [, {count}]])		*matchlist()*
 		Same as |match()|, but return a |List|.  The first item in the
 		list is the matched string, same as what matchstr() would
--- a/runtime/doc/usr_41.txt
+++ b/runtime/doc/usr_41.txt
@@ -603,6 +603,7 @@ String manipulation:					*string-functio
 	charclass()		class of a character
 	match()			position where a pattern matches in a string
 	matchend()		position where a pattern match ends in a string
+	matchfuzzy()		fuzzy matches a string in a list of strings
 	matchstr()		match of a pattern in a string
 	matchstrpos()		match and positions of a pattern in a string
 	matchlist()		like matchstr() and also return submatches
--- a/src/evalfunc.c
+++ b/src/evalfunc.c
@@ -750,6 +750,7 @@ static funcentry_T global_functions[] =
     {"matcharg",	1, 1, FEARG_1,	  ret_list_string, f_matcharg},
     {"matchdelete",	1, 2, FEARG_1,	  ret_number,	f_matchdelete},
     {"matchend",	2, 4, FEARG_1,	  ret_number,	f_matchend},
+    {"matchfuzzy",	2, 2, FEARG_1,	  ret_list_string,	f_matchfuzzy},
     {"matchlist",	2, 4, FEARG_1,	  ret_list_string, f_matchlist},
     {"matchstr",	2, 4, FEARG_1,	  ret_string,	f_matchstr},
     {"matchstrpos",	2, 4, FEARG_1,	  ret_list_any,	f_matchstrpos},
--- a/src/proto/search.pro
+++ b/src/proto/search.pro
@@ -36,4 +36,5 @@ void find_pattern_in_path(char_u *ptr, i
 spat_T *get_spat(int idx);
 int get_spat_last_idx(void);
 void f_searchcount(typval_T *argvars, typval_T *rettv);
+void f_matchfuzzy(typval_T *argvars, typval_T *rettv);
 /* vim: set ft=c : */
--- a/src/search.c
+++ b/src/search.c
@@ -4165,4 +4165,344 @@ f_searchcount(typval_T *argvars, typval_
 the_end:
     restore_last_search_pattern();
 }
+
+/*
+ * Fuzzy string matching
+ *
+ * Ported from the lib_fts library authored by Forrest Smith.
+ * https://github.com/forrestthewoods/lib_fts/tree/master/code
+ *
+ * Blog describing the algorithm:
+ * https://www.forrestthewoods.com/blog/reverse_engineering_sublime_texts_fuzzy_match/
+ *
+ * Each matching string is assigned a score. The following factors are checked:
+ *   Matched letter
+ *   Unmatched letter
+ *   Consecutively matched letters
+ *   Proximity to start
+ *   Letter following a separator (space, underscore)
+ *   Uppercase letter following lowercase (aka CamelCase)
+ *
+ * Matched letters are good. Unmatched letters are bad. Matching near the start
+ * is good. Matching the first letter in the middle of a phrase is good.
+ * Matching the uppercase letters in camel case entries is good.
+ *
+ * The score assigned for each factor is explained below.
+ * File paths are different from file names. File extensions may be ignorable.
+ * Single words care about consecutive matches but not separators or camel
+ * case.
+ *   Score starts at 0
+ *   Matched letter: +0 points
+ *   Unmatched letter: -1 point
+ *   Consecutive match bonus: +5 points
+ *   Separator bonus: +10 points
+ *   Camel case bonus: +10 points
+ *   Unmatched leading letter: -3 points (max: -9)
+ *
+ * There is some nuance to this. Scores don’t have an intrinsic meaning. The
+ * score range isn’t 0 to 100. It’s roughly [-50, 50]. Longer words have a
+ * lower minimum score due to unmatched letter penalty. Longer search patterns
+ * have a higher maximum score due to match bonuses.
+ *
+ * Separator and camel case bonus is worth a LOT. Consecutive matches are worth
+ * quite a bit.
+ *
+ * There is a penalty if you DON’T match the first three letters. Which
+ * effectively rewards matching near the start. However there’s no difference
+ * in matching between the middle and end.
+ *
+ * There is not an explicit bonus for an exact match. Unmatched letters receive
+ * a penalty. So shorter strings and closer matches are worth more.
+ */
+typedef struct
+{
+    listitem_T *item;
+    int		score;
+} fuzzyItem_T;
+
+    static int
+fuzzy_match_recursive(
+	char_u	*fuzpat,
+	char_u	*str,
+	int	*outScore,
+	char_u	*strBegin,
+	char_u	*srcMatches,
+	char_u	*matches,
+	int	maxMatches,
+	int	nextMatch,
+	int	*recursionCount,
+	int	recursionLimit)
+{
+    // Recursion params
+    int		recursiveMatch = FALSE;
+    char_u	bestRecursiveMatches[256];
+    int		bestRecursiveScore = 0;
+    int		first_match;
+    int		matched;
+
+    // Count recursions
+    ++*recursionCount;
+    if (*recursionCount >= recursionLimit)
+	return FALSE;
+
+    // Detect end of strings
+    if (*fuzpat == '\0' || *str == '\0')
+	return FALSE;
+
+    // Loop through fuzpat and str looking for a match
+    first_match = TRUE;
+    while (*fuzpat != '\0' && *str != '\0')
+    {
+	// Found match
+	if (vim_tolower(*fuzpat) == vim_tolower(*str))
+	{
+	    char_u	recursiveMatches[256];
+	    int		recursiveScore = 0;
+
+	    // Supplied matches buffer was too short
+	    if (nextMatch >= maxMatches)
+		return FALSE;
+
+	    // "Copy-on-Write" srcMatches into matches
+	    if (first_match && srcMatches)
+	    {
+		memcpy(matches, srcMatches, nextMatch);
+		first_match = FALSE;
+	    }
+
+	    // Recursive call that "skips" this match
+	    if (fuzzy_match_recursive(fuzpat, str + 1, &recursiveScore,
+			strBegin, matches, recursiveMatches,
+			sizeof(recursiveMatches), nextMatch, recursionCount,
+			recursionLimit))
+	    {
+		// Pick best recursive score
+		if (!recursiveMatch || recursiveScore > bestRecursiveScore)
+		{
+		    memcpy(bestRecursiveMatches, recursiveMatches, 256);
+		    bestRecursiveScore = recursiveScore;
+		}
+		recursiveMatch = TRUE;
+	    }
+
+	    // Advance
+	    matches[nextMatch++] = (char_u)(str - strBegin);
+	    ++fuzpat;
+	}
+	++str;
+    }
+
+    // Determine if full fuzpat was matched
+    matched = *fuzpat == '\0' ? TRUE : FALSE;
+
+    // Calculate score
+    if (matched)
+    {
+	// bonus for adjacent matches
+	int	sequential_bonus = 15;
+	// bonus if match occurs after a separator
+	int	separator_bonus = 30;
+	// bonus if match is uppercase and prev is lower
+	int	camel_bonus = 30;
+	// bonus if the first letter is matched
+	int	first_letter_bonus = 15;
+	// penalty applied for every letter in str before the first match
+	int	leading_letter_penalty = -5;
+	// maximum penalty for leading letters
+	int	max_leading_letter_penalty = -15;
+	// penalty for every letter that doesn't matter
+	int	unmatched_letter_penalty = -1;
+	int	penalty;
+	int	unmatched;
+	int	i;
+
+	// Iterate str to end
+	while (*str != '\0')
+	    ++str;
+
+	// Initialize score
+	*outScore = 100;
+
+	// Apply leading letter penalty
+	penalty = leading_letter_penalty * matches[0];
+	if (penalty < max_leading_letter_penalty)
+	    penalty = max_leading_letter_penalty;
+	*outScore += penalty;
+
+	// Apply unmatched penalty
+	unmatched = (int)(str - strBegin) - nextMatch;
+	*outScore += unmatched_letter_penalty * unmatched;
+
+	// Apply ordering bonuses
+	for (i = 0; i < nextMatch; ++i)
+	{
+	    char_u	currIdx = matches[i];
+
+	    if (i > 0)
+	    {
+		char_u	prevIdx = matches[i - 1];
+
+		// Sequential
+		if (currIdx == (prevIdx + 1))
+		    *outScore += sequential_bonus;
+	    }
+
+	    // Check for bonuses based on neighbor character value
+	    if (currIdx > 0)
+	    {
+		// Camel case
+		char_u	neighbor = strBegin[currIdx - 1];
+		char_u	curr = strBegin[currIdx];
+		int	neighborSeparator;
+
+		if (islower(neighbor) && isupper(curr))
+		    *outScore += camel_bonus;
+
+		// Separator
+		neighborSeparator = neighbor == '_' || neighbor == ' ';
+		if (neighborSeparator)
+		    *outScore += separator_bonus;
+	    }
+	    else
+	    {
+		// First letter
+		*outScore += first_letter_bonus;
+	    }
+	}
+    }
+
+    // Return best result
+    if (recursiveMatch && (!matched || bestRecursiveScore > *outScore))
+    {
+	// Recursive score is better than "this"
+	memcpy(matches, bestRecursiveMatches, maxMatches);
+	*outScore = bestRecursiveScore;
+	return TRUE;
+    }
+    else if (matched)
+	return TRUE;		// "this" score is better than recursive
+
+    return FALSE;		// no match
+}
+
+/*
+ * fuzzy_match()
+ *
+ * Performs exhaustive search via recursion to find all possible matches and
+ * match with highest score.
+ * Scores values have no intrinsic meaning.  Possible score range is not
+ * normalized and varies with pattern.
+ * Recursion is limited internally (default=10) to prevent degenerate cases
+ * (fuzpat="aaaaaa" str="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa").
+ * Uses char_u for match indices. Therefore patterns are limited to 256
+ * characters.
+ *
+ * Returns TRUE if fuzpat is found AND calculates a score.
+ */
+    static int
+fuzzy_match(char_u *str, char_u *fuzpat, int *outScore)
+{
+    char_u	matches[256];
+    int		recursionCount = 0;
+    int		recursionLimit = 10;
+
+    *outScore = 0;
+
+    return fuzzy_match_recursive(fuzpat, str, outScore, str, NULL, matches,
+	    sizeof(matches), 0, &recursionCount, recursionLimit);
+}
+
+/*
+ * Sort the fuzzy matches in the descending order of the match score.
+ */
+    static int
+fuzzy_item_compare(const void *s1, const void *s2)
+{
+    int		v1 = ((fuzzyItem_T *)s1)->score;
+    int		v2 = ((fuzzyItem_T *)s2)->score;
+
+    return v1 == v2 ? 0 : v1 > v2 ? -1 : 1;
+}
+
+/*
+ * Fuzzy search the string 'str' in 'strlist' and return the matching strings
+ * in 'fmatchlist'.
+ */
+    static void
+match_fuzzy(list_T *strlist, char_u *str, list_T *fmatchlist)
+{
+    long	len;
+    fuzzyItem_T	*ptrs;
+    listitem_T	*li;
+    long	i = 0;
+    int		found_match = FALSE;
+
+    len = list_len(strlist);
+    if (len == 0)
+	return;
+
+    ptrs = ALLOC_MULT(fuzzyItem_T, len);
+    if (ptrs == NULL)
+	return;
+
+    // For all the string items in strlist, get the fuzzy matching score
+    FOR_ALL_LIST_ITEMS(strlist, li)
+    {
+	int	score;
+
+	ptrs[i].item = li;
+	ptrs[i].score = -9999;
+	// ignore non-string items in the list
+	if (li->li_tv.v_type == VAR_STRING && li->li_tv.vval.v_string != NULL)
+	    if (fuzzy_match(li->li_tv.vval.v_string, str, &score))
+	    {
+		ptrs[i].score = score;
+		found_match = TRUE;
+	    }
+	++i;
+    }
+
+    if (found_match)
+    {
+	// Sort the list by the descending order of the match score
+	qsort((void *)ptrs, (size_t)len, sizeof(fuzzyItem_T),
+		fuzzy_item_compare);
+
+	// Copy the matching strings with 'score != -9999' to the return list
+	for (i = 0; i < len; i++)
+	{
+	    if (ptrs[i].score == -9999)
+		break;
+	    list_append_string(fmatchlist, ptrs[i].item->li_tv.vval.v_string,
+		    -1);
+	}
+    }
+
+    vim_free(ptrs);
+}
+
+/*
+ * "matchfuzzy()" function
+ */
+    void
+f_matchfuzzy(typval_T *argvars, typval_T *rettv)
+{
+    if (argvars[0].v_type != VAR_LIST)
+    {
+	emsg(_(e_listreq));
+	return;
+    }
+    if (argvars[0].vval.v_list == NULL)
+	return;
+    if (argvars[1].v_type != VAR_STRING
+	    || argvars[1].vval.v_string == NULL)
+    {
+	semsg(_(e_invarg2), tv_get_string(&argvars[1]));
+	return;
+    }
+    if (rettv_list_alloc(rettv) == OK)
+	match_fuzzy(argvars[0].vval.v_list, tv_get_string(&argvars[1]),
+		rettv->vval.v_list);
+}
+
 #endif
--- a/src/testdir/test_functions.vim
+++ b/src/testdir/test_functions.vim
@@ -2554,4 +2554,28 @@ func Test_browsedir()
   call assert_fails('call browsedir("open", [])', 'E730:')
 endfunc
 
+" Test for matchfuzzy()
+func Test_matchfuzzy()
+  call assert_fails('call matchfuzzy(10, "abc")', 'E714:')
+  call assert_fails('call matchfuzzy(["abc"], [])', 'E730:')
+  call assert_equal([], matchfuzzy([], 'abc'))
+  call assert_equal([], matchfuzzy(['abc'], ''))
+  call assert_equal(['abc'], matchfuzzy(['abc', 10], 'ac'))
+  call assert_equal([], matchfuzzy([10, 20], 'ac'))
+  call assert_equal(['abc'], matchfuzzy(['abc'], 'abc'))
+  call assert_equal(['crayon', 'camera'], matchfuzzy(['camera', 'crayon'], 'cra'))
+  call assert_equal(['aabbaa', 'aaabbbaaa', 'aaaabbbbaaaa', 'aba'], matchfuzzy(['aba', 'aabbaa', 'aaabbbaaa', 'aaaabbbbaaaa'], 'aa'))
+  call assert_equal(['one'], matchfuzzy(['one', 'two'], 'one'))
+  call assert_equal(['oneTwo', 'onetwo'], matchfuzzy(['onetwo', 'oneTwo'], 'oneTwo'))
+  call assert_equal(['one_two', 'onetwo'], matchfuzzy(['onetwo', 'one_two'], 'oneTwo'))
+  call assert_equal(['aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'], matchfuzzy(['aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'], 'aa'))
+  call assert_equal([], matchfuzzy([repeat('a', 300)], repeat('a', 257)))
+
+  %bw!
+  eval ['somebuf', 'anotherone', 'needle', 'yetanotherone']->map({_, v -> bufadd(v) + bufload(v)})
+  let l = getbufinfo()->map({_, v -> v.name})->matchfuzzy('ndl')
+  call assert_equal(1, len(l))
+  call assert_match('needle', l[0])
+endfunc
+
 " vim: shiftwidth=2 sts=2 expandtab
--- a/src/version.c
+++ b/src/version.c
@@ -751,6 +751,8 @@ static char *(features[]) =
 static int included_patches[] =
 {   /* Add new patch number below this line */
 /**/
+    1665,
+/**/
     1664,
 /**/
     1663,