comparison src/search.c @ 22232:f22acf6472da v8.2.1665

patch 8.2.1665: cannot do fuzzy string matching Commit: https://github.com/vim/vim/commit/635414dd2f3ae7d4d972d79b806348a6516cb91a Author: Bram Moolenaar <Bram@vim.org> Date: Fri Sep 11 22:25:15 2020 +0200 patch 8.2.1665: cannot do fuzzy string matching Problem: Cannot do fuzzy string matching. Solution: Add matchfuzzy(). (Yegappan Lakshmanan, closes https://github.com/vim/vim/issues/6932)
author Bram Moolenaar <Bram@vim.org>
date Fri, 11 Sep 2020 22:30:04 +0200
parents a2d8535d035a
children 0491b9cafd44
comparison
equal deleted inserted replaced
22231:e1311fb42cd4 22232:f22acf6472da
4163 dict_add_number(rettv->vval.v_dict, "maxcount", stat.last_maxcount); 4163 dict_add_number(rettv->vval.v_dict, "maxcount", stat.last_maxcount);
4164 4164
4165 the_end: 4165 the_end:
4166 restore_last_search_pattern(); 4166 restore_last_search_pattern();
4167 } 4167 }
4168 #endif 4168
4169 /*
4170 * Fuzzy string matching
4171 *
4172 * Ported from the lib_fts library authored by Forrest Smith.
4173 * https://github.com/forrestthewoods/lib_fts/tree/master/code
4174 *
4175 * Blog describing the algorithm:
4176 * https://www.forrestthewoods.com/blog/reverse_engineering_sublime_texts_fuzzy_match/
4177 *
4178 * Each matching string is assigned a score. The following factors are checked:
4179 * Matched letter
4180 * Unmatched letter
4181 * Consecutively matched letters
4182 * Proximity to start
4183 * Letter following a separator (space, underscore)
4184 * Uppercase letter following lowercase (aka CamelCase)
4185 *
4186 * Matched letters are good. Unmatched letters are bad. Matching near the start
4187 * is good. Matching the first letter in the middle of a phrase is good.
4188 * Matching the uppercase letters in camel case entries is good.
4189 *
4190 * The score assigned for each factor is explained below.
4191 * File paths are different from file names. File extensions may be ignorable.
4192 * Single words care about consecutive matches but not separators or camel
4193 * case.
4194 * Score starts at 0
4195 * Matched letter: +0 points
4196 * Unmatched letter: -1 point
4197 * Consecutive match bonus: +5 points
4198 * Separator bonus: +10 points
4199 * Camel case bonus: +10 points
4200 * Unmatched leading letter: -3 points (max: -9)
4201 *
4202 * There is some nuance to this. Scores don’t have an intrinsic meaning. The
4203 * score range isn’t 0 to 100. It’s roughly [-50, 50]. Longer words have a
4204 * lower minimum score due to unmatched letter penalty. Longer search patterns
4205 * have a higher maximum score due to match bonuses.
4206 *
4207 * Separator and camel case bonus is worth a LOT. Consecutive matches are worth
4208 * quite a bit.
4209 *
4210 * There is a penalty if you DON’T match the first three letters. Which
4211 * effectively rewards matching near the start. However there’s no difference
4212 * in matching between the middle and end.
4213 *
4214 * There is not an explicit bonus for an exact match. Unmatched letters receive
4215 * a penalty. So shorter strings and closer matches are worth more.
4216 */
4217 typedef struct
4218 {
4219 listitem_T *item;
4220 int score;
4221 } fuzzyItem_T;
4222
4223 static int
4224 fuzzy_match_recursive(
4225 char_u *fuzpat,
4226 char_u *str,
4227 int *outScore,
4228 char_u *strBegin,
4229 char_u *srcMatches,
4230 char_u *matches,
4231 int maxMatches,
4232 int nextMatch,
4233 int *recursionCount,
4234 int recursionLimit)
4235 {
4236 // Recursion params
4237 int recursiveMatch = FALSE;
4238 char_u bestRecursiveMatches[256];
4239 int bestRecursiveScore = 0;
4240 int first_match;
4241 int matched;
4242
4243 // Count recursions
4244 ++*recursionCount;
4245 if (*recursionCount >= recursionLimit)
4246 return FALSE;
4247
4248 // Detect end of strings
4249 if (*fuzpat == '\0' || *str == '\0')
4250 return FALSE;
4251
4252 // Loop through fuzpat and str looking for a match
4253 first_match = TRUE;
4254 while (*fuzpat != '\0' && *str != '\0')
4255 {
4256 // Found match
4257 if (vim_tolower(*fuzpat) == vim_tolower(*str))
4258 {
4259 char_u recursiveMatches[256];
4260 int recursiveScore = 0;
4261
4262 // Supplied matches buffer was too short
4263 if (nextMatch >= maxMatches)
4264 return FALSE;
4265
4266 // "Copy-on-Write" srcMatches into matches
4267 if (first_match && srcMatches)
4268 {
4269 memcpy(matches, srcMatches, nextMatch);
4270 first_match = FALSE;
4271 }
4272
4273 // Recursive call that "skips" this match
4274 if (fuzzy_match_recursive(fuzpat, str + 1, &recursiveScore,
4275 strBegin, matches, recursiveMatches,
4276 sizeof(recursiveMatches), nextMatch, recursionCount,
4277 recursionLimit))
4278 {
4279 // Pick best recursive score
4280 if (!recursiveMatch || recursiveScore > bestRecursiveScore)
4281 {
4282 memcpy(bestRecursiveMatches, recursiveMatches, 256);
4283 bestRecursiveScore = recursiveScore;
4284 }
4285 recursiveMatch = TRUE;
4286 }
4287
4288 // Advance
4289 matches[nextMatch++] = (char_u)(str - strBegin);
4290 ++fuzpat;
4291 }
4292 ++str;
4293 }
4294
4295 // Determine if full fuzpat was matched
4296 matched = *fuzpat == '\0' ? TRUE : FALSE;
4297
4298 // Calculate score
4299 if (matched)
4300 {
4301 // bonus for adjacent matches
4302 int sequential_bonus = 15;
4303 // bonus if match occurs after a separator
4304 int separator_bonus = 30;
4305 // bonus if match is uppercase and prev is lower
4306 int camel_bonus = 30;
4307 // bonus if the first letter is matched
4308 int first_letter_bonus = 15;
4309 // penalty applied for every letter in str before the first match
4310 int leading_letter_penalty = -5;
4311 // maximum penalty for leading letters
4312 int max_leading_letter_penalty = -15;
4313 // penalty for every letter that doesn't matter
4314 int unmatched_letter_penalty = -1;
4315 int penalty;
4316 int unmatched;
4317 int i;
4318
4319 // Iterate str to end
4320 while (*str != '\0')
4321 ++str;
4322
4323 // Initialize score
4324 *outScore = 100;
4325
4326 // Apply leading letter penalty
4327 penalty = leading_letter_penalty * matches[0];
4328 if (penalty < max_leading_letter_penalty)
4329 penalty = max_leading_letter_penalty;
4330 *outScore += penalty;
4331
4332 // Apply unmatched penalty
4333 unmatched = (int)(str - strBegin) - nextMatch;
4334 *outScore += unmatched_letter_penalty * unmatched;
4335
4336 // Apply ordering bonuses
4337 for (i = 0; i < nextMatch; ++i)
4338 {
4339 char_u currIdx = matches[i];
4340
4341 if (i > 0)
4342 {
4343 char_u prevIdx = matches[i - 1];
4344
4345 // Sequential
4346 if (currIdx == (prevIdx + 1))
4347 *outScore += sequential_bonus;
4348 }
4349
4350 // Check for bonuses based on neighbor character value
4351 if (currIdx > 0)
4352 {
4353 // Camel case
4354 char_u neighbor = strBegin[currIdx - 1];
4355 char_u curr = strBegin[currIdx];
4356 int neighborSeparator;
4357
4358 if (islower(neighbor) && isupper(curr))
4359 *outScore += camel_bonus;
4360
4361 // Separator
4362 neighborSeparator = neighbor == '_' || neighbor == ' ';
4363 if (neighborSeparator)
4364 *outScore += separator_bonus;
4365 }
4366 else
4367 {
4368 // First letter
4369 *outScore += first_letter_bonus;
4370 }
4371 }
4372 }
4373
4374 // Return best result
4375 if (recursiveMatch && (!matched || bestRecursiveScore > *outScore))
4376 {
4377 // Recursive score is better than "this"
4378 memcpy(matches, bestRecursiveMatches, maxMatches);
4379 *outScore = bestRecursiveScore;
4380 return TRUE;
4381 }
4382 else if (matched)
4383 return TRUE; // "this" score is better than recursive
4384
4385 return FALSE; // no match
4386 }
4387
4388 /*
4389 * fuzzy_match()
4390 *
4391 * Performs exhaustive search via recursion to find all possible matches and
4392 * match with highest score.
4393 * Scores values have no intrinsic meaning. Possible score range is not
4394 * normalized and varies with pattern.
4395 * Recursion is limited internally (default=10) to prevent degenerate cases
4396 * (fuzpat="aaaaaa" str="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa").
4397 * Uses char_u for match indices. Therefore patterns are limited to 256
4398 * characters.
4399 *
4400 * Returns TRUE if fuzpat is found AND calculates a score.
4401 */
4402 static int
4403 fuzzy_match(char_u *str, char_u *fuzpat, int *outScore)
4404 {
4405 char_u matches[256];
4406 int recursionCount = 0;
4407 int recursionLimit = 10;
4408
4409 *outScore = 0;
4410
4411 return fuzzy_match_recursive(fuzpat, str, outScore, str, NULL, matches,
4412 sizeof(matches), 0, &recursionCount, recursionLimit);
4413 }
4414
4415 /*
4416 * Sort the fuzzy matches in the descending order of the match score.
4417 */
4418 static int
4419 fuzzy_item_compare(const void *s1, const void *s2)
4420 {
4421 int v1 = ((fuzzyItem_T *)s1)->score;
4422 int v2 = ((fuzzyItem_T *)s2)->score;
4423
4424 return v1 == v2 ? 0 : v1 > v2 ? -1 : 1;
4425 }
4426
4427 /*
4428 * Fuzzy search the string 'str' in 'strlist' and return the matching strings
4429 * in 'fmatchlist'.
4430 */
4431 static void
4432 match_fuzzy(list_T *strlist, char_u *str, list_T *fmatchlist)
4433 {
4434 long len;
4435 fuzzyItem_T *ptrs;
4436 listitem_T *li;
4437 long i = 0;
4438 int found_match = FALSE;
4439
4440 len = list_len(strlist);
4441 if (len == 0)
4442 return;
4443
4444 ptrs = ALLOC_MULT(fuzzyItem_T, len);
4445 if (ptrs == NULL)
4446 return;
4447
4448 // For all the string items in strlist, get the fuzzy matching score
4449 FOR_ALL_LIST_ITEMS(strlist, li)
4450 {
4451 int score;
4452
4453 ptrs[i].item = li;
4454 ptrs[i].score = -9999;
4455 // ignore non-string items in the list
4456 if (li->li_tv.v_type == VAR_STRING && li->li_tv.vval.v_string != NULL)
4457 if (fuzzy_match(li->li_tv.vval.v_string, str, &score))
4458 {
4459 ptrs[i].score = score;
4460 found_match = TRUE;
4461 }
4462 ++i;
4463 }
4464
4465 if (found_match)
4466 {
4467 // Sort the list by the descending order of the match score
4468 qsort((void *)ptrs, (size_t)len, sizeof(fuzzyItem_T),
4469 fuzzy_item_compare);
4470
4471 // Copy the matching strings with 'score != -9999' to the return list
4472 for (i = 0; i < len; i++)
4473 {
4474 if (ptrs[i].score == -9999)
4475 break;
4476 list_append_string(fmatchlist, ptrs[i].item->li_tv.vval.v_string,
4477 -1);
4478 }
4479 }
4480
4481 vim_free(ptrs);
4482 }
4483
4484 /*
4485 * "matchfuzzy()" function
4486 */
4487 void
4488 f_matchfuzzy(typval_T *argvars, typval_T *rettv)
4489 {
4490 if (argvars[0].v_type != VAR_LIST)
4491 {
4492 emsg(_(e_listreq));
4493 return;
4494 }
4495 if (argvars[0].vval.v_list == NULL)
4496 return;
4497 if (argvars[1].v_type != VAR_STRING
4498 || argvars[1].vval.v_string == NULL)
4499 {
4500 semsg(_(e_invarg2), tv_get_string(&argvars[1]));
4501 return;
4502 }
4503 if (rettv_list_alloc(rettv) == OK)
4504 match_fuzzy(argvars[0].vval.v_list, tv_get_string(&argvars[1]),
4505 rettv->vval.v_list);
4506 }
4507
4508 #endif