Mercurial > vim
comparison src/search.c @ 22232:f22acf6472da v8.2.1665
patch 8.2.1665: cannot do fuzzy string matching
Commit: https://github.com/vim/vim/commit/635414dd2f3ae7d4d972d79b806348a6516cb91a
Author: Bram Moolenaar <Bram@vim.org>
Date: Fri Sep 11 22:25:15 2020 +0200
patch 8.2.1665: cannot do fuzzy string matching
Problem: Cannot do fuzzy string matching.
Solution: Add matchfuzzy(). (Yegappan Lakshmanan, closes https://github.com/vim/vim/issues/6932)
author | Bram Moolenaar <Bram@vim.org> |
---|---|
date | Fri, 11 Sep 2020 22:30:04 +0200 |
parents | a2d8535d035a |
children | 0491b9cafd44 |
comparison
equal
deleted
inserted
replaced
22231:e1311fb42cd4 | 22232:f22acf6472da |
---|---|
4163 dict_add_number(rettv->vval.v_dict, "maxcount", stat.last_maxcount); | 4163 dict_add_number(rettv->vval.v_dict, "maxcount", stat.last_maxcount); |
4164 | 4164 |
4165 the_end: | 4165 the_end: |
4166 restore_last_search_pattern(); | 4166 restore_last_search_pattern(); |
4167 } | 4167 } |
4168 #endif | 4168 |
4169 /* | |
4170 * Fuzzy string matching | |
4171 * | |
4172 * Ported from the lib_fts library authored by Forrest Smith. | |
4173 * https://github.com/forrestthewoods/lib_fts/tree/master/code | |
4174 * | |
4175 * Blog describing the algorithm: | |
4176 * https://www.forrestthewoods.com/blog/reverse_engineering_sublime_texts_fuzzy_match/ | |
4177 * | |
4178 * Each matching string is assigned a score. The following factors are checked: | |
4179 * Matched letter | |
4180 * Unmatched letter | |
4181 * Consecutively matched letters | |
4182 * Proximity to start | |
4183 * Letter following a separator (space, underscore) | |
4184 * Uppercase letter following lowercase (aka CamelCase) | |
4185 * | |
4186 * Matched letters are good. Unmatched letters are bad. Matching near the start | |
4187 * is good. Matching the first letter in the middle of a phrase is good. | |
4188 * Matching the uppercase letters in camel case entries is good. | |
4189 * | |
4190 * The score assigned for each factor is explained below. | |
4191 * File paths are different from file names. File extensions may be ignorable. | |
4192 * Single words care about consecutive matches but not separators or camel | |
4193 * case. | |
4194 * Score starts at 0 | |
4195 * Matched letter: +0 points | |
4196 * Unmatched letter: -1 point | |
4197 * Consecutive match bonus: +5 points | |
4198 * Separator bonus: +10 points | |
4199 * Camel case bonus: +10 points | |
4200 * Unmatched leading letter: -3 points (max: -9) | |
4201 * | |
4202 * There is some nuance to this. Scores don’t have an intrinsic meaning. The | |
4203 * score range isn’t 0 to 100. It’s roughly [-50, 50]. Longer words have a | |
4204 * lower minimum score due to unmatched letter penalty. Longer search patterns | |
4205 * have a higher maximum score due to match bonuses. | |
4206 * | |
4207 * Separator and camel case bonus is worth a LOT. Consecutive matches are worth | |
4208 * quite a bit. | |
4209 * | |
4210 * There is a penalty if you DON’T match the first three letters. Which | |
4211 * effectively rewards matching near the start. However there’s no difference | |
4212 * in matching between the middle and end. | |
4213 * | |
4214 * There is not an explicit bonus for an exact match. Unmatched letters receive | |
4215 * a penalty. So shorter strings and closer matches are worth more. | |
4216 */ | |
4217 typedef struct | |
4218 { | |
4219 listitem_T *item; | |
4220 int score; | |
4221 } fuzzyItem_T; | |
4222 | |
4223 static int | |
4224 fuzzy_match_recursive( | |
4225 char_u *fuzpat, | |
4226 char_u *str, | |
4227 int *outScore, | |
4228 char_u *strBegin, | |
4229 char_u *srcMatches, | |
4230 char_u *matches, | |
4231 int maxMatches, | |
4232 int nextMatch, | |
4233 int *recursionCount, | |
4234 int recursionLimit) | |
4235 { | |
4236 // Recursion params | |
4237 int recursiveMatch = FALSE; | |
4238 char_u bestRecursiveMatches[256]; | |
4239 int bestRecursiveScore = 0; | |
4240 int first_match; | |
4241 int matched; | |
4242 | |
4243 // Count recursions | |
4244 ++*recursionCount; | |
4245 if (*recursionCount >= recursionLimit) | |
4246 return FALSE; | |
4247 | |
4248 // Detect end of strings | |
4249 if (*fuzpat == '\0' || *str == '\0') | |
4250 return FALSE; | |
4251 | |
4252 // Loop through fuzpat and str looking for a match | |
4253 first_match = TRUE; | |
4254 while (*fuzpat != '\0' && *str != '\0') | |
4255 { | |
4256 // Found match | |
4257 if (vim_tolower(*fuzpat) == vim_tolower(*str)) | |
4258 { | |
4259 char_u recursiveMatches[256]; | |
4260 int recursiveScore = 0; | |
4261 | |
4262 // Supplied matches buffer was too short | |
4263 if (nextMatch >= maxMatches) | |
4264 return FALSE; | |
4265 | |
4266 // "Copy-on-Write" srcMatches into matches | |
4267 if (first_match && srcMatches) | |
4268 { | |
4269 memcpy(matches, srcMatches, nextMatch); | |
4270 first_match = FALSE; | |
4271 } | |
4272 | |
4273 // Recursive call that "skips" this match | |
4274 if (fuzzy_match_recursive(fuzpat, str + 1, &recursiveScore, | |
4275 strBegin, matches, recursiveMatches, | |
4276 sizeof(recursiveMatches), nextMatch, recursionCount, | |
4277 recursionLimit)) | |
4278 { | |
4279 // Pick best recursive score | |
4280 if (!recursiveMatch || recursiveScore > bestRecursiveScore) | |
4281 { | |
4282 memcpy(bestRecursiveMatches, recursiveMatches, 256); | |
4283 bestRecursiveScore = recursiveScore; | |
4284 } | |
4285 recursiveMatch = TRUE; | |
4286 } | |
4287 | |
4288 // Advance | |
4289 matches[nextMatch++] = (char_u)(str - strBegin); | |
4290 ++fuzpat; | |
4291 } | |
4292 ++str; | |
4293 } | |
4294 | |
4295 // Determine if full fuzpat was matched | |
4296 matched = *fuzpat == '\0' ? TRUE : FALSE; | |
4297 | |
4298 // Calculate score | |
4299 if (matched) | |
4300 { | |
4301 // bonus for adjacent matches | |
4302 int sequential_bonus = 15; | |
4303 // bonus if match occurs after a separator | |
4304 int separator_bonus = 30; | |
4305 // bonus if match is uppercase and prev is lower | |
4306 int camel_bonus = 30; | |
4307 // bonus if the first letter is matched | |
4308 int first_letter_bonus = 15; | |
4309 // penalty applied for every letter in str before the first match | |
4310 int leading_letter_penalty = -5; | |
4311 // maximum penalty for leading letters | |
4312 int max_leading_letter_penalty = -15; | |
4313 // penalty for every letter that doesn't matter | |
4314 int unmatched_letter_penalty = -1; | |
4315 int penalty; | |
4316 int unmatched; | |
4317 int i; | |
4318 | |
4319 // Iterate str to end | |
4320 while (*str != '\0') | |
4321 ++str; | |
4322 | |
4323 // Initialize score | |
4324 *outScore = 100; | |
4325 | |
4326 // Apply leading letter penalty | |
4327 penalty = leading_letter_penalty * matches[0]; | |
4328 if (penalty < max_leading_letter_penalty) | |
4329 penalty = max_leading_letter_penalty; | |
4330 *outScore += penalty; | |
4331 | |
4332 // Apply unmatched penalty | |
4333 unmatched = (int)(str - strBegin) - nextMatch; | |
4334 *outScore += unmatched_letter_penalty * unmatched; | |
4335 | |
4336 // Apply ordering bonuses | |
4337 for (i = 0; i < nextMatch; ++i) | |
4338 { | |
4339 char_u currIdx = matches[i]; | |
4340 | |
4341 if (i > 0) | |
4342 { | |
4343 char_u prevIdx = matches[i - 1]; | |
4344 | |
4345 // Sequential | |
4346 if (currIdx == (prevIdx + 1)) | |
4347 *outScore += sequential_bonus; | |
4348 } | |
4349 | |
4350 // Check for bonuses based on neighbor character value | |
4351 if (currIdx > 0) | |
4352 { | |
4353 // Camel case | |
4354 char_u neighbor = strBegin[currIdx - 1]; | |
4355 char_u curr = strBegin[currIdx]; | |
4356 int neighborSeparator; | |
4357 | |
4358 if (islower(neighbor) && isupper(curr)) | |
4359 *outScore += camel_bonus; | |
4360 | |
4361 // Separator | |
4362 neighborSeparator = neighbor == '_' || neighbor == ' '; | |
4363 if (neighborSeparator) | |
4364 *outScore += separator_bonus; | |
4365 } | |
4366 else | |
4367 { | |
4368 // First letter | |
4369 *outScore += first_letter_bonus; | |
4370 } | |
4371 } | |
4372 } | |
4373 | |
4374 // Return best result | |
4375 if (recursiveMatch && (!matched || bestRecursiveScore > *outScore)) | |
4376 { | |
4377 // Recursive score is better than "this" | |
4378 memcpy(matches, bestRecursiveMatches, maxMatches); | |
4379 *outScore = bestRecursiveScore; | |
4380 return TRUE; | |
4381 } | |
4382 else if (matched) | |
4383 return TRUE; // "this" score is better than recursive | |
4384 | |
4385 return FALSE; // no match | |
4386 } | |
4387 | |
4388 /* | |
4389 * fuzzy_match() | |
4390 * | |
4391 * Performs exhaustive search via recursion to find all possible matches and | |
4392 * match with highest score. | |
4393 * Scores values have no intrinsic meaning. Possible score range is not | |
4394 * normalized and varies with pattern. | |
4395 * Recursion is limited internally (default=10) to prevent degenerate cases | |
4396 * (fuzpat="aaaaaa" str="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"). | |
4397 * Uses char_u for match indices. Therefore patterns are limited to 256 | |
4398 * characters. | |
4399 * | |
4400 * Returns TRUE if fuzpat is found AND calculates a score. | |
4401 */ | |
4402 static int | |
4403 fuzzy_match(char_u *str, char_u *fuzpat, int *outScore) | |
4404 { | |
4405 char_u matches[256]; | |
4406 int recursionCount = 0; | |
4407 int recursionLimit = 10; | |
4408 | |
4409 *outScore = 0; | |
4410 | |
4411 return fuzzy_match_recursive(fuzpat, str, outScore, str, NULL, matches, | |
4412 sizeof(matches), 0, &recursionCount, recursionLimit); | |
4413 } | |
4414 | |
4415 /* | |
4416 * Sort the fuzzy matches in the descending order of the match score. | |
4417 */ | |
4418 static int | |
4419 fuzzy_item_compare(const void *s1, const void *s2) | |
4420 { | |
4421 int v1 = ((fuzzyItem_T *)s1)->score; | |
4422 int v2 = ((fuzzyItem_T *)s2)->score; | |
4423 | |
4424 return v1 == v2 ? 0 : v1 > v2 ? -1 : 1; | |
4425 } | |
4426 | |
4427 /* | |
4428 * Fuzzy search the string 'str' in 'strlist' and return the matching strings | |
4429 * in 'fmatchlist'. | |
4430 */ | |
4431 static void | |
4432 match_fuzzy(list_T *strlist, char_u *str, list_T *fmatchlist) | |
4433 { | |
4434 long len; | |
4435 fuzzyItem_T *ptrs; | |
4436 listitem_T *li; | |
4437 long i = 0; | |
4438 int found_match = FALSE; | |
4439 | |
4440 len = list_len(strlist); | |
4441 if (len == 0) | |
4442 return; | |
4443 | |
4444 ptrs = ALLOC_MULT(fuzzyItem_T, len); | |
4445 if (ptrs == NULL) | |
4446 return; | |
4447 | |
4448 // For all the string items in strlist, get the fuzzy matching score | |
4449 FOR_ALL_LIST_ITEMS(strlist, li) | |
4450 { | |
4451 int score; | |
4452 | |
4453 ptrs[i].item = li; | |
4454 ptrs[i].score = -9999; | |
4455 // ignore non-string items in the list | |
4456 if (li->li_tv.v_type == VAR_STRING && li->li_tv.vval.v_string != NULL) | |
4457 if (fuzzy_match(li->li_tv.vval.v_string, str, &score)) | |
4458 { | |
4459 ptrs[i].score = score; | |
4460 found_match = TRUE; | |
4461 } | |
4462 ++i; | |
4463 } | |
4464 | |
4465 if (found_match) | |
4466 { | |
4467 // Sort the list by the descending order of the match score | |
4468 qsort((void *)ptrs, (size_t)len, sizeof(fuzzyItem_T), | |
4469 fuzzy_item_compare); | |
4470 | |
4471 // Copy the matching strings with 'score != -9999' to the return list | |
4472 for (i = 0; i < len; i++) | |
4473 { | |
4474 if (ptrs[i].score == -9999) | |
4475 break; | |
4476 list_append_string(fmatchlist, ptrs[i].item->li_tv.vval.v_string, | |
4477 -1); | |
4478 } | |
4479 } | |
4480 | |
4481 vim_free(ptrs); | |
4482 } | |
4483 | |
4484 /* | |
4485 * "matchfuzzy()" function | |
4486 */ | |
4487 void | |
4488 f_matchfuzzy(typval_T *argvars, typval_T *rettv) | |
4489 { | |
4490 if (argvars[0].v_type != VAR_LIST) | |
4491 { | |
4492 emsg(_(e_listreq)); | |
4493 return; | |
4494 } | |
4495 if (argvars[0].vval.v_list == NULL) | |
4496 return; | |
4497 if (argvars[1].v_type != VAR_STRING | |
4498 || argvars[1].vval.v_string == NULL) | |
4499 { | |
4500 semsg(_(e_invarg2), tv_get_string(&argvars[1])); | |
4501 return; | |
4502 } | |
4503 if (rettv_list_alloc(rettv) == OK) | |
4504 match_fuzzy(argvars[0].vval.v_list, tv_get_string(&argvars[1]), | |
4505 rettv->vval.v_list); | |
4506 } | |
4507 | |
4508 #endif |