Mercurial > vim
comparison src/spellsuggest.c @ 18172:6e53d83e021d v8.1.2081
patch 8.1.2081: the spell.c file is too big
Commit: https://github.com/vim/vim/commit/46a426c9acfdd3d6c0fa134a17681634b9325bee
Author: Bram Moolenaar <Bram@vim.org>
Date: Fri Sep 27 12:41:56 2019 +0200
patch 8.1.2081: the spell.c file is too big
Problem: The spell.c file is too big.
Solution: Move the code for spell suggestions to a separate file. (Yegappan
Lakshmanan, closes #4988)
author | Bram Moolenaar <Bram@vim.org> |
---|---|
date | Fri, 27 Sep 2019 12:45:08 +0200 |
parents | |
children | c8a53c0daeed |
comparison
equal
deleted
inserted
replaced
18171:b6145168fb4b | 18172:6e53d83e021d |
---|---|
1 /* vi:set ts=8 sts=4 sw=4 noet: | |
2 * | |
3 * VIM - Vi IMproved by Bram Moolenaar | |
4 * | |
5 * Do ":help uganda" in Vim to read copying and usage conditions. | |
6 * Do ":help credits" in Vim to see a list of people who contributed. | |
7 * See README.txt for an overview of the Vim source code. | |
8 */ | |
9 | |
10 /* | |
11 * spellsuggest.c: functions for spelling suggestions | |
12 */ | |
13 | |
14 #include "vim.h" | |
15 | |
16 #if defined(FEAT_SPELL) || defined(PROTO) | |
17 | |
18 /* | |
19 * Use this to adjust the score after finding suggestions, based on the | |
20 * suggested word sounding like the bad word. This is much faster than doing | |
21 * it for every possible suggestion. | |
22 * Disadvantage: When "the" is typed as "hte" it sounds quite different ("@" | |
23 * vs "ht") and goes down in the list. | |
24 * Used when 'spellsuggest' is set to "best". | |
25 */ | |
26 #define RESCORE(word_score, sound_score) ((3 * word_score + sound_score) / 4) | |
27 | |
28 /* | |
29 * Do the opposite: based on a maximum end score and a known sound score, | |
30 * compute the maximum word score that can be used. | |
31 */ | |
32 #define MAXSCORE(word_score, sound_score) ((4 * word_score - sound_score) / 3) | |
33 | |
34 // only used for su_badflags | |
35 #define WF_MIXCAP 0x20 // mix of upper and lower case: macaRONI | |
36 | |
37 /* | |
38 * Information used when looking for suggestions. | |
39 */ | |
40 typedef struct suginfo_S | |
41 { | |
42 garray_T su_ga; // suggestions, contains "suggest_T" | |
43 int su_maxcount; // max. number of suggestions displayed | |
44 int su_maxscore; // maximum score for adding to su_ga | |
45 int su_sfmaxscore; // idem, for when doing soundfold words | |
46 garray_T su_sga; // like su_ga, sound-folded scoring | |
47 char_u *su_badptr; // start of bad word in line | |
48 int su_badlen; // length of detected bad word in line | |
49 int su_badflags; // caps flags for bad word | |
50 char_u su_badword[MAXWLEN]; // bad word truncated at su_badlen | |
51 char_u su_fbadword[MAXWLEN]; // su_badword case-folded | |
52 char_u su_sal_badword[MAXWLEN]; // su_badword soundfolded | |
53 hashtab_T su_banned; // table with banned words | |
54 slang_T *su_sallang; // default language for sound folding | |
55 } suginfo_T; | |
56 | |
57 // One word suggestion. Used in "si_ga". | |
58 typedef struct suggest_S | |
59 { | |
60 char_u *st_word; // suggested word, allocated string | |
61 int st_wordlen; // STRLEN(st_word) | |
62 int st_orglen; // length of replaced text | |
63 int st_score; // lower is better | |
64 int st_altscore; // used when st_score compares equal | |
65 int st_salscore; // st_score is for soundalike | |
66 int st_had_bonus; // bonus already included in score | |
67 slang_T *st_slang; // language used for sound folding | |
68 } suggest_T; | |
69 | |
70 #define SUG(ga, i) (((suggest_T *)(ga).ga_data)[i]) | |
71 | |
72 // TRUE if a word appears in the list of banned words. | |
73 #define WAS_BANNED(su, word) (!HASHITEM_EMPTY(hash_find(&su->su_banned, word))) | |
74 | |
75 // Number of suggestions kept when cleaning up. We need to keep more than | |
76 // what is displayed, because when rescore_suggestions() is called the score | |
77 // may change and wrong suggestions may be removed later. | |
78 #define SUG_CLEAN_COUNT(su) ((su)->su_maxcount < 130 ? 150 : (su)->su_maxcount + 20) | |
79 | |
80 // Threshold for sorting and cleaning up suggestions. Don't want to keep lots | |
81 // of suggestions that are not going to be displayed. | |
82 #define SUG_MAX_COUNT(su) (SUG_CLEAN_COUNT(su) + 50) | |
83 | |
84 // score for various changes | |
85 #define SCORE_SPLIT 149 // split bad word | |
86 #define SCORE_SPLIT_NO 249 // split bad word with NOSPLITSUGS | |
87 #define SCORE_ICASE 52 // slightly different case | |
88 #define SCORE_REGION 200 // word is for different region | |
89 #define SCORE_RARE 180 // rare word | |
90 #define SCORE_SWAP 75 // swap two characters | |
91 #define SCORE_SWAP3 110 // swap two characters in three | |
92 #define SCORE_REP 65 // REP replacement | |
93 #define SCORE_SUBST 93 // substitute a character | |
94 #define SCORE_SIMILAR 33 // substitute a similar character | |
95 #define SCORE_SUBCOMP 33 // substitute a composing character | |
96 #define SCORE_DEL 94 // delete a character | |
97 #define SCORE_DELDUP 66 // delete a duplicated character | |
98 #define SCORE_DELCOMP 28 // delete a composing character | |
99 #define SCORE_INS 96 // insert a character | |
100 #define SCORE_INSDUP 67 // insert a duplicate character | |
101 #define SCORE_INSCOMP 30 // insert a composing character | |
102 #define SCORE_NONWORD 103 // change non-word to word char | |
103 | |
104 #define SCORE_FILE 30 // suggestion from a file | |
105 #define SCORE_MAXINIT 350 // Initial maximum score: higher == slower. | |
106 // 350 allows for about three changes. | |
107 | |
108 #define SCORE_COMMON1 30 // subtracted for words seen before | |
109 #define SCORE_COMMON2 40 // subtracted for words often seen | |
110 #define SCORE_COMMON3 50 // subtracted for words very often seen | |
111 #define SCORE_THRES2 10 // word count threshold for COMMON2 | |
112 #define SCORE_THRES3 100 // word count threshold for COMMON3 | |
113 | |
114 // When trying changed soundfold words it becomes slow when trying more than | |
115 // two changes. With less then two changes it's slightly faster but we miss a | |
116 // few good suggestions. In rare cases we need to try three of four changes. | |
117 #define SCORE_SFMAX1 200 // maximum score for first try | |
118 #define SCORE_SFMAX2 300 // maximum score for second try | |
119 #define SCORE_SFMAX3 400 // maximum score for third try | |
120 | |
121 #define SCORE_BIG SCORE_INS * 3 // big difference | |
122 #define SCORE_MAXMAX 999999 // accept any score | |
123 #define SCORE_LIMITMAX 350 // for spell_edit_score_limit() | |
124 | |
125 // for spell_edit_score_limit() we need to know the minimum value of | |
126 // SCORE_ICASE, SCORE_SWAP, SCORE_DEL, SCORE_SIMILAR and SCORE_INS | |
127 #define SCORE_EDIT_MIN SCORE_SIMILAR | |
128 | |
129 /* | |
130 * For finding suggestions: At each node in the tree these states are tried: | |
131 */ | |
132 typedef enum | |
133 { | |
134 STATE_START = 0, // At start of node check for NUL bytes (goodword | |
135 // ends); if badword ends there is a match, otherwise | |
136 // try splitting word. | |
137 STATE_NOPREFIX, // try without prefix | |
138 STATE_SPLITUNDO, // Undo splitting. | |
139 STATE_ENDNUL, // Past NUL bytes at start of the node. | |
140 STATE_PLAIN, // Use each byte of the node. | |
141 STATE_DEL, // Delete a byte from the bad word. | |
142 STATE_INS_PREP, // Prepare for inserting bytes. | |
143 STATE_INS, // Insert a byte in the bad word. | |
144 STATE_SWAP, // Swap two bytes. | |
145 STATE_UNSWAP, // Undo swap two characters. | |
146 STATE_SWAP3, // Swap two characters over three. | |
147 STATE_UNSWAP3, // Undo Swap two characters over three. | |
148 STATE_UNROT3L, // Undo rotate three characters left | |
149 STATE_UNROT3R, // Undo rotate three characters right | |
150 STATE_REP_INI, // Prepare for using REP items. | |
151 STATE_REP, // Use matching REP items from the .aff file. | |
152 STATE_REP_UNDO, // Undo a REP item replacement. | |
153 STATE_FINAL // End of this node. | |
154 } state_T; | |
155 | |
156 /* | |
157 * Struct to keep the state at each level in suggest_try_change(). | |
158 */ | |
159 typedef struct trystate_S | |
160 { | |
161 state_T ts_state; // state at this level, STATE_ | |
162 int ts_score; // score | |
163 idx_T ts_arridx; // index in tree array, start of node | |
164 short ts_curi; // index in list of child nodes | |
165 char_u ts_fidx; // index in fword[], case-folded bad word | |
166 char_u ts_fidxtry; // ts_fidx at which bytes may be changed | |
167 char_u ts_twordlen; // valid length of tword[] | |
168 char_u ts_prefixdepth; // stack depth for end of prefix or | |
169 // PFD_PREFIXTREE or PFD_NOPREFIX | |
170 char_u ts_flags; // TSF_ flags | |
171 char_u ts_tcharlen; // number of bytes in tword character | |
172 char_u ts_tcharidx; // current byte index in tword character | |
173 char_u ts_isdiff; // DIFF_ values | |
174 char_u ts_fcharstart; // index in fword where badword char started | |
175 char_u ts_prewordlen; // length of word in "preword[]" | |
176 char_u ts_splitoff; // index in "tword" after last split | |
177 char_u ts_splitfidx; // "ts_fidx" at word split | |
178 char_u ts_complen; // nr of compound words used | |
179 char_u ts_compsplit; // index for "compflags" where word was spit | |
180 char_u ts_save_badflags; // su_badflags saved here | |
181 char_u ts_delidx; // index in fword for char that was deleted, | |
182 // valid when "ts_flags" has TSF_DIDDEL | |
183 } trystate_T; | |
184 | |
185 // values for ts_isdiff | |
186 #define DIFF_NONE 0 // no different byte (yet) | |
187 #define DIFF_YES 1 // different byte found | |
188 #define DIFF_INSERT 2 // inserting character | |
189 | |
190 // values for ts_flags | |
191 #define TSF_PREFIXOK 1 // already checked that prefix is OK | |
192 #define TSF_DIDSPLIT 2 // tried split at this point | |
193 #define TSF_DIDDEL 4 // did a delete, "ts_delidx" has index | |
194 | |
195 // special values ts_prefixdepth | |
196 #define PFD_NOPREFIX 0xff // not using prefixes | |
197 #define PFD_PREFIXTREE 0xfe // walking through the prefix tree | |
198 #define PFD_NOTSPECIAL 0xfd // highest value that's not special | |
199 | |
200 static void spell_find_suggest(char_u *badptr, int badlen, suginfo_T *su, int maxcount, int banbadword, int need_cap, int interactive); | |
201 #ifdef FEAT_EVAL | |
202 static void spell_suggest_expr(suginfo_T *su, char_u *expr); | |
203 #endif | |
204 static void spell_suggest_file(suginfo_T *su, char_u *fname); | |
205 static void spell_suggest_intern(suginfo_T *su, int interactive); | |
206 static void spell_find_cleanup(suginfo_T *su); | |
207 static void suggest_try_special(suginfo_T *su); | |
208 static void suggest_try_change(suginfo_T *su); | |
209 static void suggest_trie_walk(suginfo_T *su, langp_T *lp, char_u *fword, int soundfold); | |
210 static void go_deeper(trystate_T *stack, int depth, int score_add); | |
211 static void find_keepcap_word(slang_T *slang, char_u *fword, char_u *kword); | |
212 static void score_comp_sal(suginfo_T *su); | |
213 static void score_combine(suginfo_T *su); | |
214 static int stp_sal_score(suggest_T *stp, suginfo_T *su, slang_T *slang, char_u *badsound); | |
215 static void suggest_try_soundalike_prep(void); | |
216 static void suggest_try_soundalike(suginfo_T *su); | |
217 static void suggest_try_soundalike_finish(void); | |
218 static void add_sound_suggest(suginfo_T *su, char_u *goodword, int score, langp_T *lp); | |
219 static int soundfold_find(slang_T *slang, char_u *word); | |
220 static int similar_chars(slang_T *slang, int c1, int c2); | |
221 static void add_suggestion(suginfo_T *su, garray_T *gap, char_u *goodword, int badlen, int score, int altscore, int had_bonus, slang_T *slang, int maxsf); | |
222 static void check_suggestions(suginfo_T *su, garray_T *gap); | |
223 static void add_banned(suginfo_T *su, char_u *word); | |
224 static void rescore_suggestions(suginfo_T *su); | |
225 static void rescore_one(suginfo_T *su, suggest_T *stp); | |
226 static int cleanup_suggestions(garray_T *gap, int maxscore, int keep); | |
227 static int soundalike_score(char_u *goodsound, char_u *badsound); | |
228 static int spell_edit_score(slang_T *slang, char_u *badword, char_u *goodword); | |
229 static int spell_edit_score_limit(slang_T *slang, char_u *badword, char_u *goodword, int limit); | |
230 static int spell_edit_score_limit_w(slang_T *slang, char_u *badword, char_u *goodword, int limit); | |
231 | |
232 /* | |
233 * Return TRUE when the sequence of flags in "compflags" plus "flag" can | |
234 * possibly form a valid compounded word. This also checks the COMPOUNDRULE | |
235 * lines if they don't contain wildcards. | |
236 */ | |
237 static int | |
238 can_be_compound( | |
239 trystate_T *sp, | |
240 slang_T *slang, | |
241 char_u *compflags, | |
242 int flag) | |
243 { | |
244 // If the flag doesn't appear in sl_compstartflags or sl_compallflags | |
245 // then it can't possibly compound. | |
246 if (!byte_in_str(sp->ts_complen == sp->ts_compsplit | |
247 ? slang->sl_compstartflags : slang->sl_compallflags, flag)) | |
248 return FALSE; | |
249 | |
250 // If there are no wildcards, we can check if the flags collected so far | |
251 // possibly can form a match with COMPOUNDRULE patterns. This only | |
252 // makes sense when we have two or more words. | |
253 if (slang->sl_comprules != NULL && sp->ts_complen > sp->ts_compsplit) | |
254 { | |
255 int v; | |
256 | |
257 compflags[sp->ts_complen] = flag; | |
258 compflags[sp->ts_complen + 1] = NUL; | |
259 v = match_compoundrule(slang, compflags + sp->ts_compsplit); | |
260 compflags[sp->ts_complen] = NUL; | |
261 return v; | |
262 } | |
263 | |
264 return TRUE; | |
265 } | |
266 | |
267 /* | |
268 * Adjust the score of common words. | |
269 */ | |
270 static int | |
271 score_wordcount_adj( | |
272 slang_T *slang, | |
273 int score, | |
274 char_u *word, | |
275 int split) // word was split, less bonus | |
276 { | |
277 hashitem_T *hi; | |
278 wordcount_T *wc; | |
279 int bonus; | |
280 int newscore; | |
281 | |
282 hi = hash_find(&slang->sl_wordcount, word); | |
283 if (!HASHITEM_EMPTY(hi)) | |
284 { | |
285 wc = HI2WC(hi); | |
286 if (wc->wc_count < SCORE_THRES2) | |
287 bonus = SCORE_COMMON1; | |
288 else if (wc->wc_count < SCORE_THRES3) | |
289 bonus = SCORE_COMMON2; | |
290 else | |
291 bonus = SCORE_COMMON3; | |
292 if (split) | |
293 newscore = score - bonus / 2; | |
294 else | |
295 newscore = score - bonus; | |
296 if (newscore < 0) | |
297 return 0; | |
298 return newscore; | |
299 } | |
300 return score; | |
301 } | |
302 | |
303 /* | |
304 * Like captype() but for a KEEPCAP word add ONECAP if the word starts with a | |
305 * capital. So that make_case_word() can turn WOrd into Word. | |
306 * Add ALLCAP for "WOrD". | |
307 */ | |
308 static int | |
309 badword_captype(char_u *word, char_u *end) | |
310 { | |
311 int flags = captype(word, end); | |
312 int c; | |
313 int l, u; | |
314 int first; | |
315 char_u *p; | |
316 | |
317 if (flags & WF_KEEPCAP) | |
318 { | |
319 // Count the number of UPPER and lower case letters. | |
320 l = u = 0; | |
321 first = FALSE; | |
322 for (p = word; p < end; MB_PTR_ADV(p)) | |
323 { | |
324 c = PTR2CHAR(p); | |
325 if (SPELL_ISUPPER(c)) | |
326 { | |
327 ++u; | |
328 if (p == word) | |
329 first = TRUE; | |
330 } | |
331 else | |
332 ++l; | |
333 } | |
334 | |
335 // If there are more UPPER than lower case letters suggest an | |
336 // ALLCAP word. Otherwise, if the first letter is UPPER then | |
337 // suggest ONECAP. Exception: "ALl" most likely should be "All", | |
338 // require three upper case letters. | |
339 if (u > l && u > 2) | |
340 flags |= WF_ALLCAP; | |
341 else if (first) | |
342 flags |= WF_ONECAP; | |
343 | |
344 if (u >= 2 && l >= 2) // maCARONI maCAroni | |
345 flags |= WF_MIXCAP; | |
346 } | |
347 return flags; | |
348 } | |
349 | |
350 /* | |
351 * Opposite of offset2bytes(). | |
352 * "pp" points to the bytes and is advanced over it. | |
353 * Returns the offset. | |
354 */ | |
355 static int | |
356 bytes2offset(char_u **pp) | |
357 { | |
358 char_u *p = *pp; | |
359 int nr; | |
360 int c; | |
361 | |
362 c = *p++; | |
363 if ((c & 0x80) == 0x00) // 1 byte | |
364 { | |
365 nr = c - 1; | |
366 } | |
367 else if ((c & 0xc0) == 0x80) // 2 bytes | |
368 { | |
369 nr = (c & 0x3f) - 1; | |
370 nr = nr * 255 + (*p++ - 1); | |
371 } | |
372 else if ((c & 0xe0) == 0xc0) // 3 bytes | |
373 { | |
374 nr = (c & 0x1f) - 1; | |
375 nr = nr * 255 + (*p++ - 1); | |
376 nr = nr * 255 + (*p++ - 1); | |
377 } | |
378 else // 4 bytes | |
379 { | |
380 nr = (c & 0x0f) - 1; | |
381 nr = nr * 255 + (*p++ - 1); | |
382 nr = nr * 255 + (*p++ - 1); | |
383 nr = nr * 255 + (*p++ - 1); | |
384 } | |
385 | |
386 *pp = p; | |
387 return nr; | |
388 } | |
389 | |
390 // values for sps_flags | |
391 #define SPS_BEST 1 | |
392 #define SPS_FAST 2 | |
393 #define SPS_DOUBLE 4 | |
394 | |
395 static int sps_flags = SPS_BEST; // flags from 'spellsuggest' | |
396 static int sps_limit = 9999; // max nr of suggestions given | |
397 | |
398 /* | |
399 * Check the 'spellsuggest' option. Return FAIL if it's wrong. | |
400 * Sets "sps_flags" and "sps_limit". | |
401 */ | |
402 int | |
403 spell_check_sps(void) | |
404 { | |
405 char_u *p; | |
406 char_u *s; | |
407 char_u buf[MAXPATHL]; | |
408 int f; | |
409 | |
410 sps_flags = 0; | |
411 sps_limit = 9999; | |
412 | |
413 for (p = p_sps; *p != NUL; ) | |
414 { | |
415 copy_option_part(&p, buf, MAXPATHL, ","); | |
416 | |
417 f = 0; | |
418 if (VIM_ISDIGIT(*buf)) | |
419 { | |
420 s = buf; | |
421 sps_limit = getdigits(&s); | |
422 if (*s != NUL && !VIM_ISDIGIT(*s)) | |
423 f = -1; | |
424 } | |
425 else if (STRCMP(buf, "best") == 0) | |
426 f = SPS_BEST; | |
427 else if (STRCMP(buf, "fast") == 0) | |
428 f = SPS_FAST; | |
429 else if (STRCMP(buf, "double") == 0) | |
430 f = SPS_DOUBLE; | |
431 else if (STRNCMP(buf, "expr:", 5) != 0 | |
432 && STRNCMP(buf, "file:", 5) != 0) | |
433 f = -1; | |
434 | |
435 if (f == -1 || (sps_flags != 0 && f != 0)) | |
436 { | |
437 sps_flags = SPS_BEST; | |
438 sps_limit = 9999; | |
439 return FAIL; | |
440 } | |
441 if (f != 0) | |
442 sps_flags = f; | |
443 } | |
444 | |
445 if (sps_flags == 0) | |
446 sps_flags = SPS_BEST; | |
447 | |
448 return OK; | |
449 } | |
450 | |
451 /* | |
452 * "z=": Find badly spelled word under or after the cursor. | |
453 * Give suggestions for the properly spelled word. | |
454 * In Visual mode use the highlighted word as the bad word. | |
455 * When "count" is non-zero use that suggestion. | |
456 */ | |
457 void | |
458 spell_suggest(int count) | |
459 { | |
460 char_u *line; | |
461 pos_T prev_cursor = curwin->w_cursor; | |
462 char_u wcopy[MAXWLEN + 2]; | |
463 char_u *p; | |
464 int i; | |
465 int c; | |
466 suginfo_T sug; | |
467 suggest_T *stp; | |
468 int mouse_used; | |
469 int need_cap; | |
470 int limit; | |
471 int selected = count; | |
472 int badlen = 0; | |
473 int msg_scroll_save = msg_scroll; | |
474 | |
475 if (no_spell_checking(curwin)) | |
476 return; | |
477 | |
478 if (VIsual_active) | |
479 { | |
480 // Use the Visually selected text as the bad word. But reject | |
481 // a multi-line selection. | |
482 if (curwin->w_cursor.lnum != VIsual.lnum) | |
483 { | |
484 vim_beep(BO_SPELL); | |
485 return; | |
486 } | |
487 badlen = (int)curwin->w_cursor.col - (int)VIsual.col; | |
488 if (badlen < 0) | |
489 badlen = -badlen; | |
490 else | |
491 curwin->w_cursor.col = VIsual.col; | |
492 ++badlen; | |
493 end_visual_mode(); | |
494 } | |
495 // Find the start of the badly spelled word. | |
496 else if (spell_move_to(curwin, FORWARD, TRUE, TRUE, NULL) == 0 | |
497 || curwin->w_cursor.col > prev_cursor.col) | |
498 { | |
499 // No bad word or it starts after the cursor: use the word under the | |
500 // cursor. | |
501 curwin->w_cursor = prev_cursor; | |
502 line = ml_get_curline(); | |
503 p = line + curwin->w_cursor.col; | |
504 // Backup to before start of word. | |
505 while (p > line && spell_iswordp_nmw(p, curwin)) | |
506 MB_PTR_BACK(line, p); | |
507 // Forward to start of word. | |
508 while (*p != NUL && !spell_iswordp_nmw(p, curwin)) | |
509 MB_PTR_ADV(p); | |
510 | |
511 if (!spell_iswordp_nmw(p, curwin)) // No word found. | |
512 { | |
513 beep_flush(); | |
514 return; | |
515 } | |
516 curwin->w_cursor.col = (colnr_T)(p - line); | |
517 } | |
518 | |
519 // Get the word and its length. | |
520 | |
521 // Figure out if the word should be capitalised. | |
522 need_cap = check_need_cap(curwin->w_cursor.lnum, curwin->w_cursor.col); | |
523 | |
524 // Make a copy of current line since autocommands may free the line. | |
525 line = vim_strsave(ml_get_curline()); | |
526 if (line == NULL) | |
527 goto skip; | |
528 | |
529 // Get the list of suggestions. Limit to 'lines' - 2 or the number in | |
530 // 'spellsuggest', whatever is smaller. | |
531 if (sps_limit > (int)Rows - 2) | |
532 limit = (int)Rows - 2; | |
533 else | |
534 limit = sps_limit; | |
535 spell_find_suggest(line + curwin->w_cursor.col, badlen, &sug, limit, | |
536 TRUE, need_cap, TRUE); | |
537 | |
538 if (sug.su_ga.ga_len == 0) | |
539 msg(_("Sorry, no suggestions")); | |
540 else if (count > 0) | |
541 { | |
542 if (count > sug.su_ga.ga_len) | |
543 smsg(_("Sorry, only %ld suggestions"), | |
544 (long)sug.su_ga.ga_len); | |
545 } | |
546 else | |
547 { | |
548 VIM_CLEAR(repl_from); | |
549 VIM_CLEAR(repl_to); | |
550 | |
551 #ifdef FEAT_RIGHTLEFT | |
552 // When 'rightleft' is set the list is drawn right-left. | |
553 cmdmsg_rl = curwin->w_p_rl; | |
554 if (cmdmsg_rl) | |
555 msg_col = Columns - 1; | |
556 #endif | |
557 | |
558 // List the suggestions. | |
559 msg_start(); | |
560 msg_row = Rows - 1; // for when 'cmdheight' > 1 | |
561 lines_left = Rows; // avoid more prompt | |
562 vim_snprintf((char *)IObuff, IOSIZE, _("Change \"%.*s\" to:"), | |
563 sug.su_badlen, sug.su_badptr); | |
564 #ifdef FEAT_RIGHTLEFT | |
565 if (cmdmsg_rl && STRNCMP(IObuff, "Change", 6) == 0) | |
566 { | |
567 // And now the rabbit from the high hat: Avoid showing the | |
568 // untranslated message rightleft. | |
569 vim_snprintf((char *)IObuff, IOSIZE, ":ot \"%.*s\" egnahC", | |
570 sug.su_badlen, sug.su_badptr); | |
571 } | |
572 #endif | |
573 msg_puts((char *)IObuff); | |
574 msg_clr_eos(); | |
575 msg_putchar('\n'); | |
576 | |
577 msg_scroll = TRUE; | |
578 for (i = 0; i < sug.su_ga.ga_len; ++i) | |
579 { | |
580 stp = &SUG(sug.su_ga, i); | |
581 | |
582 // The suggested word may replace only part of the bad word, add | |
583 // the not replaced part. | |
584 vim_strncpy(wcopy, stp->st_word, MAXWLEN); | |
585 if (sug.su_badlen > stp->st_orglen) | |
586 vim_strncpy(wcopy + stp->st_wordlen, | |
587 sug.su_badptr + stp->st_orglen, | |
588 sug.su_badlen - stp->st_orglen); | |
589 vim_snprintf((char *)IObuff, IOSIZE, "%2d", i + 1); | |
590 #ifdef FEAT_RIGHTLEFT | |
591 if (cmdmsg_rl) | |
592 rl_mirror(IObuff); | |
593 #endif | |
594 msg_puts((char *)IObuff); | |
595 | |
596 vim_snprintf((char *)IObuff, IOSIZE, " \"%s\"", wcopy); | |
597 msg_puts((char *)IObuff); | |
598 | |
599 // The word may replace more than "su_badlen". | |
600 if (sug.su_badlen < stp->st_orglen) | |
601 { | |
602 vim_snprintf((char *)IObuff, IOSIZE, _(" < \"%.*s\""), | |
603 stp->st_orglen, sug.su_badptr); | |
604 msg_puts((char *)IObuff); | |
605 } | |
606 | |
607 if (p_verbose > 0) | |
608 { | |
609 // Add the score. | |
610 if (sps_flags & (SPS_DOUBLE | SPS_BEST)) | |
611 vim_snprintf((char *)IObuff, IOSIZE, " (%s%d - %d)", | |
612 stp->st_salscore ? "s " : "", | |
613 stp->st_score, stp->st_altscore); | |
614 else | |
615 vim_snprintf((char *)IObuff, IOSIZE, " (%d)", | |
616 stp->st_score); | |
617 #ifdef FEAT_RIGHTLEFT | |
618 if (cmdmsg_rl) | |
619 // Mirror the numbers, but keep the leading space. | |
620 rl_mirror(IObuff + 1); | |
621 #endif | |
622 msg_advance(30); | |
623 msg_puts((char *)IObuff); | |
624 } | |
625 msg_putchar('\n'); | |
626 } | |
627 | |
628 #ifdef FEAT_RIGHTLEFT | |
629 cmdmsg_rl = FALSE; | |
630 msg_col = 0; | |
631 #endif | |
632 // Ask for choice. | |
633 selected = prompt_for_number(&mouse_used); | |
634 if (mouse_used) | |
635 selected -= lines_left; | |
636 lines_left = Rows; // avoid more prompt | |
637 // don't delay for 'smd' in normal_cmd() | |
638 msg_scroll = msg_scroll_save; | |
639 } | |
640 | |
641 if (selected > 0 && selected <= sug.su_ga.ga_len && u_save_cursor() == OK) | |
642 { | |
643 // Save the from and to text for :spellrepall. | |
644 stp = &SUG(sug.su_ga, selected - 1); | |
645 if (sug.su_badlen > stp->st_orglen) | |
646 { | |
647 // Replacing less than "su_badlen", append the remainder to | |
648 // repl_to. | |
649 repl_from = vim_strnsave(sug.su_badptr, sug.su_badlen); | |
650 vim_snprintf((char *)IObuff, IOSIZE, "%s%.*s", stp->st_word, | |
651 sug.su_badlen - stp->st_orglen, | |
652 sug.su_badptr + stp->st_orglen); | |
653 repl_to = vim_strsave(IObuff); | |
654 } | |
655 else | |
656 { | |
657 // Replacing su_badlen or more, use the whole word. | |
658 repl_from = vim_strnsave(sug.su_badptr, stp->st_orglen); | |
659 repl_to = vim_strsave(stp->st_word); | |
660 } | |
661 | |
662 // Replace the word. | |
663 p = alloc(STRLEN(line) - stp->st_orglen + stp->st_wordlen + 1); | |
664 if (p != NULL) | |
665 { | |
666 c = (int)(sug.su_badptr - line); | |
667 mch_memmove(p, line, c); | |
668 STRCPY(p + c, stp->st_word); | |
669 STRCAT(p, sug.su_badptr + stp->st_orglen); | |
670 ml_replace(curwin->w_cursor.lnum, p, FALSE); | |
671 curwin->w_cursor.col = c; | |
672 | |
673 // For redo we use a change-word command. | |
674 ResetRedobuff(); | |
675 AppendToRedobuff((char_u *)"ciw"); | |
676 AppendToRedobuffLit(p + c, | |
677 stp->st_wordlen + sug.su_badlen - stp->st_orglen); | |
678 AppendCharToRedobuff(ESC); | |
679 | |
680 // After this "p" may be invalid. | |
681 changed_bytes(curwin->w_cursor.lnum, c); | |
682 } | |
683 } | |
684 else | |
685 curwin->w_cursor = prev_cursor; | |
686 | |
687 spell_find_cleanup(&sug); | |
688 skip: | |
689 vim_free(line); | |
690 } | |
691 | |
692 /* | |
693 * Find spell suggestions for "word". Return them in the growarray "*gap" as | |
694 * a list of allocated strings. | |
695 */ | |
696 void | |
697 spell_suggest_list( | |
698 garray_T *gap, | |
699 char_u *word, | |
700 int maxcount, // maximum nr of suggestions | |
701 int need_cap, // 'spellcapcheck' matched | |
702 int interactive) | |
703 { | |
704 suginfo_T sug; | |
705 int i; | |
706 suggest_T *stp; | |
707 char_u *wcopy; | |
708 | |
709 spell_find_suggest(word, 0, &sug, maxcount, FALSE, need_cap, interactive); | |
710 | |
711 // Make room in "gap". | |
712 ga_init2(gap, sizeof(char_u *), sug.su_ga.ga_len + 1); | |
713 if (ga_grow(gap, sug.su_ga.ga_len) == OK) | |
714 { | |
715 for (i = 0; i < sug.su_ga.ga_len; ++i) | |
716 { | |
717 stp = &SUG(sug.su_ga, i); | |
718 | |
719 // The suggested word may replace only part of "word", add the not | |
720 // replaced part. | |
721 wcopy = alloc(stp->st_wordlen | |
722 + (unsigned)STRLEN(sug.su_badptr + stp->st_orglen) + 1); | |
723 if (wcopy == NULL) | |
724 break; | |
725 STRCPY(wcopy, stp->st_word); | |
726 STRCPY(wcopy + stp->st_wordlen, sug.su_badptr + stp->st_orglen); | |
727 ((char_u **)gap->ga_data)[gap->ga_len++] = wcopy; | |
728 } | |
729 } | |
730 | |
731 spell_find_cleanup(&sug); | |
732 } | |
733 | |
734 /* | |
735 * Find spell suggestions for the word at the start of "badptr". | |
736 * Return the suggestions in "su->su_ga". | |
737 * The maximum number of suggestions is "maxcount". | |
738 * Note: does use info for the current window. | |
739 * This is based on the mechanisms of Aspell, but completely reimplemented. | |
740 */ | |
741 static void | |
742 spell_find_suggest( | |
743 char_u *badptr, | |
744 int badlen, // length of bad word or 0 if unknown | |
745 suginfo_T *su, | |
746 int maxcount, | |
747 int banbadword, // don't include badword in suggestions | |
748 int need_cap, // word should start with capital | |
749 int interactive) | |
750 { | |
751 hlf_T attr = HLF_COUNT; | |
752 char_u buf[MAXPATHL]; | |
753 char_u *p; | |
754 int do_combine = FALSE; | |
755 char_u *sps_copy; | |
756 #ifdef FEAT_EVAL | |
757 static int expr_busy = FALSE; | |
758 #endif | |
759 int c; | |
760 int i; | |
761 langp_T *lp; | |
762 | |
763 // Set the info in "*su". | |
764 vim_memset(su, 0, sizeof(suginfo_T)); | |
765 ga_init2(&su->su_ga, (int)sizeof(suggest_T), 10); | |
766 ga_init2(&su->su_sga, (int)sizeof(suggest_T), 10); | |
767 if (*badptr == NUL) | |
768 return; | |
769 hash_init(&su->su_banned); | |
770 | |
771 su->su_badptr = badptr; | |
772 if (badlen != 0) | |
773 su->su_badlen = badlen; | |
774 else | |
775 su->su_badlen = spell_check(curwin, su->su_badptr, &attr, NULL, FALSE); | |
776 su->su_maxcount = maxcount; | |
777 su->su_maxscore = SCORE_MAXINIT; | |
778 | |
779 if (su->su_badlen >= MAXWLEN) | |
780 su->su_badlen = MAXWLEN - 1; // just in case | |
781 vim_strncpy(su->su_badword, su->su_badptr, su->su_badlen); | |
782 (void)spell_casefold(su->su_badptr, su->su_badlen, | |
783 su->su_fbadword, MAXWLEN); | |
784 // TODO: make this work if the case-folded text is longer than the original | |
785 // text. Currently an illegal byte causes wrong pointer computations. | |
786 su->su_fbadword[su->su_badlen] = NUL; | |
787 | |
788 // get caps flags for bad word | |
789 su->su_badflags = badword_captype(su->su_badptr, | |
790 su->su_badptr + su->su_badlen); | |
791 if (need_cap) | |
792 su->su_badflags |= WF_ONECAP; | |
793 | |
794 // Find the default language for sound folding. We simply use the first | |
795 // one in 'spelllang' that supports sound folding. That's good for when | |
796 // using multiple files for one language, it's not that bad when mixing | |
797 // languages (e.g., "pl,en"). | |
798 for (i = 0; i < curbuf->b_s.b_langp.ga_len; ++i) | |
799 { | |
800 lp = LANGP_ENTRY(curbuf->b_s.b_langp, i); | |
801 if (lp->lp_sallang != NULL) | |
802 { | |
803 su->su_sallang = lp->lp_sallang; | |
804 break; | |
805 } | |
806 } | |
807 | |
808 // Soundfold the bad word with the default sound folding, so that we don't | |
809 // have to do this many times. | |
810 if (su->su_sallang != NULL) | |
811 spell_soundfold(su->su_sallang, su->su_fbadword, TRUE, | |
812 su->su_sal_badword); | |
813 | |
814 // If the word is not capitalised and spell_check() doesn't consider the | |
815 // word to be bad then it might need to be capitalised. Add a suggestion | |
816 // for that. | |
817 c = PTR2CHAR(su->su_badptr); | |
818 if (!SPELL_ISUPPER(c) && attr == HLF_COUNT) | |
819 { | |
820 make_case_word(su->su_badword, buf, WF_ONECAP); | |
821 add_suggestion(su, &su->su_ga, buf, su->su_badlen, SCORE_ICASE, | |
822 0, TRUE, su->su_sallang, FALSE); | |
823 } | |
824 | |
825 // Ban the bad word itself. It may appear in another region. | |
826 if (banbadword) | |
827 add_banned(su, su->su_badword); | |
828 | |
829 // Make a copy of 'spellsuggest', because the expression may change it. | |
830 sps_copy = vim_strsave(p_sps); | |
831 if (sps_copy == NULL) | |
832 return; | |
833 | |
834 // Loop over the items in 'spellsuggest'. | |
835 for (p = sps_copy; *p != NUL; ) | |
836 { | |
837 copy_option_part(&p, buf, MAXPATHL, ","); | |
838 | |
839 if (STRNCMP(buf, "expr:", 5) == 0) | |
840 { | |
841 #ifdef FEAT_EVAL | |
842 // Evaluate an expression. Skip this when called recursively, | |
843 // when using spellsuggest() in the expression. | |
844 if (!expr_busy) | |
845 { | |
846 expr_busy = TRUE; | |
847 spell_suggest_expr(su, buf + 5); | |
848 expr_busy = FALSE; | |
849 } | |
850 #endif | |
851 } | |
852 else if (STRNCMP(buf, "file:", 5) == 0) | |
853 // Use list of suggestions in a file. | |
854 spell_suggest_file(su, buf + 5); | |
855 else | |
856 { | |
857 // Use internal method. | |
858 spell_suggest_intern(su, interactive); | |
859 if (sps_flags & SPS_DOUBLE) | |
860 do_combine = TRUE; | |
861 } | |
862 } | |
863 | |
864 vim_free(sps_copy); | |
865 | |
866 if (do_combine) | |
867 // Combine the two list of suggestions. This must be done last, | |
868 // because sorting changes the order again. | |
869 score_combine(su); | |
870 } | |
871 | |
872 #ifdef FEAT_EVAL | |
873 /* | |
874 * Find suggestions by evaluating expression "expr". | |
875 */ | |
876 static void | |
877 spell_suggest_expr(suginfo_T *su, char_u *expr) | |
878 { | |
879 list_T *list; | |
880 listitem_T *li; | |
881 int score; | |
882 char_u *p; | |
883 | |
884 // The work is split up in a few parts to avoid having to export | |
885 // suginfo_T. | |
886 // First evaluate the expression and get the resulting list. | |
887 list = eval_spell_expr(su->su_badword, expr); | |
888 if (list != NULL) | |
889 { | |
890 // Loop over the items in the list. | |
891 for (li = list->lv_first; li != NULL; li = li->li_next) | |
892 if (li->li_tv.v_type == VAR_LIST) | |
893 { | |
894 // Get the word and the score from the items. | |
895 score = get_spellword(li->li_tv.vval.v_list, &p); | |
896 if (score >= 0 && score <= su->su_maxscore) | |
897 add_suggestion(su, &su->su_ga, p, su->su_badlen, | |
898 score, 0, TRUE, su->su_sallang, FALSE); | |
899 } | |
900 list_unref(list); | |
901 } | |
902 | |
903 // Remove bogus suggestions, sort and truncate at "maxcount". | |
904 check_suggestions(su, &su->su_ga); | |
905 (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, su->su_maxcount); | |
906 } | |
907 #endif | |
908 | |
909 /* | |
910 * Find suggestions in file "fname". Used for "file:" in 'spellsuggest'. | |
911 */ | |
912 static void | |
913 spell_suggest_file(suginfo_T *su, char_u *fname) | |
914 { | |
915 FILE *fd; | |
916 char_u line[MAXWLEN * 2]; | |
917 char_u *p; | |
918 int len; | |
919 char_u cword[MAXWLEN]; | |
920 | |
921 // Open the file. | |
922 fd = mch_fopen((char *)fname, "r"); | |
923 if (fd == NULL) | |
924 { | |
925 semsg(_(e_notopen), fname); | |
926 return; | |
927 } | |
928 | |
929 // Read it line by line. | |
930 while (!vim_fgets(line, MAXWLEN * 2, fd) && !got_int) | |
931 { | |
932 line_breakcheck(); | |
933 | |
934 p = vim_strchr(line, '/'); | |
935 if (p == NULL) | |
936 continue; // No Tab found, just skip the line. | |
937 *p++ = NUL; | |
938 if (STRICMP(su->su_badword, line) == 0) | |
939 { | |
940 // Match! Isolate the good word, until CR or NL. | |
941 for (len = 0; p[len] >= ' '; ++len) | |
942 ; | |
943 p[len] = NUL; | |
944 | |
945 // If the suggestion doesn't have specific case duplicate the case | |
946 // of the bad word. | |
947 if (captype(p, NULL) == 0) | |
948 { | |
949 make_case_word(p, cword, su->su_badflags); | |
950 p = cword; | |
951 } | |
952 | |
953 add_suggestion(su, &su->su_ga, p, su->su_badlen, | |
954 SCORE_FILE, 0, TRUE, su->su_sallang, FALSE); | |
955 } | |
956 } | |
957 | |
958 fclose(fd); | |
959 | |
960 // Remove bogus suggestions, sort and truncate at "maxcount". | |
961 check_suggestions(su, &su->su_ga); | |
962 (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, su->su_maxcount); | |
963 } | |
964 | |
965 /* | |
966 * Find suggestions for the internal method indicated by "sps_flags". | |
967 */ | |
968 static void | |
969 spell_suggest_intern(suginfo_T *su, int interactive) | |
970 { | |
971 // Load the .sug file(s) that are available and not done yet. | |
972 suggest_load_files(); | |
973 | |
974 // 1. Try special cases, such as repeating a word: "the the" -> "the". | |
975 // | |
976 // Set a maximum score to limit the combination of operations that is | |
977 // tried. | |
978 suggest_try_special(su); | |
979 | |
980 // 2. Try inserting/deleting/swapping/changing a letter, use REP entries | |
981 // from the .aff file and inserting a space (split the word). | |
982 suggest_try_change(su); | |
983 | |
984 // For the resulting top-scorers compute the sound-a-like score. | |
985 if (sps_flags & SPS_DOUBLE) | |
986 score_comp_sal(su); | |
987 | |
988 // 3. Try finding sound-a-like words. | |
989 if ((sps_flags & SPS_FAST) == 0) | |
990 { | |
991 if (sps_flags & SPS_BEST) | |
992 // Adjust the word score for the suggestions found so far for how | |
993 // they sounds like. | |
994 rescore_suggestions(su); | |
995 | |
996 // While going through the soundfold tree "su_maxscore" is the score | |
997 // for the soundfold word, limits the changes that are being tried, | |
998 // and "su_sfmaxscore" the rescored score, which is set by | |
999 // cleanup_suggestions(). | |
1000 // First find words with a small edit distance, because this is much | |
1001 // faster and often already finds the top-N suggestions. If we didn't | |
1002 // find many suggestions try again with a higher edit distance. | |
1003 // "sl_sounddone" is used to avoid doing the same word twice. | |
1004 suggest_try_soundalike_prep(); | |
1005 su->su_maxscore = SCORE_SFMAX1; | |
1006 su->su_sfmaxscore = SCORE_MAXINIT * 3; | |
1007 suggest_try_soundalike(su); | |
1008 if (su->su_ga.ga_len < SUG_CLEAN_COUNT(su)) | |
1009 { | |
1010 // We didn't find enough matches, try again, allowing more | |
1011 // changes to the soundfold word. | |
1012 su->su_maxscore = SCORE_SFMAX2; | |
1013 suggest_try_soundalike(su); | |
1014 if (su->su_ga.ga_len < SUG_CLEAN_COUNT(su)) | |
1015 { | |
1016 // Still didn't find enough matches, try again, allowing even | |
1017 // more changes to the soundfold word. | |
1018 su->su_maxscore = SCORE_SFMAX3; | |
1019 suggest_try_soundalike(su); | |
1020 } | |
1021 } | |
1022 su->su_maxscore = su->su_sfmaxscore; | |
1023 suggest_try_soundalike_finish(); | |
1024 } | |
1025 | |
1026 // When CTRL-C was hit while searching do show the results. Only clear | |
1027 // got_int when using a command, not for spellsuggest(). | |
1028 ui_breakcheck(); | |
1029 if (interactive && got_int) | |
1030 { | |
1031 (void)vgetc(); | |
1032 got_int = FALSE; | |
1033 } | |
1034 | |
1035 if ((sps_flags & SPS_DOUBLE) == 0 && su->su_ga.ga_len != 0) | |
1036 { | |
1037 if (sps_flags & SPS_BEST) | |
1038 // Adjust the word score for how it sounds like. | |
1039 rescore_suggestions(su); | |
1040 | |
1041 // Remove bogus suggestions, sort and truncate at "maxcount". | |
1042 check_suggestions(su, &su->su_ga); | |
1043 (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, su->su_maxcount); | |
1044 } | |
1045 } | |
1046 | |
1047 /* | |
1048 * Free the info put in "*su" by spell_find_suggest(). | |
1049 */ | |
1050 static void | |
1051 spell_find_cleanup(suginfo_T *su) | |
1052 { | |
1053 int i; | |
1054 | |
1055 // Free the suggestions. | |
1056 for (i = 0; i < su->su_ga.ga_len; ++i) | |
1057 vim_free(SUG(su->su_ga, i).st_word); | |
1058 ga_clear(&su->su_ga); | |
1059 for (i = 0; i < su->su_sga.ga_len; ++i) | |
1060 vim_free(SUG(su->su_sga, i).st_word); | |
1061 ga_clear(&su->su_sga); | |
1062 | |
1063 // Free the banned words. | |
1064 hash_clear_all(&su->su_banned, 0); | |
1065 } | |
1066 | |
1067 /* | |
1068 * Try finding suggestions by recognizing specific situations. | |
1069 */ | |
1070 static void | |
1071 suggest_try_special(suginfo_T *su) | |
1072 { | |
1073 char_u *p; | |
1074 size_t len; | |
1075 int c; | |
1076 char_u word[MAXWLEN]; | |
1077 | |
1078 // Recognize a word that is repeated: "the the". | |
1079 p = skiptowhite(su->su_fbadword); | |
1080 len = p - su->su_fbadword; | |
1081 p = skipwhite(p); | |
1082 if (STRLEN(p) == len && STRNCMP(su->su_fbadword, p, len) == 0) | |
1083 { | |
1084 // Include badflags: if the badword is onecap or allcap | |
1085 // use that for the goodword too: "The the" -> "The". | |
1086 c = su->su_fbadword[len]; | |
1087 su->su_fbadword[len] = NUL; | |
1088 make_case_word(su->su_fbadword, word, su->su_badflags); | |
1089 su->su_fbadword[len] = c; | |
1090 | |
1091 // Give a soundalike score of 0, compute the score as if deleting one | |
1092 // character. | |
1093 add_suggestion(su, &su->su_ga, word, su->su_badlen, | |
1094 RESCORE(SCORE_REP, 0), 0, TRUE, su->su_sallang, FALSE); | |
1095 } | |
1096 } | |
1097 | |
1098 /* | |
1099 * Change the 0 to 1 to measure how much time is spent in each state. | |
1100 * Output is dumped in "suggestprof". | |
1101 */ | |
1102 #if 0 | |
1103 # define SUGGEST_PROFILE | |
1104 proftime_T current; | |
1105 proftime_T total; | |
1106 proftime_T times[STATE_FINAL + 1]; | |
1107 long counts[STATE_FINAL + 1]; | |
1108 | |
1109 static void | |
1110 prof_init(void) | |
1111 { | |
1112 for (int i = 0; i <= STATE_FINAL; ++i) | |
1113 { | |
1114 profile_zero(×[i]); | |
1115 counts[i] = 0; | |
1116 } | |
1117 profile_start(¤t); | |
1118 profile_start(&total); | |
1119 } | |
1120 | |
1121 // call before changing state | |
1122 static void | |
1123 prof_store(state_T state) | |
1124 { | |
1125 profile_end(¤t); | |
1126 profile_add(×[state], ¤t); | |
1127 ++counts[state]; | |
1128 profile_start(¤t); | |
1129 } | |
1130 # define PROF_STORE(state) prof_store(state); | |
1131 | |
1132 static void | |
1133 prof_report(char *name) | |
1134 { | |
1135 FILE *fd = fopen("suggestprof", "a"); | |
1136 | |
1137 profile_end(&total); | |
1138 fprintf(fd, "-----------------------\n"); | |
1139 fprintf(fd, "%s: %s\n", name, profile_msg(&total)); | |
1140 for (int i = 0; i <= STATE_FINAL; ++i) | |
1141 fprintf(fd, "%d: %s (%ld)\n", i, profile_msg(×[i]), counts[i]); | |
1142 fclose(fd); | |
1143 } | |
1144 #else | |
1145 # define PROF_STORE(state) | |
1146 #endif | |
1147 | |
1148 /* | |
1149 * Try finding suggestions by adding/removing/swapping letters. | |
1150 */ | |
1151 static void | |
1152 suggest_try_change(suginfo_T *su) | |
1153 { | |
1154 char_u fword[MAXWLEN]; // copy of the bad word, case-folded | |
1155 int n; | |
1156 char_u *p; | |
1157 int lpi; | |
1158 langp_T *lp; | |
1159 | |
1160 // We make a copy of the case-folded bad word, so that we can modify it | |
1161 // to find matches (esp. REP items). Append some more text, changing | |
1162 // chars after the bad word may help. | |
1163 STRCPY(fword, su->su_fbadword); | |
1164 n = (int)STRLEN(fword); | |
1165 p = su->su_badptr + su->su_badlen; | |
1166 (void)spell_casefold(p, (int)STRLEN(p), fword + n, MAXWLEN - n); | |
1167 | |
1168 for (lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi) | |
1169 { | |
1170 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi); | |
1171 | |
1172 // If reloading a spell file fails it's still in the list but | |
1173 // everything has been cleared. | |
1174 if (lp->lp_slang->sl_fbyts == NULL) | |
1175 continue; | |
1176 | |
1177 // Try it for this language. Will add possible suggestions. | |
1178 #ifdef SUGGEST_PROFILE | |
1179 prof_init(); | |
1180 #endif | |
1181 suggest_trie_walk(su, lp, fword, FALSE); | |
1182 #ifdef SUGGEST_PROFILE | |
1183 prof_report("try_change"); | |
1184 #endif | |
1185 } | |
1186 } | |
1187 | |
1188 // Check the maximum score, if we go over it we won't try this change. | |
1189 #define TRY_DEEPER(su, stack, depth, add) \ | |
1190 (stack[depth].ts_score + (add) < su->su_maxscore) | |
1191 | |
1192 /* | |
1193 * Try finding suggestions by adding/removing/swapping letters. | |
1194 * | |
1195 * This uses a state machine. At each node in the tree we try various | |
1196 * operations. When trying if an operation works "depth" is increased and the | |
1197 * stack[] is used to store info. This allows combinations, thus insert one | |
1198 * character, replace one and delete another. The number of changes is | |
1199 * limited by su->su_maxscore. | |
1200 * | |
1201 * After implementing this I noticed an article by Kemal Oflazer that | |
1202 * describes something similar: "Error-tolerant Finite State Recognition with | |
1203 * Applications to Morphological Analysis and Spelling Correction" (1996). | |
1204 * The implementation in the article is simplified and requires a stack of | |
1205 * unknown depth. The implementation here only needs a stack depth equal to | |
1206 * the length of the word. | |
1207 * | |
1208 * This is also used for the sound-folded word, "soundfold" is TRUE then. | |
1209 * The mechanism is the same, but we find a match with a sound-folded word | |
1210 * that comes from one or more original words. Each of these words may be | |
1211 * added, this is done by add_sound_suggest(). | |
1212 * Don't use: | |
1213 * the prefix tree or the keep-case tree | |
1214 * "su->su_badlen" | |
1215 * anything to do with upper and lower case | |
1216 * anything to do with word or non-word characters ("spell_iswordp()") | |
1217 * banned words | |
1218 * word flags (rare, region, compounding) | |
1219 * word splitting for now | |
1220 * "similar_chars()" | |
1221 * use "slang->sl_repsal" instead of "lp->lp_replang->sl_rep" | |
1222 */ | |
1223 static void | |
1224 suggest_trie_walk( | |
1225 suginfo_T *su, | |
1226 langp_T *lp, | |
1227 char_u *fword, | |
1228 int soundfold) | |
1229 { | |
1230 char_u tword[MAXWLEN]; // good word collected so far | |
1231 trystate_T stack[MAXWLEN]; | |
1232 char_u preword[MAXWLEN * 3]; // word found with proper case; | |
1233 // concatenation of prefix compound | |
1234 // words and split word. NUL terminated | |
1235 // when going deeper but not when coming | |
1236 // back. | |
1237 char_u compflags[MAXWLEN]; // compound flags, one for each word | |
1238 trystate_T *sp; | |
1239 int newscore; | |
1240 int score; | |
1241 char_u *byts, *fbyts, *pbyts; | |
1242 idx_T *idxs, *fidxs, *pidxs; | |
1243 int depth; | |
1244 int c, c2, c3; | |
1245 int n = 0; | |
1246 int flags; | |
1247 garray_T *gap; | |
1248 idx_T arridx; | |
1249 int len; | |
1250 char_u *p; | |
1251 fromto_T *ftp; | |
1252 int fl = 0, tl; | |
1253 int repextra = 0; // extra bytes in fword[] from REP item | |
1254 slang_T *slang = lp->lp_slang; | |
1255 int fword_ends; | |
1256 int goodword_ends; | |
1257 #ifdef DEBUG_TRIEWALK | |
1258 // Stores the name of the change made at each level. | |
1259 char_u changename[MAXWLEN][80]; | |
1260 #endif | |
1261 int breakcheckcount = 1000; | |
1262 int compound_ok; | |
1263 | |
1264 // Go through the whole case-fold tree, try changes at each node. | |
1265 // "tword[]" contains the word collected from nodes in the tree. | |
1266 // "fword[]" the word we are trying to match with (initially the bad | |
1267 // word). | |
1268 depth = 0; | |
1269 sp = &stack[0]; | |
1270 vim_memset(sp, 0, sizeof(trystate_T)); | |
1271 sp->ts_curi = 1; | |
1272 | |
1273 if (soundfold) | |
1274 { | |
1275 // Going through the soundfold tree. | |
1276 byts = fbyts = slang->sl_sbyts; | |
1277 idxs = fidxs = slang->sl_sidxs; | |
1278 pbyts = NULL; | |
1279 pidxs = NULL; | |
1280 sp->ts_prefixdepth = PFD_NOPREFIX; | |
1281 sp->ts_state = STATE_START; | |
1282 } | |
1283 else | |
1284 { | |
1285 // When there are postponed prefixes we need to use these first. At | |
1286 // the end of the prefix we continue in the case-fold tree. | |
1287 fbyts = slang->sl_fbyts; | |
1288 fidxs = slang->sl_fidxs; | |
1289 pbyts = slang->sl_pbyts; | |
1290 pidxs = slang->sl_pidxs; | |
1291 if (pbyts != NULL) | |
1292 { | |
1293 byts = pbyts; | |
1294 idxs = pidxs; | |
1295 sp->ts_prefixdepth = PFD_PREFIXTREE; | |
1296 sp->ts_state = STATE_NOPREFIX; // try without prefix first | |
1297 } | |
1298 else | |
1299 { | |
1300 byts = fbyts; | |
1301 idxs = fidxs; | |
1302 sp->ts_prefixdepth = PFD_NOPREFIX; | |
1303 sp->ts_state = STATE_START; | |
1304 } | |
1305 } | |
1306 | |
1307 // Loop to find all suggestions. At each round we either: | |
1308 // - For the current state try one operation, advance "ts_curi", | |
1309 // increase "depth". | |
1310 // - When a state is done go to the next, set "ts_state". | |
1311 // - When all states are tried decrease "depth". | |
1312 while (depth >= 0 && !got_int) | |
1313 { | |
1314 sp = &stack[depth]; | |
1315 switch (sp->ts_state) | |
1316 { | |
1317 case STATE_START: | |
1318 case STATE_NOPREFIX: | |
1319 // Start of node: Deal with NUL bytes, which means | |
1320 // tword[] may end here. | |
1321 arridx = sp->ts_arridx; // current node in the tree | |
1322 len = byts[arridx]; // bytes in this node | |
1323 arridx += sp->ts_curi; // index of current byte | |
1324 | |
1325 if (sp->ts_prefixdepth == PFD_PREFIXTREE) | |
1326 { | |
1327 // Skip over the NUL bytes, we use them later. | |
1328 for (n = 0; n < len && byts[arridx + n] == 0; ++n) | |
1329 ; | |
1330 sp->ts_curi += n; | |
1331 | |
1332 // Always past NUL bytes now. | |
1333 n = (int)sp->ts_state; | |
1334 PROF_STORE(sp->ts_state) | |
1335 sp->ts_state = STATE_ENDNUL; | |
1336 sp->ts_save_badflags = su->su_badflags; | |
1337 | |
1338 // At end of a prefix or at start of prefixtree: check for | |
1339 // following word. | |
1340 if (byts[arridx] == 0 || n == (int)STATE_NOPREFIX) | |
1341 { | |
1342 // Set su->su_badflags to the caps type at this position. | |
1343 // Use the caps type until here for the prefix itself. | |
1344 if (has_mbyte) | |
1345 n = nofold_len(fword, sp->ts_fidx, su->su_badptr); | |
1346 else | |
1347 n = sp->ts_fidx; | |
1348 flags = badword_captype(su->su_badptr, su->su_badptr + n); | |
1349 su->su_badflags = badword_captype(su->su_badptr + n, | |
1350 su->su_badptr + su->su_badlen); | |
1351 #ifdef DEBUG_TRIEWALK | |
1352 sprintf(changename[depth], "prefix"); | |
1353 #endif | |
1354 go_deeper(stack, depth, 0); | |
1355 ++depth; | |
1356 sp = &stack[depth]; | |
1357 sp->ts_prefixdepth = depth - 1; | |
1358 byts = fbyts; | |
1359 idxs = fidxs; | |
1360 sp->ts_arridx = 0; | |
1361 | |
1362 // Move the prefix to preword[] with the right case | |
1363 // and make find_keepcap_word() works. | |
1364 tword[sp->ts_twordlen] = NUL; | |
1365 make_case_word(tword + sp->ts_splitoff, | |
1366 preword + sp->ts_prewordlen, flags); | |
1367 sp->ts_prewordlen = (char_u)STRLEN(preword); | |
1368 sp->ts_splitoff = sp->ts_twordlen; | |
1369 } | |
1370 break; | |
1371 } | |
1372 | |
1373 if (sp->ts_curi > len || byts[arridx] != 0) | |
1374 { | |
1375 // Past bytes in node and/or past NUL bytes. | |
1376 PROF_STORE(sp->ts_state) | |
1377 sp->ts_state = STATE_ENDNUL; | |
1378 sp->ts_save_badflags = su->su_badflags; | |
1379 break; | |
1380 } | |
1381 | |
1382 // End of word in tree. | |
1383 ++sp->ts_curi; // eat one NUL byte | |
1384 | |
1385 flags = (int)idxs[arridx]; | |
1386 | |
1387 // Skip words with the NOSUGGEST flag. | |
1388 if (flags & WF_NOSUGGEST) | |
1389 break; | |
1390 | |
1391 fword_ends = (fword[sp->ts_fidx] == NUL | |
1392 || (soundfold | |
1393 ? VIM_ISWHITE(fword[sp->ts_fidx]) | |
1394 : !spell_iswordp(fword + sp->ts_fidx, curwin))); | |
1395 tword[sp->ts_twordlen] = NUL; | |
1396 | |
1397 if (sp->ts_prefixdepth <= PFD_NOTSPECIAL | |
1398 && (sp->ts_flags & TSF_PREFIXOK) == 0) | |
1399 { | |
1400 // There was a prefix before the word. Check that the prefix | |
1401 // can be used with this word. | |
1402 // Count the length of the NULs in the prefix. If there are | |
1403 // none this must be the first try without a prefix. | |
1404 n = stack[sp->ts_prefixdepth].ts_arridx; | |
1405 len = pbyts[n++]; | |
1406 for (c = 0; c < len && pbyts[n + c] == 0; ++c) | |
1407 ; | |
1408 if (c > 0) | |
1409 { | |
1410 c = valid_word_prefix(c, n, flags, | |
1411 tword + sp->ts_splitoff, slang, FALSE); | |
1412 if (c == 0) | |
1413 break; | |
1414 | |
1415 // Use the WF_RARE flag for a rare prefix. | |
1416 if (c & WF_RAREPFX) | |
1417 flags |= WF_RARE; | |
1418 | |
1419 // Tricky: when checking for both prefix and compounding | |
1420 // we run into the prefix flag first. | |
1421 // Remember that it's OK, so that we accept the prefix | |
1422 // when arriving at a compound flag. | |
1423 sp->ts_flags |= TSF_PREFIXOK; | |
1424 } | |
1425 } | |
1426 | |
1427 // Check NEEDCOMPOUND: can't use word without compounding. Do try | |
1428 // appending another compound word below. | |
1429 if (sp->ts_complen == sp->ts_compsplit && fword_ends | |
1430 && (flags & WF_NEEDCOMP)) | |
1431 goodword_ends = FALSE; | |
1432 else | |
1433 goodword_ends = TRUE; | |
1434 | |
1435 p = NULL; | |
1436 compound_ok = TRUE; | |
1437 if (sp->ts_complen > sp->ts_compsplit) | |
1438 { | |
1439 if (slang->sl_nobreak) | |
1440 { | |
1441 // There was a word before this word. When there was no | |
1442 // change in this word (it was correct) add the first word | |
1443 // as a suggestion. If this word was corrected too, we | |
1444 // need to check if a correct word follows. | |
1445 if (sp->ts_fidx - sp->ts_splitfidx | |
1446 == sp->ts_twordlen - sp->ts_splitoff | |
1447 && STRNCMP(fword + sp->ts_splitfidx, | |
1448 tword + sp->ts_splitoff, | |
1449 sp->ts_fidx - sp->ts_splitfidx) == 0) | |
1450 { | |
1451 preword[sp->ts_prewordlen] = NUL; | |
1452 newscore = score_wordcount_adj(slang, sp->ts_score, | |
1453 preword + sp->ts_prewordlen, | |
1454 sp->ts_prewordlen > 0); | |
1455 // Add the suggestion if the score isn't too bad. | |
1456 if (newscore <= su->su_maxscore) | |
1457 add_suggestion(su, &su->su_ga, preword, | |
1458 sp->ts_splitfidx - repextra, | |
1459 newscore, 0, FALSE, | |
1460 lp->lp_sallang, FALSE); | |
1461 break; | |
1462 } | |
1463 } | |
1464 else | |
1465 { | |
1466 // There was a compound word before this word. If this | |
1467 // word does not support compounding then give up | |
1468 // (splitting is tried for the word without compound | |
1469 // flag). | |
1470 if (((unsigned)flags >> 24) == 0 | |
1471 || sp->ts_twordlen - sp->ts_splitoff | |
1472 < slang->sl_compminlen) | |
1473 break; | |
1474 // For multi-byte chars check character length against | |
1475 // COMPOUNDMIN. | |
1476 if (has_mbyte | |
1477 && slang->sl_compminlen > 0 | |
1478 && mb_charlen(tword + sp->ts_splitoff) | |
1479 < slang->sl_compminlen) | |
1480 break; | |
1481 | |
1482 compflags[sp->ts_complen] = ((unsigned)flags >> 24); | |
1483 compflags[sp->ts_complen + 1] = NUL; | |
1484 vim_strncpy(preword + sp->ts_prewordlen, | |
1485 tword + sp->ts_splitoff, | |
1486 sp->ts_twordlen - sp->ts_splitoff); | |
1487 | |
1488 // Verify CHECKCOMPOUNDPATTERN rules. | |
1489 if (match_checkcompoundpattern(preword, sp->ts_prewordlen, | |
1490 &slang->sl_comppat)) | |
1491 compound_ok = FALSE; | |
1492 | |
1493 if (compound_ok) | |
1494 { | |
1495 p = preword; | |
1496 while (*skiptowhite(p) != NUL) | |
1497 p = skipwhite(skiptowhite(p)); | |
1498 if (fword_ends && !can_compound(slang, p, | |
1499 compflags + sp->ts_compsplit)) | |
1500 // Compound is not allowed. But it may still be | |
1501 // possible if we add another (short) word. | |
1502 compound_ok = FALSE; | |
1503 } | |
1504 | |
1505 // Get pointer to last char of previous word. | |
1506 p = preword + sp->ts_prewordlen; | |
1507 MB_PTR_BACK(preword, p); | |
1508 } | |
1509 } | |
1510 | |
1511 // Form the word with proper case in preword. | |
1512 // If there is a word from a previous split, append. | |
1513 // For the soundfold tree don't change the case, simply append. | |
1514 if (soundfold) | |
1515 STRCPY(preword + sp->ts_prewordlen, tword + sp->ts_splitoff); | |
1516 else if (flags & WF_KEEPCAP) | |
1517 // Must find the word in the keep-case tree. | |
1518 find_keepcap_word(slang, tword + sp->ts_splitoff, | |
1519 preword + sp->ts_prewordlen); | |
1520 else | |
1521 { | |
1522 // Include badflags: If the badword is onecap or allcap | |
1523 // use that for the goodword too. But if the badword is | |
1524 // allcap and it's only one char long use onecap. | |
1525 c = su->su_badflags; | |
1526 if ((c & WF_ALLCAP) | |
1527 && su->su_badlen == (*mb_ptr2len)(su->su_badptr)) | |
1528 c = WF_ONECAP; | |
1529 c |= flags; | |
1530 | |
1531 // When appending a compound word after a word character don't | |
1532 // use Onecap. | |
1533 if (p != NULL && spell_iswordp_nmw(p, curwin)) | |
1534 c &= ~WF_ONECAP; | |
1535 make_case_word(tword + sp->ts_splitoff, | |
1536 preword + sp->ts_prewordlen, c); | |
1537 } | |
1538 | |
1539 if (!soundfold) | |
1540 { | |
1541 // Don't use a banned word. It may appear again as a good | |
1542 // word, thus remember it. | |
1543 if (flags & WF_BANNED) | |
1544 { | |
1545 add_banned(su, preword + sp->ts_prewordlen); | |
1546 break; | |
1547 } | |
1548 if ((sp->ts_complen == sp->ts_compsplit | |
1549 && WAS_BANNED(su, preword + sp->ts_prewordlen)) | |
1550 || WAS_BANNED(su, preword)) | |
1551 { | |
1552 if (slang->sl_compprog == NULL) | |
1553 break; | |
1554 // the word so far was banned but we may try compounding | |
1555 goodword_ends = FALSE; | |
1556 } | |
1557 } | |
1558 | |
1559 newscore = 0; | |
1560 if (!soundfold) // soundfold words don't have flags | |
1561 { | |
1562 if ((flags & WF_REGION) | |
1563 && (((unsigned)flags >> 16) & lp->lp_region) == 0) | |
1564 newscore += SCORE_REGION; | |
1565 if (flags & WF_RARE) | |
1566 newscore += SCORE_RARE; | |
1567 | |
1568 if (!spell_valid_case(su->su_badflags, | |
1569 captype(preword + sp->ts_prewordlen, NULL))) | |
1570 newscore += SCORE_ICASE; | |
1571 } | |
1572 | |
1573 // TODO: how about splitting in the soundfold tree? | |
1574 if (fword_ends | |
1575 && goodword_ends | |
1576 && sp->ts_fidx >= sp->ts_fidxtry | |
1577 && compound_ok) | |
1578 { | |
1579 // The badword also ends: add suggestions. | |
1580 #ifdef DEBUG_TRIEWALK | |
1581 if (soundfold && STRCMP(preword, "smwrd") == 0) | |
1582 { | |
1583 int j; | |
1584 | |
1585 // print the stack of changes that brought us here | |
1586 smsg("------ %s -------", fword); | |
1587 for (j = 0; j < depth; ++j) | |
1588 smsg("%s", changename[j]); | |
1589 } | |
1590 #endif | |
1591 if (soundfold) | |
1592 { | |
1593 // For soundfolded words we need to find the original | |
1594 // words, the edit distance and then add them. | |
1595 add_sound_suggest(su, preword, sp->ts_score, lp); | |
1596 } | |
1597 else if (sp->ts_fidx > 0) | |
1598 { | |
1599 // Give a penalty when changing non-word char to word | |
1600 // char, e.g., "thes," -> "these". | |
1601 p = fword + sp->ts_fidx; | |
1602 MB_PTR_BACK(fword, p); | |
1603 if (!spell_iswordp(p, curwin)) | |
1604 { | |
1605 p = preword + STRLEN(preword); | |
1606 MB_PTR_BACK(preword, p); | |
1607 if (spell_iswordp(p, curwin)) | |
1608 newscore += SCORE_NONWORD; | |
1609 } | |
1610 | |
1611 // Give a bonus to words seen before. | |
1612 score = score_wordcount_adj(slang, | |
1613 sp->ts_score + newscore, | |
1614 preword + sp->ts_prewordlen, | |
1615 sp->ts_prewordlen > 0); | |
1616 | |
1617 // Add the suggestion if the score isn't too bad. | |
1618 if (score <= su->su_maxscore) | |
1619 { | |
1620 add_suggestion(su, &su->su_ga, preword, | |
1621 sp->ts_fidx - repextra, | |
1622 score, 0, FALSE, lp->lp_sallang, FALSE); | |
1623 | |
1624 if (su->su_badflags & WF_MIXCAP) | |
1625 { | |
1626 // We really don't know if the word should be | |
1627 // upper or lower case, add both. | |
1628 c = captype(preword, NULL); | |
1629 if (c == 0 || c == WF_ALLCAP) | |
1630 { | |
1631 make_case_word(tword + sp->ts_splitoff, | |
1632 preword + sp->ts_prewordlen, | |
1633 c == 0 ? WF_ALLCAP : 0); | |
1634 | |
1635 add_suggestion(su, &su->su_ga, preword, | |
1636 sp->ts_fidx - repextra, | |
1637 score + SCORE_ICASE, 0, FALSE, | |
1638 lp->lp_sallang, FALSE); | |
1639 } | |
1640 } | |
1641 } | |
1642 } | |
1643 } | |
1644 | |
1645 // Try word split and/or compounding. | |
1646 if ((sp->ts_fidx >= sp->ts_fidxtry || fword_ends) | |
1647 // Don't split halfway a character. | |
1648 && (!has_mbyte || sp->ts_tcharlen == 0)) | |
1649 { | |
1650 int try_compound; | |
1651 int try_split; | |
1652 | |
1653 // If past the end of the bad word don't try a split. | |
1654 // Otherwise try changing the next word. E.g., find | |
1655 // suggestions for "the the" where the second "the" is | |
1656 // different. It's done like a split. | |
1657 // TODO: word split for soundfold words | |
1658 try_split = (sp->ts_fidx - repextra < su->su_badlen) | |
1659 && !soundfold; | |
1660 | |
1661 // Get here in several situations: | |
1662 // 1. The word in the tree ends: | |
1663 // If the word allows compounding try that. Otherwise try | |
1664 // a split by inserting a space. For both check that a | |
1665 // valid words starts at fword[sp->ts_fidx]. | |
1666 // For NOBREAK do like compounding to be able to check if | |
1667 // the next word is valid. | |
1668 // 2. The badword does end, but it was due to a change (e.g., | |
1669 // a swap). No need to split, but do check that the | |
1670 // following word is valid. | |
1671 // 3. The badword and the word in the tree end. It may still | |
1672 // be possible to compound another (short) word. | |
1673 try_compound = FALSE; | |
1674 if (!soundfold | |
1675 && !slang->sl_nocompoundsugs | |
1676 && slang->sl_compprog != NULL | |
1677 && ((unsigned)flags >> 24) != 0 | |
1678 && sp->ts_twordlen - sp->ts_splitoff | |
1679 >= slang->sl_compminlen | |
1680 && (!has_mbyte | |
1681 || slang->sl_compminlen == 0 | |
1682 || mb_charlen(tword + sp->ts_splitoff) | |
1683 >= slang->sl_compminlen) | |
1684 && (slang->sl_compsylmax < MAXWLEN | |
1685 || sp->ts_complen + 1 - sp->ts_compsplit | |
1686 < slang->sl_compmax) | |
1687 && (can_be_compound(sp, slang, | |
1688 compflags, ((unsigned)flags >> 24)))) | |
1689 | |
1690 { | |
1691 try_compound = TRUE; | |
1692 compflags[sp->ts_complen] = ((unsigned)flags >> 24); | |
1693 compflags[sp->ts_complen + 1] = NUL; | |
1694 } | |
1695 | |
1696 // For NOBREAK we never try splitting, it won't make any word | |
1697 // valid. | |
1698 if (slang->sl_nobreak && !slang->sl_nocompoundsugs) | |
1699 try_compound = TRUE; | |
1700 | |
1701 // If we could add a compound word, and it's also possible to | |
1702 // split at this point, do the split first and set | |
1703 // TSF_DIDSPLIT to avoid doing it again. | |
1704 else if (!fword_ends | |
1705 && try_compound | |
1706 && (sp->ts_flags & TSF_DIDSPLIT) == 0) | |
1707 { | |
1708 try_compound = FALSE; | |
1709 sp->ts_flags |= TSF_DIDSPLIT; | |
1710 --sp->ts_curi; // do the same NUL again | |
1711 compflags[sp->ts_complen] = NUL; | |
1712 } | |
1713 else | |
1714 sp->ts_flags &= ~TSF_DIDSPLIT; | |
1715 | |
1716 if (try_split || try_compound) | |
1717 { | |
1718 if (!try_compound && (!fword_ends || !goodword_ends)) | |
1719 { | |
1720 // If we're going to split need to check that the | |
1721 // words so far are valid for compounding. If there | |
1722 // is only one word it must not have the NEEDCOMPOUND | |
1723 // flag. | |
1724 if (sp->ts_complen == sp->ts_compsplit | |
1725 && (flags & WF_NEEDCOMP)) | |
1726 break; | |
1727 p = preword; | |
1728 while (*skiptowhite(p) != NUL) | |
1729 p = skipwhite(skiptowhite(p)); | |
1730 if (sp->ts_complen > sp->ts_compsplit | |
1731 && !can_compound(slang, p, | |
1732 compflags + sp->ts_compsplit)) | |
1733 break; | |
1734 | |
1735 if (slang->sl_nosplitsugs) | |
1736 newscore += SCORE_SPLIT_NO; | |
1737 else | |
1738 newscore += SCORE_SPLIT; | |
1739 | |
1740 // Give a bonus to words seen before. | |
1741 newscore = score_wordcount_adj(slang, newscore, | |
1742 preword + sp->ts_prewordlen, TRUE); | |
1743 } | |
1744 | |
1745 if (TRY_DEEPER(su, stack, depth, newscore)) | |
1746 { | |
1747 go_deeper(stack, depth, newscore); | |
1748 #ifdef DEBUG_TRIEWALK | |
1749 if (!try_compound && !fword_ends) | |
1750 sprintf(changename[depth], "%.*s-%s: split", | |
1751 sp->ts_twordlen, tword, fword + sp->ts_fidx); | |
1752 else | |
1753 sprintf(changename[depth], "%.*s-%s: compound", | |
1754 sp->ts_twordlen, tword, fword + sp->ts_fidx); | |
1755 #endif | |
1756 // Save things to be restored at STATE_SPLITUNDO. | |
1757 sp->ts_save_badflags = su->su_badflags; | |
1758 PROF_STORE(sp->ts_state) | |
1759 sp->ts_state = STATE_SPLITUNDO; | |
1760 | |
1761 ++depth; | |
1762 sp = &stack[depth]; | |
1763 | |
1764 // Append a space to preword when splitting. | |
1765 if (!try_compound && !fword_ends) | |
1766 STRCAT(preword, " "); | |
1767 sp->ts_prewordlen = (char_u)STRLEN(preword); | |
1768 sp->ts_splitoff = sp->ts_twordlen; | |
1769 sp->ts_splitfidx = sp->ts_fidx; | |
1770 | |
1771 // If the badword has a non-word character at this | |
1772 // position skip it. That means replacing the | |
1773 // non-word character with a space. Always skip a | |
1774 // character when the word ends. But only when the | |
1775 // good word can end. | |
1776 if (((!try_compound && !spell_iswordp_nmw(fword | |
1777 + sp->ts_fidx, | |
1778 curwin)) | |
1779 || fword_ends) | |
1780 && fword[sp->ts_fidx] != NUL | |
1781 && goodword_ends) | |
1782 { | |
1783 int l; | |
1784 | |
1785 l = MB_PTR2LEN(fword + sp->ts_fidx); | |
1786 if (fword_ends) | |
1787 { | |
1788 // Copy the skipped character to preword. | |
1789 mch_memmove(preword + sp->ts_prewordlen, | |
1790 fword + sp->ts_fidx, l); | |
1791 sp->ts_prewordlen += l; | |
1792 preword[sp->ts_prewordlen] = NUL; | |
1793 } | |
1794 else | |
1795 sp->ts_score -= SCORE_SPLIT - SCORE_SUBST; | |
1796 sp->ts_fidx += l; | |
1797 } | |
1798 | |
1799 // When compounding include compound flag in | |
1800 // compflags[] (already set above). When splitting we | |
1801 // may start compounding over again. | |
1802 if (try_compound) | |
1803 ++sp->ts_complen; | |
1804 else | |
1805 sp->ts_compsplit = sp->ts_complen; | |
1806 sp->ts_prefixdepth = PFD_NOPREFIX; | |
1807 | |
1808 // set su->su_badflags to the caps type at this | |
1809 // position | |
1810 if (has_mbyte) | |
1811 n = nofold_len(fword, sp->ts_fidx, su->su_badptr); | |
1812 else | |
1813 n = sp->ts_fidx; | |
1814 su->su_badflags = badword_captype(su->su_badptr + n, | |
1815 su->su_badptr + su->su_badlen); | |
1816 | |
1817 // Restart at top of the tree. | |
1818 sp->ts_arridx = 0; | |
1819 | |
1820 // If there are postponed prefixes, try these too. | |
1821 if (pbyts != NULL) | |
1822 { | |
1823 byts = pbyts; | |
1824 idxs = pidxs; | |
1825 sp->ts_prefixdepth = PFD_PREFIXTREE; | |
1826 PROF_STORE(sp->ts_state) | |
1827 sp->ts_state = STATE_NOPREFIX; | |
1828 } | |
1829 } | |
1830 } | |
1831 } | |
1832 break; | |
1833 | |
1834 case STATE_SPLITUNDO: | |
1835 // Undo the changes done for word split or compound word. | |
1836 su->su_badflags = sp->ts_save_badflags; | |
1837 | |
1838 // Continue looking for NUL bytes. | |
1839 PROF_STORE(sp->ts_state) | |
1840 sp->ts_state = STATE_START; | |
1841 | |
1842 // In case we went into the prefix tree. | |
1843 byts = fbyts; | |
1844 idxs = fidxs; | |
1845 break; | |
1846 | |
1847 case STATE_ENDNUL: | |
1848 // Past the NUL bytes in the node. | |
1849 su->su_badflags = sp->ts_save_badflags; | |
1850 if (fword[sp->ts_fidx] == NUL && sp->ts_tcharlen == 0) | |
1851 { | |
1852 // The badword ends, can't use STATE_PLAIN. | |
1853 PROF_STORE(sp->ts_state) | |
1854 sp->ts_state = STATE_DEL; | |
1855 break; | |
1856 } | |
1857 PROF_STORE(sp->ts_state) | |
1858 sp->ts_state = STATE_PLAIN; | |
1859 // FALLTHROUGH | |
1860 | |
1861 case STATE_PLAIN: | |
1862 // Go over all possible bytes at this node, add each to tword[] | |
1863 // and use child node. "ts_curi" is the index. | |
1864 arridx = sp->ts_arridx; | |
1865 if (sp->ts_curi > byts[arridx]) | |
1866 { | |
1867 // Done all bytes at this node, do next state. When still at | |
1868 // already changed bytes skip the other tricks. | |
1869 PROF_STORE(sp->ts_state) | |
1870 if (sp->ts_fidx >= sp->ts_fidxtry) | |
1871 sp->ts_state = STATE_DEL; | |
1872 else | |
1873 sp->ts_state = STATE_FINAL; | |
1874 } | |
1875 else | |
1876 { | |
1877 arridx += sp->ts_curi++; | |
1878 c = byts[arridx]; | |
1879 | |
1880 // Normal byte, go one level deeper. If it's not equal to the | |
1881 // byte in the bad word adjust the score. But don't even try | |
1882 // when the byte was already changed. And don't try when we | |
1883 // just deleted this byte, accepting it is always cheaper than | |
1884 // delete + substitute. | |
1885 if (c == fword[sp->ts_fidx] | |
1886 || (sp->ts_tcharlen > 0 && sp->ts_isdiff != DIFF_NONE)) | |
1887 newscore = 0; | |
1888 else | |
1889 newscore = SCORE_SUBST; | |
1890 if ((newscore == 0 | |
1891 || (sp->ts_fidx >= sp->ts_fidxtry | |
1892 && ((sp->ts_flags & TSF_DIDDEL) == 0 | |
1893 || c != fword[sp->ts_delidx]))) | |
1894 && TRY_DEEPER(su, stack, depth, newscore)) | |
1895 { | |
1896 go_deeper(stack, depth, newscore); | |
1897 #ifdef DEBUG_TRIEWALK | |
1898 if (newscore > 0) | |
1899 sprintf(changename[depth], "%.*s-%s: subst %c to %c", | |
1900 sp->ts_twordlen, tword, fword + sp->ts_fidx, | |
1901 fword[sp->ts_fidx], c); | |
1902 else | |
1903 sprintf(changename[depth], "%.*s-%s: accept %c", | |
1904 sp->ts_twordlen, tword, fword + sp->ts_fidx, | |
1905 fword[sp->ts_fidx]); | |
1906 #endif | |
1907 ++depth; | |
1908 sp = &stack[depth]; | |
1909 ++sp->ts_fidx; | |
1910 tword[sp->ts_twordlen++] = c; | |
1911 sp->ts_arridx = idxs[arridx]; | |
1912 if (newscore == SCORE_SUBST) | |
1913 sp->ts_isdiff = DIFF_YES; | |
1914 if (has_mbyte) | |
1915 { | |
1916 // Multi-byte characters are a bit complicated to | |
1917 // handle: They differ when any of the bytes differ | |
1918 // and then their length may also differ. | |
1919 if (sp->ts_tcharlen == 0) | |
1920 { | |
1921 // First byte. | |
1922 sp->ts_tcharidx = 0; | |
1923 sp->ts_tcharlen = MB_BYTE2LEN(c); | |
1924 sp->ts_fcharstart = sp->ts_fidx - 1; | |
1925 sp->ts_isdiff = (newscore != 0) | |
1926 ? DIFF_YES : DIFF_NONE; | |
1927 } | |
1928 else if (sp->ts_isdiff == DIFF_INSERT) | |
1929 // When inserting trail bytes don't advance in the | |
1930 // bad word. | |
1931 --sp->ts_fidx; | |
1932 if (++sp->ts_tcharidx == sp->ts_tcharlen) | |
1933 { | |
1934 // Last byte of character. | |
1935 if (sp->ts_isdiff == DIFF_YES) | |
1936 { | |
1937 // Correct ts_fidx for the byte length of the | |
1938 // character (we didn't check that before). | |
1939 sp->ts_fidx = sp->ts_fcharstart | |
1940 + MB_PTR2LEN( | |
1941 fword + sp->ts_fcharstart); | |
1942 // For changing a composing character adjust | |
1943 // the score from SCORE_SUBST to | |
1944 // SCORE_SUBCOMP. | |
1945 if (enc_utf8 | |
1946 && utf_iscomposing( | |
1947 utf_ptr2char(tword | |
1948 + sp->ts_twordlen | |
1949 - sp->ts_tcharlen)) | |
1950 && utf_iscomposing( | |
1951 utf_ptr2char(fword | |
1952 + sp->ts_fcharstart))) | |
1953 sp->ts_score -= | |
1954 SCORE_SUBST - SCORE_SUBCOMP; | |
1955 | |
1956 // For a similar character adjust score from | |
1957 // SCORE_SUBST to SCORE_SIMILAR. | |
1958 else if (!soundfold | |
1959 && slang->sl_has_map | |
1960 && similar_chars(slang, | |
1961 mb_ptr2char(tword | |
1962 + sp->ts_twordlen | |
1963 - sp->ts_tcharlen), | |
1964 mb_ptr2char(fword | |
1965 + sp->ts_fcharstart))) | |
1966 sp->ts_score -= | |
1967 SCORE_SUBST - SCORE_SIMILAR; | |
1968 } | |
1969 else if (sp->ts_isdiff == DIFF_INSERT | |
1970 && sp->ts_twordlen > sp->ts_tcharlen) | |
1971 { | |
1972 p = tword + sp->ts_twordlen - sp->ts_tcharlen; | |
1973 c = mb_ptr2char(p); | |
1974 if (enc_utf8 && utf_iscomposing(c)) | |
1975 { | |
1976 // Inserting a composing char doesn't | |
1977 // count that much. | |
1978 sp->ts_score -= SCORE_INS - SCORE_INSCOMP; | |
1979 } | |
1980 else | |
1981 { | |
1982 // If the previous character was the same, | |
1983 // thus doubling a character, give a bonus | |
1984 // to the score. Also for the soundfold | |
1985 // tree (might seem illogical but does | |
1986 // give better scores). | |
1987 MB_PTR_BACK(tword, p); | |
1988 if (c == mb_ptr2char(p)) | |
1989 sp->ts_score -= SCORE_INS | |
1990 - SCORE_INSDUP; | |
1991 } | |
1992 } | |
1993 | |
1994 // Starting a new char, reset the length. | |
1995 sp->ts_tcharlen = 0; | |
1996 } | |
1997 } | |
1998 else | |
1999 { | |
2000 // If we found a similar char adjust the score. | |
2001 // We do this after calling go_deeper() because | |
2002 // it's slow. | |
2003 if (newscore != 0 | |
2004 && !soundfold | |
2005 && slang->sl_has_map | |
2006 && similar_chars(slang, | |
2007 c, fword[sp->ts_fidx - 1])) | |
2008 sp->ts_score -= SCORE_SUBST - SCORE_SIMILAR; | |
2009 } | |
2010 } | |
2011 } | |
2012 break; | |
2013 | |
2014 case STATE_DEL: | |
2015 // When past the first byte of a multi-byte char don't try | |
2016 // delete/insert/swap a character. | |
2017 if (has_mbyte && sp->ts_tcharlen > 0) | |
2018 { | |
2019 PROF_STORE(sp->ts_state) | |
2020 sp->ts_state = STATE_FINAL; | |
2021 break; | |
2022 } | |
2023 // Try skipping one character in the bad word (delete it). | |
2024 PROF_STORE(sp->ts_state) | |
2025 sp->ts_state = STATE_INS_PREP; | |
2026 sp->ts_curi = 1; | |
2027 if (soundfold && sp->ts_fidx == 0 && fword[sp->ts_fidx] == '*') | |
2028 // Deleting a vowel at the start of a word counts less, see | |
2029 // soundalike_score(). | |
2030 newscore = 2 * SCORE_DEL / 3; | |
2031 else | |
2032 newscore = SCORE_DEL; | |
2033 if (fword[sp->ts_fidx] != NUL | |
2034 && TRY_DEEPER(su, stack, depth, newscore)) | |
2035 { | |
2036 go_deeper(stack, depth, newscore); | |
2037 #ifdef DEBUG_TRIEWALK | |
2038 sprintf(changename[depth], "%.*s-%s: delete %c", | |
2039 sp->ts_twordlen, tword, fword + sp->ts_fidx, | |
2040 fword[sp->ts_fidx]); | |
2041 #endif | |
2042 ++depth; | |
2043 | |
2044 // Remember what character we deleted, so that we can avoid | |
2045 // inserting it again. | |
2046 stack[depth].ts_flags |= TSF_DIDDEL; | |
2047 stack[depth].ts_delidx = sp->ts_fidx; | |
2048 | |
2049 // Advance over the character in fword[]. Give a bonus to the | |
2050 // score if the same character is following "nn" -> "n". It's | |
2051 // a bit illogical for soundfold tree but it does give better | |
2052 // results. | |
2053 if (has_mbyte) | |
2054 { | |
2055 c = mb_ptr2char(fword + sp->ts_fidx); | |
2056 stack[depth].ts_fidx += MB_PTR2LEN(fword + sp->ts_fidx); | |
2057 if (enc_utf8 && utf_iscomposing(c)) | |
2058 stack[depth].ts_score -= SCORE_DEL - SCORE_DELCOMP; | |
2059 else if (c == mb_ptr2char(fword + stack[depth].ts_fidx)) | |
2060 stack[depth].ts_score -= SCORE_DEL - SCORE_DELDUP; | |
2061 } | |
2062 else | |
2063 { | |
2064 ++stack[depth].ts_fidx; | |
2065 if (fword[sp->ts_fidx] == fword[sp->ts_fidx + 1]) | |
2066 stack[depth].ts_score -= SCORE_DEL - SCORE_DELDUP; | |
2067 } | |
2068 break; | |
2069 } | |
2070 // FALLTHROUGH | |
2071 | |
2072 case STATE_INS_PREP: | |
2073 if (sp->ts_flags & TSF_DIDDEL) | |
2074 { | |
2075 // If we just deleted a byte then inserting won't make sense, | |
2076 // a substitute is always cheaper. | |
2077 PROF_STORE(sp->ts_state) | |
2078 sp->ts_state = STATE_SWAP; | |
2079 break; | |
2080 } | |
2081 | |
2082 // skip over NUL bytes | |
2083 n = sp->ts_arridx; | |
2084 for (;;) | |
2085 { | |
2086 if (sp->ts_curi > byts[n]) | |
2087 { | |
2088 // Only NUL bytes at this node, go to next state. | |
2089 PROF_STORE(sp->ts_state) | |
2090 sp->ts_state = STATE_SWAP; | |
2091 break; | |
2092 } | |
2093 if (byts[n + sp->ts_curi] != NUL) | |
2094 { | |
2095 // Found a byte to insert. | |
2096 PROF_STORE(sp->ts_state) | |
2097 sp->ts_state = STATE_INS; | |
2098 break; | |
2099 } | |
2100 ++sp->ts_curi; | |
2101 } | |
2102 break; | |
2103 | |
2104 // FALLTHROUGH | |
2105 | |
2106 case STATE_INS: | |
2107 // Insert one byte. Repeat this for each possible byte at this | |
2108 // node. | |
2109 n = sp->ts_arridx; | |
2110 if (sp->ts_curi > byts[n]) | |
2111 { | |
2112 // Done all bytes at this node, go to next state. | |
2113 PROF_STORE(sp->ts_state) | |
2114 sp->ts_state = STATE_SWAP; | |
2115 break; | |
2116 } | |
2117 | |
2118 // Do one more byte at this node, but: | |
2119 // - Skip NUL bytes. | |
2120 // - Skip the byte if it's equal to the byte in the word, | |
2121 // accepting that byte is always better. | |
2122 n += sp->ts_curi++; | |
2123 c = byts[n]; | |
2124 if (soundfold && sp->ts_twordlen == 0 && c == '*') | |
2125 // Inserting a vowel at the start of a word counts less, | |
2126 // see soundalike_score(). | |
2127 newscore = 2 * SCORE_INS / 3; | |
2128 else | |
2129 newscore = SCORE_INS; | |
2130 if (c != fword[sp->ts_fidx] | |
2131 && TRY_DEEPER(su, stack, depth, newscore)) | |
2132 { | |
2133 go_deeper(stack, depth, newscore); | |
2134 #ifdef DEBUG_TRIEWALK | |
2135 sprintf(changename[depth], "%.*s-%s: insert %c", | |
2136 sp->ts_twordlen, tword, fword + sp->ts_fidx, | |
2137 c); | |
2138 #endif | |
2139 ++depth; | |
2140 sp = &stack[depth]; | |
2141 tword[sp->ts_twordlen++] = c; | |
2142 sp->ts_arridx = idxs[n]; | |
2143 if (has_mbyte) | |
2144 { | |
2145 fl = MB_BYTE2LEN(c); | |
2146 if (fl > 1) | |
2147 { | |
2148 // There are following bytes for the same character. | |
2149 // We must find all bytes before trying | |
2150 // delete/insert/swap/etc. | |
2151 sp->ts_tcharlen = fl; | |
2152 sp->ts_tcharidx = 1; | |
2153 sp->ts_isdiff = DIFF_INSERT; | |
2154 } | |
2155 } | |
2156 else | |
2157 fl = 1; | |
2158 if (fl == 1) | |
2159 { | |
2160 // If the previous character was the same, thus doubling a | |
2161 // character, give a bonus to the score. Also for | |
2162 // soundfold words (illogical but does give a better | |
2163 // score). | |
2164 if (sp->ts_twordlen >= 2 | |
2165 && tword[sp->ts_twordlen - 2] == c) | |
2166 sp->ts_score -= SCORE_INS - SCORE_INSDUP; | |
2167 } | |
2168 } | |
2169 break; | |
2170 | |
2171 case STATE_SWAP: | |
2172 // Swap two bytes in the bad word: "12" -> "21". | |
2173 // We change "fword" here, it's changed back afterwards at | |
2174 // STATE_UNSWAP. | |
2175 p = fword + sp->ts_fidx; | |
2176 c = *p; | |
2177 if (c == NUL) | |
2178 { | |
2179 // End of word, can't swap or replace. | |
2180 PROF_STORE(sp->ts_state) | |
2181 sp->ts_state = STATE_FINAL; | |
2182 break; | |
2183 } | |
2184 | |
2185 // Don't swap if the first character is not a word character. | |
2186 // SWAP3 etc. also don't make sense then. | |
2187 if (!soundfold && !spell_iswordp(p, curwin)) | |
2188 { | |
2189 PROF_STORE(sp->ts_state) | |
2190 sp->ts_state = STATE_REP_INI; | |
2191 break; | |
2192 } | |
2193 | |
2194 if (has_mbyte) | |
2195 { | |
2196 n = MB_CPTR2LEN(p); | |
2197 c = mb_ptr2char(p); | |
2198 if (p[n] == NUL) | |
2199 c2 = NUL; | |
2200 else if (!soundfold && !spell_iswordp(p + n, curwin)) | |
2201 c2 = c; // don't swap non-word char | |
2202 else | |
2203 c2 = mb_ptr2char(p + n); | |
2204 } | |
2205 else | |
2206 { | |
2207 if (p[1] == NUL) | |
2208 c2 = NUL; | |
2209 else if (!soundfold && !spell_iswordp(p + 1, curwin)) | |
2210 c2 = c; // don't swap non-word char | |
2211 else | |
2212 c2 = p[1]; | |
2213 } | |
2214 | |
2215 // When the second character is NUL we can't swap. | |
2216 if (c2 == NUL) | |
2217 { | |
2218 PROF_STORE(sp->ts_state) | |
2219 sp->ts_state = STATE_REP_INI; | |
2220 break; | |
2221 } | |
2222 | |
2223 // When characters are identical, swap won't do anything. | |
2224 // Also get here if the second char is not a word character. | |
2225 if (c == c2) | |
2226 { | |
2227 PROF_STORE(sp->ts_state) | |
2228 sp->ts_state = STATE_SWAP3; | |
2229 break; | |
2230 } | |
2231 if (c2 != NUL && TRY_DEEPER(su, stack, depth, SCORE_SWAP)) | |
2232 { | |
2233 go_deeper(stack, depth, SCORE_SWAP); | |
2234 #ifdef DEBUG_TRIEWALK | |
2235 sprintf(changename[depth], "%.*s-%s: swap %c and %c", | |
2236 sp->ts_twordlen, tword, fword + sp->ts_fidx, | |
2237 c, c2); | |
2238 #endif | |
2239 PROF_STORE(sp->ts_state) | |
2240 sp->ts_state = STATE_UNSWAP; | |
2241 ++depth; | |
2242 if (has_mbyte) | |
2243 { | |
2244 fl = mb_char2len(c2); | |
2245 mch_memmove(p, p + n, fl); | |
2246 mb_char2bytes(c, p + fl); | |
2247 stack[depth].ts_fidxtry = sp->ts_fidx + n + fl; | |
2248 } | |
2249 else | |
2250 { | |
2251 p[0] = c2; | |
2252 p[1] = c; | |
2253 stack[depth].ts_fidxtry = sp->ts_fidx + 2; | |
2254 } | |
2255 } | |
2256 else | |
2257 { | |
2258 // If this swap doesn't work then SWAP3 won't either. | |
2259 PROF_STORE(sp->ts_state) | |
2260 sp->ts_state = STATE_REP_INI; | |
2261 } | |
2262 break; | |
2263 | |
2264 case STATE_UNSWAP: | |
2265 // Undo the STATE_SWAP swap: "21" -> "12". | |
2266 p = fword + sp->ts_fidx; | |
2267 if (has_mbyte) | |
2268 { | |
2269 n = MB_PTR2LEN(p); | |
2270 c = mb_ptr2char(p + n); | |
2271 mch_memmove(p + MB_PTR2LEN(p + n), p, n); | |
2272 mb_char2bytes(c, p); | |
2273 } | |
2274 else | |
2275 { | |
2276 c = *p; | |
2277 *p = p[1]; | |
2278 p[1] = c; | |
2279 } | |
2280 // FALLTHROUGH | |
2281 | |
2282 case STATE_SWAP3: | |
2283 // Swap two bytes, skipping one: "123" -> "321". We change | |
2284 // "fword" here, it's changed back afterwards at STATE_UNSWAP3. | |
2285 p = fword + sp->ts_fidx; | |
2286 if (has_mbyte) | |
2287 { | |
2288 n = MB_CPTR2LEN(p); | |
2289 c = mb_ptr2char(p); | |
2290 fl = MB_CPTR2LEN(p + n); | |
2291 c2 = mb_ptr2char(p + n); | |
2292 if (!soundfold && !spell_iswordp(p + n + fl, curwin)) | |
2293 c3 = c; // don't swap non-word char | |
2294 else | |
2295 c3 = mb_ptr2char(p + n + fl); | |
2296 } | |
2297 else | |
2298 { | |
2299 c = *p; | |
2300 c2 = p[1]; | |
2301 if (!soundfold && !spell_iswordp(p + 2, curwin)) | |
2302 c3 = c; // don't swap non-word char | |
2303 else | |
2304 c3 = p[2]; | |
2305 } | |
2306 | |
2307 // When characters are identical: "121" then SWAP3 result is | |
2308 // identical, ROT3L result is same as SWAP: "211", ROT3L result is | |
2309 // same as SWAP on next char: "112". Thus skip all swapping. | |
2310 // Also skip when c3 is NUL. | |
2311 // Also get here when the third character is not a word character. | |
2312 // Second character may any char: "a.b" -> "b.a" | |
2313 if (c == c3 || c3 == NUL) | |
2314 { | |
2315 PROF_STORE(sp->ts_state) | |
2316 sp->ts_state = STATE_REP_INI; | |
2317 break; | |
2318 } | |
2319 if (TRY_DEEPER(su, stack, depth, SCORE_SWAP3)) | |
2320 { | |
2321 go_deeper(stack, depth, SCORE_SWAP3); | |
2322 #ifdef DEBUG_TRIEWALK | |
2323 sprintf(changename[depth], "%.*s-%s: swap3 %c and %c", | |
2324 sp->ts_twordlen, tword, fword + sp->ts_fidx, | |
2325 c, c3); | |
2326 #endif | |
2327 PROF_STORE(sp->ts_state) | |
2328 sp->ts_state = STATE_UNSWAP3; | |
2329 ++depth; | |
2330 if (has_mbyte) | |
2331 { | |
2332 tl = mb_char2len(c3); | |
2333 mch_memmove(p, p + n + fl, tl); | |
2334 mb_char2bytes(c2, p + tl); | |
2335 mb_char2bytes(c, p + fl + tl); | |
2336 stack[depth].ts_fidxtry = sp->ts_fidx + n + fl + tl; | |
2337 } | |
2338 else | |
2339 { | |
2340 p[0] = p[2]; | |
2341 p[2] = c; | |
2342 stack[depth].ts_fidxtry = sp->ts_fidx + 3; | |
2343 } | |
2344 } | |
2345 else | |
2346 { | |
2347 PROF_STORE(sp->ts_state) | |
2348 sp->ts_state = STATE_REP_INI; | |
2349 } | |
2350 break; | |
2351 | |
2352 case STATE_UNSWAP3: | |
2353 // Undo STATE_SWAP3: "321" -> "123" | |
2354 p = fword + sp->ts_fidx; | |
2355 if (has_mbyte) | |
2356 { | |
2357 n = MB_PTR2LEN(p); | |
2358 c2 = mb_ptr2char(p + n); | |
2359 fl = MB_PTR2LEN(p + n); | |
2360 c = mb_ptr2char(p + n + fl); | |
2361 tl = MB_PTR2LEN(p + n + fl); | |
2362 mch_memmove(p + fl + tl, p, n); | |
2363 mb_char2bytes(c, p); | |
2364 mb_char2bytes(c2, p + tl); | |
2365 p = p + tl; | |
2366 } | |
2367 else | |
2368 { | |
2369 c = *p; | |
2370 *p = p[2]; | |
2371 p[2] = c; | |
2372 ++p; | |
2373 } | |
2374 | |
2375 if (!soundfold && !spell_iswordp(p, curwin)) | |
2376 { | |
2377 // Middle char is not a word char, skip the rotate. First and | |
2378 // third char were already checked at swap and swap3. | |
2379 PROF_STORE(sp->ts_state) | |
2380 sp->ts_state = STATE_REP_INI; | |
2381 break; | |
2382 } | |
2383 | |
2384 // Rotate three characters left: "123" -> "231". We change | |
2385 // "fword" here, it's changed back afterwards at STATE_UNROT3L. | |
2386 if (TRY_DEEPER(su, stack, depth, SCORE_SWAP3)) | |
2387 { | |
2388 go_deeper(stack, depth, SCORE_SWAP3); | |
2389 #ifdef DEBUG_TRIEWALK | |
2390 p = fword + sp->ts_fidx; | |
2391 sprintf(changename[depth], "%.*s-%s: rotate left %c%c%c", | |
2392 sp->ts_twordlen, tword, fword + sp->ts_fidx, | |
2393 p[0], p[1], p[2]); | |
2394 #endif | |
2395 PROF_STORE(sp->ts_state) | |
2396 sp->ts_state = STATE_UNROT3L; | |
2397 ++depth; | |
2398 p = fword + sp->ts_fidx; | |
2399 if (has_mbyte) | |
2400 { | |
2401 n = MB_CPTR2LEN(p); | |
2402 c = mb_ptr2char(p); | |
2403 fl = MB_CPTR2LEN(p + n); | |
2404 fl += MB_CPTR2LEN(p + n + fl); | |
2405 mch_memmove(p, p + n, fl); | |
2406 mb_char2bytes(c, p + fl); | |
2407 stack[depth].ts_fidxtry = sp->ts_fidx + n + fl; | |
2408 } | |
2409 else | |
2410 { | |
2411 c = *p; | |
2412 *p = p[1]; | |
2413 p[1] = p[2]; | |
2414 p[2] = c; | |
2415 stack[depth].ts_fidxtry = sp->ts_fidx + 3; | |
2416 } | |
2417 } | |
2418 else | |
2419 { | |
2420 PROF_STORE(sp->ts_state) | |
2421 sp->ts_state = STATE_REP_INI; | |
2422 } | |
2423 break; | |
2424 | |
2425 case STATE_UNROT3L: | |
2426 // Undo ROT3L: "231" -> "123" | |
2427 p = fword + sp->ts_fidx; | |
2428 if (has_mbyte) | |
2429 { | |
2430 n = MB_PTR2LEN(p); | |
2431 n += MB_PTR2LEN(p + n); | |
2432 c = mb_ptr2char(p + n); | |
2433 tl = MB_PTR2LEN(p + n); | |
2434 mch_memmove(p + tl, p, n); | |
2435 mb_char2bytes(c, p); | |
2436 } | |
2437 else | |
2438 { | |
2439 c = p[2]; | |
2440 p[2] = p[1]; | |
2441 p[1] = *p; | |
2442 *p = c; | |
2443 } | |
2444 | |
2445 // Rotate three bytes right: "123" -> "312". We change "fword" | |
2446 // here, it's changed back afterwards at STATE_UNROT3R. | |
2447 if (TRY_DEEPER(su, stack, depth, SCORE_SWAP3)) | |
2448 { | |
2449 go_deeper(stack, depth, SCORE_SWAP3); | |
2450 #ifdef DEBUG_TRIEWALK | |
2451 p = fword + sp->ts_fidx; | |
2452 sprintf(changename[depth], "%.*s-%s: rotate right %c%c%c", | |
2453 sp->ts_twordlen, tword, fword + sp->ts_fidx, | |
2454 p[0], p[1], p[2]); | |
2455 #endif | |
2456 PROF_STORE(sp->ts_state) | |
2457 sp->ts_state = STATE_UNROT3R; | |
2458 ++depth; | |
2459 p = fword + sp->ts_fidx; | |
2460 if (has_mbyte) | |
2461 { | |
2462 n = MB_CPTR2LEN(p); | |
2463 n += MB_CPTR2LEN(p + n); | |
2464 c = mb_ptr2char(p + n); | |
2465 tl = MB_CPTR2LEN(p + n); | |
2466 mch_memmove(p + tl, p, n); | |
2467 mb_char2bytes(c, p); | |
2468 stack[depth].ts_fidxtry = sp->ts_fidx + n + tl; | |
2469 } | |
2470 else | |
2471 { | |
2472 c = p[2]; | |
2473 p[2] = p[1]; | |
2474 p[1] = *p; | |
2475 *p = c; | |
2476 stack[depth].ts_fidxtry = sp->ts_fidx + 3; | |
2477 } | |
2478 } | |
2479 else | |
2480 { | |
2481 PROF_STORE(sp->ts_state) | |
2482 sp->ts_state = STATE_REP_INI; | |
2483 } | |
2484 break; | |
2485 | |
2486 case STATE_UNROT3R: | |
2487 // Undo ROT3R: "312" -> "123" | |
2488 p = fword + sp->ts_fidx; | |
2489 if (has_mbyte) | |
2490 { | |
2491 c = mb_ptr2char(p); | |
2492 tl = MB_PTR2LEN(p); | |
2493 n = MB_PTR2LEN(p + tl); | |
2494 n += MB_PTR2LEN(p + tl + n); | |
2495 mch_memmove(p, p + tl, n); | |
2496 mb_char2bytes(c, p + n); | |
2497 } | |
2498 else | |
2499 { | |
2500 c = *p; | |
2501 *p = p[1]; | |
2502 p[1] = p[2]; | |
2503 p[2] = c; | |
2504 } | |
2505 // FALLTHROUGH | |
2506 | |
2507 case STATE_REP_INI: | |
2508 // Check if matching with REP items from the .aff file would work. | |
2509 // Quickly skip if: | |
2510 // - there are no REP items and we are not in the soundfold trie | |
2511 // - the score is going to be too high anyway | |
2512 // - already applied a REP item or swapped here | |
2513 if ((lp->lp_replang == NULL && !soundfold) | |
2514 || sp->ts_score + SCORE_REP >= su->su_maxscore | |
2515 || sp->ts_fidx < sp->ts_fidxtry) | |
2516 { | |
2517 PROF_STORE(sp->ts_state) | |
2518 sp->ts_state = STATE_FINAL; | |
2519 break; | |
2520 } | |
2521 | |
2522 // Use the first byte to quickly find the first entry that may | |
2523 // match. If the index is -1 there is none. | |
2524 if (soundfold) | |
2525 sp->ts_curi = slang->sl_repsal_first[fword[sp->ts_fidx]]; | |
2526 else | |
2527 sp->ts_curi = lp->lp_replang->sl_rep_first[fword[sp->ts_fidx]]; | |
2528 | |
2529 if (sp->ts_curi < 0) | |
2530 { | |
2531 PROF_STORE(sp->ts_state) | |
2532 sp->ts_state = STATE_FINAL; | |
2533 break; | |
2534 } | |
2535 | |
2536 PROF_STORE(sp->ts_state) | |
2537 sp->ts_state = STATE_REP; | |
2538 // FALLTHROUGH | |
2539 | |
2540 case STATE_REP: | |
2541 // Try matching with REP items from the .aff file. For each match | |
2542 // replace the characters and check if the resulting word is | |
2543 // valid. | |
2544 p = fword + sp->ts_fidx; | |
2545 | |
2546 if (soundfold) | |
2547 gap = &slang->sl_repsal; | |
2548 else | |
2549 gap = &lp->lp_replang->sl_rep; | |
2550 while (sp->ts_curi < gap->ga_len) | |
2551 { | |
2552 ftp = (fromto_T *)gap->ga_data + sp->ts_curi++; | |
2553 if (*ftp->ft_from != *p) | |
2554 { | |
2555 // past possible matching entries | |
2556 sp->ts_curi = gap->ga_len; | |
2557 break; | |
2558 } | |
2559 if (STRNCMP(ftp->ft_from, p, STRLEN(ftp->ft_from)) == 0 | |
2560 && TRY_DEEPER(su, stack, depth, SCORE_REP)) | |
2561 { | |
2562 go_deeper(stack, depth, SCORE_REP); | |
2563 #ifdef DEBUG_TRIEWALK | |
2564 sprintf(changename[depth], "%.*s-%s: replace %s with %s", | |
2565 sp->ts_twordlen, tword, fword + sp->ts_fidx, | |
2566 ftp->ft_from, ftp->ft_to); | |
2567 #endif | |
2568 // Need to undo this afterwards. | |
2569 PROF_STORE(sp->ts_state) | |
2570 sp->ts_state = STATE_REP_UNDO; | |
2571 | |
2572 // Change the "from" to the "to" string. | |
2573 ++depth; | |
2574 fl = (int)STRLEN(ftp->ft_from); | |
2575 tl = (int)STRLEN(ftp->ft_to); | |
2576 if (fl != tl) | |
2577 { | |
2578 STRMOVE(p + tl, p + fl); | |
2579 repextra += tl - fl; | |
2580 } | |
2581 mch_memmove(p, ftp->ft_to, tl); | |
2582 stack[depth].ts_fidxtry = sp->ts_fidx + tl; | |
2583 stack[depth].ts_tcharlen = 0; | |
2584 break; | |
2585 } | |
2586 } | |
2587 | |
2588 if (sp->ts_curi >= gap->ga_len && sp->ts_state == STATE_REP) | |
2589 { | |
2590 // No (more) matches. | |
2591 PROF_STORE(sp->ts_state) | |
2592 sp->ts_state = STATE_FINAL; | |
2593 } | |
2594 | |
2595 break; | |
2596 | |
2597 case STATE_REP_UNDO: | |
2598 // Undo a REP replacement and continue with the next one. | |
2599 if (soundfold) | |
2600 gap = &slang->sl_repsal; | |
2601 else | |
2602 gap = &lp->lp_replang->sl_rep; | |
2603 ftp = (fromto_T *)gap->ga_data + sp->ts_curi - 1; | |
2604 fl = (int)STRLEN(ftp->ft_from); | |
2605 tl = (int)STRLEN(ftp->ft_to); | |
2606 p = fword + sp->ts_fidx; | |
2607 if (fl != tl) | |
2608 { | |
2609 STRMOVE(p + fl, p + tl); | |
2610 repextra -= tl - fl; | |
2611 } | |
2612 mch_memmove(p, ftp->ft_from, fl); | |
2613 PROF_STORE(sp->ts_state) | |
2614 sp->ts_state = STATE_REP; | |
2615 break; | |
2616 | |
2617 default: | |
2618 // Did all possible states at this level, go up one level. | |
2619 --depth; | |
2620 | |
2621 if (depth >= 0 && stack[depth].ts_prefixdepth == PFD_PREFIXTREE) | |
2622 { | |
2623 // Continue in or go back to the prefix tree. | |
2624 byts = pbyts; | |
2625 idxs = pidxs; | |
2626 } | |
2627 | |
2628 // Don't check for CTRL-C too often, it takes time. | |
2629 if (--breakcheckcount == 0) | |
2630 { | |
2631 ui_breakcheck(); | |
2632 breakcheckcount = 1000; | |
2633 } | |
2634 } | |
2635 } | |
2636 } | |
2637 | |
2638 | |
2639 /* | |
2640 * Go one level deeper in the tree. | |
2641 */ | |
2642 static void | |
2643 go_deeper(trystate_T *stack, int depth, int score_add) | |
2644 { | |
2645 stack[depth + 1] = stack[depth]; | |
2646 stack[depth + 1].ts_state = STATE_START; | |
2647 stack[depth + 1].ts_score = stack[depth].ts_score + score_add; | |
2648 stack[depth + 1].ts_curi = 1; // start just after length byte | |
2649 stack[depth + 1].ts_flags = 0; | |
2650 } | |
2651 | |
2652 /* | |
2653 * "fword" is a good word with case folded. Find the matching keep-case | |
2654 * words and put it in "kword". | |
2655 * Theoretically there could be several keep-case words that result in the | |
2656 * same case-folded word, but we only find one... | |
2657 */ | |
2658 static void | |
2659 find_keepcap_word(slang_T *slang, char_u *fword, char_u *kword) | |
2660 { | |
2661 char_u uword[MAXWLEN]; // "fword" in upper-case | |
2662 int depth; | |
2663 idx_T tryidx; | |
2664 | |
2665 // The following arrays are used at each depth in the tree. | |
2666 idx_T arridx[MAXWLEN]; | |
2667 int round[MAXWLEN]; | |
2668 int fwordidx[MAXWLEN]; | |
2669 int uwordidx[MAXWLEN]; | |
2670 int kwordlen[MAXWLEN]; | |
2671 | |
2672 int flen, ulen; | |
2673 int l; | |
2674 int len; | |
2675 int c; | |
2676 idx_T lo, hi, m; | |
2677 char_u *p; | |
2678 char_u *byts = slang->sl_kbyts; // array with bytes of the words | |
2679 idx_T *idxs = slang->sl_kidxs; // array with indexes | |
2680 | |
2681 if (byts == NULL) | |
2682 { | |
2683 // array is empty: "cannot happen" | |
2684 *kword = NUL; | |
2685 return; | |
2686 } | |
2687 | |
2688 // Make an all-cap version of "fword". | |
2689 allcap_copy(fword, uword); | |
2690 | |
2691 // Each character needs to be tried both case-folded and upper-case. | |
2692 // All this gets very complicated if we keep in mind that changing case | |
2693 // may change the byte length of a multi-byte character... | |
2694 depth = 0; | |
2695 arridx[0] = 0; | |
2696 round[0] = 0; | |
2697 fwordidx[0] = 0; | |
2698 uwordidx[0] = 0; | |
2699 kwordlen[0] = 0; | |
2700 while (depth >= 0) | |
2701 { | |
2702 if (fword[fwordidx[depth]] == NUL) | |
2703 { | |
2704 // We are at the end of "fword". If the tree allows a word to end | |
2705 // here we have found a match. | |
2706 if (byts[arridx[depth] + 1] == 0) | |
2707 { | |
2708 kword[kwordlen[depth]] = NUL; | |
2709 return; | |
2710 } | |
2711 | |
2712 // kword is getting too long, continue one level up | |
2713 --depth; | |
2714 } | |
2715 else if (++round[depth] > 2) | |
2716 { | |
2717 // tried both fold-case and upper-case character, continue one | |
2718 // level up | |
2719 --depth; | |
2720 } | |
2721 else | |
2722 { | |
2723 // round[depth] == 1: Try using the folded-case character. | |
2724 // round[depth] == 2: Try using the upper-case character. | |
2725 if (has_mbyte) | |
2726 { | |
2727 flen = MB_CPTR2LEN(fword + fwordidx[depth]); | |
2728 ulen = MB_CPTR2LEN(uword + uwordidx[depth]); | |
2729 } | |
2730 else | |
2731 ulen = flen = 1; | |
2732 if (round[depth] == 1) | |
2733 { | |
2734 p = fword + fwordidx[depth]; | |
2735 l = flen; | |
2736 } | |
2737 else | |
2738 { | |
2739 p = uword + uwordidx[depth]; | |
2740 l = ulen; | |
2741 } | |
2742 | |
2743 for (tryidx = arridx[depth]; l > 0; --l) | |
2744 { | |
2745 // Perform a binary search in the list of accepted bytes. | |
2746 len = byts[tryidx++]; | |
2747 c = *p++; | |
2748 lo = tryidx; | |
2749 hi = tryidx + len - 1; | |
2750 while (lo < hi) | |
2751 { | |
2752 m = (lo + hi) / 2; | |
2753 if (byts[m] > c) | |
2754 hi = m - 1; | |
2755 else if (byts[m] < c) | |
2756 lo = m + 1; | |
2757 else | |
2758 { | |
2759 lo = hi = m; | |
2760 break; | |
2761 } | |
2762 } | |
2763 | |
2764 // Stop if there is no matching byte. | |
2765 if (hi < lo || byts[lo] != c) | |
2766 break; | |
2767 | |
2768 // Continue at the child (if there is one). | |
2769 tryidx = idxs[lo]; | |
2770 } | |
2771 | |
2772 if (l == 0) | |
2773 { | |
2774 // Found the matching char. Copy it to "kword" and go a | |
2775 // level deeper. | |
2776 if (round[depth] == 1) | |
2777 { | |
2778 STRNCPY(kword + kwordlen[depth], fword + fwordidx[depth], | |
2779 flen); | |
2780 kwordlen[depth + 1] = kwordlen[depth] + flen; | |
2781 } | |
2782 else | |
2783 { | |
2784 STRNCPY(kword + kwordlen[depth], uword + uwordidx[depth], | |
2785 ulen); | |
2786 kwordlen[depth + 1] = kwordlen[depth] + ulen; | |
2787 } | |
2788 fwordidx[depth + 1] = fwordidx[depth] + flen; | |
2789 uwordidx[depth + 1] = uwordidx[depth] + ulen; | |
2790 | |
2791 ++depth; | |
2792 arridx[depth] = tryidx; | |
2793 round[depth] = 0; | |
2794 } | |
2795 } | |
2796 } | |
2797 | |
2798 // Didn't find it: "cannot happen". | |
2799 *kword = NUL; | |
2800 } | |
2801 | |
2802 /* | |
2803 * Compute the sound-a-like score for suggestions in su->su_ga and add them to | |
2804 * su->su_sga. | |
2805 */ | |
2806 static void | |
2807 score_comp_sal(suginfo_T *su) | |
2808 { | |
2809 langp_T *lp; | |
2810 char_u badsound[MAXWLEN]; | |
2811 int i; | |
2812 suggest_T *stp; | |
2813 suggest_T *sstp; | |
2814 int score; | |
2815 int lpi; | |
2816 | |
2817 if (ga_grow(&su->su_sga, su->su_ga.ga_len) == FAIL) | |
2818 return; | |
2819 | |
2820 // Use the sound-folding of the first language that supports it. | |
2821 for (lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi) | |
2822 { | |
2823 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi); | |
2824 if (lp->lp_slang->sl_sal.ga_len > 0) | |
2825 { | |
2826 // soundfold the bad word | |
2827 spell_soundfold(lp->lp_slang, su->su_fbadword, TRUE, badsound); | |
2828 | |
2829 for (i = 0; i < su->su_ga.ga_len; ++i) | |
2830 { | |
2831 stp = &SUG(su->su_ga, i); | |
2832 | |
2833 // Case-fold the suggested word, sound-fold it and compute the | |
2834 // sound-a-like score. | |
2835 score = stp_sal_score(stp, su, lp->lp_slang, badsound); | |
2836 if (score < SCORE_MAXMAX) | |
2837 { | |
2838 // Add the suggestion. | |
2839 sstp = &SUG(su->su_sga, su->su_sga.ga_len); | |
2840 sstp->st_word = vim_strsave(stp->st_word); | |
2841 if (sstp->st_word != NULL) | |
2842 { | |
2843 sstp->st_wordlen = stp->st_wordlen; | |
2844 sstp->st_score = score; | |
2845 sstp->st_altscore = 0; | |
2846 sstp->st_orglen = stp->st_orglen; | |
2847 ++su->su_sga.ga_len; | |
2848 } | |
2849 } | |
2850 } | |
2851 break; | |
2852 } | |
2853 } | |
2854 } | |
2855 | |
2856 /* | |
2857 * Combine the list of suggestions in su->su_ga and su->su_sga. | |
2858 * They are entwined. | |
2859 */ | |
2860 static void | |
2861 score_combine(suginfo_T *su) | |
2862 { | |
2863 int i; | |
2864 int j; | |
2865 garray_T ga; | |
2866 garray_T *gap; | |
2867 langp_T *lp; | |
2868 suggest_T *stp; | |
2869 char_u *p; | |
2870 char_u badsound[MAXWLEN]; | |
2871 int round; | |
2872 int lpi; | |
2873 slang_T *slang = NULL; | |
2874 | |
2875 // Add the alternate score to su_ga. | |
2876 for (lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi) | |
2877 { | |
2878 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi); | |
2879 if (lp->lp_slang->sl_sal.ga_len > 0) | |
2880 { | |
2881 // soundfold the bad word | |
2882 slang = lp->lp_slang; | |
2883 spell_soundfold(slang, su->su_fbadword, TRUE, badsound); | |
2884 | |
2885 for (i = 0; i < su->su_ga.ga_len; ++i) | |
2886 { | |
2887 stp = &SUG(su->su_ga, i); | |
2888 stp->st_altscore = stp_sal_score(stp, su, slang, badsound); | |
2889 if (stp->st_altscore == SCORE_MAXMAX) | |
2890 stp->st_score = (stp->st_score * 3 + SCORE_BIG) / 4; | |
2891 else | |
2892 stp->st_score = (stp->st_score * 3 | |
2893 + stp->st_altscore) / 4; | |
2894 stp->st_salscore = FALSE; | |
2895 } | |
2896 break; | |
2897 } | |
2898 } | |
2899 | |
2900 if (slang == NULL) // Using "double" without sound folding. | |
2901 { | |
2902 (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, | |
2903 su->su_maxcount); | |
2904 return; | |
2905 } | |
2906 | |
2907 // Add the alternate score to su_sga. | |
2908 for (i = 0; i < su->su_sga.ga_len; ++i) | |
2909 { | |
2910 stp = &SUG(su->su_sga, i); | |
2911 stp->st_altscore = spell_edit_score(slang, | |
2912 su->su_badword, stp->st_word); | |
2913 if (stp->st_score == SCORE_MAXMAX) | |
2914 stp->st_score = (SCORE_BIG * 7 + stp->st_altscore) / 8; | |
2915 else | |
2916 stp->st_score = (stp->st_score * 7 + stp->st_altscore) / 8; | |
2917 stp->st_salscore = TRUE; | |
2918 } | |
2919 | |
2920 // Remove bad suggestions, sort the suggestions and truncate at "maxcount" | |
2921 // for both lists. | |
2922 check_suggestions(su, &su->su_ga); | |
2923 (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, su->su_maxcount); | |
2924 check_suggestions(su, &su->su_sga); | |
2925 (void)cleanup_suggestions(&su->su_sga, su->su_maxscore, su->su_maxcount); | |
2926 | |
2927 ga_init2(&ga, (int)sizeof(suginfo_T), 1); | |
2928 if (ga_grow(&ga, su->su_ga.ga_len + su->su_sga.ga_len) == FAIL) | |
2929 return; | |
2930 | |
2931 stp = &SUG(ga, 0); | |
2932 for (i = 0; i < su->su_ga.ga_len || i < su->su_sga.ga_len; ++i) | |
2933 { | |
2934 // round 1: get a suggestion from su_ga | |
2935 // round 2: get a suggestion from su_sga | |
2936 for (round = 1; round <= 2; ++round) | |
2937 { | |
2938 gap = round == 1 ? &su->su_ga : &su->su_sga; | |
2939 if (i < gap->ga_len) | |
2940 { | |
2941 // Don't add a word if it's already there. | |
2942 p = SUG(*gap, i).st_word; | |
2943 for (j = 0; j < ga.ga_len; ++j) | |
2944 if (STRCMP(stp[j].st_word, p) == 0) | |
2945 break; | |
2946 if (j == ga.ga_len) | |
2947 stp[ga.ga_len++] = SUG(*gap, i); | |
2948 else | |
2949 vim_free(p); | |
2950 } | |
2951 } | |
2952 } | |
2953 | |
2954 ga_clear(&su->su_ga); | |
2955 ga_clear(&su->su_sga); | |
2956 | |
2957 // Truncate the list to the number of suggestions that will be displayed. | |
2958 if (ga.ga_len > su->su_maxcount) | |
2959 { | |
2960 for (i = su->su_maxcount; i < ga.ga_len; ++i) | |
2961 vim_free(stp[i].st_word); | |
2962 ga.ga_len = su->su_maxcount; | |
2963 } | |
2964 | |
2965 su->su_ga = ga; | |
2966 } | |
2967 | |
2968 /* | |
2969 * For the goodword in "stp" compute the soundalike score compared to the | |
2970 * badword. | |
2971 */ | |
2972 static int | |
2973 stp_sal_score( | |
2974 suggest_T *stp, | |
2975 suginfo_T *su, | |
2976 slang_T *slang, | |
2977 char_u *badsound) // sound-folded badword | |
2978 { | |
2979 char_u *p; | |
2980 char_u *pbad; | |
2981 char_u *pgood; | |
2982 char_u badsound2[MAXWLEN]; | |
2983 char_u fword[MAXWLEN]; | |
2984 char_u goodsound[MAXWLEN]; | |
2985 char_u goodword[MAXWLEN]; | |
2986 int lendiff; | |
2987 | |
2988 lendiff = (int)(su->su_badlen - stp->st_orglen); | |
2989 if (lendiff >= 0) | |
2990 pbad = badsound; | |
2991 else | |
2992 { | |
2993 // soundfold the bad word with more characters following | |
2994 (void)spell_casefold(su->su_badptr, stp->st_orglen, fword, MAXWLEN); | |
2995 | |
2996 // When joining two words the sound often changes a lot. E.g., "t he" | |
2997 // sounds like "t h" while "the" sounds like "@". Avoid that by | |
2998 // removing the space. Don't do it when the good word also contains a | |
2999 // space. | |
3000 if (VIM_ISWHITE(su->su_badptr[su->su_badlen]) | |
3001 && *skiptowhite(stp->st_word) == NUL) | |
3002 for (p = fword; *(p = skiptowhite(p)) != NUL; ) | |
3003 STRMOVE(p, p + 1); | |
3004 | |
3005 spell_soundfold(slang, fword, TRUE, badsound2); | |
3006 pbad = badsound2; | |
3007 } | |
3008 | |
3009 if (lendiff > 0 && stp->st_wordlen + lendiff < MAXWLEN) | |
3010 { | |
3011 // Add part of the bad word to the good word, so that we soundfold | |
3012 // what replaces the bad word. | |
3013 STRCPY(goodword, stp->st_word); | |
3014 vim_strncpy(goodword + stp->st_wordlen, | |
3015 su->su_badptr + su->su_badlen - lendiff, lendiff); | |
3016 pgood = goodword; | |
3017 } | |
3018 else | |
3019 pgood = stp->st_word; | |
3020 | |
3021 // Sound-fold the word and compute the score for the difference. | |
3022 spell_soundfold(slang, pgood, FALSE, goodsound); | |
3023 | |
3024 return soundalike_score(goodsound, pbad); | |
3025 } | |
3026 | |
3027 // structure used to store soundfolded words that add_sound_suggest() has | |
3028 // handled already. | |
3029 typedef struct | |
3030 { | |
3031 short sft_score; // lowest score used | |
3032 char_u sft_word[1]; // soundfolded word, actually longer | |
3033 } sftword_T; | |
3034 | |
3035 static sftword_T dumsft; | |
3036 #define HIKEY2SFT(p) ((sftword_T *)(p - (dumsft.sft_word - (char_u *)&dumsft))) | |
3037 #define HI2SFT(hi) HIKEY2SFT((hi)->hi_key) | |
3038 | |
3039 /* | |
3040 * Prepare for calling suggest_try_soundalike(). | |
3041 */ | |
3042 static void | |
3043 suggest_try_soundalike_prep(void) | |
3044 { | |
3045 langp_T *lp; | |
3046 int lpi; | |
3047 slang_T *slang; | |
3048 | |
3049 // Do this for all languages that support sound folding and for which a | |
3050 // .sug file has been loaded. | |
3051 for (lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi) | |
3052 { | |
3053 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi); | |
3054 slang = lp->lp_slang; | |
3055 if (slang->sl_sal.ga_len > 0 && slang->sl_sbyts != NULL) | |
3056 // prepare the hashtable used by add_sound_suggest() | |
3057 hash_init(&slang->sl_sounddone); | |
3058 } | |
3059 } | |
3060 | |
3061 /* | |
3062 * Find suggestions by comparing the word in a sound-a-like form. | |
3063 * Note: This doesn't support postponed prefixes. | |
3064 */ | |
3065 static void | |
3066 suggest_try_soundalike(suginfo_T *su) | |
3067 { | |
3068 char_u salword[MAXWLEN]; | |
3069 langp_T *lp; | |
3070 int lpi; | |
3071 slang_T *slang; | |
3072 | |
3073 // Do this for all languages that support sound folding and for which a | |
3074 // .sug file has been loaded. | |
3075 for (lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi) | |
3076 { | |
3077 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi); | |
3078 slang = lp->lp_slang; | |
3079 if (slang->sl_sal.ga_len > 0 && slang->sl_sbyts != NULL) | |
3080 { | |
3081 // soundfold the bad word | |
3082 spell_soundfold(slang, su->su_fbadword, TRUE, salword); | |
3083 | |
3084 // try all kinds of inserts/deletes/swaps/etc. | |
3085 // TODO: also soundfold the next words, so that we can try joining | |
3086 // and splitting | |
3087 #ifdef SUGGEST_PROFILE | |
3088 prof_init(); | |
3089 #endif | |
3090 suggest_trie_walk(su, lp, salword, TRUE); | |
3091 #ifdef SUGGEST_PROFILE | |
3092 prof_report("soundalike"); | |
3093 #endif | |
3094 } | |
3095 } | |
3096 } | |
3097 | |
3098 /* | |
3099 * Finish up after calling suggest_try_soundalike(). | |
3100 */ | |
3101 static void | |
3102 suggest_try_soundalike_finish(void) | |
3103 { | |
3104 langp_T *lp; | |
3105 int lpi; | |
3106 slang_T *slang; | |
3107 int todo; | |
3108 hashitem_T *hi; | |
3109 | |
3110 // Do this for all languages that support sound folding and for which a | |
3111 // .sug file has been loaded. | |
3112 for (lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi) | |
3113 { | |
3114 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi); | |
3115 slang = lp->lp_slang; | |
3116 if (slang->sl_sal.ga_len > 0 && slang->sl_sbyts != NULL) | |
3117 { | |
3118 // Free the info about handled words. | |
3119 todo = (int)slang->sl_sounddone.ht_used; | |
3120 for (hi = slang->sl_sounddone.ht_array; todo > 0; ++hi) | |
3121 if (!HASHITEM_EMPTY(hi)) | |
3122 { | |
3123 vim_free(HI2SFT(hi)); | |
3124 --todo; | |
3125 } | |
3126 | |
3127 // Clear the hashtable, it may also be used by another region. | |
3128 hash_clear(&slang->sl_sounddone); | |
3129 hash_init(&slang->sl_sounddone); | |
3130 } | |
3131 } | |
3132 } | |
3133 | |
3134 /* | |
3135 * A match with a soundfolded word is found. Add the good word(s) that | |
3136 * produce this soundfolded word. | |
3137 */ | |
3138 static void | |
3139 add_sound_suggest( | |
3140 suginfo_T *su, | |
3141 char_u *goodword, | |
3142 int score, // soundfold score | |
3143 langp_T *lp) | |
3144 { | |
3145 slang_T *slang = lp->lp_slang; // language for sound folding | |
3146 int sfwordnr; | |
3147 char_u *nrline; | |
3148 int orgnr; | |
3149 char_u theword[MAXWLEN]; | |
3150 int i; | |
3151 int wlen; | |
3152 char_u *byts; | |
3153 idx_T *idxs; | |
3154 int n; | |
3155 int wordcount; | |
3156 int wc; | |
3157 int goodscore; | |
3158 hash_T hash; | |
3159 hashitem_T *hi; | |
3160 sftword_T *sft; | |
3161 int bc, gc; | |
3162 int limit; | |
3163 | |
3164 // It's very well possible that the same soundfold word is found several | |
3165 // times with different scores. Since the following is quite slow only do | |
3166 // the words that have a better score than before. Use a hashtable to | |
3167 // remember the words that have been done. | |
3168 hash = hash_hash(goodword); | |
3169 hi = hash_lookup(&slang->sl_sounddone, goodword, hash); | |
3170 if (HASHITEM_EMPTY(hi)) | |
3171 { | |
3172 sft = alloc(sizeof(sftword_T) + STRLEN(goodword)); | |
3173 if (sft != NULL) | |
3174 { | |
3175 sft->sft_score = score; | |
3176 STRCPY(sft->sft_word, goodword); | |
3177 hash_add_item(&slang->sl_sounddone, hi, sft->sft_word, hash); | |
3178 } | |
3179 } | |
3180 else | |
3181 { | |
3182 sft = HI2SFT(hi); | |
3183 if (score >= sft->sft_score) | |
3184 return; | |
3185 sft->sft_score = score; | |
3186 } | |
3187 | |
3188 // Find the word nr in the soundfold tree. | |
3189 sfwordnr = soundfold_find(slang, goodword); | |
3190 if (sfwordnr < 0) | |
3191 { | |
3192 internal_error("add_sound_suggest()"); | |
3193 return; | |
3194 } | |
3195 | |
3196 // go over the list of good words that produce this soundfold word | |
3197 nrline = ml_get_buf(slang->sl_sugbuf, (linenr_T)(sfwordnr + 1), FALSE); | |
3198 orgnr = 0; | |
3199 while (*nrline != NUL) | |
3200 { | |
3201 // The wordnr was stored in a minimal nr of bytes as an offset to the | |
3202 // previous wordnr. | |
3203 orgnr += bytes2offset(&nrline); | |
3204 | |
3205 byts = slang->sl_fbyts; | |
3206 idxs = slang->sl_fidxs; | |
3207 | |
3208 // Lookup the word "orgnr" one of the two tries. | |
3209 n = 0; | |
3210 wordcount = 0; | |
3211 for (wlen = 0; wlen < MAXWLEN - 3; ++wlen) | |
3212 { | |
3213 i = 1; | |
3214 if (wordcount == orgnr && byts[n + 1] == NUL) | |
3215 break; // found end of word | |
3216 | |
3217 if (byts[n + 1] == NUL) | |
3218 ++wordcount; | |
3219 | |
3220 // skip over the NUL bytes | |
3221 for ( ; byts[n + i] == NUL; ++i) | |
3222 if (i > byts[n]) // safety check | |
3223 { | |
3224 STRCPY(theword + wlen, "BAD"); | |
3225 wlen += 3; | |
3226 goto badword; | |
3227 } | |
3228 | |
3229 // One of the siblings must have the word. | |
3230 for ( ; i < byts[n]; ++i) | |
3231 { | |
3232 wc = idxs[idxs[n + i]]; // nr of words under this byte | |
3233 if (wordcount + wc > orgnr) | |
3234 break; | |
3235 wordcount += wc; | |
3236 } | |
3237 | |
3238 theword[wlen] = byts[n + i]; | |
3239 n = idxs[n + i]; | |
3240 } | |
3241 badword: | |
3242 theword[wlen] = NUL; | |
3243 | |
3244 // Go over the possible flags and regions. | |
3245 for (; i <= byts[n] && byts[n + i] == NUL; ++i) | |
3246 { | |
3247 char_u cword[MAXWLEN]; | |
3248 char_u *p; | |
3249 int flags = (int)idxs[n + i]; | |
3250 | |
3251 // Skip words with the NOSUGGEST flag | |
3252 if (flags & WF_NOSUGGEST) | |
3253 continue; | |
3254 | |
3255 if (flags & WF_KEEPCAP) | |
3256 { | |
3257 // Must find the word in the keep-case tree. | |
3258 find_keepcap_word(slang, theword, cword); | |
3259 p = cword; | |
3260 } | |
3261 else | |
3262 { | |
3263 flags |= su->su_badflags; | |
3264 if ((flags & WF_CAPMASK) != 0) | |
3265 { | |
3266 // Need to fix case according to "flags". | |
3267 make_case_word(theword, cword, flags); | |
3268 p = cword; | |
3269 } | |
3270 else | |
3271 p = theword; | |
3272 } | |
3273 | |
3274 // Add the suggestion. | |
3275 if (sps_flags & SPS_DOUBLE) | |
3276 { | |
3277 // Add the suggestion if the score isn't too bad. | |
3278 if (score <= su->su_maxscore) | |
3279 add_suggestion(su, &su->su_sga, p, su->su_badlen, | |
3280 score, 0, FALSE, slang, FALSE); | |
3281 } | |
3282 else | |
3283 { | |
3284 // Add a penalty for words in another region. | |
3285 if ((flags & WF_REGION) | |
3286 && (((unsigned)flags >> 16) & lp->lp_region) == 0) | |
3287 goodscore = SCORE_REGION; | |
3288 else | |
3289 goodscore = 0; | |
3290 | |
3291 // Add a small penalty for changing the first letter from | |
3292 // lower to upper case. Helps for "tath" -> "Kath", which is | |
3293 // less common than "tath" -> "path". Don't do it when the | |
3294 // letter is the same, that has already been counted. | |
3295 gc = PTR2CHAR(p); | |
3296 if (SPELL_ISUPPER(gc)) | |
3297 { | |
3298 bc = PTR2CHAR(su->su_badword); | |
3299 if (!SPELL_ISUPPER(bc) | |
3300 && SPELL_TOFOLD(bc) != SPELL_TOFOLD(gc)) | |
3301 goodscore += SCORE_ICASE / 2; | |
3302 } | |
3303 | |
3304 // Compute the score for the good word. This only does letter | |
3305 // insert/delete/swap/replace. REP items are not considered, | |
3306 // which may make the score a bit higher. | |
3307 // Use a limit for the score to make it work faster. Use | |
3308 // MAXSCORE(), because RESCORE() will change the score. | |
3309 // If the limit is very high then the iterative method is | |
3310 // inefficient, using an array is quicker. | |
3311 limit = MAXSCORE(su->su_sfmaxscore - goodscore, score); | |
3312 if (limit > SCORE_LIMITMAX) | |
3313 goodscore += spell_edit_score(slang, su->su_badword, p); | |
3314 else | |
3315 goodscore += spell_edit_score_limit(slang, su->su_badword, | |
3316 p, limit); | |
3317 | |
3318 // When going over the limit don't bother to do the rest. | |
3319 if (goodscore < SCORE_MAXMAX) | |
3320 { | |
3321 // Give a bonus to words seen before. | |
3322 goodscore = score_wordcount_adj(slang, goodscore, p, FALSE); | |
3323 | |
3324 // Add the suggestion if the score isn't too bad. | |
3325 goodscore = RESCORE(goodscore, score); | |
3326 if (goodscore <= su->su_sfmaxscore) | |
3327 add_suggestion(su, &su->su_ga, p, su->su_badlen, | |
3328 goodscore, score, TRUE, slang, TRUE); | |
3329 } | |
3330 } | |
3331 } | |
3332 // smsg("word %s (%d): %s (%d)", sftword, sftnr, theword, orgnr); | |
3333 } | |
3334 } | |
3335 | |
3336 /* | |
3337 * Find word "word" in fold-case tree for "slang" and return the word number. | |
3338 */ | |
3339 static int | |
3340 soundfold_find(slang_T *slang, char_u *word) | |
3341 { | |
3342 idx_T arridx = 0; | |
3343 int len; | |
3344 int wlen = 0; | |
3345 int c; | |
3346 char_u *ptr = word; | |
3347 char_u *byts; | |
3348 idx_T *idxs; | |
3349 int wordnr = 0; | |
3350 | |
3351 byts = slang->sl_sbyts; | |
3352 idxs = slang->sl_sidxs; | |
3353 | |
3354 for (;;) | |
3355 { | |
3356 // First byte is the number of possible bytes. | |
3357 len = byts[arridx++]; | |
3358 | |
3359 // If the first possible byte is a zero the word could end here. | |
3360 // If the word ends we found the word. If not skip the NUL bytes. | |
3361 c = ptr[wlen]; | |
3362 if (byts[arridx] == NUL) | |
3363 { | |
3364 if (c == NUL) | |
3365 break; | |
3366 | |
3367 // Skip over the zeros, there can be several. | |
3368 while (len > 0 && byts[arridx] == NUL) | |
3369 { | |
3370 ++arridx; | |
3371 --len; | |
3372 } | |
3373 if (len == 0) | |
3374 return -1; // no children, word should have ended here | |
3375 ++wordnr; | |
3376 } | |
3377 | |
3378 // If the word ends we didn't find it. | |
3379 if (c == NUL) | |
3380 return -1; | |
3381 | |
3382 // Perform a binary search in the list of accepted bytes. | |
3383 if (c == TAB) // <Tab> is handled like <Space> | |
3384 c = ' '; | |
3385 while (byts[arridx] < c) | |
3386 { | |
3387 // The word count is in the first idxs[] entry of the child. | |
3388 wordnr += idxs[idxs[arridx]]; | |
3389 ++arridx; | |
3390 if (--len == 0) // end of the bytes, didn't find it | |
3391 return -1; | |
3392 } | |
3393 if (byts[arridx] != c) // didn't find the byte | |
3394 return -1; | |
3395 | |
3396 // Continue at the child (if there is one). | |
3397 arridx = idxs[arridx]; | |
3398 ++wlen; | |
3399 | |
3400 // One space in the good word may stand for several spaces in the | |
3401 // checked word. | |
3402 if (c == ' ') | |
3403 while (ptr[wlen] == ' ' || ptr[wlen] == TAB) | |
3404 ++wlen; | |
3405 } | |
3406 | |
3407 return wordnr; | |
3408 } | |
3409 | |
3410 /* | |
3411 * Return TRUE if "c1" and "c2" are similar characters according to the MAP | |
3412 * lines in the .aff file. | |
3413 */ | |
3414 static int | |
3415 similar_chars(slang_T *slang, int c1, int c2) | |
3416 { | |
3417 int m1, m2; | |
3418 char_u buf[MB_MAXBYTES + 1]; | |
3419 hashitem_T *hi; | |
3420 | |
3421 if (c1 >= 256) | |
3422 { | |
3423 buf[mb_char2bytes(c1, buf)] = 0; | |
3424 hi = hash_find(&slang->sl_map_hash, buf); | |
3425 if (HASHITEM_EMPTY(hi)) | |
3426 m1 = 0; | |
3427 else | |
3428 m1 = mb_ptr2char(hi->hi_key + STRLEN(hi->hi_key) + 1); | |
3429 } | |
3430 else | |
3431 m1 = slang->sl_map_array[c1]; | |
3432 if (m1 == 0) | |
3433 return FALSE; | |
3434 | |
3435 | |
3436 if (c2 >= 256) | |
3437 { | |
3438 buf[mb_char2bytes(c2, buf)] = 0; | |
3439 hi = hash_find(&slang->sl_map_hash, buf); | |
3440 if (HASHITEM_EMPTY(hi)) | |
3441 m2 = 0; | |
3442 else | |
3443 m2 = mb_ptr2char(hi->hi_key + STRLEN(hi->hi_key) + 1); | |
3444 } | |
3445 else | |
3446 m2 = slang->sl_map_array[c2]; | |
3447 | |
3448 return m1 == m2; | |
3449 } | |
3450 | |
3451 /* | |
3452 * Add a suggestion to the list of suggestions. | |
3453 * For a suggestion that is already in the list the lowest score is remembered. | |
3454 */ | |
3455 static void | |
3456 add_suggestion( | |
3457 suginfo_T *su, | |
3458 garray_T *gap, // either su_ga or su_sga | |
3459 char_u *goodword, | |
3460 int badlenarg, // len of bad word replaced with "goodword" | |
3461 int score, | |
3462 int altscore, | |
3463 int had_bonus, // value for st_had_bonus | |
3464 slang_T *slang, // language for sound folding | |
3465 int maxsf) // su_maxscore applies to soundfold score, | |
3466 // su_sfmaxscore to the total score. | |
3467 { | |
3468 int goodlen; // len of goodword changed | |
3469 int badlen; // len of bad word changed | |
3470 suggest_T *stp; | |
3471 suggest_T new_sug; | |
3472 int i; | |
3473 char_u *pgood, *pbad; | |
3474 | |
3475 // Minimize "badlen" for consistency. Avoids that changing "the the" to | |
3476 // "thee the" is added next to changing the first "the" the "thee". | |
3477 pgood = goodword + STRLEN(goodword); | |
3478 pbad = su->su_badptr + badlenarg; | |
3479 for (;;) | |
3480 { | |
3481 goodlen = (int)(pgood - goodword); | |
3482 badlen = (int)(pbad - su->su_badptr); | |
3483 if (goodlen <= 0 || badlen <= 0) | |
3484 break; | |
3485 MB_PTR_BACK(goodword, pgood); | |
3486 MB_PTR_BACK(su->su_badptr, pbad); | |
3487 if (has_mbyte) | |
3488 { | |
3489 if (mb_ptr2char(pgood) != mb_ptr2char(pbad)) | |
3490 break; | |
3491 } | |
3492 else if (*pgood != *pbad) | |
3493 break; | |
3494 } | |
3495 | |
3496 if (badlen == 0 && goodlen == 0) | |
3497 // goodword doesn't change anything; may happen for "the the" changing | |
3498 // the first "the" to itself. | |
3499 return; | |
3500 | |
3501 if (gap->ga_len == 0) | |
3502 i = -1; | |
3503 else | |
3504 { | |
3505 // Check if the word is already there. Also check the length that is | |
3506 // being replaced "thes," -> "these" is a different suggestion from | |
3507 // "thes" -> "these". | |
3508 stp = &SUG(*gap, 0); | |
3509 for (i = gap->ga_len; --i >= 0; ++stp) | |
3510 if (stp->st_wordlen == goodlen | |
3511 && stp->st_orglen == badlen | |
3512 && STRNCMP(stp->st_word, goodword, goodlen) == 0) | |
3513 { | |
3514 // Found it. Remember the word with the lowest score. | |
3515 if (stp->st_slang == NULL) | |
3516 stp->st_slang = slang; | |
3517 | |
3518 new_sug.st_score = score; | |
3519 new_sug.st_altscore = altscore; | |
3520 new_sug.st_had_bonus = had_bonus; | |
3521 | |
3522 if (stp->st_had_bonus != had_bonus) | |
3523 { | |
3524 // Only one of the two had the soundalike score computed. | |
3525 // Need to do that for the other one now, otherwise the | |
3526 // scores can't be compared. This happens because | |
3527 // suggest_try_change() doesn't compute the soundalike | |
3528 // word to keep it fast, while some special methods set | |
3529 // the soundalike score to zero. | |
3530 if (had_bonus) | |
3531 rescore_one(su, stp); | |
3532 else | |
3533 { | |
3534 new_sug.st_word = stp->st_word; | |
3535 new_sug.st_wordlen = stp->st_wordlen; | |
3536 new_sug.st_slang = stp->st_slang; | |
3537 new_sug.st_orglen = badlen; | |
3538 rescore_one(su, &new_sug); | |
3539 } | |
3540 } | |
3541 | |
3542 if (stp->st_score > new_sug.st_score) | |
3543 { | |
3544 stp->st_score = new_sug.st_score; | |
3545 stp->st_altscore = new_sug.st_altscore; | |
3546 stp->st_had_bonus = new_sug.st_had_bonus; | |
3547 } | |
3548 break; | |
3549 } | |
3550 } | |
3551 | |
3552 if (i < 0 && ga_grow(gap, 1) == OK) | |
3553 { | |
3554 // Add a suggestion. | |
3555 stp = &SUG(*gap, gap->ga_len); | |
3556 stp->st_word = vim_strnsave(goodword, goodlen); | |
3557 if (stp->st_word != NULL) | |
3558 { | |
3559 stp->st_wordlen = goodlen; | |
3560 stp->st_score = score; | |
3561 stp->st_altscore = altscore; | |
3562 stp->st_had_bonus = had_bonus; | |
3563 stp->st_orglen = badlen; | |
3564 stp->st_slang = slang; | |
3565 ++gap->ga_len; | |
3566 | |
3567 // If we have too many suggestions now, sort the list and keep | |
3568 // the best suggestions. | |
3569 if (gap->ga_len > SUG_MAX_COUNT(su)) | |
3570 { | |
3571 if (maxsf) | |
3572 su->su_sfmaxscore = cleanup_suggestions(gap, | |
3573 su->su_sfmaxscore, SUG_CLEAN_COUNT(su)); | |
3574 else | |
3575 su->su_maxscore = cleanup_suggestions(gap, | |
3576 su->su_maxscore, SUG_CLEAN_COUNT(su)); | |
3577 } | |
3578 } | |
3579 } | |
3580 } | |
3581 | |
3582 /* | |
3583 * Suggestions may in fact be flagged as errors. Esp. for banned words and | |
3584 * for split words, such as "the the". Remove these from the list here. | |
3585 */ | |
3586 static void | |
3587 check_suggestions( | |
3588 suginfo_T *su, | |
3589 garray_T *gap) // either su_ga or su_sga | |
3590 { | |
3591 suggest_T *stp; | |
3592 int i; | |
3593 char_u longword[MAXWLEN + 1]; | |
3594 int len; | |
3595 hlf_T attr; | |
3596 | |
3597 stp = &SUG(*gap, 0); | |
3598 for (i = gap->ga_len - 1; i >= 0; --i) | |
3599 { | |
3600 // Need to append what follows to check for "the the". | |
3601 vim_strncpy(longword, stp[i].st_word, MAXWLEN); | |
3602 len = stp[i].st_wordlen; | |
3603 vim_strncpy(longword + len, su->su_badptr + stp[i].st_orglen, | |
3604 MAXWLEN - len); | |
3605 attr = HLF_COUNT; | |
3606 (void)spell_check(curwin, longword, &attr, NULL, FALSE); | |
3607 if (attr != HLF_COUNT) | |
3608 { | |
3609 // Remove this entry. | |
3610 vim_free(stp[i].st_word); | |
3611 --gap->ga_len; | |
3612 if (i < gap->ga_len) | |
3613 mch_memmove(stp + i, stp + i + 1, | |
3614 sizeof(suggest_T) * (gap->ga_len - i)); | |
3615 } | |
3616 } | |
3617 } | |
3618 | |
3619 | |
3620 /* | |
3621 * Add a word to be banned. | |
3622 */ | |
3623 static void | |
3624 add_banned( | |
3625 suginfo_T *su, | |
3626 char_u *word) | |
3627 { | |
3628 char_u *s; | |
3629 hash_T hash; | |
3630 hashitem_T *hi; | |
3631 | |
3632 hash = hash_hash(word); | |
3633 hi = hash_lookup(&su->su_banned, word, hash); | |
3634 if (HASHITEM_EMPTY(hi)) | |
3635 { | |
3636 s = vim_strsave(word); | |
3637 if (s != NULL) | |
3638 hash_add_item(&su->su_banned, hi, s, hash); | |
3639 } | |
3640 } | |
3641 | |
3642 /* | |
3643 * Recompute the score for all suggestions if sound-folding is possible. This | |
3644 * is slow, thus only done for the final results. | |
3645 */ | |
3646 static void | |
3647 rescore_suggestions(suginfo_T *su) | |
3648 { | |
3649 int i; | |
3650 | |
3651 if (su->su_sallang != NULL) | |
3652 for (i = 0; i < su->su_ga.ga_len; ++i) | |
3653 rescore_one(su, &SUG(su->su_ga, i)); | |
3654 } | |
3655 | |
3656 /* | |
3657 * Recompute the score for one suggestion if sound-folding is possible. | |
3658 */ | |
3659 static void | |
3660 rescore_one(suginfo_T *su, suggest_T *stp) | |
3661 { | |
3662 slang_T *slang = stp->st_slang; | |
3663 char_u sal_badword[MAXWLEN]; | |
3664 char_u *p; | |
3665 | |
3666 // Only rescore suggestions that have no sal score yet and do have a | |
3667 // language. | |
3668 if (slang != NULL && slang->sl_sal.ga_len > 0 && !stp->st_had_bonus) | |
3669 { | |
3670 if (slang == su->su_sallang) | |
3671 p = su->su_sal_badword; | |
3672 else | |
3673 { | |
3674 spell_soundfold(slang, su->su_fbadword, TRUE, sal_badword); | |
3675 p = sal_badword; | |
3676 } | |
3677 | |
3678 stp->st_altscore = stp_sal_score(stp, su, slang, p); | |
3679 if (stp->st_altscore == SCORE_MAXMAX) | |
3680 stp->st_altscore = SCORE_BIG; | |
3681 stp->st_score = RESCORE(stp->st_score, stp->st_altscore); | |
3682 stp->st_had_bonus = TRUE; | |
3683 } | |
3684 } | |
3685 | |
3686 static int sug_compare(const void *s1, const void *s2); | |
3687 | |
3688 /* | |
3689 * Function given to qsort() to sort the suggestions on st_score. | |
3690 * First on "st_score", then "st_altscore" then alphabetically. | |
3691 */ | |
3692 static int | |
3693 sug_compare(const void *s1, const void *s2) | |
3694 { | |
3695 suggest_T *p1 = (suggest_T *)s1; | |
3696 suggest_T *p2 = (suggest_T *)s2; | |
3697 int n = p1->st_score - p2->st_score; | |
3698 | |
3699 if (n == 0) | |
3700 { | |
3701 n = p1->st_altscore - p2->st_altscore; | |
3702 if (n == 0) | |
3703 n = STRICMP(p1->st_word, p2->st_word); | |
3704 } | |
3705 return n; | |
3706 } | |
3707 | |
3708 /* | |
3709 * Cleanup the suggestions: | |
3710 * - Sort on score. | |
3711 * - Remove words that won't be displayed. | |
3712 * Returns the maximum score in the list or "maxscore" unmodified. | |
3713 */ | |
3714 static int | |
3715 cleanup_suggestions( | |
3716 garray_T *gap, | |
3717 int maxscore, | |
3718 int keep) // nr of suggestions to keep | |
3719 { | |
3720 suggest_T *stp = &SUG(*gap, 0); | |
3721 int i; | |
3722 | |
3723 // Sort the list. | |
3724 qsort(gap->ga_data, (size_t)gap->ga_len, sizeof(suggest_T), sug_compare); | |
3725 | |
3726 // Truncate the list to the number of suggestions that will be displayed. | |
3727 if (gap->ga_len > keep) | |
3728 { | |
3729 for (i = keep; i < gap->ga_len; ++i) | |
3730 vim_free(stp[i].st_word); | |
3731 gap->ga_len = keep; | |
3732 return stp[keep - 1].st_score; | |
3733 } | |
3734 return maxscore; | |
3735 } | |
3736 | |
3737 /* | |
3738 * Compute a score for two sound-a-like words. | |
3739 * This permits up to two inserts/deletes/swaps/etc. to keep things fast. | |
3740 * Instead of a generic loop we write out the code. That keeps it fast by | |
3741 * avoiding checks that will not be possible. | |
3742 */ | |
3743 static int | |
3744 soundalike_score( | |
3745 char_u *goodstart, // sound-folded good word | |
3746 char_u *badstart) // sound-folded bad word | |
3747 { | |
3748 char_u *goodsound = goodstart; | |
3749 char_u *badsound = badstart; | |
3750 int goodlen; | |
3751 int badlen; | |
3752 int n; | |
3753 char_u *pl, *ps; | |
3754 char_u *pl2, *ps2; | |
3755 int score = 0; | |
3756 | |
3757 // Adding/inserting "*" at the start (word starts with vowel) shouldn't be | |
3758 // counted so much, vowels halfway the word aren't counted at all. | |
3759 if ((*badsound == '*' || *goodsound == '*') && *badsound != *goodsound) | |
3760 { | |
3761 if ((badsound[0] == NUL && goodsound[1] == NUL) | |
3762 || (goodsound[0] == NUL && badsound[1] == NUL)) | |
3763 // changing word with vowel to word without a sound | |
3764 return SCORE_DEL; | |
3765 if (badsound[0] == NUL || goodsound[0] == NUL) | |
3766 // more than two changes | |
3767 return SCORE_MAXMAX; | |
3768 | |
3769 if (badsound[1] == goodsound[1] | |
3770 || (badsound[1] != NUL | |
3771 && goodsound[1] != NUL | |
3772 && badsound[2] == goodsound[2])) | |
3773 { | |
3774 // handle like a substitute | |
3775 } | |
3776 else | |
3777 { | |
3778 score = 2 * SCORE_DEL / 3; | |
3779 if (*badsound == '*') | |
3780 ++badsound; | |
3781 else | |
3782 ++goodsound; | |
3783 } | |
3784 } | |
3785 | |
3786 goodlen = (int)STRLEN(goodsound); | |
3787 badlen = (int)STRLEN(badsound); | |
3788 | |
3789 // Return quickly if the lengths are too different to be fixed by two | |
3790 // changes. | |
3791 n = goodlen - badlen; | |
3792 if (n < -2 || n > 2) | |
3793 return SCORE_MAXMAX; | |
3794 | |
3795 if (n > 0) | |
3796 { | |
3797 pl = goodsound; // goodsound is longest | |
3798 ps = badsound; | |
3799 } | |
3800 else | |
3801 { | |
3802 pl = badsound; // badsound is longest | |
3803 ps = goodsound; | |
3804 } | |
3805 | |
3806 // Skip over the identical part. | |
3807 while (*pl == *ps && *pl != NUL) | |
3808 { | |
3809 ++pl; | |
3810 ++ps; | |
3811 } | |
3812 | |
3813 switch (n) | |
3814 { | |
3815 case -2: | |
3816 case 2: | |
3817 // Must delete two characters from "pl". | |
3818 ++pl; // first delete | |
3819 while (*pl == *ps) | |
3820 { | |
3821 ++pl; | |
3822 ++ps; | |
3823 } | |
3824 // strings must be equal after second delete | |
3825 if (STRCMP(pl + 1, ps) == 0) | |
3826 return score + SCORE_DEL * 2; | |
3827 | |
3828 // Failed to compare. | |
3829 break; | |
3830 | |
3831 case -1: | |
3832 case 1: | |
3833 // Minimal one delete from "pl" required. | |
3834 | |
3835 // 1: delete | |
3836 pl2 = pl + 1; | |
3837 ps2 = ps; | |
3838 while (*pl2 == *ps2) | |
3839 { | |
3840 if (*pl2 == NUL) // reached the end | |
3841 return score + SCORE_DEL; | |
3842 ++pl2; | |
3843 ++ps2; | |
3844 } | |
3845 | |
3846 // 2: delete then swap, then rest must be equal | |
3847 if (pl2[0] == ps2[1] && pl2[1] == ps2[0] | |
3848 && STRCMP(pl2 + 2, ps2 + 2) == 0) | |
3849 return score + SCORE_DEL + SCORE_SWAP; | |
3850 | |
3851 // 3: delete then substitute, then the rest must be equal | |
3852 if (STRCMP(pl2 + 1, ps2 + 1) == 0) | |
3853 return score + SCORE_DEL + SCORE_SUBST; | |
3854 | |
3855 // 4: first swap then delete | |
3856 if (pl[0] == ps[1] && pl[1] == ps[0]) | |
3857 { | |
3858 pl2 = pl + 2; // swap, skip two chars | |
3859 ps2 = ps + 2; | |
3860 while (*pl2 == *ps2) | |
3861 { | |
3862 ++pl2; | |
3863 ++ps2; | |
3864 } | |
3865 // delete a char and then strings must be equal | |
3866 if (STRCMP(pl2 + 1, ps2) == 0) | |
3867 return score + SCORE_SWAP + SCORE_DEL; | |
3868 } | |
3869 | |
3870 // 5: first substitute then delete | |
3871 pl2 = pl + 1; // substitute, skip one char | |
3872 ps2 = ps + 1; | |
3873 while (*pl2 == *ps2) | |
3874 { | |
3875 ++pl2; | |
3876 ++ps2; | |
3877 } | |
3878 // delete a char and then strings must be equal | |
3879 if (STRCMP(pl2 + 1, ps2) == 0) | |
3880 return score + SCORE_SUBST + SCORE_DEL; | |
3881 | |
3882 // Failed to compare. | |
3883 break; | |
3884 | |
3885 case 0: | |
3886 // Lengths are equal, thus changes must result in same length: An | |
3887 // insert is only possible in combination with a delete. | |
3888 // 1: check if for identical strings | |
3889 if (*pl == NUL) | |
3890 return score; | |
3891 | |
3892 // 2: swap | |
3893 if (pl[0] == ps[1] && pl[1] == ps[0]) | |
3894 { | |
3895 pl2 = pl + 2; // swap, skip two chars | |
3896 ps2 = ps + 2; | |
3897 while (*pl2 == *ps2) | |
3898 { | |
3899 if (*pl2 == NUL) // reached the end | |
3900 return score + SCORE_SWAP; | |
3901 ++pl2; | |
3902 ++ps2; | |
3903 } | |
3904 // 3: swap and swap again | |
3905 if (pl2[0] == ps2[1] && pl2[1] == ps2[0] | |
3906 && STRCMP(pl2 + 2, ps2 + 2) == 0) | |
3907 return score + SCORE_SWAP + SCORE_SWAP; | |
3908 | |
3909 // 4: swap and substitute | |
3910 if (STRCMP(pl2 + 1, ps2 + 1) == 0) | |
3911 return score + SCORE_SWAP + SCORE_SUBST; | |
3912 } | |
3913 | |
3914 // 5: substitute | |
3915 pl2 = pl + 1; | |
3916 ps2 = ps + 1; | |
3917 while (*pl2 == *ps2) | |
3918 { | |
3919 if (*pl2 == NUL) // reached the end | |
3920 return score + SCORE_SUBST; | |
3921 ++pl2; | |
3922 ++ps2; | |
3923 } | |
3924 | |
3925 // 6: substitute and swap | |
3926 if (pl2[0] == ps2[1] && pl2[1] == ps2[0] | |
3927 && STRCMP(pl2 + 2, ps2 + 2) == 0) | |
3928 return score + SCORE_SUBST + SCORE_SWAP; | |
3929 | |
3930 // 7: substitute and substitute | |
3931 if (STRCMP(pl2 + 1, ps2 + 1) == 0) | |
3932 return score + SCORE_SUBST + SCORE_SUBST; | |
3933 | |
3934 // 8: insert then delete | |
3935 pl2 = pl; | |
3936 ps2 = ps + 1; | |
3937 while (*pl2 == *ps2) | |
3938 { | |
3939 ++pl2; | |
3940 ++ps2; | |
3941 } | |
3942 if (STRCMP(pl2 + 1, ps2) == 0) | |
3943 return score + SCORE_INS + SCORE_DEL; | |
3944 | |
3945 // 9: delete then insert | |
3946 pl2 = pl + 1; | |
3947 ps2 = ps; | |
3948 while (*pl2 == *ps2) | |
3949 { | |
3950 ++pl2; | |
3951 ++ps2; | |
3952 } | |
3953 if (STRCMP(pl2, ps2 + 1) == 0) | |
3954 return score + SCORE_INS + SCORE_DEL; | |
3955 | |
3956 // Failed to compare. | |
3957 break; | |
3958 } | |
3959 | |
3960 return SCORE_MAXMAX; | |
3961 } | |
3962 | |
3963 /* | |
3964 * Compute the "edit distance" to turn "badword" into "goodword". The less | |
3965 * deletes/inserts/substitutes/swaps are required the lower the score. | |
3966 * | |
3967 * The algorithm is described by Du and Chang, 1992. | |
3968 * The implementation of the algorithm comes from Aspell editdist.cpp, | |
3969 * edit_distance(). It has been converted from C++ to C and modified to | |
3970 * support multi-byte characters. | |
3971 */ | |
3972 static int | |
3973 spell_edit_score( | |
3974 slang_T *slang, | |
3975 char_u *badword, | |
3976 char_u *goodword) | |
3977 { | |
3978 int *cnt; | |
3979 int badlen, goodlen; // lengths including NUL | |
3980 int j, i; | |
3981 int t; | |
3982 int bc, gc; | |
3983 int pbc, pgc; | |
3984 char_u *p; | |
3985 int wbadword[MAXWLEN]; | |
3986 int wgoodword[MAXWLEN]; | |
3987 | |
3988 if (has_mbyte) | |
3989 { | |
3990 // Get the characters from the multi-byte strings and put them in an | |
3991 // int array for easy access. | |
3992 for (p = badword, badlen = 0; *p != NUL; ) | |
3993 wbadword[badlen++] = mb_cptr2char_adv(&p); | |
3994 wbadword[badlen++] = 0; | |
3995 for (p = goodword, goodlen = 0; *p != NUL; ) | |
3996 wgoodword[goodlen++] = mb_cptr2char_adv(&p); | |
3997 wgoodword[goodlen++] = 0; | |
3998 } | |
3999 else | |
4000 { | |
4001 badlen = (int)STRLEN(badword) + 1; | |
4002 goodlen = (int)STRLEN(goodword) + 1; | |
4003 } | |
4004 | |
4005 // We use "cnt" as an array: CNT(badword_idx, goodword_idx). | |
4006 #define CNT(a, b) cnt[(a) + (b) * (badlen + 1)] | |
4007 cnt = ALLOC_MULT(int, (badlen + 1) * (goodlen + 1)); | |
4008 if (cnt == NULL) | |
4009 return 0; // out of memory | |
4010 | |
4011 CNT(0, 0) = 0; | |
4012 for (j = 1; j <= goodlen; ++j) | |
4013 CNT(0, j) = CNT(0, j - 1) + SCORE_INS; | |
4014 | |
4015 for (i = 1; i <= badlen; ++i) | |
4016 { | |
4017 CNT(i, 0) = CNT(i - 1, 0) + SCORE_DEL; | |
4018 for (j = 1; j <= goodlen; ++j) | |
4019 { | |
4020 if (has_mbyte) | |
4021 { | |
4022 bc = wbadword[i - 1]; | |
4023 gc = wgoodword[j - 1]; | |
4024 } | |
4025 else | |
4026 { | |
4027 bc = badword[i - 1]; | |
4028 gc = goodword[j - 1]; | |
4029 } | |
4030 if (bc == gc) | |
4031 CNT(i, j) = CNT(i - 1, j - 1); | |
4032 else | |
4033 { | |
4034 // Use a better score when there is only a case difference. | |
4035 if (SPELL_TOFOLD(bc) == SPELL_TOFOLD(gc)) | |
4036 CNT(i, j) = SCORE_ICASE + CNT(i - 1, j - 1); | |
4037 else | |
4038 { | |
4039 // For a similar character use SCORE_SIMILAR. | |
4040 if (slang != NULL | |
4041 && slang->sl_has_map | |
4042 && similar_chars(slang, gc, bc)) | |
4043 CNT(i, j) = SCORE_SIMILAR + CNT(i - 1, j - 1); | |
4044 else | |
4045 CNT(i, j) = SCORE_SUBST + CNT(i - 1, j - 1); | |
4046 } | |
4047 | |
4048 if (i > 1 && j > 1) | |
4049 { | |
4050 if (has_mbyte) | |
4051 { | |
4052 pbc = wbadword[i - 2]; | |
4053 pgc = wgoodword[j - 2]; | |
4054 } | |
4055 else | |
4056 { | |
4057 pbc = badword[i - 2]; | |
4058 pgc = goodword[j - 2]; | |
4059 } | |
4060 if (bc == pgc && pbc == gc) | |
4061 { | |
4062 t = SCORE_SWAP + CNT(i - 2, j - 2); | |
4063 if (t < CNT(i, j)) | |
4064 CNT(i, j) = t; | |
4065 } | |
4066 } | |
4067 t = SCORE_DEL + CNT(i - 1, j); | |
4068 if (t < CNT(i, j)) | |
4069 CNT(i, j) = t; | |
4070 t = SCORE_INS + CNT(i, j - 1); | |
4071 if (t < CNT(i, j)) | |
4072 CNT(i, j) = t; | |
4073 } | |
4074 } | |
4075 } | |
4076 | |
4077 i = CNT(badlen - 1, goodlen - 1); | |
4078 vim_free(cnt); | |
4079 return i; | |
4080 } | |
4081 | |
4082 typedef struct | |
4083 { | |
4084 int badi; | |
4085 int goodi; | |
4086 int score; | |
4087 } limitscore_T; | |
4088 | |
4089 /* | |
4090 * Like spell_edit_score(), but with a limit on the score to make it faster. | |
4091 * May return SCORE_MAXMAX when the score is higher than "limit". | |
4092 * | |
4093 * This uses a stack for the edits still to be tried. | |
4094 * The idea comes from Aspell leditdist.cpp. Rewritten in C and added support | |
4095 * for multi-byte characters. | |
4096 */ | |
4097 static int | |
4098 spell_edit_score_limit( | |
4099 slang_T *slang, | |
4100 char_u *badword, | |
4101 char_u *goodword, | |
4102 int limit) | |
4103 { | |
4104 limitscore_T stack[10]; // allow for over 3 * 2 edits | |
4105 int stackidx; | |
4106 int bi, gi; | |
4107 int bi2, gi2; | |
4108 int bc, gc; | |
4109 int score; | |
4110 int score_off; | |
4111 int minscore; | |
4112 int round; | |
4113 | |
4114 // Multi-byte characters require a bit more work, use a different function | |
4115 // to avoid testing "has_mbyte" quite often. | |
4116 if (has_mbyte) | |
4117 return spell_edit_score_limit_w(slang, badword, goodword, limit); | |
4118 | |
4119 // The idea is to go from start to end over the words. So long as | |
4120 // characters are equal just continue, this always gives the lowest score. | |
4121 // When there is a difference try several alternatives. Each alternative | |
4122 // increases "score" for the edit distance. Some of the alternatives are | |
4123 // pushed unto a stack and tried later, some are tried right away. At the | |
4124 // end of the word the score for one alternative is known. The lowest | |
4125 // possible score is stored in "minscore". | |
4126 stackidx = 0; | |
4127 bi = 0; | |
4128 gi = 0; | |
4129 score = 0; | |
4130 minscore = limit + 1; | |
4131 | |
4132 for (;;) | |
4133 { | |
4134 // Skip over an equal part, score remains the same. | |
4135 for (;;) | |
4136 { | |
4137 bc = badword[bi]; | |
4138 gc = goodword[gi]; | |
4139 if (bc != gc) // stop at a char that's different | |
4140 break; | |
4141 if (bc == NUL) // both words end | |
4142 { | |
4143 if (score < minscore) | |
4144 minscore = score; | |
4145 goto pop; // do next alternative | |
4146 } | |
4147 ++bi; | |
4148 ++gi; | |
4149 } | |
4150 | |
4151 if (gc == NUL) // goodword ends, delete badword chars | |
4152 { | |
4153 do | |
4154 { | |
4155 if ((score += SCORE_DEL) >= minscore) | |
4156 goto pop; // do next alternative | |
4157 } while (badword[++bi] != NUL); | |
4158 minscore = score; | |
4159 } | |
4160 else if (bc == NUL) // badword ends, insert badword chars | |
4161 { | |
4162 do | |
4163 { | |
4164 if ((score += SCORE_INS) >= minscore) | |
4165 goto pop; // do next alternative | |
4166 } while (goodword[++gi] != NUL); | |
4167 minscore = score; | |
4168 } | |
4169 else // both words continue | |
4170 { | |
4171 // If not close to the limit, perform a change. Only try changes | |
4172 // that may lead to a lower score than "minscore". | |
4173 // round 0: try deleting a char from badword | |
4174 // round 1: try inserting a char in badword | |
4175 for (round = 0; round <= 1; ++round) | |
4176 { | |
4177 score_off = score + (round == 0 ? SCORE_DEL : SCORE_INS); | |
4178 if (score_off < minscore) | |
4179 { | |
4180 if (score_off + SCORE_EDIT_MIN >= minscore) | |
4181 { | |
4182 // Near the limit, rest of the words must match. We | |
4183 // can check that right now, no need to push an item | |
4184 // onto the stack. | |
4185 bi2 = bi + 1 - round; | |
4186 gi2 = gi + round; | |
4187 while (goodword[gi2] == badword[bi2]) | |
4188 { | |
4189 if (goodword[gi2] == NUL) | |
4190 { | |
4191 minscore = score_off; | |
4192 break; | |
4193 } | |
4194 ++bi2; | |
4195 ++gi2; | |
4196 } | |
4197 } | |
4198 else | |
4199 { | |
4200 // try deleting/inserting a character later | |
4201 stack[stackidx].badi = bi + 1 - round; | |
4202 stack[stackidx].goodi = gi + round; | |
4203 stack[stackidx].score = score_off; | |
4204 ++stackidx; | |
4205 } | |
4206 } | |
4207 } | |
4208 | |
4209 if (score + SCORE_SWAP < minscore) | |
4210 { | |
4211 // If swapping two characters makes a match then the | |
4212 // substitution is more expensive, thus there is no need to | |
4213 // try both. | |
4214 if (gc == badword[bi + 1] && bc == goodword[gi + 1]) | |
4215 { | |
4216 // Swap two characters, that is: skip them. | |
4217 gi += 2; | |
4218 bi += 2; | |
4219 score += SCORE_SWAP; | |
4220 continue; | |
4221 } | |
4222 } | |
4223 | |
4224 // Substitute one character for another which is the same | |
4225 // thing as deleting a character from both goodword and badword. | |
4226 // Use a better score when there is only a case difference. | |
4227 if (SPELL_TOFOLD(bc) == SPELL_TOFOLD(gc)) | |
4228 score += SCORE_ICASE; | |
4229 else | |
4230 { | |
4231 // For a similar character use SCORE_SIMILAR. | |
4232 if (slang != NULL | |
4233 && slang->sl_has_map | |
4234 && similar_chars(slang, gc, bc)) | |
4235 score += SCORE_SIMILAR; | |
4236 else | |
4237 score += SCORE_SUBST; | |
4238 } | |
4239 | |
4240 if (score < minscore) | |
4241 { | |
4242 // Do the substitution. | |
4243 ++gi; | |
4244 ++bi; | |
4245 continue; | |
4246 } | |
4247 } | |
4248 pop: | |
4249 // Get here to try the next alternative, pop it from the stack. | |
4250 if (stackidx == 0) // stack is empty, finished | |
4251 break; | |
4252 | |
4253 // pop an item from the stack | |
4254 --stackidx; | |
4255 gi = stack[stackidx].goodi; | |
4256 bi = stack[stackidx].badi; | |
4257 score = stack[stackidx].score; | |
4258 } | |
4259 | |
4260 // When the score goes over "limit" it may actually be much higher. | |
4261 // Return a very large number to avoid going below the limit when giving a | |
4262 // bonus. | |
4263 if (minscore > limit) | |
4264 return SCORE_MAXMAX; | |
4265 return minscore; | |
4266 } | |
4267 | |
4268 /* | |
4269 * Multi-byte version of spell_edit_score_limit(). | |
4270 * Keep it in sync with the above! | |
4271 */ | |
4272 static int | |
4273 spell_edit_score_limit_w( | |
4274 slang_T *slang, | |
4275 char_u *badword, | |
4276 char_u *goodword, | |
4277 int limit) | |
4278 { | |
4279 limitscore_T stack[10]; // allow for over 3 * 2 edits | |
4280 int stackidx; | |
4281 int bi, gi; | |
4282 int bi2, gi2; | |
4283 int bc, gc; | |
4284 int score; | |
4285 int score_off; | |
4286 int minscore; | |
4287 int round; | |
4288 char_u *p; | |
4289 int wbadword[MAXWLEN]; | |
4290 int wgoodword[MAXWLEN]; | |
4291 | |
4292 // Get the characters from the multi-byte strings and put them in an | |
4293 // int array for easy access. | |
4294 bi = 0; | |
4295 for (p = badword; *p != NUL; ) | |
4296 wbadword[bi++] = mb_cptr2char_adv(&p); | |
4297 wbadword[bi++] = 0; | |
4298 gi = 0; | |
4299 for (p = goodword; *p != NUL; ) | |
4300 wgoodword[gi++] = mb_cptr2char_adv(&p); | |
4301 wgoodword[gi++] = 0; | |
4302 | |
4303 // The idea is to go from start to end over the words. So long as | |
4304 // characters are equal just continue, this always gives the lowest score. | |
4305 // When there is a difference try several alternatives. Each alternative | |
4306 // increases "score" for the edit distance. Some of the alternatives are | |
4307 // pushed unto a stack and tried later, some are tried right away. At the | |
4308 // end of the word the score for one alternative is known. The lowest | |
4309 // possible score is stored in "minscore". | |
4310 stackidx = 0; | |
4311 bi = 0; | |
4312 gi = 0; | |
4313 score = 0; | |
4314 minscore = limit + 1; | |
4315 | |
4316 for (;;) | |
4317 { | |
4318 // Skip over an equal part, score remains the same. | |
4319 for (;;) | |
4320 { | |
4321 bc = wbadword[bi]; | |
4322 gc = wgoodword[gi]; | |
4323 | |
4324 if (bc != gc) // stop at a char that's different | |
4325 break; | |
4326 if (bc == NUL) // both words end | |
4327 { | |
4328 if (score < minscore) | |
4329 minscore = score; | |
4330 goto pop; // do next alternative | |
4331 } | |
4332 ++bi; | |
4333 ++gi; | |
4334 } | |
4335 | |
4336 if (gc == NUL) // goodword ends, delete badword chars | |
4337 { | |
4338 do | |
4339 { | |
4340 if ((score += SCORE_DEL) >= minscore) | |
4341 goto pop; // do next alternative | |
4342 } while (wbadword[++bi] != NUL); | |
4343 minscore = score; | |
4344 } | |
4345 else if (bc == NUL) // badword ends, insert badword chars | |
4346 { | |
4347 do | |
4348 { | |
4349 if ((score += SCORE_INS) >= minscore) | |
4350 goto pop; // do next alternative | |
4351 } while (wgoodword[++gi] != NUL); | |
4352 minscore = score; | |
4353 } | |
4354 else // both words continue | |
4355 { | |
4356 // If not close to the limit, perform a change. Only try changes | |
4357 // that may lead to a lower score than "minscore". | |
4358 // round 0: try deleting a char from badword | |
4359 // round 1: try inserting a char in badword | |
4360 for (round = 0; round <= 1; ++round) | |
4361 { | |
4362 score_off = score + (round == 0 ? SCORE_DEL : SCORE_INS); | |
4363 if (score_off < minscore) | |
4364 { | |
4365 if (score_off + SCORE_EDIT_MIN >= minscore) | |
4366 { | |
4367 // Near the limit, rest of the words must match. We | |
4368 // can check that right now, no need to push an item | |
4369 // onto the stack. | |
4370 bi2 = bi + 1 - round; | |
4371 gi2 = gi + round; | |
4372 while (wgoodword[gi2] == wbadword[bi2]) | |
4373 { | |
4374 if (wgoodword[gi2] == NUL) | |
4375 { | |
4376 minscore = score_off; | |
4377 break; | |
4378 } | |
4379 ++bi2; | |
4380 ++gi2; | |
4381 } | |
4382 } | |
4383 else | |
4384 { | |
4385 // try deleting a character from badword later | |
4386 stack[stackidx].badi = bi + 1 - round; | |
4387 stack[stackidx].goodi = gi + round; | |
4388 stack[stackidx].score = score_off; | |
4389 ++stackidx; | |
4390 } | |
4391 } | |
4392 } | |
4393 | |
4394 if (score + SCORE_SWAP < minscore) | |
4395 { | |
4396 // If swapping two characters makes a match then the | |
4397 // substitution is more expensive, thus there is no need to | |
4398 // try both. | |
4399 if (gc == wbadword[bi + 1] && bc == wgoodword[gi + 1]) | |
4400 { | |
4401 // Swap two characters, that is: skip them. | |
4402 gi += 2; | |
4403 bi += 2; | |
4404 score += SCORE_SWAP; | |
4405 continue; | |
4406 } | |
4407 } | |
4408 | |
4409 // Substitute one character for another which is the same | |
4410 // thing as deleting a character from both goodword and badword. | |
4411 // Use a better score when there is only a case difference. | |
4412 if (SPELL_TOFOLD(bc) == SPELL_TOFOLD(gc)) | |
4413 score += SCORE_ICASE; | |
4414 else | |
4415 { | |
4416 // For a similar character use SCORE_SIMILAR. | |
4417 if (slang != NULL | |
4418 && slang->sl_has_map | |
4419 && similar_chars(slang, gc, bc)) | |
4420 score += SCORE_SIMILAR; | |
4421 else | |
4422 score += SCORE_SUBST; | |
4423 } | |
4424 | |
4425 if (score < minscore) | |
4426 { | |
4427 // Do the substitution. | |
4428 ++gi; | |
4429 ++bi; | |
4430 continue; | |
4431 } | |
4432 } | |
4433 pop: | |
4434 // Get here to try the next alternative, pop it from the stack. | |
4435 if (stackidx == 0) // stack is empty, finished | |
4436 break; | |
4437 | |
4438 // pop an item from the stack | |
4439 --stackidx; | |
4440 gi = stack[stackidx].goodi; | |
4441 bi = stack[stackidx].badi; | |
4442 score = stack[stackidx].score; | |
4443 } | |
4444 | |
4445 // When the score goes over "limit" it may actually be much higher. | |
4446 // Return a very large number to avoid going below the limit when giving a | |
4447 // bonus. | |
4448 if (minscore > limit) | |
4449 return SCORE_MAXMAX; | |
4450 return minscore; | |
4451 } | |
4452 #endif // FEAT_SPELL |