comparison src/spellsuggest.c @ 18172:6e53d83e021d v8.1.2081

patch 8.1.2081: the spell.c file is too big Commit: https://github.com/vim/vim/commit/46a426c9acfdd3d6c0fa134a17681634b9325bee Author: Bram Moolenaar <Bram@vim.org> Date: Fri Sep 27 12:41:56 2019 +0200 patch 8.1.2081: the spell.c file is too big Problem: The spell.c file is too big. Solution: Move the code for spell suggestions to a separate file. (Yegappan Lakshmanan, closes #4988)
author Bram Moolenaar <Bram@vim.org>
date Fri, 27 Sep 2019 12:45:08 +0200
parents
children c8a53c0daeed
comparison
equal deleted inserted replaced
18171:b6145168fb4b 18172:6e53d83e021d
1 /* vi:set ts=8 sts=4 sw=4 noet:
2 *
3 * VIM - Vi IMproved by Bram Moolenaar
4 *
5 * Do ":help uganda" in Vim to read copying and usage conditions.
6 * Do ":help credits" in Vim to see a list of people who contributed.
7 * See README.txt for an overview of the Vim source code.
8 */
9
10 /*
11 * spellsuggest.c: functions for spelling suggestions
12 */
13
14 #include "vim.h"
15
16 #if defined(FEAT_SPELL) || defined(PROTO)
17
18 /*
19 * Use this to adjust the score after finding suggestions, based on the
20 * suggested word sounding like the bad word. This is much faster than doing
21 * it for every possible suggestion.
22 * Disadvantage: When "the" is typed as "hte" it sounds quite different ("@"
23 * vs "ht") and goes down in the list.
24 * Used when 'spellsuggest' is set to "best".
25 */
26 #define RESCORE(word_score, sound_score) ((3 * word_score + sound_score) / 4)
27
28 /*
29 * Do the opposite: based on a maximum end score and a known sound score,
30 * compute the maximum word score that can be used.
31 */
32 #define MAXSCORE(word_score, sound_score) ((4 * word_score - sound_score) / 3)
33
34 // only used for su_badflags
35 #define WF_MIXCAP 0x20 // mix of upper and lower case: macaRONI
36
37 /*
38 * Information used when looking for suggestions.
39 */
40 typedef struct suginfo_S
41 {
42 garray_T su_ga; // suggestions, contains "suggest_T"
43 int su_maxcount; // max. number of suggestions displayed
44 int su_maxscore; // maximum score for adding to su_ga
45 int su_sfmaxscore; // idem, for when doing soundfold words
46 garray_T su_sga; // like su_ga, sound-folded scoring
47 char_u *su_badptr; // start of bad word in line
48 int su_badlen; // length of detected bad word in line
49 int su_badflags; // caps flags for bad word
50 char_u su_badword[MAXWLEN]; // bad word truncated at su_badlen
51 char_u su_fbadword[MAXWLEN]; // su_badword case-folded
52 char_u su_sal_badword[MAXWLEN]; // su_badword soundfolded
53 hashtab_T su_banned; // table with banned words
54 slang_T *su_sallang; // default language for sound folding
55 } suginfo_T;
56
57 // One word suggestion. Used in "si_ga".
58 typedef struct suggest_S
59 {
60 char_u *st_word; // suggested word, allocated string
61 int st_wordlen; // STRLEN(st_word)
62 int st_orglen; // length of replaced text
63 int st_score; // lower is better
64 int st_altscore; // used when st_score compares equal
65 int st_salscore; // st_score is for soundalike
66 int st_had_bonus; // bonus already included in score
67 slang_T *st_slang; // language used for sound folding
68 } suggest_T;
69
70 #define SUG(ga, i) (((suggest_T *)(ga).ga_data)[i])
71
72 // TRUE if a word appears in the list of banned words.
73 #define WAS_BANNED(su, word) (!HASHITEM_EMPTY(hash_find(&su->su_banned, word)))
74
75 // Number of suggestions kept when cleaning up. We need to keep more than
76 // what is displayed, because when rescore_suggestions() is called the score
77 // may change and wrong suggestions may be removed later.
78 #define SUG_CLEAN_COUNT(su) ((su)->su_maxcount < 130 ? 150 : (su)->su_maxcount + 20)
79
80 // Threshold for sorting and cleaning up suggestions. Don't want to keep lots
81 // of suggestions that are not going to be displayed.
82 #define SUG_MAX_COUNT(su) (SUG_CLEAN_COUNT(su) + 50)
83
84 // score for various changes
85 #define SCORE_SPLIT 149 // split bad word
86 #define SCORE_SPLIT_NO 249 // split bad word with NOSPLITSUGS
87 #define SCORE_ICASE 52 // slightly different case
88 #define SCORE_REGION 200 // word is for different region
89 #define SCORE_RARE 180 // rare word
90 #define SCORE_SWAP 75 // swap two characters
91 #define SCORE_SWAP3 110 // swap two characters in three
92 #define SCORE_REP 65 // REP replacement
93 #define SCORE_SUBST 93 // substitute a character
94 #define SCORE_SIMILAR 33 // substitute a similar character
95 #define SCORE_SUBCOMP 33 // substitute a composing character
96 #define SCORE_DEL 94 // delete a character
97 #define SCORE_DELDUP 66 // delete a duplicated character
98 #define SCORE_DELCOMP 28 // delete a composing character
99 #define SCORE_INS 96 // insert a character
100 #define SCORE_INSDUP 67 // insert a duplicate character
101 #define SCORE_INSCOMP 30 // insert a composing character
102 #define SCORE_NONWORD 103 // change non-word to word char
103
104 #define SCORE_FILE 30 // suggestion from a file
105 #define SCORE_MAXINIT 350 // Initial maximum score: higher == slower.
106 // 350 allows for about three changes.
107
108 #define SCORE_COMMON1 30 // subtracted for words seen before
109 #define SCORE_COMMON2 40 // subtracted for words often seen
110 #define SCORE_COMMON3 50 // subtracted for words very often seen
111 #define SCORE_THRES2 10 // word count threshold for COMMON2
112 #define SCORE_THRES3 100 // word count threshold for COMMON3
113
114 // When trying changed soundfold words it becomes slow when trying more than
115 // two changes. With less then two changes it's slightly faster but we miss a
116 // few good suggestions. In rare cases we need to try three of four changes.
117 #define SCORE_SFMAX1 200 // maximum score for first try
118 #define SCORE_SFMAX2 300 // maximum score for second try
119 #define SCORE_SFMAX3 400 // maximum score for third try
120
121 #define SCORE_BIG SCORE_INS * 3 // big difference
122 #define SCORE_MAXMAX 999999 // accept any score
123 #define SCORE_LIMITMAX 350 // for spell_edit_score_limit()
124
125 // for spell_edit_score_limit() we need to know the minimum value of
126 // SCORE_ICASE, SCORE_SWAP, SCORE_DEL, SCORE_SIMILAR and SCORE_INS
127 #define SCORE_EDIT_MIN SCORE_SIMILAR
128
129 /*
130 * For finding suggestions: At each node in the tree these states are tried:
131 */
132 typedef enum
133 {
134 STATE_START = 0, // At start of node check for NUL bytes (goodword
135 // ends); if badword ends there is a match, otherwise
136 // try splitting word.
137 STATE_NOPREFIX, // try without prefix
138 STATE_SPLITUNDO, // Undo splitting.
139 STATE_ENDNUL, // Past NUL bytes at start of the node.
140 STATE_PLAIN, // Use each byte of the node.
141 STATE_DEL, // Delete a byte from the bad word.
142 STATE_INS_PREP, // Prepare for inserting bytes.
143 STATE_INS, // Insert a byte in the bad word.
144 STATE_SWAP, // Swap two bytes.
145 STATE_UNSWAP, // Undo swap two characters.
146 STATE_SWAP3, // Swap two characters over three.
147 STATE_UNSWAP3, // Undo Swap two characters over three.
148 STATE_UNROT3L, // Undo rotate three characters left
149 STATE_UNROT3R, // Undo rotate three characters right
150 STATE_REP_INI, // Prepare for using REP items.
151 STATE_REP, // Use matching REP items from the .aff file.
152 STATE_REP_UNDO, // Undo a REP item replacement.
153 STATE_FINAL // End of this node.
154 } state_T;
155
156 /*
157 * Struct to keep the state at each level in suggest_try_change().
158 */
159 typedef struct trystate_S
160 {
161 state_T ts_state; // state at this level, STATE_
162 int ts_score; // score
163 idx_T ts_arridx; // index in tree array, start of node
164 short ts_curi; // index in list of child nodes
165 char_u ts_fidx; // index in fword[], case-folded bad word
166 char_u ts_fidxtry; // ts_fidx at which bytes may be changed
167 char_u ts_twordlen; // valid length of tword[]
168 char_u ts_prefixdepth; // stack depth for end of prefix or
169 // PFD_PREFIXTREE or PFD_NOPREFIX
170 char_u ts_flags; // TSF_ flags
171 char_u ts_tcharlen; // number of bytes in tword character
172 char_u ts_tcharidx; // current byte index in tword character
173 char_u ts_isdiff; // DIFF_ values
174 char_u ts_fcharstart; // index in fword where badword char started
175 char_u ts_prewordlen; // length of word in "preword[]"
176 char_u ts_splitoff; // index in "tword" after last split
177 char_u ts_splitfidx; // "ts_fidx" at word split
178 char_u ts_complen; // nr of compound words used
179 char_u ts_compsplit; // index for "compflags" where word was spit
180 char_u ts_save_badflags; // su_badflags saved here
181 char_u ts_delidx; // index in fword for char that was deleted,
182 // valid when "ts_flags" has TSF_DIDDEL
183 } trystate_T;
184
185 // values for ts_isdiff
186 #define DIFF_NONE 0 // no different byte (yet)
187 #define DIFF_YES 1 // different byte found
188 #define DIFF_INSERT 2 // inserting character
189
190 // values for ts_flags
191 #define TSF_PREFIXOK 1 // already checked that prefix is OK
192 #define TSF_DIDSPLIT 2 // tried split at this point
193 #define TSF_DIDDEL 4 // did a delete, "ts_delidx" has index
194
195 // special values ts_prefixdepth
196 #define PFD_NOPREFIX 0xff // not using prefixes
197 #define PFD_PREFIXTREE 0xfe // walking through the prefix tree
198 #define PFD_NOTSPECIAL 0xfd // highest value that's not special
199
200 static void spell_find_suggest(char_u *badptr, int badlen, suginfo_T *su, int maxcount, int banbadword, int need_cap, int interactive);
201 #ifdef FEAT_EVAL
202 static void spell_suggest_expr(suginfo_T *su, char_u *expr);
203 #endif
204 static void spell_suggest_file(suginfo_T *su, char_u *fname);
205 static void spell_suggest_intern(suginfo_T *su, int interactive);
206 static void spell_find_cleanup(suginfo_T *su);
207 static void suggest_try_special(suginfo_T *su);
208 static void suggest_try_change(suginfo_T *su);
209 static void suggest_trie_walk(suginfo_T *su, langp_T *lp, char_u *fword, int soundfold);
210 static void go_deeper(trystate_T *stack, int depth, int score_add);
211 static void find_keepcap_word(slang_T *slang, char_u *fword, char_u *kword);
212 static void score_comp_sal(suginfo_T *su);
213 static void score_combine(suginfo_T *su);
214 static int stp_sal_score(suggest_T *stp, suginfo_T *su, slang_T *slang, char_u *badsound);
215 static void suggest_try_soundalike_prep(void);
216 static void suggest_try_soundalike(suginfo_T *su);
217 static void suggest_try_soundalike_finish(void);
218 static void add_sound_suggest(suginfo_T *su, char_u *goodword, int score, langp_T *lp);
219 static int soundfold_find(slang_T *slang, char_u *word);
220 static int similar_chars(slang_T *slang, int c1, int c2);
221 static void add_suggestion(suginfo_T *su, garray_T *gap, char_u *goodword, int badlen, int score, int altscore, int had_bonus, slang_T *slang, int maxsf);
222 static void check_suggestions(suginfo_T *su, garray_T *gap);
223 static void add_banned(suginfo_T *su, char_u *word);
224 static void rescore_suggestions(suginfo_T *su);
225 static void rescore_one(suginfo_T *su, suggest_T *stp);
226 static int cleanup_suggestions(garray_T *gap, int maxscore, int keep);
227 static int soundalike_score(char_u *goodsound, char_u *badsound);
228 static int spell_edit_score(slang_T *slang, char_u *badword, char_u *goodword);
229 static int spell_edit_score_limit(slang_T *slang, char_u *badword, char_u *goodword, int limit);
230 static int spell_edit_score_limit_w(slang_T *slang, char_u *badword, char_u *goodword, int limit);
231
232 /*
233 * Return TRUE when the sequence of flags in "compflags" plus "flag" can
234 * possibly form a valid compounded word. This also checks the COMPOUNDRULE
235 * lines if they don't contain wildcards.
236 */
237 static int
238 can_be_compound(
239 trystate_T *sp,
240 slang_T *slang,
241 char_u *compflags,
242 int flag)
243 {
244 // If the flag doesn't appear in sl_compstartflags or sl_compallflags
245 // then it can't possibly compound.
246 if (!byte_in_str(sp->ts_complen == sp->ts_compsplit
247 ? slang->sl_compstartflags : slang->sl_compallflags, flag))
248 return FALSE;
249
250 // If there are no wildcards, we can check if the flags collected so far
251 // possibly can form a match with COMPOUNDRULE patterns. This only
252 // makes sense when we have two or more words.
253 if (slang->sl_comprules != NULL && sp->ts_complen > sp->ts_compsplit)
254 {
255 int v;
256
257 compflags[sp->ts_complen] = flag;
258 compflags[sp->ts_complen + 1] = NUL;
259 v = match_compoundrule(slang, compflags + sp->ts_compsplit);
260 compflags[sp->ts_complen] = NUL;
261 return v;
262 }
263
264 return TRUE;
265 }
266
267 /*
268 * Adjust the score of common words.
269 */
270 static int
271 score_wordcount_adj(
272 slang_T *slang,
273 int score,
274 char_u *word,
275 int split) // word was split, less bonus
276 {
277 hashitem_T *hi;
278 wordcount_T *wc;
279 int bonus;
280 int newscore;
281
282 hi = hash_find(&slang->sl_wordcount, word);
283 if (!HASHITEM_EMPTY(hi))
284 {
285 wc = HI2WC(hi);
286 if (wc->wc_count < SCORE_THRES2)
287 bonus = SCORE_COMMON1;
288 else if (wc->wc_count < SCORE_THRES3)
289 bonus = SCORE_COMMON2;
290 else
291 bonus = SCORE_COMMON3;
292 if (split)
293 newscore = score - bonus / 2;
294 else
295 newscore = score - bonus;
296 if (newscore < 0)
297 return 0;
298 return newscore;
299 }
300 return score;
301 }
302
303 /*
304 * Like captype() but for a KEEPCAP word add ONECAP if the word starts with a
305 * capital. So that make_case_word() can turn WOrd into Word.
306 * Add ALLCAP for "WOrD".
307 */
308 static int
309 badword_captype(char_u *word, char_u *end)
310 {
311 int flags = captype(word, end);
312 int c;
313 int l, u;
314 int first;
315 char_u *p;
316
317 if (flags & WF_KEEPCAP)
318 {
319 // Count the number of UPPER and lower case letters.
320 l = u = 0;
321 first = FALSE;
322 for (p = word; p < end; MB_PTR_ADV(p))
323 {
324 c = PTR2CHAR(p);
325 if (SPELL_ISUPPER(c))
326 {
327 ++u;
328 if (p == word)
329 first = TRUE;
330 }
331 else
332 ++l;
333 }
334
335 // If there are more UPPER than lower case letters suggest an
336 // ALLCAP word. Otherwise, if the first letter is UPPER then
337 // suggest ONECAP. Exception: "ALl" most likely should be "All",
338 // require three upper case letters.
339 if (u > l && u > 2)
340 flags |= WF_ALLCAP;
341 else if (first)
342 flags |= WF_ONECAP;
343
344 if (u >= 2 && l >= 2) // maCARONI maCAroni
345 flags |= WF_MIXCAP;
346 }
347 return flags;
348 }
349
350 /*
351 * Opposite of offset2bytes().
352 * "pp" points to the bytes and is advanced over it.
353 * Returns the offset.
354 */
355 static int
356 bytes2offset(char_u **pp)
357 {
358 char_u *p = *pp;
359 int nr;
360 int c;
361
362 c = *p++;
363 if ((c & 0x80) == 0x00) // 1 byte
364 {
365 nr = c - 1;
366 }
367 else if ((c & 0xc0) == 0x80) // 2 bytes
368 {
369 nr = (c & 0x3f) - 1;
370 nr = nr * 255 + (*p++ - 1);
371 }
372 else if ((c & 0xe0) == 0xc0) // 3 bytes
373 {
374 nr = (c & 0x1f) - 1;
375 nr = nr * 255 + (*p++ - 1);
376 nr = nr * 255 + (*p++ - 1);
377 }
378 else // 4 bytes
379 {
380 nr = (c & 0x0f) - 1;
381 nr = nr * 255 + (*p++ - 1);
382 nr = nr * 255 + (*p++ - 1);
383 nr = nr * 255 + (*p++ - 1);
384 }
385
386 *pp = p;
387 return nr;
388 }
389
390 // values for sps_flags
391 #define SPS_BEST 1
392 #define SPS_FAST 2
393 #define SPS_DOUBLE 4
394
395 static int sps_flags = SPS_BEST; // flags from 'spellsuggest'
396 static int sps_limit = 9999; // max nr of suggestions given
397
398 /*
399 * Check the 'spellsuggest' option. Return FAIL if it's wrong.
400 * Sets "sps_flags" and "sps_limit".
401 */
402 int
403 spell_check_sps(void)
404 {
405 char_u *p;
406 char_u *s;
407 char_u buf[MAXPATHL];
408 int f;
409
410 sps_flags = 0;
411 sps_limit = 9999;
412
413 for (p = p_sps; *p != NUL; )
414 {
415 copy_option_part(&p, buf, MAXPATHL, ",");
416
417 f = 0;
418 if (VIM_ISDIGIT(*buf))
419 {
420 s = buf;
421 sps_limit = getdigits(&s);
422 if (*s != NUL && !VIM_ISDIGIT(*s))
423 f = -1;
424 }
425 else if (STRCMP(buf, "best") == 0)
426 f = SPS_BEST;
427 else if (STRCMP(buf, "fast") == 0)
428 f = SPS_FAST;
429 else if (STRCMP(buf, "double") == 0)
430 f = SPS_DOUBLE;
431 else if (STRNCMP(buf, "expr:", 5) != 0
432 && STRNCMP(buf, "file:", 5) != 0)
433 f = -1;
434
435 if (f == -1 || (sps_flags != 0 && f != 0))
436 {
437 sps_flags = SPS_BEST;
438 sps_limit = 9999;
439 return FAIL;
440 }
441 if (f != 0)
442 sps_flags = f;
443 }
444
445 if (sps_flags == 0)
446 sps_flags = SPS_BEST;
447
448 return OK;
449 }
450
451 /*
452 * "z=": Find badly spelled word under or after the cursor.
453 * Give suggestions for the properly spelled word.
454 * In Visual mode use the highlighted word as the bad word.
455 * When "count" is non-zero use that suggestion.
456 */
457 void
458 spell_suggest(int count)
459 {
460 char_u *line;
461 pos_T prev_cursor = curwin->w_cursor;
462 char_u wcopy[MAXWLEN + 2];
463 char_u *p;
464 int i;
465 int c;
466 suginfo_T sug;
467 suggest_T *stp;
468 int mouse_used;
469 int need_cap;
470 int limit;
471 int selected = count;
472 int badlen = 0;
473 int msg_scroll_save = msg_scroll;
474
475 if (no_spell_checking(curwin))
476 return;
477
478 if (VIsual_active)
479 {
480 // Use the Visually selected text as the bad word. But reject
481 // a multi-line selection.
482 if (curwin->w_cursor.lnum != VIsual.lnum)
483 {
484 vim_beep(BO_SPELL);
485 return;
486 }
487 badlen = (int)curwin->w_cursor.col - (int)VIsual.col;
488 if (badlen < 0)
489 badlen = -badlen;
490 else
491 curwin->w_cursor.col = VIsual.col;
492 ++badlen;
493 end_visual_mode();
494 }
495 // Find the start of the badly spelled word.
496 else if (spell_move_to(curwin, FORWARD, TRUE, TRUE, NULL) == 0
497 || curwin->w_cursor.col > prev_cursor.col)
498 {
499 // No bad word or it starts after the cursor: use the word under the
500 // cursor.
501 curwin->w_cursor = prev_cursor;
502 line = ml_get_curline();
503 p = line + curwin->w_cursor.col;
504 // Backup to before start of word.
505 while (p > line && spell_iswordp_nmw(p, curwin))
506 MB_PTR_BACK(line, p);
507 // Forward to start of word.
508 while (*p != NUL && !spell_iswordp_nmw(p, curwin))
509 MB_PTR_ADV(p);
510
511 if (!spell_iswordp_nmw(p, curwin)) // No word found.
512 {
513 beep_flush();
514 return;
515 }
516 curwin->w_cursor.col = (colnr_T)(p - line);
517 }
518
519 // Get the word and its length.
520
521 // Figure out if the word should be capitalised.
522 need_cap = check_need_cap(curwin->w_cursor.lnum, curwin->w_cursor.col);
523
524 // Make a copy of current line since autocommands may free the line.
525 line = vim_strsave(ml_get_curline());
526 if (line == NULL)
527 goto skip;
528
529 // Get the list of suggestions. Limit to 'lines' - 2 or the number in
530 // 'spellsuggest', whatever is smaller.
531 if (sps_limit > (int)Rows - 2)
532 limit = (int)Rows - 2;
533 else
534 limit = sps_limit;
535 spell_find_suggest(line + curwin->w_cursor.col, badlen, &sug, limit,
536 TRUE, need_cap, TRUE);
537
538 if (sug.su_ga.ga_len == 0)
539 msg(_("Sorry, no suggestions"));
540 else if (count > 0)
541 {
542 if (count > sug.su_ga.ga_len)
543 smsg(_("Sorry, only %ld suggestions"),
544 (long)sug.su_ga.ga_len);
545 }
546 else
547 {
548 VIM_CLEAR(repl_from);
549 VIM_CLEAR(repl_to);
550
551 #ifdef FEAT_RIGHTLEFT
552 // When 'rightleft' is set the list is drawn right-left.
553 cmdmsg_rl = curwin->w_p_rl;
554 if (cmdmsg_rl)
555 msg_col = Columns - 1;
556 #endif
557
558 // List the suggestions.
559 msg_start();
560 msg_row = Rows - 1; // for when 'cmdheight' > 1
561 lines_left = Rows; // avoid more prompt
562 vim_snprintf((char *)IObuff, IOSIZE, _("Change \"%.*s\" to:"),
563 sug.su_badlen, sug.su_badptr);
564 #ifdef FEAT_RIGHTLEFT
565 if (cmdmsg_rl && STRNCMP(IObuff, "Change", 6) == 0)
566 {
567 // And now the rabbit from the high hat: Avoid showing the
568 // untranslated message rightleft.
569 vim_snprintf((char *)IObuff, IOSIZE, ":ot \"%.*s\" egnahC",
570 sug.su_badlen, sug.su_badptr);
571 }
572 #endif
573 msg_puts((char *)IObuff);
574 msg_clr_eos();
575 msg_putchar('\n');
576
577 msg_scroll = TRUE;
578 for (i = 0; i < sug.su_ga.ga_len; ++i)
579 {
580 stp = &SUG(sug.su_ga, i);
581
582 // The suggested word may replace only part of the bad word, add
583 // the not replaced part.
584 vim_strncpy(wcopy, stp->st_word, MAXWLEN);
585 if (sug.su_badlen > stp->st_orglen)
586 vim_strncpy(wcopy + stp->st_wordlen,
587 sug.su_badptr + stp->st_orglen,
588 sug.su_badlen - stp->st_orglen);
589 vim_snprintf((char *)IObuff, IOSIZE, "%2d", i + 1);
590 #ifdef FEAT_RIGHTLEFT
591 if (cmdmsg_rl)
592 rl_mirror(IObuff);
593 #endif
594 msg_puts((char *)IObuff);
595
596 vim_snprintf((char *)IObuff, IOSIZE, " \"%s\"", wcopy);
597 msg_puts((char *)IObuff);
598
599 // The word may replace more than "su_badlen".
600 if (sug.su_badlen < stp->st_orglen)
601 {
602 vim_snprintf((char *)IObuff, IOSIZE, _(" < \"%.*s\""),
603 stp->st_orglen, sug.su_badptr);
604 msg_puts((char *)IObuff);
605 }
606
607 if (p_verbose > 0)
608 {
609 // Add the score.
610 if (sps_flags & (SPS_DOUBLE | SPS_BEST))
611 vim_snprintf((char *)IObuff, IOSIZE, " (%s%d - %d)",
612 stp->st_salscore ? "s " : "",
613 stp->st_score, stp->st_altscore);
614 else
615 vim_snprintf((char *)IObuff, IOSIZE, " (%d)",
616 stp->st_score);
617 #ifdef FEAT_RIGHTLEFT
618 if (cmdmsg_rl)
619 // Mirror the numbers, but keep the leading space.
620 rl_mirror(IObuff + 1);
621 #endif
622 msg_advance(30);
623 msg_puts((char *)IObuff);
624 }
625 msg_putchar('\n');
626 }
627
628 #ifdef FEAT_RIGHTLEFT
629 cmdmsg_rl = FALSE;
630 msg_col = 0;
631 #endif
632 // Ask for choice.
633 selected = prompt_for_number(&mouse_used);
634 if (mouse_used)
635 selected -= lines_left;
636 lines_left = Rows; // avoid more prompt
637 // don't delay for 'smd' in normal_cmd()
638 msg_scroll = msg_scroll_save;
639 }
640
641 if (selected > 0 && selected <= sug.su_ga.ga_len && u_save_cursor() == OK)
642 {
643 // Save the from and to text for :spellrepall.
644 stp = &SUG(sug.su_ga, selected - 1);
645 if (sug.su_badlen > stp->st_orglen)
646 {
647 // Replacing less than "su_badlen", append the remainder to
648 // repl_to.
649 repl_from = vim_strnsave(sug.su_badptr, sug.su_badlen);
650 vim_snprintf((char *)IObuff, IOSIZE, "%s%.*s", stp->st_word,
651 sug.su_badlen - stp->st_orglen,
652 sug.su_badptr + stp->st_orglen);
653 repl_to = vim_strsave(IObuff);
654 }
655 else
656 {
657 // Replacing su_badlen or more, use the whole word.
658 repl_from = vim_strnsave(sug.su_badptr, stp->st_orglen);
659 repl_to = vim_strsave(stp->st_word);
660 }
661
662 // Replace the word.
663 p = alloc(STRLEN(line) - stp->st_orglen + stp->st_wordlen + 1);
664 if (p != NULL)
665 {
666 c = (int)(sug.su_badptr - line);
667 mch_memmove(p, line, c);
668 STRCPY(p + c, stp->st_word);
669 STRCAT(p, sug.su_badptr + stp->st_orglen);
670 ml_replace(curwin->w_cursor.lnum, p, FALSE);
671 curwin->w_cursor.col = c;
672
673 // For redo we use a change-word command.
674 ResetRedobuff();
675 AppendToRedobuff((char_u *)"ciw");
676 AppendToRedobuffLit(p + c,
677 stp->st_wordlen + sug.su_badlen - stp->st_orglen);
678 AppendCharToRedobuff(ESC);
679
680 // After this "p" may be invalid.
681 changed_bytes(curwin->w_cursor.lnum, c);
682 }
683 }
684 else
685 curwin->w_cursor = prev_cursor;
686
687 spell_find_cleanup(&sug);
688 skip:
689 vim_free(line);
690 }
691
692 /*
693 * Find spell suggestions for "word". Return them in the growarray "*gap" as
694 * a list of allocated strings.
695 */
696 void
697 spell_suggest_list(
698 garray_T *gap,
699 char_u *word,
700 int maxcount, // maximum nr of suggestions
701 int need_cap, // 'spellcapcheck' matched
702 int interactive)
703 {
704 suginfo_T sug;
705 int i;
706 suggest_T *stp;
707 char_u *wcopy;
708
709 spell_find_suggest(word, 0, &sug, maxcount, FALSE, need_cap, interactive);
710
711 // Make room in "gap".
712 ga_init2(gap, sizeof(char_u *), sug.su_ga.ga_len + 1);
713 if (ga_grow(gap, sug.su_ga.ga_len) == OK)
714 {
715 for (i = 0; i < sug.su_ga.ga_len; ++i)
716 {
717 stp = &SUG(sug.su_ga, i);
718
719 // The suggested word may replace only part of "word", add the not
720 // replaced part.
721 wcopy = alloc(stp->st_wordlen
722 + (unsigned)STRLEN(sug.su_badptr + stp->st_orglen) + 1);
723 if (wcopy == NULL)
724 break;
725 STRCPY(wcopy, stp->st_word);
726 STRCPY(wcopy + stp->st_wordlen, sug.su_badptr + stp->st_orglen);
727 ((char_u **)gap->ga_data)[gap->ga_len++] = wcopy;
728 }
729 }
730
731 spell_find_cleanup(&sug);
732 }
733
734 /*
735 * Find spell suggestions for the word at the start of "badptr".
736 * Return the suggestions in "su->su_ga".
737 * The maximum number of suggestions is "maxcount".
738 * Note: does use info for the current window.
739 * This is based on the mechanisms of Aspell, but completely reimplemented.
740 */
741 static void
742 spell_find_suggest(
743 char_u *badptr,
744 int badlen, // length of bad word or 0 if unknown
745 suginfo_T *su,
746 int maxcount,
747 int banbadword, // don't include badword in suggestions
748 int need_cap, // word should start with capital
749 int interactive)
750 {
751 hlf_T attr = HLF_COUNT;
752 char_u buf[MAXPATHL];
753 char_u *p;
754 int do_combine = FALSE;
755 char_u *sps_copy;
756 #ifdef FEAT_EVAL
757 static int expr_busy = FALSE;
758 #endif
759 int c;
760 int i;
761 langp_T *lp;
762
763 // Set the info in "*su".
764 vim_memset(su, 0, sizeof(suginfo_T));
765 ga_init2(&su->su_ga, (int)sizeof(suggest_T), 10);
766 ga_init2(&su->su_sga, (int)sizeof(suggest_T), 10);
767 if (*badptr == NUL)
768 return;
769 hash_init(&su->su_banned);
770
771 su->su_badptr = badptr;
772 if (badlen != 0)
773 su->su_badlen = badlen;
774 else
775 su->su_badlen = spell_check(curwin, su->su_badptr, &attr, NULL, FALSE);
776 su->su_maxcount = maxcount;
777 su->su_maxscore = SCORE_MAXINIT;
778
779 if (su->su_badlen >= MAXWLEN)
780 su->su_badlen = MAXWLEN - 1; // just in case
781 vim_strncpy(su->su_badword, su->su_badptr, su->su_badlen);
782 (void)spell_casefold(su->su_badptr, su->su_badlen,
783 su->su_fbadword, MAXWLEN);
784 // TODO: make this work if the case-folded text is longer than the original
785 // text. Currently an illegal byte causes wrong pointer computations.
786 su->su_fbadword[su->su_badlen] = NUL;
787
788 // get caps flags for bad word
789 su->su_badflags = badword_captype(su->su_badptr,
790 su->su_badptr + su->su_badlen);
791 if (need_cap)
792 su->su_badflags |= WF_ONECAP;
793
794 // Find the default language for sound folding. We simply use the first
795 // one in 'spelllang' that supports sound folding. That's good for when
796 // using multiple files for one language, it's not that bad when mixing
797 // languages (e.g., "pl,en").
798 for (i = 0; i < curbuf->b_s.b_langp.ga_len; ++i)
799 {
800 lp = LANGP_ENTRY(curbuf->b_s.b_langp, i);
801 if (lp->lp_sallang != NULL)
802 {
803 su->su_sallang = lp->lp_sallang;
804 break;
805 }
806 }
807
808 // Soundfold the bad word with the default sound folding, so that we don't
809 // have to do this many times.
810 if (su->su_sallang != NULL)
811 spell_soundfold(su->su_sallang, su->su_fbadword, TRUE,
812 su->su_sal_badword);
813
814 // If the word is not capitalised and spell_check() doesn't consider the
815 // word to be bad then it might need to be capitalised. Add a suggestion
816 // for that.
817 c = PTR2CHAR(su->su_badptr);
818 if (!SPELL_ISUPPER(c) && attr == HLF_COUNT)
819 {
820 make_case_word(su->su_badword, buf, WF_ONECAP);
821 add_suggestion(su, &su->su_ga, buf, su->su_badlen, SCORE_ICASE,
822 0, TRUE, su->su_sallang, FALSE);
823 }
824
825 // Ban the bad word itself. It may appear in another region.
826 if (banbadword)
827 add_banned(su, su->su_badword);
828
829 // Make a copy of 'spellsuggest', because the expression may change it.
830 sps_copy = vim_strsave(p_sps);
831 if (sps_copy == NULL)
832 return;
833
834 // Loop over the items in 'spellsuggest'.
835 for (p = sps_copy; *p != NUL; )
836 {
837 copy_option_part(&p, buf, MAXPATHL, ",");
838
839 if (STRNCMP(buf, "expr:", 5) == 0)
840 {
841 #ifdef FEAT_EVAL
842 // Evaluate an expression. Skip this when called recursively,
843 // when using spellsuggest() in the expression.
844 if (!expr_busy)
845 {
846 expr_busy = TRUE;
847 spell_suggest_expr(su, buf + 5);
848 expr_busy = FALSE;
849 }
850 #endif
851 }
852 else if (STRNCMP(buf, "file:", 5) == 0)
853 // Use list of suggestions in a file.
854 spell_suggest_file(su, buf + 5);
855 else
856 {
857 // Use internal method.
858 spell_suggest_intern(su, interactive);
859 if (sps_flags & SPS_DOUBLE)
860 do_combine = TRUE;
861 }
862 }
863
864 vim_free(sps_copy);
865
866 if (do_combine)
867 // Combine the two list of suggestions. This must be done last,
868 // because sorting changes the order again.
869 score_combine(su);
870 }
871
872 #ifdef FEAT_EVAL
873 /*
874 * Find suggestions by evaluating expression "expr".
875 */
876 static void
877 spell_suggest_expr(suginfo_T *su, char_u *expr)
878 {
879 list_T *list;
880 listitem_T *li;
881 int score;
882 char_u *p;
883
884 // The work is split up in a few parts to avoid having to export
885 // suginfo_T.
886 // First evaluate the expression and get the resulting list.
887 list = eval_spell_expr(su->su_badword, expr);
888 if (list != NULL)
889 {
890 // Loop over the items in the list.
891 for (li = list->lv_first; li != NULL; li = li->li_next)
892 if (li->li_tv.v_type == VAR_LIST)
893 {
894 // Get the word and the score from the items.
895 score = get_spellword(li->li_tv.vval.v_list, &p);
896 if (score >= 0 && score <= su->su_maxscore)
897 add_suggestion(su, &su->su_ga, p, su->su_badlen,
898 score, 0, TRUE, su->su_sallang, FALSE);
899 }
900 list_unref(list);
901 }
902
903 // Remove bogus suggestions, sort and truncate at "maxcount".
904 check_suggestions(su, &su->su_ga);
905 (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, su->su_maxcount);
906 }
907 #endif
908
909 /*
910 * Find suggestions in file "fname". Used for "file:" in 'spellsuggest'.
911 */
912 static void
913 spell_suggest_file(suginfo_T *su, char_u *fname)
914 {
915 FILE *fd;
916 char_u line[MAXWLEN * 2];
917 char_u *p;
918 int len;
919 char_u cword[MAXWLEN];
920
921 // Open the file.
922 fd = mch_fopen((char *)fname, "r");
923 if (fd == NULL)
924 {
925 semsg(_(e_notopen), fname);
926 return;
927 }
928
929 // Read it line by line.
930 while (!vim_fgets(line, MAXWLEN * 2, fd) && !got_int)
931 {
932 line_breakcheck();
933
934 p = vim_strchr(line, '/');
935 if (p == NULL)
936 continue; // No Tab found, just skip the line.
937 *p++ = NUL;
938 if (STRICMP(su->su_badword, line) == 0)
939 {
940 // Match! Isolate the good word, until CR or NL.
941 for (len = 0; p[len] >= ' '; ++len)
942 ;
943 p[len] = NUL;
944
945 // If the suggestion doesn't have specific case duplicate the case
946 // of the bad word.
947 if (captype(p, NULL) == 0)
948 {
949 make_case_word(p, cword, su->su_badflags);
950 p = cword;
951 }
952
953 add_suggestion(su, &su->su_ga, p, su->su_badlen,
954 SCORE_FILE, 0, TRUE, su->su_sallang, FALSE);
955 }
956 }
957
958 fclose(fd);
959
960 // Remove bogus suggestions, sort and truncate at "maxcount".
961 check_suggestions(su, &su->su_ga);
962 (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, su->su_maxcount);
963 }
964
965 /*
966 * Find suggestions for the internal method indicated by "sps_flags".
967 */
968 static void
969 spell_suggest_intern(suginfo_T *su, int interactive)
970 {
971 // Load the .sug file(s) that are available and not done yet.
972 suggest_load_files();
973
974 // 1. Try special cases, such as repeating a word: "the the" -> "the".
975 //
976 // Set a maximum score to limit the combination of operations that is
977 // tried.
978 suggest_try_special(su);
979
980 // 2. Try inserting/deleting/swapping/changing a letter, use REP entries
981 // from the .aff file and inserting a space (split the word).
982 suggest_try_change(su);
983
984 // For the resulting top-scorers compute the sound-a-like score.
985 if (sps_flags & SPS_DOUBLE)
986 score_comp_sal(su);
987
988 // 3. Try finding sound-a-like words.
989 if ((sps_flags & SPS_FAST) == 0)
990 {
991 if (sps_flags & SPS_BEST)
992 // Adjust the word score for the suggestions found so far for how
993 // they sounds like.
994 rescore_suggestions(su);
995
996 // While going through the soundfold tree "su_maxscore" is the score
997 // for the soundfold word, limits the changes that are being tried,
998 // and "su_sfmaxscore" the rescored score, which is set by
999 // cleanup_suggestions().
1000 // First find words with a small edit distance, because this is much
1001 // faster and often already finds the top-N suggestions. If we didn't
1002 // find many suggestions try again with a higher edit distance.
1003 // "sl_sounddone" is used to avoid doing the same word twice.
1004 suggest_try_soundalike_prep();
1005 su->su_maxscore = SCORE_SFMAX1;
1006 su->su_sfmaxscore = SCORE_MAXINIT * 3;
1007 suggest_try_soundalike(su);
1008 if (su->su_ga.ga_len < SUG_CLEAN_COUNT(su))
1009 {
1010 // We didn't find enough matches, try again, allowing more
1011 // changes to the soundfold word.
1012 su->su_maxscore = SCORE_SFMAX2;
1013 suggest_try_soundalike(su);
1014 if (su->su_ga.ga_len < SUG_CLEAN_COUNT(su))
1015 {
1016 // Still didn't find enough matches, try again, allowing even
1017 // more changes to the soundfold word.
1018 su->su_maxscore = SCORE_SFMAX3;
1019 suggest_try_soundalike(su);
1020 }
1021 }
1022 su->su_maxscore = su->su_sfmaxscore;
1023 suggest_try_soundalike_finish();
1024 }
1025
1026 // When CTRL-C was hit while searching do show the results. Only clear
1027 // got_int when using a command, not for spellsuggest().
1028 ui_breakcheck();
1029 if (interactive && got_int)
1030 {
1031 (void)vgetc();
1032 got_int = FALSE;
1033 }
1034
1035 if ((sps_flags & SPS_DOUBLE) == 0 && su->su_ga.ga_len != 0)
1036 {
1037 if (sps_flags & SPS_BEST)
1038 // Adjust the word score for how it sounds like.
1039 rescore_suggestions(su);
1040
1041 // Remove bogus suggestions, sort and truncate at "maxcount".
1042 check_suggestions(su, &su->su_ga);
1043 (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, su->su_maxcount);
1044 }
1045 }
1046
1047 /*
1048 * Free the info put in "*su" by spell_find_suggest().
1049 */
1050 static void
1051 spell_find_cleanup(suginfo_T *su)
1052 {
1053 int i;
1054
1055 // Free the suggestions.
1056 for (i = 0; i < su->su_ga.ga_len; ++i)
1057 vim_free(SUG(su->su_ga, i).st_word);
1058 ga_clear(&su->su_ga);
1059 for (i = 0; i < su->su_sga.ga_len; ++i)
1060 vim_free(SUG(su->su_sga, i).st_word);
1061 ga_clear(&su->su_sga);
1062
1063 // Free the banned words.
1064 hash_clear_all(&su->su_banned, 0);
1065 }
1066
1067 /*
1068 * Try finding suggestions by recognizing specific situations.
1069 */
1070 static void
1071 suggest_try_special(suginfo_T *su)
1072 {
1073 char_u *p;
1074 size_t len;
1075 int c;
1076 char_u word[MAXWLEN];
1077
1078 // Recognize a word that is repeated: "the the".
1079 p = skiptowhite(su->su_fbadword);
1080 len = p - su->su_fbadword;
1081 p = skipwhite(p);
1082 if (STRLEN(p) == len && STRNCMP(su->su_fbadword, p, len) == 0)
1083 {
1084 // Include badflags: if the badword is onecap or allcap
1085 // use that for the goodword too: "The the" -> "The".
1086 c = su->su_fbadword[len];
1087 su->su_fbadword[len] = NUL;
1088 make_case_word(su->su_fbadword, word, su->su_badflags);
1089 su->su_fbadword[len] = c;
1090
1091 // Give a soundalike score of 0, compute the score as if deleting one
1092 // character.
1093 add_suggestion(su, &su->su_ga, word, su->su_badlen,
1094 RESCORE(SCORE_REP, 0), 0, TRUE, su->su_sallang, FALSE);
1095 }
1096 }
1097
1098 /*
1099 * Change the 0 to 1 to measure how much time is spent in each state.
1100 * Output is dumped in "suggestprof".
1101 */
1102 #if 0
1103 # define SUGGEST_PROFILE
1104 proftime_T current;
1105 proftime_T total;
1106 proftime_T times[STATE_FINAL + 1];
1107 long counts[STATE_FINAL + 1];
1108
1109 static void
1110 prof_init(void)
1111 {
1112 for (int i = 0; i <= STATE_FINAL; ++i)
1113 {
1114 profile_zero(&times[i]);
1115 counts[i] = 0;
1116 }
1117 profile_start(&current);
1118 profile_start(&total);
1119 }
1120
1121 // call before changing state
1122 static void
1123 prof_store(state_T state)
1124 {
1125 profile_end(&current);
1126 profile_add(&times[state], &current);
1127 ++counts[state];
1128 profile_start(&current);
1129 }
1130 # define PROF_STORE(state) prof_store(state);
1131
1132 static void
1133 prof_report(char *name)
1134 {
1135 FILE *fd = fopen("suggestprof", "a");
1136
1137 profile_end(&total);
1138 fprintf(fd, "-----------------------\n");
1139 fprintf(fd, "%s: %s\n", name, profile_msg(&total));
1140 for (int i = 0; i <= STATE_FINAL; ++i)
1141 fprintf(fd, "%d: %s (%ld)\n", i, profile_msg(&times[i]), counts[i]);
1142 fclose(fd);
1143 }
1144 #else
1145 # define PROF_STORE(state)
1146 #endif
1147
1148 /*
1149 * Try finding suggestions by adding/removing/swapping letters.
1150 */
1151 static void
1152 suggest_try_change(suginfo_T *su)
1153 {
1154 char_u fword[MAXWLEN]; // copy of the bad word, case-folded
1155 int n;
1156 char_u *p;
1157 int lpi;
1158 langp_T *lp;
1159
1160 // We make a copy of the case-folded bad word, so that we can modify it
1161 // to find matches (esp. REP items). Append some more text, changing
1162 // chars after the bad word may help.
1163 STRCPY(fword, su->su_fbadword);
1164 n = (int)STRLEN(fword);
1165 p = su->su_badptr + su->su_badlen;
1166 (void)spell_casefold(p, (int)STRLEN(p), fword + n, MAXWLEN - n);
1167
1168 for (lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi)
1169 {
1170 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi);
1171
1172 // If reloading a spell file fails it's still in the list but
1173 // everything has been cleared.
1174 if (lp->lp_slang->sl_fbyts == NULL)
1175 continue;
1176
1177 // Try it for this language. Will add possible suggestions.
1178 #ifdef SUGGEST_PROFILE
1179 prof_init();
1180 #endif
1181 suggest_trie_walk(su, lp, fword, FALSE);
1182 #ifdef SUGGEST_PROFILE
1183 prof_report("try_change");
1184 #endif
1185 }
1186 }
1187
1188 // Check the maximum score, if we go over it we won't try this change.
1189 #define TRY_DEEPER(su, stack, depth, add) \
1190 (stack[depth].ts_score + (add) < su->su_maxscore)
1191
1192 /*
1193 * Try finding suggestions by adding/removing/swapping letters.
1194 *
1195 * This uses a state machine. At each node in the tree we try various
1196 * operations. When trying if an operation works "depth" is increased and the
1197 * stack[] is used to store info. This allows combinations, thus insert one
1198 * character, replace one and delete another. The number of changes is
1199 * limited by su->su_maxscore.
1200 *
1201 * After implementing this I noticed an article by Kemal Oflazer that
1202 * describes something similar: "Error-tolerant Finite State Recognition with
1203 * Applications to Morphological Analysis and Spelling Correction" (1996).
1204 * The implementation in the article is simplified and requires a stack of
1205 * unknown depth. The implementation here only needs a stack depth equal to
1206 * the length of the word.
1207 *
1208 * This is also used for the sound-folded word, "soundfold" is TRUE then.
1209 * The mechanism is the same, but we find a match with a sound-folded word
1210 * that comes from one or more original words. Each of these words may be
1211 * added, this is done by add_sound_suggest().
1212 * Don't use:
1213 * the prefix tree or the keep-case tree
1214 * "su->su_badlen"
1215 * anything to do with upper and lower case
1216 * anything to do with word or non-word characters ("spell_iswordp()")
1217 * banned words
1218 * word flags (rare, region, compounding)
1219 * word splitting for now
1220 * "similar_chars()"
1221 * use "slang->sl_repsal" instead of "lp->lp_replang->sl_rep"
1222 */
1223 static void
1224 suggest_trie_walk(
1225 suginfo_T *su,
1226 langp_T *lp,
1227 char_u *fword,
1228 int soundfold)
1229 {
1230 char_u tword[MAXWLEN]; // good word collected so far
1231 trystate_T stack[MAXWLEN];
1232 char_u preword[MAXWLEN * 3]; // word found with proper case;
1233 // concatenation of prefix compound
1234 // words and split word. NUL terminated
1235 // when going deeper but not when coming
1236 // back.
1237 char_u compflags[MAXWLEN]; // compound flags, one for each word
1238 trystate_T *sp;
1239 int newscore;
1240 int score;
1241 char_u *byts, *fbyts, *pbyts;
1242 idx_T *idxs, *fidxs, *pidxs;
1243 int depth;
1244 int c, c2, c3;
1245 int n = 0;
1246 int flags;
1247 garray_T *gap;
1248 idx_T arridx;
1249 int len;
1250 char_u *p;
1251 fromto_T *ftp;
1252 int fl = 0, tl;
1253 int repextra = 0; // extra bytes in fword[] from REP item
1254 slang_T *slang = lp->lp_slang;
1255 int fword_ends;
1256 int goodword_ends;
1257 #ifdef DEBUG_TRIEWALK
1258 // Stores the name of the change made at each level.
1259 char_u changename[MAXWLEN][80];
1260 #endif
1261 int breakcheckcount = 1000;
1262 int compound_ok;
1263
1264 // Go through the whole case-fold tree, try changes at each node.
1265 // "tword[]" contains the word collected from nodes in the tree.
1266 // "fword[]" the word we are trying to match with (initially the bad
1267 // word).
1268 depth = 0;
1269 sp = &stack[0];
1270 vim_memset(sp, 0, sizeof(trystate_T));
1271 sp->ts_curi = 1;
1272
1273 if (soundfold)
1274 {
1275 // Going through the soundfold tree.
1276 byts = fbyts = slang->sl_sbyts;
1277 idxs = fidxs = slang->sl_sidxs;
1278 pbyts = NULL;
1279 pidxs = NULL;
1280 sp->ts_prefixdepth = PFD_NOPREFIX;
1281 sp->ts_state = STATE_START;
1282 }
1283 else
1284 {
1285 // When there are postponed prefixes we need to use these first. At
1286 // the end of the prefix we continue in the case-fold tree.
1287 fbyts = slang->sl_fbyts;
1288 fidxs = slang->sl_fidxs;
1289 pbyts = slang->sl_pbyts;
1290 pidxs = slang->sl_pidxs;
1291 if (pbyts != NULL)
1292 {
1293 byts = pbyts;
1294 idxs = pidxs;
1295 sp->ts_prefixdepth = PFD_PREFIXTREE;
1296 sp->ts_state = STATE_NOPREFIX; // try without prefix first
1297 }
1298 else
1299 {
1300 byts = fbyts;
1301 idxs = fidxs;
1302 sp->ts_prefixdepth = PFD_NOPREFIX;
1303 sp->ts_state = STATE_START;
1304 }
1305 }
1306
1307 // Loop to find all suggestions. At each round we either:
1308 // - For the current state try one operation, advance "ts_curi",
1309 // increase "depth".
1310 // - When a state is done go to the next, set "ts_state".
1311 // - When all states are tried decrease "depth".
1312 while (depth >= 0 && !got_int)
1313 {
1314 sp = &stack[depth];
1315 switch (sp->ts_state)
1316 {
1317 case STATE_START:
1318 case STATE_NOPREFIX:
1319 // Start of node: Deal with NUL bytes, which means
1320 // tword[] may end here.
1321 arridx = sp->ts_arridx; // current node in the tree
1322 len = byts[arridx]; // bytes in this node
1323 arridx += sp->ts_curi; // index of current byte
1324
1325 if (sp->ts_prefixdepth == PFD_PREFIXTREE)
1326 {
1327 // Skip over the NUL bytes, we use them later.
1328 for (n = 0; n < len && byts[arridx + n] == 0; ++n)
1329 ;
1330 sp->ts_curi += n;
1331
1332 // Always past NUL bytes now.
1333 n = (int)sp->ts_state;
1334 PROF_STORE(sp->ts_state)
1335 sp->ts_state = STATE_ENDNUL;
1336 sp->ts_save_badflags = su->su_badflags;
1337
1338 // At end of a prefix or at start of prefixtree: check for
1339 // following word.
1340 if (byts[arridx] == 0 || n == (int)STATE_NOPREFIX)
1341 {
1342 // Set su->su_badflags to the caps type at this position.
1343 // Use the caps type until here for the prefix itself.
1344 if (has_mbyte)
1345 n = nofold_len(fword, sp->ts_fidx, su->su_badptr);
1346 else
1347 n = sp->ts_fidx;
1348 flags = badword_captype(su->su_badptr, su->su_badptr + n);
1349 su->su_badflags = badword_captype(su->su_badptr + n,
1350 su->su_badptr + su->su_badlen);
1351 #ifdef DEBUG_TRIEWALK
1352 sprintf(changename[depth], "prefix");
1353 #endif
1354 go_deeper(stack, depth, 0);
1355 ++depth;
1356 sp = &stack[depth];
1357 sp->ts_prefixdepth = depth - 1;
1358 byts = fbyts;
1359 idxs = fidxs;
1360 sp->ts_arridx = 0;
1361
1362 // Move the prefix to preword[] with the right case
1363 // and make find_keepcap_word() works.
1364 tword[sp->ts_twordlen] = NUL;
1365 make_case_word(tword + sp->ts_splitoff,
1366 preword + sp->ts_prewordlen, flags);
1367 sp->ts_prewordlen = (char_u)STRLEN(preword);
1368 sp->ts_splitoff = sp->ts_twordlen;
1369 }
1370 break;
1371 }
1372
1373 if (sp->ts_curi > len || byts[arridx] != 0)
1374 {
1375 // Past bytes in node and/or past NUL bytes.
1376 PROF_STORE(sp->ts_state)
1377 sp->ts_state = STATE_ENDNUL;
1378 sp->ts_save_badflags = su->su_badflags;
1379 break;
1380 }
1381
1382 // End of word in tree.
1383 ++sp->ts_curi; // eat one NUL byte
1384
1385 flags = (int)idxs[arridx];
1386
1387 // Skip words with the NOSUGGEST flag.
1388 if (flags & WF_NOSUGGEST)
1389 break;
1390
1391 fword_ends = (fword[sp->ts_fidx] == NUL
1392 || (soundfold
1393 ? VIM_ISWHITE(fword[sp->ts_fidx])
1394 : !spell_iswordp(fword + sp->ts_fidx, curwin)));
1395 tword[sp->ts_twordlen] = NUL;
1396
1397 if (sp->ts_prefixdepth <= PFD_NOTSPECIAL
1398 && (sp->ts_flags & TSF_PREFIXOK) == 0)
1399 {
1400 // There was a prefix before the word. Check that the prefix
1401 // can be used with this word.
1402 // Count the length of the NULs in the prefix. If there are
1403 // none this must be the first try without a prefix.
1404 n = stack[sp->ts_prefixdepth].ts_arridx;
1405 len = pbyts[n++];
1406 for (c = 0; c < len && pbyts[n + c] == 0; ++c)
1407 ;
1408 if (c > 0)
1409 {
1410 c = valid_word_prefix(c, n, flags,
1411 tword + sp->ts_splitoff, slang, FALSE);
1412 if (c == 0)
1413 break;
1414
1415 // Use the WF_RARE flag for a rare prefix.
1416 if (c & WF_RAREPFX)
1417 flags |= WF_RARE;
1418
1419 // Tricky: when checking for both prefix and compounding
1420 // we run into the prefix flag first.
1421 // Remember that it's OK, so that we accept the prefix
1422 // when arriving at a compound flag.
1423 sp->ts_flags |= TSF_PREFIXOK;
1424 }
1425 }
1426
1427 // Check NEEDCOMPOUND: can't use word without compounding. Do try
1428 // appending another compound word below.
1429 if (sp->ts_complen == sp->ts_compsplit && fword_ends
1430 && (flags & WF_NEEDCOMP))
1431 goodword_ends = FALSE;
1432 else
1433 goodword_ends = TRUE;
1434
1435 p = NULL;
1436 compound_ok = TRUE;
1437 if (sp->ts_complen > sp->ts_compsplit)
1438 {
1439 if (slang->sl_nobreak)
1440 {
1441 // There was a word before this word. When there was no
1442 // change in this word (it was correct) add the first word
1443 // as a suggestion. If this word was corrected too, we
1444 // need to check if a correct word follows.
1445 if (sp->ts_fidx - sp->ts_splitfidx
1446 == sp->ts_twordlen - sp->ts_splitoff
1447 && STRNCMP(fword + sp->ts_splitfidx,
1448 tword + sp->ts_splitoff,
1449 sp->ts_fidx - sp->ts_splitfidx) == 0)
1450 {
1451 preword[sp->ts_prewordlen] = NUL;
1452 newscore = score_wordcount_adj(slang, sp->ts_score,
1453 preword + sp->ts_prewordlen,
1454 sp->ts_prewordlen > 0);
1455 // Add the suggestion if the score isn't too bad.
1456 if (newscore <= su->su_maxscore)
1457 add_suggestion(su, &su->su_ga, preword,
1458 sp->ts_splitfidx - repextra,
1459 newscore, 0, FALSE,
1460 lp->lp_sallang, FALSE);
1461 break;
1462 }
1463 }
1464 else
1465 {
1466 // There was a compound word before this word. If this
1467 // word does not support compounding then give up
1468 // (splitting is tried for the word without compound
1469 // flag).
1470 if (((unsigned)flags >> 24) == 0
1471 || sp->ts_twordlen - sp->ts_splitoff
1472 < slang->sl_compminlen)
1473 break;
1474 // For multi-byte chars check character length against
1475 // COMPOUNDMIN.
1476 if (has_mbyte
1477 && slang->sl_compminlen > 0
1478 && mb_charlen(tword + sp->ts_splitoff)
1479 < slang->sl_compminlen)
1480 break;
1481
1482 compflags[sp->ts_complen] = ((unsigned)flags >> 24);
1483 compflags[sp->ts_complen + 1] = NUL;
1484 vim_strncpy(preword + sp->ts_prewordlen,
1485 tword + sp->ts_splitoff,
1486 sp->ts_twordlen - sp->ts_splitoff);
1487
1488 // Verify CHECKCOMPOUNDPATTERN rules.
1489 if (match_checkcompoundpattern(preword, sp->ts_prewordlen,
1490 &slang->sl_comppat))
1491 compound_ok = FALSE;
1492
1493 if (compound_ok)
1494 {
1495 p = preword;
1496 while (*skiptowhite(p) != NUL)
1497 p = skipwhite(skiptowhite(p));
1498 if (fword_ends && !can_compound(slang, p,
1499 compflags + sp->ts_compsplit))
1500 // Compound is not allowed. But it may still be
1501 // possible if we add another (short) word.
1502 compound_ok = FALSE;
1503 }
1504
1505 // Get pointer to last char of previous word.
1506 p = preword + sp->ts_prewordlen;
1507 MB_PTR_BACK(preword, p);
1508 }
1509 }
1510
1511 // Form the word with proper case in preword.
1512 // If there is a word from a previous split, append.
1513 // For the soundfold tree don't change the case, simply append.
1514 if (soundfold)
1515 STRCPY(preword + sp->ts_prewordlen, tword + sp->ts_splitoff);
1516 else if (flags & WF_KEEPCAP)
1517 // Must find the word in the keep-case tree.
1518 find_keepcap_word(slang, tword + sp->ts_splitoff,
1519 preword + sp->ts_prewordlen);
1520 else
1521 {
1522 // Include badflags: If the badword is onecap or allcap
1523 // use that for the goodword too. But if the badword is
1524 // allcap and it's only one char long use onecap.
1525 c = su->su_badflags;
1526 if ((c & WF_ALLCAP)
1527 && su->su_badlen == (*mb_ptr2len)(su->su_badptr))
1528 c = WF_ONECAP;
1529 c |= flags;
1530
1531 // When appending a compound word after a word character don't
1532 // use Onecap.
1533 if (p != NULL && spell_iswordp_nmw(p, curwin))
1534 c &= ~WF_ONECAP;
1535 make_case_word(tword + sp->ts_splitoff,
1536 preword + sp->ts_prewordlen, c);
1537 }
1538
1539 if (!soundfold)
1540 {
1541 // Don't use a banned word. It may appear again as a good
1542 // word, thus remember it.
1543 if (flags & WF_BANNED)
1544 {
1545 add_banned(su, preword + sp->ts_prewordlen);
1546 break;
1547 }
1548 if ((sp->ts_complen == sp->ts_compsplit
1549 && WAS_BANNED(su, preword + sp->ts_prewordlen))
1550 || WAS_BANNED(su, preword))
1551 {
1552 if (slang->sl_compprog == NULL)
1553 break;
1554 // the word so far was banned but we may try compounding
1555 goodword_ends = FALSE;
1556 }
1557 }
1558
1559 newscore = 0;
1560 if (!soundfold) // soundfold words don't have flags
1561 {
1562 if ((flags & WF_REGION)
1563 && (((unsigned)flags >> 16) & lp->lp_region) == 0)
1564 newscore += SCORE_REGION;
1565 if (flags & WF_RARE)
1566 newscore += SCORE_RARE;
1567
1568 if (!spell_valid_case(su->su_badflags,
1569 captype(preword + sp->ts_prewordlen, NULL)))
1570 newscore += SCORE_ICASE;
1571 }
1572
1573 // TODO: how about splitting in the soundfold tree?
1574 if (fword_ends
1575 && goodword_ends
1576 && sp->ts_fidx >= sp->ts_fidxtry
1577 && compound_ok)
1578 {
1579 // The badword also ends: add suggestions.
1580 #ifdef DEBUG_TRIEWALK
1581 if (soundfold && STRCMP(preword, "smwrd") == 0)
1582 {
1583 int j;
1584
1585 // print the stack of changes that brought us here
1586 smsg("------ %s -------", fword);
1587 for (j = 0; j < depth; ++j)
1588 smsg("%s", changename[j]);
1589 }
1590 #endif
1591 if (soundfold)
1592 {
1593 // For soundfolded words we need to find the original
1594 // words, the edit distance and then add them.
1595 add_sound_suggest(su, preword, sp->ts_score, lp);
1596 }
1597 else if (sp->ts_fidx > 0)
1598 {
1599 // Give a penalty when changing non-word char to word
1600 // char, e.g., "thes," -> "these".
1601 p = fword + sp->ts_fidx;
1602 MB_PTR_BACK(fword, p);
1603 if (!spell_iswordp(p, curwin))
1604 {
1605 p = preword + STRLEN(preword);
1606 MB_PTR_BACK(preword, p);
1607 if (spell_iswordp(p, curwin))
1608 newscore += SCORE_NONWORD;
1609 }
1610
1611 // Give a bonus to words seen before.
1612 score = score_wordcount_adj(slang,
1613 sp->ts_score + newscore,
1614 preword + sp->ts_prewordlen,
1615 sp->ts_prewordlen > 0);
1616
1617 // Add the suggestion if the score isn't too bad.
1618 if (score <= su->su_maxscore)
1619 {
1620 add_suggestion(su, &su->su_ga, preword,
1621 sp->ts_fidx - repextra,
1622 score, 0, FALSE, lp->lp_sallang, FALSE);
1623
1624 if (su->su_badflags & WF_MIXCAP)
1625 {
1626 // We really don't know if the word should be
1627 // upper or lower case, add both.
1628 c = captype(preword, NULL);
1629 if (c == 0 || c == WF_ALLCAP)
1630 {
1631 make_case_word(tword + sp->ts_splitoff,
1632 preword + sp->ts_prewordlen,
1633 c == 0 ? WF_ALLCAP : 0);
1634
1635 add_suggestion(su, &su->su_ga, preword,
1636 sp->ts_fidx - repextra,
1637 score + SCORE_ICASE, 0, FALSE,
1638 lp->lp_sallang, FALSE);
1639 }
1640 }
1641 }
1642 }
1643 }
1644
1645 // Try word split and/or compounding.
1646 if ((sp->ts_fidx >= sp->ts_fidxtry || fword_ends)
1647 // Don't split halfway a character.
1648 && (!has_mbyte || sp->ts_tcharlen == 0))
1649 {
1650 int try_compound;
1651 int try_split;
1652
1653 // If past the end of the bad word don't try a split.
1654 // Otherwise try changing the next word. E.g., find
1655 // suggestions for "the the" where the second "the" is
1656 // different. It's done like a split.
1657 // TODO: word split for soundfold words
1658 try_split = (sp->ts_fidx - repextra < su->su_badlen)
1659 && !soundfold;
1660
1661 // Get here in several situations:
1662 // 1. The word in the tree ends:
1663 // If the word allows compounding try that. Otherwise try
1664 // a split by inserting a space. For both check that a
1665 // valid words starts at fword[sp->ts_fidx].
1666 // For NOBREAK do like compounding to be able to check if
1667 // the next word is valid.
1668 // 2. The badword does end, but it was due to a change (e.g.,
1669 // a swap). No need to split, but do check that the
1670 // following word is valid.
1671 // 3. The badword and the word in the tree end. It may still
1672 // be possible to compound another (short) word.
1673 try_compound = FALSE;
1674 if (!soundfold
1675 && !slang->sl_nocompoundsugs
1676 && slang->sl_compprog != NULL
1677 && ((unsigned)flags >> 24) != 0
1678 && sp->ts_twordlen - sp->ts_splitoff
1679 >= slang->sl_compminlen
1680 && (!has_mbyte
1681 || slang->sl_compminlen == 0
1682 || mb_charlen(tword + sp->ts_splitoff)
1683 >= slang->sl_compminlen)
1684 && (slang->sl_compsylmax < MAXWLEN
1685 || sp->ts_complen + 1 - sp->ts_compsplit
1686 < slang->sl_compmax)
1687 && (can_be_compound(sp, slang,
1688 compflags, ((unsigned)flags >> 24))))
1689
1690 {
1691 try_compound = TRUE;
1692 compflags[sp->ts_complen] = ((unsigned)flags >> 24);
1693 compflags[sp->ts_complen + 1] = NUL;
1694 }
1695
1696 // For NOBREAK we never try splitting, it won't make any word
1697 // valid.
1698 if (slang->sl_nobreak && !slang->sl_nocompoundsugs)
1699 try_compound = TRUE;
1700
1701 // If we could add a compound word, and it's also possible to
1702 // split at this point, do the split first and set
1703 // TSF_DIDSPLIT to avoid doing it again.
1704 else if (!fword_ends
1705 && try_compound
1706 && (sp->ts_flags & TSF_DIDSPLIT) == 0)
1707 {
1708 try_compound = FALSE;
1709 sp->ts_flags |= TSF_DIDSPLIT;
1710 --sp->ts_curi; // do the same NUL again
1711 compflags[sp->ts_complen] = NUL;
1712 }
1713 else
1714 sp->ts_flags &= ~TSF_DIDSPLIT;
1715
1716 if (try_split || try_compound)
1717 {
1718 if (!try_compound && (!fword_ends || !goodword_ends))
1719 {
1720 // If we're going to split need to check that the
1721 // words so far are valid for compounding. If there
1722 // is only one word it must not have the NEEDCOMPOUND
1723 // flag.
1724 if (sp->ts_complen == sp->ts_compsplit
1725 && (flags & WF_NEEDCOMP))
1726 break;
1727 p = preword;
1728 while (*skiptowhite(p) != NUL)
1729 p = skipwhite(skiptowhite(p));
1730 if (sp->ts_complen > sp->ts_compsplit
1731 && !can_compound(slang, p,
1732 compflags + sp->ts_compsplit))
1733 break;
1734
1735 if (slang->sl_nosplitsugs)
1736 newscore += SCORE_SPLIT_NO;
1737 else
1738 newscore += SCORE_SPLIT;
1739
1740 // Give a bonus to words seen before.
1741 newscore = score_wordcount_adj(slang, newscore,
1742 preword + sp->ts_prewordlen, TRUE);
1743 }
1744
1745 if (TRY_DEEPER(su, stack, depth, newscore))
1746 {
1747 go_deeper(stack, depth, newscore);
1748 #ifdef DEBUG_TRIEWALK
1749 if (!try_compound && !fword_ends)
1750 sprintf(changename[depth], "%.*s-%s: split",
1751 sp->ts_twordlen, tword, fword + sp->ts_fidx);
1752 else
1753 sprintf(changename[depth], "%.*s-%s: compound",
1754 sp->ts_twordlen, tword, fword + sp->ts_fidx);
1755 #endif
1756 // Save things to be restored at STATE_SPLITUNDO.
1757 sp->ts_save_badflags = su->su_badflags;
1758 PROF_STORE(sp->ts_state)
1759 sp->ts_state = STATE_SPLITUNDO;
1760
1761 ++depth;
1762 sp = &stack[depth];
1763
1764 // Append a space to preword when splitting.
1765 if (!try_compound && !fword_ends)
1766 STRCAT(preword, " ");
1767 sp->ts_prewordlen = (char_u)STRLEN(preword);
1768 sp->ts_splitoff = sp->ts_twordlen;
1769 sp->ts_splitfidx = sp->ts_fidx;
1770
1771 // If the badword has a non-word character at this
1772 // position skip it. That means replacing the
1773 // non-word character with a space. Always skip a
1774 // character when the word ends. But only when the
1775 // good word can end.
1776 if (((!try_compound && !spell_iswordp_nmw(fword
1777 + sp->ts_fidx,
1778 curwin))
1779 || fword_ends)
1780 && fword[sp->ts_fidx] != NUL
1781 && goodword_ends)
1782 {
1783 int l;
1784
1785 l = MB_PTR2LEN(fword + sp->ts_fidx);
1786 if (fword_ends)
1787 {
1788 // Copy the skipped character to preword.
1789 mch_memmove(preword + sp->ts_prewordlen,
1790 fword + sp->ts_fidx, l);
1791 sp->ts_prewordlen += l;
1792 preword[sp->ts_prewordlen] = NUL;
1793 }
1794 else
1795 sp->ts_score -= SCORE_SPLIT - SCORE_SUBST;
1796 sp->ts_fidx += l;
1797 }
1798
1799 // When compounding include compound flag in
1800 // compflags[] (already set above). When splitting we
1801 // may start compounding over again.
1802 if (try_compound)
1803 ++sp->ts_complen;
1804 else
1805 sp->ts_compsplit = sp->ts_complen;
1806 sp->ts_prefixdepth = PFD_NOPREFIX;
1807
1808 // set su->su_badflags to the caps type at this
1809 // position
1810 if (has_mbyte)
1811 n = nofold_len(fword, sp->ts_fidx, su->su_badptr);
1812 else
1813 n = sp->ts_fidx;
1814 su->su_badflags = badword_captype(su->su_badptr + n,
1815 su->su_badptr + su->su_badlen);
1816
1817 // Restart at top of the tree.
1818 sp->ts_arridx = 0;
1819
1820 // If there are postponed prefixes, try these too.
1821 if (pbyts != NULL)
1822 {
1823 byts = pbyts;
1824 idxs = pidxs;
1825 sp->ts_prefixdepth = PFD_PREFIXTREE;
1826 PROF_STORE(sp->ts_state)
1827 sp->ts_state = STATE_NOPREFIX;
1828 }
1829 }
1830 }
1831 }
1832 break;
1833
1834 case STATE_SPLITUNDO:
1835 // Undo the changes done for word split or compound word.
1836 su->su_badflags = sp->ts_save_badflags;
1837
1838 // Continue looking for NUL bytes.
1839 PROF_STORE(sp->ts_state)
1840 sp->ts_state = STATE_START;
1841
1842 // In case we went into the prefix tree.
1843 byts = fbyts;
1844 idxs = fidxs;
1845 break;
1846
1847 case STATE_ENDNUL:
1848 // Past the NUL bytes in the node.
1849 su->su_badflags = sp->ts_save_badflags;
1850 if (fword[sp->ts_fidx] == NUL && sp->ts_tcharlen == 0)
1851 {
1852 // The badword ends, can't use STATE_PLAIN.
1853 PROF_STORE(sp->ts_state)
1854 sp->ts_state = STATE_DEL;
1855 break;
1856 }
1857 PROF_STORE(sp->ts_state)
1858 sp->ts_state = STATE_PLAIN;
1859 // FALLTHROUGH
1860
1861 case STATE_PLAIN:
1862 // Go over all possible bytes at this node, add each to tword[]
1863 // and use child node. "ts_curi" is the index.
1864 arridx = sp->ts_arridx;
1865 if (sp->ts_curi > byts[arridx])
1866 {
1867 // Done all bytes at this node, do next state. When still at
1868 // already changed bytes skip the other tricks.
1869 PROF_STORE(sp->ts_state)
1870 if (sp->ts_fidx >= sp->ts_fidxtry)
1871 sp->ts_state = STATE_DEL;
1872 else
1873 sp->ts_state = STATE_FINAL;
1874 }
1875 else
1876 {
1877 arridx += sp->ts_curi++;
1878 c = byts[arridx];
1879
1880 // Normal byte, go one level deeper. If it's not equal to the
1881 // byte in the bad word adjust the score. But don't even try
1882 // when the byte was already changed. And don't try when we
1883 // just deleted this byte, accepting it is always cheaper than
1884 // delete + substitute.
1885 if (c == fword[sp->ts_fidx]
1886 || (sp->ts_tcharlen > 0 && sp->ts_isdiff != DIFF_NONE))
1887 newscore = 0;
1888 else
1889 newscore = SCORE_SUBST;
1890 if ((newscore == 0
1891 || (sp->ts_fidx >= sp->ts_fidxtry
1892 && ((sp->ts_flags & TSF_DIDDEL) == 0
1893 || c != fword[sp->ts_delidx])))
1894 && TRY_DEEPER(su, stack, depth, newscore))
1895 {
1896 go_deeper(stack, depth, newscore);
1897 #ifdef DEBUG_TRIEWALK
1898 if (newscore > 0)
1899 sprintf(changename[depth], "%.*s-%s: subst %c to %c",
1900 sp->ts_twordlen, tword, fword + sp->ts_fidx,
1901 fword[sp->ts_fidx], c);
1902 else
1903 sprintf(changename[depth], "%.*s-%s: accept %c",
1904 sp->ts_twordlen, tword, fword + sp->ts_fidx,
1905 fword[sp->ts_fidx]);
1906 #endif
1907 ++depth;
1908 sp = &stack[depth];
1909 ++sp->ts_fidx;
1910 tword[sp->ts_twordlen++] = c;
1911 sp->ts_arridx = idxs[arridx];
1912 if (newscore == SCORE_SUBST)
1913 sp->ts_isdiff = DIFF_YES;
1914 if (has_mbyte)
1915 {
1916 // Multi-byte characters are a bit complicated to
1917 // handle: They differ when any of the bytes differ
1918 // and then their length may also differ.
1919 if (sp->ts_tcharlen == 0)
1920 {
1921 // First byte.
1922 sp->ts_tcharidx = 0;
1923 sp->ts_tcharlen = MB_BYTE2LEN(c);
1924 sp->ts_fcharstart = sp->ts_fidx - 1;
1925 sp->ts_isdiff = (newscore != 0)
1926 ? DIFF_YES : DIFF_NONE;
1927 }
1928 else if (sp->ts_isdiff == DIFF_INSERT)
1929 // When inserting trail bytes don't advance in the
1930 // bad word.
1931 --sp->ts_fidx;
1932 if (++sp->ts_tcharidx == sp->ts_tcharlen)
1933 {
1934 // Last byte of character.
1935 if (sp->ts_isdiff == DIFF_YES)
1936 {
1937 // Correct ts_fidx for the byte length of the
1938 // character (we didn't check that before).
1939 sp->ts_fidx = sp->ts_fcharstart
1940 + MB_PTR2LEN(
1941 fword + sp->ts_fcharstart);
1942 // For changing a composing character adjust
1943 // the score from SCORE_SUBST to
1944 // SCORE_SUBCOMP.
1945 if (enc_utf8
1946 && utf_iscomposing(
1947 utf_ptr2char(tword
1948 + sp->ts_twordlen
1949 - sp->ts_tcharlen))
1950 && utf_iscomposing(
1951 utf_ptr2char(fword
1952 + sp->ts_fcharstart)))
1953 sp->ts_score -=
1954 SCORE_SUBST - SCORE_SUBCOMP;
1955
1956 // For a similar character adjust score from
1957 // SCORE_SUBST to SCORE_SIMILAR.
1958 else if (!soundfold
1959 && slang->sl_has_map
1960 && similar_chars(slang,
1961 mb_ptr2char(tword
1962 + sp->ts_twordlen
1963 - sp->ts_tcharlen),
1964 mb_ptr2char(fword
1965 + sp->ts_fcharstart)))
1966 sp->ts_score -=
1967 SCORE_SUBST - SCORE_SIMILAR;
1968 }
1969 else if (sp->ts_isdiff == DIFF_INSERT
1970 && sp->ts_twordlen > sp->ts_tcharlen)
1971 {
1972 p = tword + sp->ts_twordlen - sp->ts_tcharlen;
1973 c = mb_ptr2char(p);
1974 if (enc_utf8 && utf_iscomposing(c))
1975 {
1976 // Inserting a composing char doesn't
1977 // count that much.
1978 sp->ts_score -= SCORE_INS - SCORE_INSCOMP;
1979 }
1980 else
1981 {
1982 // If the previous character was the same,
1983 // thus doubling a character, give a bonus
1984 // to the score. Also for the soundfold
1985 // tree (might seem illogical but does
1986 // give better scores).
1987 MB_PTR_BACK(tword, p);
1988 if (c == mb_ptr2char(p))
1989 sp->ts_score -= SCORE_INS
1990 - SCORE_INSDUP;
1991 }
1992 }
1993
1994 // Starting a new char, reset the length.
1995 sp->ts_tcharlen = 0;
1996 }
1997 }
1998 else
1999 {
2000 // If we found a similar char adjust the score.
2001 // We do this after calling go_deeper() because
2002 // it's slow.
2003 if (newscore != 0
2004 && !soundfold
2005 && slang->sl_has_map
2006 && similar_chars(slang,
2007 c, fword[sp->ts_fidx - 1]))
2008 sp->ts_score -= SCORE_SUBST - SCORE_SIMILAR;
2009 }
2010 }
2011 }
2012 break;
2013
2014 case STATE_DEL:
2015 // When past the first byte of a multi-byte char don't try
2016 // delete/insert/swap a character.
2017 if (has_mbyte && sp->ts_tcharlen > 0)
2018 {
2019 PROF_STORE(sp->ts_state)
2020 sp->ts_state = STATE_FINAL;
2021 break;
2022 }
2023 // Try skipping one character in the bad word (delete it).
2024 PROF_STORE(sp->ts_state)
2025 sp->ts_state = STATE_INS_PREP;
2026 sp->ts_curi = 1;
2027 if (soundfold && sp->ts_fidx == 0 && fword[sp->ts_fidx] == '*')
2028 // Deleting a vowel at the start of a word counts less, see
2029 // soundalike_score().
2030 newscore = 2 * SCORE_DEL / 3;
2031 else
2032 newscore = SCORE_DEL;
2033 if (fword[sp->ts_fidx] != NUL
2034 && TRY_DEEPER(su, stack, depth, newscore))
2035 {
2036 go_deeper(stack, depth, newscore);
2037 #ifdef DEBUG_TRIEWALK
2038 sprintf(changename[depth], "%.*s-%s: delete %c",
2039 sp->ts_twordlen, tword, fword + sp->ts_fidx,
2040 fword[sp->ts_fidx]);
2041 #endif
2042 ++depth;
2043
2044 // Remember what character we deleted, so that we can avoid
2045 // inserting it again.
2046 stack[depth].ts_flags |= TSF_DIDDEL;
2047 stack[depth].ts_delidx = sp->ts_fidx;
2048
2049 // Advance over the character in fword[]. Give a bonus to the
2050 // score if the same character is following "nn" -> "n". It's
2051 // a bit illogical for soundfold tree but it does give better
2052 // results.
2053 if (has_mbyte)
2054 {
2055 c = mb_ptr2char(fword + sp->ts_fidx);
2056 stack[depth].ts_fidx += MB_PTR2LEN(fword + sp->ts_fidx);
2057 if (enc_utf8 && utf_iscomposing(c))
2058 stack[depth].ts_score -= SCORE_DEL - SCORE_DELCOMP;
2059 else if (c == mb_ptr2char(fword + stack[depth].ts_fidx))
2060 stack[depth].ts_score -= SCORE_DEL - SCORE_DELDUP;
2061 }
2062 else
2063 {
2064 ++stack[depth].ts_fidx;
2065 if (fword[sp->ts_fidx] == fword[sp->ts_fidx + 1])
2066 stack[depth].ts_score -= SCORE_DEL - SCORE_DELDUP;
2067 }
2068 break;
2069 }
2070 // FALLTHROUGH
2071
2072 case STATE_INS_PREP:
2073 if (sp->ts_flags & TSF_DIDDEL)
2074 {
2075 // If we just deleted a byte then inserting won't make sense,
2076 // a substitute is always cheaper.
2077 PROF_STORE(sp->ts_state)
2078 sp->ts_state = STATE_SWAP;
2079 break;
2080 }
2081
2082 // skip over NUL bytes
2083 n = sp->ts_arridx;
2084 for (;;)
2085 {
2086 if (sp->ts_curi > byts[n])
2087 {
2088 // Only NUL bytes at this node, go to next state.
2089 PROF_STORE(sp->ts_state)
2090 sp->ts_state = STATE_SWAP;
2091 break;
2092 }
2093 if (byts[n + sp->ts_curi] != NUL)
2094 {
2095 // Found a byte to insert.
2096 PROF_STORE(sp->ts_state)
2097 sp->ts_state = STATE_INS;
2098 break;
2099 }
2100 ++sp->ts_curi;
2101 }
2102 break;
2103
2104 // FALLTHROUGH
2105
2106 case STATE_INS:
2107 // Insert one byte. Repeat this for each possible byte at this
2108 // node.
2109 n = sp->ts_arridx;
2110 if (sp->ts_curi > byts[n])
2111 {
2112 // Done all bytes at this node, go to next state.
2113 PROF_STORE(sp->ts_state)
2114 sp->ts_state = STATE_SWAP;
2115 break;
2116 }
2117
2118 // Do one more byte at this node, but:
2119 // - Skip NUL bytes.
2120 // - Skip the byte if it's equal to the byte in the word,
2121 // accepting that byte is always better.
2122 n += sp->ts_curi++;
2123 c = byts[n];
2124 if (soundfold && sp->ts_twordlen == 0 && c == '*')
2125 // Inserting a vowel at the start of a word counts less,
2126 // see soundalike_score().
2127 newscore = 2 * SCORE_INS / 3;
2128 else
2129 newscore = SCORE_INS;
2130 if (c != fword[sp->ts_fidx]
2131 && TRY_DEEPER(su, stack, depth, newscore))
2132 {
2133 go_deeper(stack, depth, newscore);
2134 #ifdef DEBUG_TRIEWALK
2135 sprintf(changename[depth], "%.*s-%s: insert %c",
2136 sp->ts_twordlen, tword, fword + sp->ts_fidx,
2137 c);
2138 #endif
2139 ++depth;
2140 sp = &stack[depth];
2141 tword[sp->ts_twordlen++] = c;
2142 sp->ts_arridx = idxs[n];
2143 if (has_mbyte)
2144 {
2145 fl = MB_BYTE2LEN(c);
2146 if (fl > 1)
2147 {
2148 // There are following bytes for the same character.
2149 // We must find all bytes before trying
2150 // delete/insert/swap/etc.
2151 sp->ts_tcharlen = fl;
2152 sp->ts_tcharidx = 1;
2153 sp->ts_isdiff = DIFF_INSERT;
2154 }
2155 }
2156 else
2157 fl = 1;
2158 if (fl == 1)
2159 {
2160 // If the previous character was the same, thus doubling a
2161 // character, give a bonus to the score. Also for
2162 // soundfold words (illogical but does give a better
2163 // score).
2164 if (sp->ts_twordlen >= 2
2165 && tword[sp->ts_twordlen - 2] == c)
2166 sp->ts_score -= SCORE_INS - SCORE_INSDUP;
2167 }
2168 }
2169 break;
2170
2171 case STATE_SWAP:
2172 // Swap two bytes in the bad word: "12" -> "21".
2173 // We change "fword" here, it's changed back afterwards at
2174 // STATE_UNSWAP.
2175 p = fword + sp->ts_fidx;
2176 c = *p;
2177 if (c == NUL)
2178 {
2179 // End of word, can't swap or replace.
2180 PROF_STORE(sp->ts_state)
2181 sp->ts_state = STATE_FINAL;
2182 break;
2183 }
2184
2185 // Don't swap if the first character is not a word character.
2186 // SWAP3 etc. also don't make sense then.
2187 if (!soundfold && !spell_iswordp(p, curwin))
2188 {
2189 PROF_STORE(sp->ts_state)
2190 sp->ts_state = STATE_REP_INI;
2191 break;
2192 }
2193
2194 if (has_mbyte)
2195 {
2196 n = MB_CPTR2LEN(p);
2197 c = mb_ptr2char(p);
2198 if (p[n] == NUL)
2199 c2 = NUL;
2200 else if (!soundfold && !spell_iswordp(p + n, curwin))
2201 c2 = c; // don't swap non-word char
2202 else
2203 c2 = mb_ptr2char(p + n);
2204 }
2205 else
2206 {
2207 if (p[1] == NUL)
2208 c2 = NUL;
2209 else if (!soundfold && !spell_iswordp(p + 1, curwin))
2210 c2 = c; // don't swap non-word char
2211 else
2212 c2 = p[1];
2213 }
2214
2215 // When the second character is NUL we can't swap.
2216 if (c2 == NUL)
2217 {
2218 PROF_STORE(sp->ts_state)
2219 sp->ts_state = STATE_REP_INI;
2220 break;
2221 }
2222
2223 // When characters are identical, swap won't do anything.
2224 // Also get here if the second char is not a word character.
2225 if (c == c2)
2226 {
2227 PROF_STORE(sp->ts_state)
2228 sp->ts_state = STATE_SWAP3;
2229 break;
2230 }
2231 if (c2 != NUL && TRY_DEEPER(su, stack, depth, SCORE_SWAP))
2232 {
2233 go_deeper(stack, depth, SCORE_SWAP);
2234 #ifdef DEBUG_TRIEWALK
2235 sprintf(changename[depth], "%.*s-%s: swap %c and %c",
2236 sp->ts_twordlen, tword, fword + sp->ts_fidx,
2237 c, c2);
2238 #endif
2239 PROF_STORE(sp->ts_state)
2240 sp->ts_state = STATE_UNSWAP;
2241 ++depth;
2242 if (has_mbyte)
2243 {
2244 fl = mb_char2len(c2);
2245 mch_memmove(p, p + n, fl);
2246 mb_char2bytes(c, p + fl);
2247 stack[depth].ts_fidxtry = sp->ts_fidx + n + fl;
2248 }
2249 else
2250 {
2251 p[0] = c2;
2252 p[1] = c;
2253 stack[depth].ts_fidxtry = sp->ts_fidx + 2;
2254 }
2255 }
2256 else
2257 {
2258 // If this swap doesn't work then SWAP3 won't either.
2259 PROF_STORE(sp->ts_state)
2260 sp->ts_state = STATE_REP_INI;
2261 }
2262 break;
2263
2264 case STATE_UNSWAP:
2265 // Undo the STATE_SWAP swap: "21" -> "12".
2266 p = fword + sp->ts_fidx;
2267 if (has_mbyte)
2268 {
2269 n = MB_PTR2LEN(p);
2270 c = mb_ptr2char(p + n);
2271 mch_memmove(p + MB_PTR2LEN(p + n), p, n);
2272 mb_char2bytes(c, p);
2273 }
2274 else
2275 {
2276 c = *p;
2277 *p = p[1];
2278 p[1] = c;
2279 }
2280 // FALLTHROUGH
2281
2282 case STATE_SWAP3:
2283 // Swap two bytes, skipping one: "123" -> "321". We change
2284 // "fword" here, it's changed back afterwards at STATE_UNSWAP3.
2285 p = fword + sp->ts_fidx;
2286 if (has_mbyte)
2287 {
2288 n = MB_CPTR2LEN(p);
2289 c = mb_ptr2char(p);
2290 fl = MB_CPTR2LEN(p + n);
2291 c2 = mb_ptr2char(p + n);
2292 if (!soundfold && !spell_iswordp(p + n + fl, curwin))
2293 c3 = c; // don't swap non-word char
2294 else
2295 c3 = mb_ptr2char(p + n + fl);
2296 }
2297 else
2298 {
2299 c = *p;
2300 c2 = p[1];
2301 if (!soundfold && !spell_iswordp(p + 2, curwin))
2302 c3 = c; // don't swap non-word char
2303 else
2304 c3 = p[2];
2305 }
2306
2307 // When characters are identical: "121" then SWAP3 result is
2308 // identical, ROT3L result is same as SWAP: "211", ROT3L result is
2309 // same as SWAP on next char: "112". Thus skip all swapping.
2310 // Also skip when c3 is NUL.
2311 // Also get here when the third character is not a word character.
2312 // Second character may any char: "a.b" -> "b.a"
2313 if (c == c3 || c3 == NUL)
2314 {
2315 PROF_STORE(sp->ts_state)
2316 sp->ts_state = STATE_REP_INI;
2317 break;
2318 }
2319 if (TRY_DEEPER(su, stack, depth, SCORE_SWAP3))
2320 {
2321 go_deeper(stack, depth, SCORE_SWAP3);
2322 #ifdef DEBUG_TRIEWALK
2323 sprintf(changename[depth], "%.*s-%s: swap3 %c and %c",
2324 sp->ts_twordlen, tword, fword + sp->ts_fidx,
2325 c, c3);
2326 #endif
2327 PROF_STORE(sp->ts_state)
2328 sp->ts_state = STATE_UNSWAP3;
2329 ++depth;
2330 if (has_mbyte)
2331 {
2332 tl = mb_char2len(c3);
2333 mch_memmove(p, p + n + fl, tl);
2334 mb_char2bytes(c2, p + tl);
2335 mb_char2bytes(c, p + fl + tl);
2336 stack[depth].ts_fidxtry = sp->ts_fidx + n + fl + tl;
2337 }
2338 else
2339 {
2340 p[0] = p[2];
2341 p[2] = c;
2342 stack[depth].ts_fidxtry = sp->ts_fidx + 3;
2343 }
2344 }
2345 else
2346 {
2347 PROF_STORE(sp->ts_state)
2348 sp->ts_state = STATE_REP_INI;
2349 }
2350 break;
2351
2352 case STATE_UNSWAP3:
2353 // Undo STATE_SWAP3: "321" -> "123"
2354 p = fword + sp->ts_fidx;
2355 if (has_mbyte)
2356 {
2357 n = MB_PTR2LEN(p);
2358 c2 = mb_ptr2char(p + n);
2359 fl = MB_PTR2LEN(p + n);
2360 c = mb_ptr2char(p + n + fl);
2361 tl = MB_PTR2LEN(p + n + fl);
2362 mch_memmove(p + fl + tl, p, n);
2363 mb_char2bytes(c, p);
2364 mb_char2bytes(c2, p + tl);
2365 p = p + tl;
2366 }
2367 else
2368 {
2369 c = *p;
2370 *p = p[2];
2371 p[2] = c;
2372 ++p;
2373 }
2374
2375 if (!soundfold && !spell_iswordp(p, curwin))
2376 {
2377 // Middle char is not a word char, skip the rotate. First and
2378 // third char were already checked at swap and swap3.
2379 PROF_STORE(sp->ts_state)
2380 sp->ts_state = STATE_REP_INI;
2381 break;
2382 }
2383
2384 // Rotate three characters left: "123" -> "231". We change
2385 // "fword" here, it's changed back afterwards at STATE_UNROT3L.
2386 if (TRY_DEEPER(su, stack, depth, SCORE_SWAP3))
2387 {
2388 go_deeper(stack, depth, SCORE_SWAP3);
2389 #ifdef DEBUG_TRIEWALK
2390 p = fword + sp->ts_fidx;
2391 sprintf(changename[depth], "%.*s-%s: rotate left %c%c%c",
2392 sp->ts_twordlen, tword, fword + sp->ts_fidx,
2393 p[0], p[1], p[2]);
2394 #endif
2395 PROF_STORE(sp->ts_state)
2396 sp->ts_state = STATE_UNROT3L;
2397 ++depth;
2398 p = fword + sp->ts_fidx;
2399 if (has_mbyte)
2400 {
2401 n = MB_CPTR2LEN(p);
2402 c = mb_ptr2char(p);
2403 fl = MB_CPTR2LEN(p + n);
2404 fl += MB_CPTR2LEN(p + n + fl);
2405 mch_memmove(p, p + n, fl);
2406 mb_char2bytes(c, p + fl);
2407 stack[depth].ts_fidxtry = sp->ts_fidx + n + fl;
2408 }
2409 else
2410 {
2411 c = *p;
2412 *p = p[1];
2413 p[1] = p[2];
2414 p[2] = c;
2415 stack[depth].ts_fidxtry = sp->ts_fidx + 3;
2416 }
2417 }
2418 else
2419 {
2420 PROF_STORE(sp->ts_state)
2421 sp->ts_state = STATE_REP_INI;
2422 }
2423 break;
2424
2425 case STATE_UNROT3L:
2426 // Undo ROT3L: "231" -> "123"
2427 p = fword + sp->ts_fidx;
2428 if (has_mbyte)
2429 {
2430 n = MB_PTR2LEN(p);
2431 n += MB_PTR2LEN(p + n);
2432 c = mb_ptr2char(p + n);
2433 tl = MB_PTR2LEN(p + n);
2434 mch_memmove(p + tl, p, n);
2435 mb_char2bytes(c, p);
2436 }
2437 else
2438 {
2439 c = p[2];
2440 p[2] = p[1];
2441 p[1] = *p;
2442 *p = c;
2443 }
2444
2445 // Rotate three bytes right: "123" -> "312". We change "fword"
2446 // here, it's changed back afterwards at STATE_UNROT3R.
2447 if (TRY_DEEPER(su, stack, depth, SCORE_SWAP3))
2448 {
2449 go_deeper(stack, depth, SCORE_SWAP3);
2450 #ifdef DEBUG_TRIEWALK
2451 p = fword + sp->ts_fidx;
2452 sprintf(changename[depth], "%.*s-%s: rotate right %c%c%c",
2453 sp->ts_twordlen, tword, fword + sp->ts_fidx,
2454 p[0], p[1], p[2]);
2455 #endif
2456 PROF_STORE(sp->ts_state)
2457 sp->ts_state = STATE_UNROT3R;
2458 ++depth;
2459 p = fword + sp->ts_fidx;
2460 if (has_mbyte)
2461 {
2462 n = MB_CPTR2LEN(p);
2463 n += MB_CPTR2LEN(p + n);
2464 c = mb_ptr2char(p + n);
2465 tl = MB_CPTR2LEN(p + n);
2466 mch_memmove(p + tl, p, n);
2467 mb_char2bytes(c, p);
2468 stack[depth].ts_fidxtry = sp->ts_fidx + n + tl;
2469 }
2470 else
2471 {
2472 c = p[2];
2473 p[2] = p[1];
2474 p[1] = *p;
2475 *p = c;
2476 stack[depth].ts_fidxtry = sp->ts_fidx + 3;
2477 }
2478 }
2479 else
2480 {
2481 PROF_STORE(sp->ts_state)
2482 sp->ts_state = STATE_REP_INI;
2483 }
2484 break;
2485
2486 case STATE_UNROT3R:
2487 // Undo ROT3R: "312" -> "123"
2488 p = fword + sp->ts_fidx;
2489 if (has_mbyte)
2490 {
2491 c = mb_ptr2char(p);
2492 tl = MB_PTR2LEN(p);
2493 n = MB_PTR2LEN(p + tl);
2494 n += MB_PTR2LEN(p + tl + n);
2495 mch_memmove(p, p + tl, n);
2496 mb_char2bytes(c, p + n);
2497 }
2498 else
2499 {
2500 c = *p;
2501 *p = p[1];
2502 p[1] = p[2];
2503 p[2] = c;
2504 }
2505 // FALLTHROUGH
2506
2507 case STATE_REP_INI:
2508 // Check if matching with REP items from the .aff file would work.
2509 // Quickly skip if:
2510 // - there are no REP items and we are not in the soundfold trie
2511 // - the score is going to be too high anyway
2512 // - already applied a REP item or swapped here
2513 if ((lp->lp_replang == NULL && !soundfold)
2514 || sp->ts_score + SCORE_REP >= su->su_maxscore
2515 || sp->ts_fidx < sp->ts_fidxtry)
2516 {
2517 PROF_STORE(sp->ts_state)
2518 sp->ts_state = STATE_FINAL;
2519 break;
2520 }
2521
2522 // Use the first byte to quickly find the first entry that may
2523 // match. If the index is -1 there is none.
2524 if (soundfold)
2525 sp->ts_curi = slang->sl_repsal_first[fword[sp->ts_fidx]];
2526 else
2527 sp->ts_curi = lp->lp_replang->sl_rep_first[fword[sp->ts_fidx]];
2528
2529 if (sp->ts_curi < 0)
2530 {
2531 PROF_STORE(sp->ts_state)
2532 sp->ts_state = STATE_FINAL;
2533 break;
2534 }
2535
2536 PROF_STORE(sp->ts_state)
2537 sp->ts_state = STATE_REP;
2538 // FALLTHROUGH
2539
2540 case STATE_REP:
2541 // Try matching with REP items from the .aff file. For each match
2542 // replace the characters and check if the resulting word is
2543 // valid.
2544 p = fword + sp->ts_fidx;
2545
2546 if (soundfold)
2547 gap = &slang->sl_repsal;
2548 else
2549 gap = &lp->lp_replang->sl_rep;
2550 while (sp->ts_curi < gap->ga_len)
2551 {
2552 ftp = (fromto_T *)gap->ga_data + sp->ts_curi++;
2553 if (*ftp->ft_from != *p)
2554 {
2555 // past possible matching entries
2556 sp->ts_curi = gap->ga_len;
2557 break;
2558 }
2559 if (STRNCMP(ftp->ft_from, p, STRLEN(ftp->ft_from)) == 0
2560 && TRY_DEEPER(su, stack, depth, SCORE_REP))
2561 {
2562 go_deeper(stack, depth, SCORE_REP);
2563 #ifdef DEBUG_TRIEWALK
2564 sprintf(changename[depth], "%.*s-%s: replace %s with %s",
2565 sp->ts_twordlen, tword, fword + sp->ts_fidx,
2566 ftp->ft_from, ftp->ft_to);
2567 #endif
2568 // Need to undo this afterwards.
2569 PROF_STORE(sp->ts_state)
2570 sp->ts_state = STATE_REP_UNDO;
2571
2572 // Change the "from" to the "to" string.
2573 ++depth;
2574 fl = (int)STRLEN(ftp->ft_from);
2575 tl = (int)STRLEN(ftp->ft_to);
2576 if (fl != tl)
2577 {
2578 STRMOVE(p + tl, p + fl);
2579 repextra += tl - fl;
2580 }
2581 mch_memmove(p, ftp->ft_to, tl);
2582 stack[depth].ts_fidxtry = sp->ts_fidx + tl;
2583 stack[depth].ts_tcharlen = 0;
2584 break;
2585 }
2586 }
2587
2588 if (sp->ts_curi >= gap->ga_len && sp->ts_state == STATE_REP)
2589 {
2590 // No (more) matches.
2591 PROF_STORE(sp->ts_state)
2592 sp->ts_state = STATE_FINAL;
2593 }
2594
2595 break;
2596
2597 case STATE_REP_UNDO:
2598 // Undo a REP replacement and continue with the next one.
2599 if (soundfold)
2600 gap = &slang->sl_repsal;
2601 else
2602 gap = &lp->lp_replang->sl_rep;
2603 ftp = (fromto_T *)gap->ga_data + sp->ts_curi - 1;
2604 fl = (int)STRLEN(ftp->ft_from);
2605 tl = (int)STRLEN(ftp->ft_to);
2606 p = fword + sp->ts_fidx;
2607 if (fl != tl)
2608 {
2609 STRMOVE(p + fl, p + tl);
2610 repextra -= tl - fl;
2611 }
2612 mch_memmove(p, ftp->ft_from, fl);
2613 PROF_STORE(sp->ts_state)
2614 sp->ts_state = STATE_REP;
2615 break;
2616
2617 default:
2618 // Did all possible states at this level, go up one level.
2619 --depth;
2620
2621 if (depth >= 0 && stack[depth].ts_prefixdepth == PFD_PREFIXTREE)
2622 {
2623 // Continue in or go back to the prefix tree.
2624 byts = pbyts;
2625 idxs = pidxs;
2626 }
2627
2628 // Don't check for CTRL-C too often, it takes time.
2629 if (--breakcheckcount == 0)
2630 {
2631 ui_breakcheck();
2632 breakcheckcount = 1000;
2633 }
2634 }
2635 }
2636 }
2637
2638
2639 /*
2640 * Go one level deeper in the tree.
2641 */
2642 static void
2643 go_deeper(trystate_T *stack, int depth, int score_add)
2644 {
2645 stack[depth + 1] = stack[depth];
2646 stack[depth + 1].ts_state = STATE_START;
2647 stack[depth + 1].ts_score = stack[depth].ts_score + score_add;
2648 stack[depth + 1].ts_curi = 1; // start just after length byte
2649 stack[depth + 1].ts_flags = 0;
2650 }
2651
2652 /*
2653 * "fword" is a good word with case folded. Find the matching keep-case
2654 * words and put it in "kword".
2655 * Theoretically there could be several keep-case words that result in the
2656 * same case-folded word, but we only find one...
2657 */
2658 static void
2659 find_keepcap_word(slang_T *slang, char_u *fword, char_u *kword)
2660 {
2661 char_u uword[MAXWLEN]; // "fword" in upper-case
2662 int depth;
2663 idx_T tryidx;
2664
2665 // The following arrays are used at each depth in the tree.
2666 idx_T arridx[MAXWLEN];
2667 int round[MAXWLEN];
2668 int fwordidx[MAXWLEN];
2669 int uwordidx[MAXWLEN];
2670 int kwordlen[MAXWLEN];
2671
2672 int flen, ulen;
2673 int l;
2674 int len;
2675 int c;
2676 idx_T lo, hi, m;
2677 char_u *p;
2678 char_u *byts = slang->sl_kbyts; // array with bytes of the words
2679 idx_T *idxs = slang->sl_kidxs; // array with indexes
2680
2681 if (byts == NULL)
2682 {
2683 // array is empty: "cannot happen"
2684 *kword = NUL;
2685 return;
2686 }
2687
2688 // Make an all-cap version of "fword".
2689 allcap_copy(fword, uword);
2690
2691 // Each character needs to be tried both case-folded and upper-case.
2692 // All this gets very complicated if we keep in mind that changing case
2693 // may change the byte length of a multi-byte character...
2694 depth = 0;
2695 arridx[0] = 0;
2696 round[0] = 0;
2697 fwordidx[0] = 0;
2698 uwordidx[0] = 0;
2699 kwordlen[0] = 0;
2700 while (depth >= 0)
2701 {
2702 if (fword[fwordidx[depth]] == NUL)
2703 {
2704 // We are at the end of "fword". If the tree allows a word to end
2705 // here we have found a match.
2706 if (byts[arridx[depth] + 1] == 0)
2707 {
2708 kword[kwordlen[depth]] = NUL;
2709 return;
2710 }
2711
2712 // kword is getting too long, continue one level up
2713 --depth;
2714 }
2715 else if (++round[depth] > 2)
2716 {
2717 // tried both fold-case and upper-case character, continue one
2718 // level up
2719 --depth;
2720 }
2721 else
2722 {
2723 // round[depth] == 1: Try using the folded-case character.
2724 // round[depth] == 2: Try using the upper-case character.
2725 if (has_mbyte)
2726 {
2727 flen = MB_CPTR2LEN(fword + fwordidx[depth]);
2728 ulen = MB_CPTR2LEN(uword + uwordidx[depth]);
2729 }
2730 else
2731 ulen = flen = 1;
2732 if (round[depth] == 1)
2733 {
2734 p = fword + fwordidx[depth];
2735 l = flen;
2736 }
2737 else
2738 {
2739 p = uword + uwordidx[depth];
2740 l = ulen;
2741 }
2742
2743 for (tryidx = arridx[depth]; l > 0; --l)
2744 {
2745 // Perform a binary search in the list of accepted bytes.
2746 len = byts[tryidx++];
2747 c = *p++;
2748 lo = tryidx;
2749 hi = tryidx + len - 1;
2750 while (lo < hi)
2751 {
2752 m = (lo + hi) / 2;
2753 if (byts[m] > c)
2754 hi = m - 1;
2755 else if (byts[m] < c)
2756 lo = m + 1;
2757 else
2758 {
2759 lo = hi = m;
2760 break;
2761 }
2762 }
2763
2764 // Stop if there is no matching byte.
2765 if (hi < lo || byts[lo] != c)
2766 break;
2767
2768 // Continue at the child (if there is one).
2769 tryidx = idxs[lo];
2770 }
2771
2772 if (l == 0)
2773 {
2774 // Found the matching char. Copy it to "kword" and go a
2775 // level deeper.
2776 if (round[depth] == 1)
2777 {
2778 STRNCPY(kword + kwordlen[depth], fword + fwordidx[depth],
2779 flen);
2780 kwordlen[depth + 1] = kwordlen[depth] + flen;
2781 }
2782 else
2783 {
2784 STRNCPY(kword + kwordlen[depth], uword + uwordidx[depth],
2785 ulen);
2786 kwordlen[depth + 1] = kwordlen[depth] + ulen;
2787 }
2788 fwordidx[depth + 1] = fwordidx[depth] + flen;
2789 uwordidx[depth + 1] = uwordidx[depth] + ulen;
2790
2791 ++depth;
2792 arridx[depth] = tryidx;
2793 round[depth] = 0;
2794 }
2795 }
2796 }
2797
2798 // Didn't find it: "cannot happen".
2799 *kword = NUL;
2800 }
2801
2802 /*
2803 * Compute the sound-a-like score for suggestions in su->su_ga and add them to
2804 * su->su_sga.
2805 */
2806 static void
2807 score_comp_sal(suginfo_T *su)
2808 {
2809 langp_T *lp;
2810 char_u badsound[MAXWLEN];
2811 int i;
2812 suggest_T *stp;
2813 suggest_T *sstp;
2814 int score;
2815 int lpi;
2816
2817 if (ga_grow(&su->su_sga, su->su_ga.ga_len) == FAIL)
2818 return;
2819
2820 // Use the sound-folding of the first language that supports it.
2821 for (lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi)
2822 {
2823 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi);
2824 if (lp->lp_slang->sl_sal.ga_len > 0)
2825 {
2826 // soundfold the bad word
2827 spell_soundfold(lp->lp_slang, su->su_fbadword, TRUE, badsound);
2828
2829 for (i = 0; i < su->su_ga.ga_len; ++i)
2830 {
2831 stp = &SUG(su->su_ga, i);
2832
2833 // Case-fold the suggested word, sound-fold it and compute the
2834 // sound-a-like score.
2835 score = stp_sal_score(stp, su, lp->lp_slang, badsound);
2836 if (score < SCORE_MAXMAX)
2837 {
2838 // Add the suggestion.
2839 sstp = &SUG(su->su_sga, su->su_sga.ga_len);
2840 sstp->st_word = vim_strsave(stp->st_word);
2841 if (sstp->st_word != NULL)
2842 {
2843 sstp->st_wordlen = stp->st_wordlen;
2844 sstp->st_score = score;
2845 sstp->st_altscore = 0;
2846 sstp->st_orglen = stp->st_orglen;
2847 ++su->su_sga.ga_len;
2848 }
2849 }
2850 }
2851 break;
2852 }
2853 }
2854 }
2855
2856 /*
2857 * Combine the list of suggestions in su->su_ga and su->su_sga.
2858 * They are entwined.
2859 */
2860 static void
2861 score_combine(suginfo_T *su)
2862 {
2863 int i;
2864 int j;
2865 garray_T ga;
2866 garray_T *gap;
2867 langp_T *lp;
2868 suggest_T *stp;
2869 char_u *p;
2870 char_u badsound[MAXWLEN];
2871 int round;
2872 int lpi;
2873 slang_T *slang = NULL;
2874
2875 // Add the alternate score to su_ga.
2876 for (lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi)
2877 {
2878 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi);
2879 if (lp->lp_slang->sl_sal.ga_len > 0)
2880 {
2881 // soundfold the bad word
2882 slang = lp->lp_slang;
2883 spell_soundfold(slang, su->su_fbadword, TRUE, badsound);
2884
2885 for (i = 0; i < su->su_ga.ga_len; ++i)
2886 {
2887 stp = &SUG(su->su_ga, i);
2888 stp->st_altscore = stp_sal_score(stp, su, slang, badsound);
2889 if (stp->st_altscore == SCORE_MAXMAX)
2890 stp->st_score = (stp->st_score * 3 + SCORE_BIG) / 4;
2891 else
2892 stp->st_score = (stp->st_score * 3
2893 + stp->st_altscore) / 4;
2894 stp->st_salscore = FALSE;
2895 }
2896 break;
2897 }
2898 }
2899
2900 if (slang == NULL) // Using "double" without sound folding.
2901 {
2902 (void)cleanup_suggestions(&su->su_ga, su->su_maxscore,
2903 su->su_maxcount);
2904 return;
2905 }
2906
2907 // Add the alternate score to su_sga.
2908 for (i = 0; i < su->su_sga.ga_len; ++i)
2909 {
2910 stp = &SUG(su->su_sga, i);
2911 stp->st_altscore = spell_edit_score(slang,
2912 su->su_badword, stp->st_word);
2913 if (stp->st_score == SCORE_MAXMAX)
2914 stp->st_score = (SCORE_BIG * 7 + stp->st_altscore) / 8;
2915 else
2916 stp->st_score = (stp->st_score * 7 + stp->st_altscore) / 8;
2917 stp->st_salscore = TRUE;
2918 }
2919
2920 // Remove bad suggestions, sort the suggestions and truncate at "maxcount"
2921 // for both lists.
2922 check_suggestions(su, &su->su_ga);
2923 (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, su->su_maxcount);
2924 check_suggestions(su, &su->su_sga);
2925 (void)cleanup_suggestions(&su->su_sga, su->su_maxscore, su->su_maxcount);
2926
2927 ga_init2(&ga, (int)sizeof(suginfo_T), 1);
2928 if (ga_grow(&ga, su->su_ga.ga_len + su->su_sga.ga_len) == FAIL)
2929 return;
2930
2931 stp = &SUG(ga, 0);
2932 for (i = 0; i < su->su_ga.ga_len || i < su->su_sga.ga_len; ++i)
2933 {
2934 // round 1: get a suggestion from su_ga
2935 // round 2: get a suggestion from su_sga
2936 for (round = 1; round <= 2; ++round)
2937 {
2938 gap = round == 1 ? &su->su_ga : &su->su_sga;
2939 if (i < gap->ga_len)
2940 {
2941 // Don't add a word if it's already there.
2942 p = SUG(*gap, i).st_word;
2943 for (j = 0; j < ga.ga_len; ++j)
2944 if (STRCMP(stp[j].st_word, p) == 0)
2945 break;
2946 if (j == ga.ga_len)
2947 stp[ga.ga_len++] = SUG(*gap, i);
2948 else
2949 vim_free(p);
2950 }
2951 }
2952 }
2953
2954 ga_clear(&su->su_ga);
2955 ga_clear(&su->su_sga);
2956
2957 // Truncate the list to the number of suggestions that will be displayed.
2958 if (ga.ga_len > su->su_maxcount)
2959 {
2960 for (i = su->su_maxcount; i < ga.ga_len; ++i)
2961 vim_free(stp[i].st_word);
2962 ga.ga_len = su->su_maxcount;
2963 }
2964
2965 su->su_ga = ga;
2966 }
2967
2968 /*
2969 * For the goodword in "stp" compute the soundalike score compared to the
2970 * badword.
2971 */
2972 static int
2973 stp_sal_score(
2974 suggest_T *stp,
2975 suginfo_T *su,
2976 slang_T *slang,
2977 char_u *badsound) // sound-folded badword
2978 {
2979 char_u *p;
2980 char_u *pbad;
2981 char_u *pgood;
2982 char_u badsound2[MAXWLEN];
2983 char_u fword[MAXWLEN];
2984 char_u goodsound[MAXWLEN];
2985 char_u goodword[MAXWLEN];
2986 int lendiff;
2987
2988 lendiff = (int)(su->su_badlen - stp->st_orglen);
2989 if (lendiff >= 0)
2990 pbad = badsound;
2991 else
2992 {
2993 // soundfold the bad word with more characters following
2994 (void)spell_casefold(su->su_badptr, stp->st_orglen, fword, MAXWLEN);
2995
2996 // When joining two words the sound often changes a lot. E.g., "t he"
2997 // sounds like "t h" while "the" sounds like "@". Avoid that by
2998 // removing the space. Don't do it when the good word also contains a
2999 // space.
3000 if (VIM_ISWHITE(su->su_badptr[su->su_badlen])
3001 && *skiptowhite(stp->st_word) == NUL)
3002 for (p = fword; *(p = skiptowhite(p)) != NUL; )
3003 STRMOVE(p, p + 1);
3004
3005 spell_soundfold(slang, fword, TRUE, badsound2);
3006 pbad = badsound2;
3007 }
3008
3009 if (lendiff > 0 && stp->st_wordlen + lendiff < MAXWLEN)
3010 {
3011 // Add part of the bad word to the good word, so that we soundfold
3012 // what replaces the bad word.
3013 STRCPY(goodword, stp->st_word);
3014 vim_strncpy(goodword + stp->st_wordlen,
3015 su->su_badptr + su->su_badlen - lendiff, lendiff);
3016 pgood = goodword;
3017 }
3018 else
3019 pgood = stp->st_word;
3020
3021 // Sound-fold the word and compute the score for the difference.
3022 spell_soundfold(slang, pgood, FALSE, goodsound);
3023
3024 return soundalike_score(goodsound, pbad);
3025 }
3026
3027 // structure used to store soundfolded words that add_sound_suggest() has
3028 // handled already.
3029 typedef struct
3030 {
3031 short sft_score; // lowest score used
3032 char_u sft_word[1]; // soundfolded word, actually longer
3033 } sftword_T;
3034
3035 static sftword_T dumsft;
3036 #define HIKEY2SFT(p) ((sftword_T *)(p - (dumsft.sft_word - (char_u *)&dumsft)))
3037 #define HI2SFT(hi) HIKEY2SFT((hi)->hi_key)
3038
3039 /*
3040 * Prepare for calling suggest_try_soundalike().
3041 */
3042 static void
3043 suggest_try_soundalike_prep(void)
3044 {
3045 langp_T *lp;
3046 int lpi;
3047 slang_T *slang;
3048
3049 // Do this for all languages that support sound folding and for which a
3050 // .sug file has been loaded.
3051 for (lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi)
3052 {
3053 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi);
3054 slang = lp->lp_slang;
3055 if (slang->sl_sal.ga_len > 0 && slang->sl_sbyts != NULL)
3056 // prepare the hashtable used by add_sound_suggest()
3057 hash_init(&slang->sl_sounddone);
3058 }
3059 }
3060
3061 /*
3062 * Find suggestions by comparing the word in a sound-a-like form.
3063 * Note: This doesn't support postponed prefixes.
3064 */
3065 static void
3066 suggest_try_soundalike(suginfo_T *su)
3067 {
3068 char_u salword[MAXWLEN];
3069 langp_T *lp;
3070 int lpi;
3071 slang_T *slang;
3072
3073 // Do this for all languages that support sound folding and for which a
3074 // .sug file has been loaded.
3075 for (lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi)
3076 {
3077 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi);
3078 slang = lp->lp_slang;
3079 if (slang->sl_sal.ga_len > 0 && slang->sl_sbyts != NULL)
3080 {
3081 // soundfold the bad word
3082 spell_soundfold(slang, su->su_fbadword, TRUE, salword);
3083
3084 // try all kinds of inserts/deletes/swaps/etc.
3085 // TODO: also soundfold the next words, so that we can try joining
3086 // and splitting
3087 #ifdef SUGGEST_PROFILE
3088 prof_init();
3089 #endif
3090 suggest_trie_walk(su, lp, salword, TRUE);
3091 #ifdef SUGGEST_PROFILE
3092 prof_report("soundalike");
3093 #endif
3094 }
3095 }
3096 }
3097
3098 /*
3099 * Finish up after calling suggest_try_soundalike().
3100 */
3101 static void
3102 suggest_try_soundalike_finish(void)
3103 {
3104 langp_T *lp;
3105 int lpi;
3106 slang_T *slang;
3107 int todo;
3108 hashitem_T *hi;
3109
3110 // Do this for all languages that support sound folding and for which a
3111 // .sug file has been loaded.
3112 for (lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi)
3113 {
3114 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi);
3115 slang = lp->lp_slang;
3116 if (slang->sl_sal.ga_len > 0 && slang->sl_sbyts != NULL)
3117 {
3118 // Free the info about handled words.
3119 todo = (int)slang->sl_sounddone.ht_used;
3120 for (hi = slang->sl_sounddone.ht_array; todo > 0; ++hi)
3121 if (!HASHITEM_EMPTY(hi))
3122 {
3123 vim_free(HI2SFT(hi));
3124 --todo;
3125 }
3126
3127 // Clear the hashtable, it may also be used by another region.
3128 hash_clear(&slang->sl_sounddone);
3129 hash_init(&slang->sl_sounddone);
3130 }
3131 }
3132 }
3133
3134 /*
3135 * A match with a soundfolded word is found. Add the good word(s) that
3136 * produce this soundfolded word.
3137 */
3138 static void
3139 add_sound_suggest(
3140 suginfo_T *su,
3141 char_u *goodword,
3142 int score, // soundfold score
3143 langp_T *lp)
3144 {
3145 slang_T *slang = lp->lp_slang; // language for sound folding
3146 int sfwordnr;
3147 char_u *nrline;
3148 int orgnr;
3149 char_u theword[MAXWLEN];
3150 int i;
3151 int wlen;
3152 char_u *byts;
3153 idx_T *idxs;
3154 int n;
3155 int wordcount;
3156 int wc;
3157 int goodscore;
3158 hash_T hash;
3159 hashitem_T *hi;
3160 sftword_T *sft;
3161 int bc, gc;
3162 int limit;
3163
3164 // It's very well possible that the same soundfold word is found several
3165 // times with different scores. Since the following is quite slow only do
3166 // the words that have a better score than before. Use a hashtable to
3167 // remember the words that have been done.
3168 hash = hash_hash(goodword);
3169 hi = hash_lookup(&slang->sl_sounddone, goodword, hash);
3170 if (HASHITEM_EMPTY(hi))
3171 {
3172 sft = alloc(sizeof(sftword_T) + STRLEN(goodword));
3173 if (sft != NULL)
3174 {
3175 sft->sft_score = score;
3176 STRCPY(sft->sft_word, goodword);
3177 hash_add_item(&slang->sl_sounddone, hi, sft->sft_word, hash);
3178 }
3179 }
3180 else
3181 {
3182 sft = HI2SFT(hi);
3183 if (score >= sft->sft_score)
3184 return;
3185 sft->sft_score = score;
3186 }
3187
3188 // Find the word nr in the soundfold tree.
3189 sfwordnr = soundfold_find(slang, goodword);
3190 if (sfwordnr < 0)
3191 {
3192 internal_error("add_sound_suggest()");
3193 return;
3194 }
3195
3196 // go over the list of good words that produce this soundfold word
3197 nrline = ml_get_buf(slang->sl_sugbuf, (linenr_T)(sfwordnr + 1), FALSE);
3198 orgnr = 0;
3199 while (*nrline != NUL)
3200 {
3201 // The wordnr was stored in a minimal nr of bytes as an offset to the
3202 // previous wordnr.
3203 orgnr += bytes2offset(&nrline);
3204
3205 byts = slang->sl_fbyts;
3206 idxs = slang->sl_fidxs;
3207
3208 // Lookup the word "orgnr" one of the two tries.
3209 n = 0;
3210 wordcount = 0;
3211 for (wlen = 0; wlen < MAXWLEN - 3; ++wlen)
3212 {
3213 i = 1;
3214 if (wordcount == orgnr && byts[n + 1] == NUL)
3215 break; // found end of word
3216
3217 if (byts[n + 1] == NUL)
3218 ++wordcount;
3219
3220 // skip over the NUL bytes
3221 for ( ; byts[n + i] == NUL; ++i)
3222 if (i > byts[n]) // safety check
3223 {
3224 STRCPY(theword + wlen, "BAD");
3225 wlen += 3;
3226 goto badword;
3227 }
3228
3229 // One of the siblings must have the word.
3230 for ( ; i < byts[n]; ++i)
3231 {
3232 wc = idxs[idxs[n + i]]; // nr of words under this byte
3233 if (wordcount + wc > orgnr)
3234 break;
3235 wordcount += wc;
3236 }
3237
3238 theword[wlen] = byts[n + i];
3239 n = idxs[n + i];
3240 }
3241 badword:
3242 theword[wlen] = NUL;
3243
3244 // Go over the possible flags and regions.
3245 for (; i <= byts[n] && byts[n + i] == NUL; ++i)
3246 {
3247 char_u cword[MAXWLEN];
3248 char_u *p;
3249 int flags = (int)idxs[n + i];
3250
3251 // Skip words with the NOSUGGEST flag
3252 if (flags & WF_NOSUGGEST)
3253 continue;
3254
3255 if (flags & WF_KEEPCAP)
3256 {
3257 // Must find the word in the keep-case tree.
3258 find_keepcap_word(slang, theword, cword);
3259 p = cword;
3260 }
3261 else
3262 {
3263 flags |= su->su_badflags;
3264 if ((flags & WF_CAPMASK) != 0)
3265 {
3266 // Need to fix case according to "flags".
3267 make_case_word(theword, cword, flags);
3268 p = cword;
3269 }
3270 else
3271 p = theword;
3272 }
3273
3274 // Add the suggestion.
3275 if (sps_flags & SPS_DOUBLE)
3276 {
3277 // Add the suggestion if the score isn't too bad.
3278 if (score <= su->su_maxscore)
3279 add_suggestion(su, &su->su_sga, p, su->su_badlen,
3280 score, 0, FALSE, slang, FALSE);
3281 }
3282 else
3283 {
3284 // Add a penalty for words in another region.
3285 if ((flags & WF_REGION)
3286 && (((unsigned)flags >> 16) & lp->lp_region) == 0)
3287 goodscore = SCORE_REGION;
3288 else
3289 goodscore = 0;
3290
3291 // Add a small penalty for changing the first letter from
3292 // lower to upper case. Helps for "tath" -> "Kath", which is
3293 // less common than "tath" -> "path". Don't do it when the
3294 // letter is the same, that has already been counted.
3295 gc = PTR2CHAR(p);
3296 if (SPELL_ISUPPER(gc))
3297 {
3298 bc = PTR2CHAR(su->su_badword);
3299 if (!SPELL_ISUPPER(bc)
3300 && SPELL_TOFOLD(bc) != SPELL_TOFOLD(gc))
3301 goodscore += SCORE_ICASE / 2;
3302 }
3303
3304 // Compute the score for the good word. This only does letter
3305 // insert/delete/swap/replace. REP items are not considered,
3306 // which may make the score a bit higher.
3307 // Use a limit for the score to make it work faster. Use
3308 // MAXSCORE(), because RESCORE() will change the score.
3309 // If the limit is very high then the iterative method is
3310 // inefficient, using an array is quicker.
3311 limit = MAXSCORE(su->su_sfmaxscore - goodscore, score);
3312 if (limit > SCORE_LIMITMAX)
3313 goodscore += spell_edit_score(slang, su->su_badword, p);
3314 else
3315 goodscore += spell_edit_score_limit(slang, su->su_badword,
3316 p, limit);
3317
3318 // When going over the limit don't bother to do the rest.
3319 if (goodscore < SCORE_MAXMAX)
3320 {
3321 // Give a bonus to words seen before.
3322 goodscore = score_wordcount_adj(slang, goodscore, p, FALSE);
3323
3324 // Add the suggestion if the score isn't too bad.
3325 goodscore = RESCORE(goodscore, score);
3326 if (goodscore <= su->su_sfmaxscore)
3327 add_suggestion(su, &su->su_ga, p, su->su_badlen,
3328 goodscore, score, TRUE, slang, TRUE);
3329 }
3330 }
3331 }
3332 // smsg("word %s (%d): %s (%d)", sftword, sftnr, theword, orgnr);
3333 }
3334 }
3335
3336 /*
3337 * Find word "word" in fold-case tree for "slang" and return the word number.
3338 */
3339 static int
3340 soundfold_find(slang_T *slang, char_u *word)
3341 {
3342 idx_T arridx = 0;
3343 int len;
3344 int wlen = 0;
3345 int c;
3346 char_u *ptr = word;
3347 char_u *byts;
3348 idx_T *idxs;
3349 int wordnr = 0;
3350
3351 byts = slang->sl_sbyts;
3352 idxs = slang->sl_sidxs;
3353
3354 for (;;)
3355 {
3356 // First byte is the number of possible bytes.
3357 len = byts[arridx++];
3358
3359 // If the first possible byte is a zero the word could end here.
3360 // If the word ends we found the word. If not skip the NUL bytes.
3361 c = ptr[wlen];
3362 if (byts[arridx] == NUL)
3363 {
3364 if (c == NUL)
3365 break;
3366
3367 // Skip over the zeros, there can be several.
3368 while (len > 0 && byts[arridx] == NUL)
3369 {
3370 ++arridx;
3371 --len;
3372 }
3373 if (len == 0)
3374 return -1; // no children, word should have ended here
3375 ++wordnr;
3376 }
3377
3378 // If the word ends we didn't find it.
3379 if (c == NUL)
3380 return -1;
3381
3382 // Perform a binary search in the list of accepted bytes.
3383 if (c == TAB) // <Tab> is handled like <Space>
3384 c = ' ';
3385 while (byts[arridx] < c)
3386 {
3387 // The word count is in the first idxs[] entry of the child.
3388 wordnr += idxs[idxs[arridx]];
3389 ++arridx;
3390 if (--len == 0) // end of the bytes, didn't find it
3391 return -1;
3392 }
3393 if (byts[arridx] != c) // didn't find the byte
3394 return -1;
3395
3396 // Continue at the child (if there is one).
3397 arridx = idxs[arridx];
3398 ++wlen;
3399
3400 // One space in the good word may stand for several spaces in the
3401 // checked word.
3402 if (c == ' ')
3403 while (ptr[wlen] == ' ' || ptr[wlen] == TAB)
3404 ++wlen;
3405 }
3406
3407 return wordnr;
3408 }
3409
3410 /*
3411 * Return TRUE if "c1" and "c2" are similar characters according to the MAP
3412 * lines in the .aff file.
3413 */
3414 static int
3415 similar_chars(slang_T *slang, int c1, int c2)
3416 {
3417 int m1, m2;
3418 char_u buf[MB_MAXBYTES + 1];
3419 hashitem_T *hi;
3420
3421 if (c1 >= 256)
3422 {
3423 buf[mb_char2bytes(c1, buf)] = 0;
3424 hi = hash_find(&slang->sl_map_hash, buf);
3425 if (HASHITEM_EMPTY(hi))
3426 m1 = 0;
3427 else
3428 m1 = mb_ptr2char(hi->hi_key + STRLEN(hi->hi_key) + 1);
3429 }
3430 else
3431 m1 = slang->sl_map_array[c1];
3432 if (m1 == 0)
3433 return FALSE;
3434
3435
3436 if (c2 >= 256)
3437 {
3438 buf[mb_char2bytes(c2, buf)] = 0;
3439 hi = hash_find(&slang->sl_map_hash, buf);
3440 if (HASHITEM_EMPTY(hi))
3441 m2 = 0;
3442 else
3443 m2 = mb_ptr2char(hi->hi_key + STRLEN(hi->hi_key) + 1);
3444 }
3445 else
3446 m2 = slang->sl_map_array[c2];
3447
3448 return m1 == m2;
3449 }
3450
3451 /*
3452 * Add a suggestion to the list of suggestions.
3453 * For a suggestion that is already in the list the lowest score is remembered.
3454 */
3455 static void
3456 add_suggestion(
3457 suginfo_T *su,
3458 garray_T *gap, // either su_ga or su_sga
3459 char_u *goodword,
3460 int badlenarg, // len of bad word replaced with "goodword"
3461 int score,
3462 int altscore,
3463 int had_bonus, // value for st_had_bonus
3464 slang_T *slang, // language for sound folding
3465 int maxsf) // su_maxscore applies to soundfold score,
3466 // su_sfmaxscore to the total score.
3467 {
3468 int goodlen; // len of goodword changed
3469 int badlen; // len of bad word changed
3470 suggest_T *stp;
3471 suggest_T new_sug;
3472 int i;
3473 char_u *pgood, *pbad;
3474
3475 // Minimize "badlen" for consistency. Avoids that changing "the the" to
3476 // "thee the" is added next to changing the first "the" the "thee".
3477 pgood = goodword + STRLEN(goodword);
3478 pbad = su->su_badptr + badlenarg;
3479 for (;;)
3480 {
3481 goodlen = (int)(pgood - goodword);
3482 badlen = (int)(pbad - su->su_badptr);
3483 if (goodlen <= 0 || badlen <= 0)
3484 break;
3485 MB_PTR_BACK(goodword, pgood);
3486 MB_PTR_BACK(su->su_badptr, pbad);
3487 if (has_mbyte)
3488 {
3489 if (mb_ptr2char(pgood) != mb_ptr2char(pbad))
3490 break;
3491 }
3492 else if (*pgood != *pbad)
3493 break;
3494 }
3495
3496 if (badlen == 0 && goodlen == 0)
3497 // goodword doesn't change anything; may happen for "the the" changing
3498 // the first "the" to itself.
3499 return;
3500
3501 if (gap->ga_len == 0)
3502 i = -1;
3503 else
3504 {
3505 // Check if the word is already there. Also check the length that is
3506 // being replaced "thes," -> "these" is a different suggestion from
3507 // "thes" -> "these".
3508 stp = &SUG(*gap, 0);
3509 for (i = gap->ga_len; --i >= 0; ++stp)
3510 if (stp->st_wordlen == goodlen
3511 && stp->st_orglen == badlen
3512 && STRNCMP(stp->st_word, goodword, goodlen) == 0)
3513 {
3514 // Found it. Remember the word with the lowest score.
3515 if (stp->st_slang == NULL)
3516 stp->st_slang = slang;
3517
3518 new_sug.st_score = score;
3519 new_sug.st_altscore = altscore;
3520 new_sug.st_had_bonus = had_bonus;
3521
3522 if (stp->st_had_bonus != had_bonus)
3523 {
3524 // Only one of the two had the soundalike score computed.
3525 // Need to do that for the other one now, otherwise the
3526 // scores can't be compared. This happens because
3527 // suggest_try_change() doesn't compute the soundalike
3528 // word to keep it fast, while some special methods set
3529 // the soundalike score to zero.
3530 if (had_bonus)
3531 rescore_one(su, stp);
3532 else
3533 {
3534 new_sug.st_word = stp->st_word;
3535 new_sug.st_wordlen = stp->st_wordlen;
3536 new_sug.st_slang = stp->st_slang;
3537 new_sug.st_orglen = badlen;
3538 rescore_one(su, &new_sug);
3539 }
3540 }
3541
3542 if (stp->st_score > new_sug.st_score)
3543 {
3544 stp->st_score = new_sug.st_score;
3545 stp->st_altscore = new_sug.st_altscore;
3546 stp->st_had_bonus = new_sug.st_had_bonus;
3547 }
3548 break;
3549 }
3550 }
3551
3552 if (i < 0 && ga_grow(gap, 1) == OK)
3553 {
3554 // Add a suggestion.
3555 stp = &SUG(*gap, gap->ga_len);
3556 stp->st_word = vim_strnsave(goodword, goodlen);
3557 if (stp->st_word != NULL)
3558 {
3559 stp->st_wordlen = goodlen;
3560 stp->st_score = score;
3561 stp->st_altscore = altscore;
3562 stp->st_had_bonus = had_bonus;
3563 stp->st_orglen = badlen;
3564 stp->st_slang = slang;
3565 ++gap->ga_len;
3566
3567 // If we have too many suggestions now, sort the list and keep
3568 // the best suggestions.
3569 if (gap->ga_len > SUG_MAX_COUNT(su))
3570 {
3571 if (maxsf)
3572 su->su_sfmaxscore = cleanup_suggestions(gap,
3573 su->su_sfmaxscore, SUG_CLEAN_COUNT(su));
3574 else
3575 su->su_maxscore = cleanup_suggestions(gap,
3576 su->su_maxscore, SUG_CLEAN_COUNT(su));
3577 }
3578 }
3579 }
3580 }
3581
3582 /*
3583 * Suggestions may in fact be flagged as errors. Esp. for banned words and
3584 * for split words, such as "the the". Remove these from the list here.
3585 */
3586 static void
3587 check_suggestions(
3588 suginfo_T *su,
3589 garray_T *gap) // either su_ga or su_sga
3590 {
3591 suggest_T *stp;
3592 int i;
3593 char_u longword[MAXWLEN + 1];
3594 int len;
3595 hlf_T attr;
3596
3597 stp = &SUG(*gap, 0);
3598 for (i = gap->ga_len - 1; i >= 0; --i)
3599 {
3600 // Need to append what follows to check for "the the".
3601 vim_strncpy(longword, stp[i].st_word, MAXWLEN);
3602 len = stp[i].st_wordlen;
3603 vim_strncpy(longword + len, su->su_badptr + stp[i].st_orglen,
3604 MAXWLEN - len);
3605 attr = HLF_COUNT;
3606 (void)spell_check(curwin, longword, &attr, NULL, FALSE);
3607 if (attr != HLF_COUNT)
3608 {
3609 // Remove this entry.
3610 vim_free(stp[i].st_word);
3611 --gap->ga_len;
3612 if (i < gap->ga_len)
3613 mch_memmove(stp + i, stp + i + 1,
3614 sizeof(suggest_T) * (gap->ga_len - i));
3615 }
3616 }
3617 }
3618
3619
3620 /*
3621 * Add a word to be banned.
3622 */
3623 static void
3624 add_banned(
3625 suginfo_T *su,
3626 char_u *word)
3627 {
3628 char_u *s;
3629 hash_T hash;
3630 hashitem_T *hi;
3631
3632 hash = hash_hash(word);
3633 hi = hash_lookup(&su->su_banned, word, hash);
3634 if (HASHITEM_EMPTY(hi))
3635 {
3636 s = vim_strsave(word);
3637 if (s != NULL)
3638 hash_add_item(&su->su_banned, hi, s, hash);
3639 }
3640 }
3641
3642 /*
3643 * Recompute the score for all suggestions if sound-folding is possible. This
3644 * is slow, thus only done for the final results.
3645 */
3646 static void
3647 rescore_suggestions(suginfo_T *su)
3648 {
3649 int i;
3650
3651 if (su->su_sallang != NULL)
3652 for (i = 0; i < su->su_ga.ga_len; ++i)
3653 rescore_one(su, &SUG(su->su_ga, i));
3654 }
3655
3656 /*
3657 * Recompute the score for one suggestion if sound-folding is possible.
3658 */
3659 static void
3660 rescore_one(suginfo_T *su, suggest_T *stp)
3661 {
3662 slang_T *slang = stp->st_slang;
3663 char_u sal_badword[MAXWLEN];
3664 char_u *p;
3665
3666 // Only rescore suggestions that have no sal score yet and do have a
3667 // language.
3668 if (slang != NULL && slang->sl_sal.ga_len > 0 && !stp->st_had_bonus)
3669 {
3670 if (slang == su->su_sallang)
3671 p = su->su_sal_badword;
3672 else
3673 {
3674 spell_soundfold(slang, su->su_fbadword, TRUE, sal_badword);
3675 p = sal_badword;
3676 }
3677
3678 stp->st_altscore = stp_sal_score(stp, su, slang, p);
3679 if (stp->st_altscore == SCORE_MAXMAX)
3680 stp->st_altscore = SCORE_BIG;
3681 stp->st_score = RESCORE(stp->st_score, stp->st_altscore);
3682 stp->st_had_bonus = TRUE;
3683 }
3684 }
3685
3686 static int sug_compare(const void *s1, const void *s2);
3687
3688 /*
3689 * Function given to qsort() to sort the suggestions on st_score.
3690 * First on "st_score", then "st_altscore" then alphabetically.
3691 */
3692 static int
3693 sug_compare(const void *s1, const void *s2)
3694 {
3695 suggest_T *p1 = (suggest_T *)s1;
3696 suggest_T *p2 = (suggest_T *)s2;
3697 int n = p1->st_score - p2->st_score;
3698
3699 if (n == 0)
3700 {
3701 n = p1->st_altscore - p2->st_altscore;
3702 if (n == 0)
3703 n = STRICMP(p1->st_word, p2->st_word);
3704 }
3705 return n;
3706 }
3707
3708 /*
3709 * Cleanup the suggestions:
3710 * - Sort on score.
3711 * - Remove words that won't be displayed.
3712 * Returns the maximum score in the list or "maxscore" unmodified.
3713 */
3714 static int
3715 cleanup_suggestions(
3716 garray_T *gap,
3717 int maxscore,
3718 int keep) // nr of suggestions to keep
3719 {
3720 suggest_T *stp = &SUG(*gap, 0);
3721 int i;
3722
3723 // Sort the list.
3724 qsort(gap->ga_data, (size_t)gap->ga_len, sizeof(suggest_T), sug_compare);
3725
3726 // Truncate the list to the number of suggestions that will be displayed.
3727 if (gap->ga_len > keep)
3728 {
3729 for (i = keep; i < gap->ga_len; ++i)
3730 vim_free(stp[i].st_word);
3731 gap->ga_len = keep;
3732 return stp[keep - 1].st_score;
3733 }
3734 return maxscore;
3735 }
3736
3737 /*
3738 * Compute a score for two sound-a-like words.
3739 * This permits up to two inserts/deletes/swaps/etc. to keep things fast.
3740 * Instead of a generic loop we write out the code. That keeps it fast by
3741 * avoiding checks that will not be possible.
3742 */
3743 static int
3744 soundalike_score(
3745 char_u *goodstart, // sound-folded good word
3746 char_u *badstart) // sound-folded bad word
3747 {
3748 char_u *goodsound = goodstart;
3749 char_u *badsound = badstart;
3750 int goodlen;
3751 int badlen;
3752 int n;
3753 char_u *pl, *ps;
3754 char_u *pl2, *ps2;
3755 int score = 0;
3756
3757 // Adding/inserting "*" at the start (word starts with vowel) shouldn't be
3758 // counted so much, vowels halfway the word aren't counted at all.
3759 if ((*badsound == '*' || *goodsound == '*') && *badsound != *goodsound)
3760 {
3761 if ((badsound[0] == NUL && goodsound[1] == NUL)
3762 || (goodsound[0] == NUL && badsound[1] == NUL))
3763 // changing word with vowel to word without a sound
3764 return SCORE_DEL;
3765 if (badsound[0] == NUL || goodsound[0] == NUL)
3766 // more than two changes
3767 return SCORE_MAXMAX;
3768
3769 if (badsound[1] == goodsound[1]
3770 || (badsound[1] != NUL
3771 && goodsound[1] != NUL
3772 && badsound[2] == goodsound[2]))
3773 {
3774 // handle like a substitute
3775 }
3776 else
3777 {
3778 score = 2 * SCORE_DEL / 3;
3779 if (*badsound == '*')
3780 ++badsound;
3781 else
3782 ++goodsound;
3783 }
3784 }
3785
3786 goodlen = (int)STRLEN(goodsound);
3787 badlen = (int)STRLEN(badsound);
3788
3789 // Return quickly if the lengths are too different to be fixed by two
3790 // changes.
3791 n = goodlen - badlen;
3792 if (n < -2 || n > 2)
3793 return SCORE_MAXMAX;
3794
3795 if (n > 0)
3796 {
3797 pl = goodsound; // goodsound is longest
3798 ps = badsound;
3799 }
3800 else
3801 {
3802 pl = badsound; // badsound is longest
3803 ps = goodsound;
3804 }
3805
3806 // Skip over the identical part.
3807 while (*pl == *ps && *pl != NUL)
3808 {
3809 ++pl;
3810 ++ps;
3811 }
3812
3813 switch (n)
3814 {
3815 case -2:
3816 case 2:
3817 // Must delete two characters from "pl".
3818 ++pl; // first delete
3819 while (*pl == *ps)
3820 {
3821 ++pl;
3822 ++ps;
3823 }
3824 // strings must be equal after second delete
3825 if (STRCMP(pl + 1, ps) == 0)
3826 return score + SCORE_DEL * 2;
3827
3828 // Failed to compare.
3829 break;
3830
3831 case -1:
3832 case 1:
3833 // Minimal one delete from "pl" required.
3834
3835 // 1: delete
3836 pl2 = pl + 1;
3837 ps2 = ps;
3838 while (*pl2 == *ps2)
3839 {
3840 if (*pl2 == NUL) // reached the end
3841 return score + SCORE_DEL;
3842 ++pl2;
3843 ++ps2;
3844 }
3845
3846 // 2: delete then swap, then rest must be equal
3847 if (pl2[0] == ps2[1] && pl2[1] == ps2[0]
3848 && STRCMP(pl2 + 2, ps2 + 2) == 0)
3849 return score + SCORE_DEL + SCORE_SWAP;
3850
3851 // 3: delete then substitute, then the rest must be equal
3852 if (STRCMP(pl2 + 1, ps2 + 1) == 0)
3853 return score + SCORE_DEL + SCORE_SUBST;
3854
3855 // 4: first swap then delete
3856 if (pl[0] == ps[1] && pl[1] == ps[0])
3857 {
3858 pl2 = pl + 2; // swap, skip two chars
3859 ps2 = ps + 2;
3860 while (*pl2 == *ps2)
3861 {
3862 ++pl2;
3863 ++ps2;
3864 }
3865 // delete a char and then strings must be equal
3866 if (STRCMP(pl2 + 1, ps2) == 0)
3867 return score + SCORE_SWAP + SCORE_DEL;
3868 }
3869
3870 // 5: first substitute then delete
3871 pl2 = pl + 1; // substitute, skip one char
3872 ps2 = ps + 1;
3873 while (*pl2 == *ps2)
3874 {
3875 ++pl2;
3876 ++ps2;
3877 }
3878 // delete a char and then strings must be equal
3879 if (STRCMP(pl2 + 1, ps2) == 0)
3880 return score + SCORE_SUBST + SCORE_DEL;
3881
3882 // Failed to compare.
3883 break;
3884
3885 case 0:
3886 // Lengths are equal, thus changes must result in same length: An
3887 // insert is only possible in combination with a delete.
3888 // 1: check if for identical strings
3889 if (*pl == NUL)
3890 return score;
3891
3892 // 2: swap
3893 if (pl[0] == ps[1] && pl[1] == ps[0])
3894 {
3895 pl2 = pl + 2; // swap, skip two chars
3896 ps2 = ps + 2;
3897 while (*pl2 == *ps2)
3898 {
3899 if (*pl2 == NUL) // reached the end
3900 return score + SCORE_SWAP;
3901 ++pl2;
3902 ++ps2;
3903 }
3904 // 3: swap and swap again
3905 if (pl2[0] == ps2[1] && pl2[1] == ps2[0]
3906 && STRCMP(pl2 + 2, ps2 + 2) == 0)
3907 return score + SCORE_SWAP + SCORE_SWAP;
3908
3909 // 4: swap and substitute
3910 if (STRCMP(pl2 + 1, ps2 + 1) == 0)
3911 return score + SCORE_SWAP + SCORE_SUBST;
3912 }
3913
3914 // 5: substitute
3915 pl2 = pl + 1;
3916 ps2 = ps + 1;
3917 while (*pl2 == *ps2)
3918 {
3919 if (*pl2 == NUL) // reached the end
3920 return score + SCORE_SUBST;
3921 ++pl2;
3922 ++ps2;
3923 }
3924
3925 // 6: substitute and swap
3926 if (pl2[0] == ps2[1] && pl2[1] == ps2[0]
3927 && STRCMP(pl2 + 2, ps2 + 2) == 0)
3928 return score + SCORE_SUBST + SCORE_SWAP;
3929
3930 // 7: substitute and substitute
3931 if (STRCMP(pl2 + 1, ps2 + 1) == 0)
3932 return score + SCORE_SUBST + SCORE_SUBST;
3933
3934 // 8: insert then delete
3935 pl2 = pl;
3936 ps2 = ps + 1;
3937 while (*pl2 == *ps2)
3938 {
3939 ++pl2;
3940 ++ps2;
3941 }
3942 if (STRCMP(pl2 + 1, ps2) == 0)
3943 return score + SCORE_INS + SCORE_DEL;
3944
3945 // 9: delete then insert
3946 pl2 = pl + 1;
3947 ps2 = ps;
3948 while (*pl2 == *ps2)
3949 {
3950 ++pl2;
3951 ++ps2;
3952 }
3953 if (STRCMP(pl2, ps2 + 1) == 0)
3954 return score + SCORE_INS + SCORE_DEL;
3955
3956 // Failed to compare.
3957 break;
3958 }
3959
3960 return SCORE_MAXMAX;
3961 }
3962
3963 /*
3964 * Compute the "edit distance" to turn "badword" into "goodword". The less
3965 * deletes/inserts/substitutes/swaps are required the lower the score.
3966 *
3967 * The algorithm is described by Du and Chang, 1992.
3968 * The implementation of the algorithm comes from Aspell editdist.cpp,
3969 * edit_distance(). It has been converted from C++ to C and modified to
3970 * support multi-byte characters.
3971 */
3972 static int
3973 spell_edit_score(
3974 slang_T *slang,
3975 char_u *badword,
3976 char_u *goodword)
3977 {
3978 int *cnt;
3979 int badlen, goodlen; // lengths including NUL
3980 int j, i;
3981 int t;
3982 int bc, gc;
3983 int pbc, pgc;
3984 char_u *p;
3985 int wbadword[MAXWLEN];
3986 int wgoodword[MAXWLEN];
3987
3988 if (has_mbyte)
3989 {
3990 // Get the characters from the multi-byte strings and put them in an
3991 // int array for easy access.
3992 for (p = badword, badlen = 0; *p != NUL; )
3993 wbadword[badlen++] = mb_cptr2char_adv(&p);
3994 wbadword[badlen++] = 0;
3995 for (p = goodword, goodlen = 0; *p != NUL; )
3996 wgoodword[goodlen++] = mb_cptr2char_adv(&p);
3997 wgoodword[goodlen++] = 0;
3998 }
3999 else
4000 {
4001 badlen = (int)STRLEN(badword) + 1;
4002 goodlen = (int)STRLEN(goodword) + 1;
4003 }
4004
4005 // We use "cnt" as an array: CNT(badword_idx, goodword_idx).
4006 #define CNT(a, b) cnt[(a) + (b) * (badlen + 1)]
4007 cnt = ALLOC_MULT(int, (badlen + 1) * (goodlen + 1));
4008 if (cnt == NULL)
4009 return 0; // out of memory
4010
4011 CNT(0, 0) = 0;
4012 for (j = 1; j <= goodlen; ++j)
4013 CNT(0, j) = CNT(0, j - 1) + SCORE_INS;
4014
4015 for (i = 1; i <= badlen; ++i)
4016 {
4017 CNT(i, 0) = CNT(i - 1, 0) + SCORE_DEL;
4018 for (j = 1; j <= goodlen; ++j)
4019 {
4020 if (has_mbyte)
4021 {
4022 bc = wbadword[i - 1];
4023 gc = wgoodword[j - 1];
4024 }
4025 else
4026 {
4027 bc = badword[i - 1];
4028 gc = goodword[j - 1];
4029 }
4030 if (bc == gc)
4031 CNT(i, j) = CNT(i - 1, j - 1);
4032 else
4033 {
4034 // Use a better score when there is only a case difference.
4035 if (SPELL_TOFOLD(bc) == SPELL_TOFOLD(gc))
4036 CNT(i, j) = SCORE_ICASE + CNT(i - 1, j - 1);
4037 else
4038 {
4039 // For a similar character use SCORE_SIMILAR.
4040 if (slang != NULL
4041 && slang->sl_has_map
4042 && similar_chars(slang, gc, bc))
4043 CNT(i, j) = SCORE_SIMILAR + CNT(i - 1, j - 1);
4044 else
4045 CNT(i, j) = SCORE_SUBST + CNT(i - 1, j - 1);
4046 }
4047
4048 if (i > 1 && j > 1)
4049 {
4050 if (has_mbyte)
4051 {
4052 pbc = wbadword[i - 2];
4053 pgc = wgoodword[j - 2];
4054 }
4055 else
4056 {
4057 pbc = badword[i - 2];
4058 pgc = goodword[j - 2];
4059 }
4060 if (bc == pgc && pbc == gc)
4061 {
4062 t = SCORE_SWAP + CNT(i - 2, j - 2);
4063 if (t < CNT(i, j))
4064 CNT(i, j) = t;
4065 }
4066 }
4067 t = SCORE_DEL + CNT(i - 1, j);
4068 if (t < CNT(i, j))
4069 CNT(i, j) = t;
4070 t = SCORE_INS + CNT(i, j - 1);
4071 if (t < CNT(i, j))
4072 CNT(i, j) = t;
4073 }
4074 }
4075 }
4076
4077 i = CNT(badlen - 1, goodlen - 1);
4078 vim_free(cnt);
4079 return i;
4080 }
4081
4082 typedef struct
4083 {
4084 int badi;
4085 int goodi;
4086 int score;
4087 } limitscore_T;
4088
4089 /*
4090 * Like spell_edit_score(), but with a limit on the score to make it faster.
4091 * May return SCORE_MAXMAX when the score is higher than "limit".
4092 *
4093 * This uses a stack for the edits still to be tried.
4094 * The idea comes from Aspell leditdist.cpp. Rewritten in C and added support
4095 * for multi-byte characters.
4096 */
4097 static int
4098 spell_edit_score_limit(
4099 slang_T *slang,
4100 char_u *badword,
4101 char_u *goodword,
4102 int limit)
4103 {
4104 limitscore_T stack[10]; // allow for over 3 * 2 edits
4105 int stackidx;
4106 int bi, gi;
4107 int bi2, gi2;
4108 int bc, gc;
4109 int score;
4110 int score_off;
4111 int minscore;
4112 int round;
4113
4114 // Multi-byte characters require a bit more work, use a different function
4115 // to avoid testing "has_mbyte" quite often.
4116 if (has_mbyte)
4117 return spell_edit_score_limit_w(slang, badword, goodword, limit);
4118
4119 // The idea is to go from start to end over the words. So long as
4120 // characters are equal just continue, this always gives the lowest score.
4121 // When there is a difference try several alternatives. Each alternative
4122 // increases "score" for the edit distance. Some of the alternatives are
4123 // pushed unto a stack and tried later, some are tried right away. At the
4124 // end of the word the score for one alternative is known. The lowest
4125 // possible score is stored in "minscore".
4126 stackidx = 0;
4127 bi = 0;
4128 gi = 0;
4129 score = 0;
4130 minscore = limit + 1;
4131
4132 for (;;)
4133 {
4134 // Skip over an equal part, score remains the same.
4135 for (;;)
4136 {
4137 bc = badword[bi];
4138 gc = goodword[gi];
4139 if (bc != gc) // stop at a char that's different
4140 break;
4141 if (bc == NUL) // both words end
4142 {
4143 if (score < minscore)
4144 minscore = score;
4145 goto pop; // do next alternative
4146 }
4147 ++bi;
4148 ++gi;
4149 }
4150
4151 if (gc == NUL) // goodword ends, delete badword chars
4152 {
4153 do
4154 {
4155 if ((score += SCORE_DEL) >= minscore)
4156 goto pop; // do next alternative
4157 } while (badword[++bi] != NUL);
4158 minscore = score;
4159 }
4160 else if (bc == NUL) // badword ends, insert badword chars
4161 {
4162 do
4163 {
4164 if ((score += SCORE_INS) >= minscore)
4165 goto pop; // do next alternative
4166 } while (goodword[++gi] != NUL);
4167 minscore = score;
4168 }
4169 else // both words continue
4170 {
4171 // If not close to the limit, perform a change. Only try changes
4172 // that may lead to a lower score than "minscore".
4173 // round 0: try deleting a char from badword
4174 // round 1: try inserting a char in badword
4175 for (round = 0; round <= 1; ++round)
4176 {
4177 score_off = score + (round == 0 ? SCORE_DEL : SCORE_INS);
4178 if (score_off < minscore)
4179 {
4180 if (score_off + SCORE_EDIT_MIN >= minscore)
4181 {
4182 // Near the limit, rest of the words must match. We
4183 // can check that right now, no need to push an item
4184 // onto the stack.
4185 bi2 = bi + 1 - round;
4186 gi2 = gi + round;
4187 while (goodword[gi2] == badword[bi2])
4188 {
4189 if (goodword[gi2] == NUL)
4190 {
4191 minscore = score_off;
4192 break;
4193 }
4194 ++bi2;
4195 ++gi2;
4196 }
4197 }
4198 else
4199 {
4200 // try deleting/inserting a character later
4201 stack[stackidx].badi = bi + 1 - round;
4202 stack[stackidx].goodi = gi + round;
4203 stack[stackidx].score = score_off;
4204 ++stackidx;
4205 }
4206 }
4207 }
4208
4209 if (score + SCORE_SWAP < minscore)
4210 {
4211 // If swapping two characters makes a match then the
4212 // substitution is more expensive, thus there is no need to
4213 // try both.
4214 if (gc == badword[bi + 1] && bc == goodword[gi + 1])
4215 {
4216 // Swap two characters, that is: skip them.
4217 gi += 2;
4218 bi += 2;
4219 score += SCORE_SWAP;
4220 continue;
4221 }
4222 }
4223
4224 // Substitute one character for another which is the same
4225 // thing as deleting a character from both goodword and badword.
4226 // Use a better score when there is only a case difference.
4227 if (SPELL_TOFOLD(bc) == SPELL_TOFOLD(gc))
4228 score += SCORE_ICASE;
4229 else
4230 {
4231 // For a similar character use SCORE_SIMILAR.
4232 if (slang != NULL
4233 && slang->sl_has_map
4234 && similar_chars(slang, gc, bc))
4235 score += SCORE_SIMILAR;
4236 else
4237 score += SCORE_SUBST;
4238 }
4239
4240 if (score < minscore)
4241 {
4242 // Do the substitution.
4243 ++gi;
4244 ++bi;
4245 continue;
4246 }
4247 }
4248 pop:
4249 // Get here to try the next alternative, pop it from the stack.
4250 if (stackidx == 0) // stack is empty, finished
4251 break;
4252
4253 // pop an item from the stack
4254 --stackidx;
4255 gi = stack[stackidx].goodi;
4256 bi = stack[stackidx].badi;
4257 score = stack[stackidx].score;
4258 }
4259
4260 // When the score goes over "limit" it may actually be much higher.
4261 // Return a very large number to avoid going below the limit when giving a
4262 // bonus.
4263 if (minscore > limit)
4264 return SCORE_MAXMAX;
4265 return minscore;
4266 }
4267
4268 /*
4269 * Multi-byte version of spell_edit_score_limit().
4270 * Keep it in sync with the above!
4271 */
4272 static int
4273 spell_edit_score_limit_w(
4274 slang_T *slang,
4275 char_u *badword,
4276 char_u *goodword,
4277 int limit)
4278 {
4279 limitscore_T stack[10]; // allow for over 3 * 2 edits
4280 int stackidx;
4281 int bi, gi;
4282 int bi2, gi2;
4283 int bc, gc;
4284 int score;
4285 int score_off;
4286 int minscore;
4287 int round;
4288 char_u *p;
4289 int wbadword[MAXWLEN];
4290 int wgoodword[MAXWLEN];
4291
4292 // Get the characters from the multi-byte strings and put them in an
4293 // int array for easy access.
4294 bi = 0;
4295 for (p = badword; *p != NUL; )
4296 wbadword[bi++] = mb_cptr2char_adv(&p);
4297 wbadword[bi++] = 0;
4298 gi = 0;
4299 for (p = goodword; *p != NUL; )
4300 wgoodword[gi++] = mb_cptr2char_adv(&p);
4301 wgoodword[gi++] = 0;
4302
4303 // The idea is to go from start to end over the words. So long as
4304 // characters are equal just continue, this always gives the lowest score.
4305 // When there is a difference try several alternatives. Each alternative
4306 // increases "score" for the edit distance. Some of the alternatives are
4307 // pushed unto a stack and tried later, some are tried right away. At the
4308 // end of the word the score for one alternative is known. The lowest
4309 // possible score is stored in "minscore".
4310 stackidx = 0;
4311 bi = 0;
4312 gi = 0;
4313 score = 0;
4314 minscore = limit + 1;
4315
4316 for (;;)
4317 {
4318 // Skip over an equal part, score remains the same.
4319 for (;;)
4320 {
4321 bc = wbadword[bi];
4322 gc = wgoodword[gi];
4323
4324 if (bc != gc) // stop at a char that's different
4325 break;
4326 if (bc == NUL) // both words end
4327 {
4328 if (score < minscore)
4329 minscore = score;
4330 goto pop; // do next alternative
4331 }
4332 ++bi;
4333 ++gi;
4334 }
4335
4336 if (gc == NUL) // goodword ends, delete badword chars
4337 {
4338 do
4339 {
4340 if ((score += SCORE_DEL) >= minscore)
4341 goto pop; // do next alternative
4342 } while (wbadword[++bi] != NUL);
4343 minscore = score;
4344 }
4345 else if (bc == NUL) // badword ends, insert badword chars
4346 {
4347 do
4348 {
4349 if ((score += SCORE_INS) >= minscore)
4350 goto pop; // do next alternative
4351 } while (wgoodword[++gi] != NUL);
4352 minscore = score;
4353 }
4354 else // both words continue
4355 {
4356 // If not close to the limit, perform a change. Only try changes
4357 // that may lead to a lower score than "minscore".
4358 // round 0: try deleting a char from badword
4359 // round 1: try inserting a char in badword
4360 for (round = 0; round <= 1; ++round)
4361 {
4362 score_off = score + (round == 0 ? SCORE_DEL : SCORE_INS);
4363 if (score_off < minscore)
4364 {
4365 if (score_off + SCORE_EDIT_MIN >= minscore)
4366 {
4367 // Near the limit, rest of the words must match. We
4368 // can check that right now, no need to push an item
4369 // onto the stack.
4370 bi2 = bi + 1 - round;
4371 gi2 = gi + round;
4372 while (wgoodword[gi2] == wbadword[bi2])
4373 {
4374 if (wgoodword[gi2] == NUL)
4375 {
4376 minscore = score_off;
4377 break;
4378 }
4379 ++bi2;
4380 ++gi2;
4381 }
4382 }
4383 else
4384 {
4385 // try deleting a character from badword later
4386 stack[stackidx].badi = bi + 1 - round;
4387 stack[stackidx].goodi = gi + round;
4388 stack[stackidx].score = score_off;
4389 ++stackidx;
4390 }
4391 }
4392 }
4393
4394 if (score + SCORE_SWAP < minscore)
4395 {
4396 // If swapping two characters makes a match then the
4397 // substitution is more expensive, thus there is no need to
4398 // try both.
4399 if (gc == wbadword[bi + 1] && bc == wgoodword[gi + 1])
4400 {
4401 // Swap two characters, that is: skip them.
4402 gi += 2;
4403 bi += 2;
4404 score += SCORE_SWAP;
4405 continue;
4406 }
4407 }
4408
4409 // Substitute one character for another which is the same
4410 // thing as deleting a character from both goodword and badword.
4411 // Use a better score when there is only a case difference.
4412 if (SPELL_TOFOLD(bc) == SPELL_TOFOLD(gc))
4413 score += SCORE_ICASE;
4414 else
4415 {
4416 // For a similar character use SCORE_SIMILAR.
4417 if (slang != NULL
4418 && slang->sl_has_map
4419 && similar_chars(slang, gc, bc))
4420 score += SCORE_SIMILAR;
4421 else
4422 score += SCORE_SUBST;
4423 }
4424
4425 if (score < minscore)
4426 {
4427 // Do the substitution.
4428 ++gi;
4429 ++bi;
4430 continue;
4431 }
4432 }
4433 pop:
4434 // Get here to try the next alternative, pop it from the stack.
4435 if (stackidx == 0) // stack is empty, finished
4436 break;
4437
4438 // pop an item from the stack
4439 --stackidx;
4440 gi = stack[stackidx].goodi;
4441 bi = stack[stackidx].badi;
4442 score = stack[stackidx].score;
4443 }
4444
4445 // When the score goes over "limit" it may actually be much higher.
4446 // Return a very large number to avoid going below the limit when giving a
4447 // bonus.
4448 if (minscore > limit)
4449 return SCORE_MAXMAX;
4450 return minscore;
4451 }
4452 #endif // FEAT_SPELL