comparison src/spell.c @ 324:548525d9da24

updated for version 7.0085
author vimboss
date Tue, 14 Jun 2005 22:01:04 +0000
parents 03b3684919e3
children f76b0d38b6bd
comparison
equal deleted inserted replaced
323:03b3684919e3 324:548525d9da24
11 * spell.c: code for spell checking 11 * spell.c: code for spell checking
12 * 12 *
13 * The spell checking mechanism uses a tree (aka trie). Each node in the tree 13 * The spell checking mechanism uses a tree (aka trie). Each node in the tree
14 * has a list of bytes that can appear (siblings). For each byte there is a 14 * has a list of bytes that can appear (siblings). For each byte there is a
15 * pointer to the node with the byte that follows in the word (child). 15 * pointer to the node with the byte that follows in the word (child).
16 * A NUL byte is used where the word may end. 16 *
17 * A NUL byte is used where the word may end. The bytes are sorted, so that
18 * binary searching can be used and the NUL bytes are at the start. The
19 * number of possible bytes is stored before the list of bytes.
20 *
21 * The tree uses two arrays: "byts" stores the characters, "idxs" stores
22 * either the next index or flags. The tree starts at index 0. For example,
23 * to lookup "vi" this sequence is followed:
24 * i = 0
25 * len = byts[i]
26 * n = where "v" appears in byts[i + 1] to byts[i + len]
27 * i = idxs[n]
28 * len = byts[i]
29 * n = where "i" appears in byts[i + 1] to byts[i + len]
30 * i = idxs[n]
31 * len = byts[i]
32 * find that byts[i + 1] is 0, idxs[i + 1] has flags for "vi".
17 * 33 *
18 * There are two trees: one with case-folded words and one with words in 34 * There are two trees: one with case-folded words and one with words in
19 * original case. The second one is only used for keep-case words and is 35 * original case. The second one is only used for keep-case words and is
20 * usually small. 36 * usually small.
21 * 37 *
28 * See ":help develop-spell". 44 * See ":help develop-spell".
29 */ 45 */
30 46
31 /* 47 /*
32 * Use this to let the score depend in how much a suggestion sounds like the 48 * Use this to let the score depend in how much a suggestion sounds like the
33 * bad word. It's quite slow and doesn't make the sorting much better.... 49 * bad word. It's quite slow and only occasionally makes the sorting better.
34 * #define SOUNDFOLD_SCORE 50 #define SOUNDFOLD_SCORE
51 */
52
53 /*
54 * Use this to adjust the score after finding suggestions, based on the
55 * suggested word sounding like the bad word. This is much faster than doing
56 * it for every possible suggestion.
57 * Disadvantage: When "the" is typed as "hte" it sounds different and goes
58 * down in the list.
59 #define RESCORE(word_score, sound_score) ((2 * word_score + sound_score) / 3)
35 */ 60 */
36 61
37 /* 62 /*
38 * Vim spell file format: <HEADER> <SUGGEST> <LWORDTREE> <KWORDTREE> 63 * Vim spell file format: <HEADER> <SUGGEST> <LWORDTREE> <KWORDTREE>
39 * 64 *
45 * <regionname> 2 bytes Region name: ca, au, etc. Lower case. 70 * <regionname> 2 bytes Region name: ca, au, etc. Lower case.
46 * First <regionname> is region 1. 71 * First <regionname> is region 1.
47 * 72 *
48 * <charflagslen> 1 byte Number of bytes in <charflags> (should be 128). 73 * <charflagslen> 1 byte Number of bytes in <charflags> (should be 128).
49 * <charflags> N bytes List of flags (first one is for character 128): 74 * <charflags> N bytes List of flags (first one is for character 128):
50 * 0x01 word character 75 * 0x01 word character CF_WORD
51 * 0x02 upper-case character 76 * 0x02 upper-case character CF_UPPER
52 * <fcharslen> 2 bytes Number of bytes in <fchars>. 77 * <fcharslen> 2 bytes Number of bytes in <fchars>.
53 * <fchars> N bytes Folded characters, first one is for character 128. 78 * <fchars> N bytes Folded characters, first one is for character 128.
54 * 79 *
55 * 80 *
56 * <SUGGEST> : <repcount> <rep> ... 81 * <SUGGEST> : <repcount> <rep> ...
143 168
144 #define MAXWLEN 250 /* Assume max. word len is this many bytes. 169 #define MAXWLEN 250 /* Assume max. word len is this many bytes.
145 Some places assume a word length fits in a 170 Some places assume a word length fits in a
146 byte, thus it can't be above 255. */ 171 byte, thus it can't be above 255. */
147 172
148 /* Flags used for a word. */ 173 /* Type used for indexes in the word tree need to be at least 3 bytes. If int
174 * is 8 bytes we could use something smaller, but what? */
175 #if SIZEOF_INT > 2
176 typedef int idx_T;
177 #else
178 typedef long idx_T;
179 #endif
180
181 /* Flags used for a word. Only the lowest byte can be used, the region byte
182 * comes above it. */
149 #define WF_REGION 0x01 /* region byte follows */ 183 #define WF_REGION 0x01 /* region byte follows */
150 #define WF_ONECAP 0x02 /* word with one capital (or all capitals) */ 184 #define WF_ONECAP 0x02 /* word with one capital (or all capitals) */
151 #define WF_ALLCAP 0x04 /* word must be all capitals */ 185 #define WF_ALLCAP 0x04 /* word must be all capitals */
152 #define WF_RARE 0x08 /* rare word */ 186 #define WF_RARE 0x08 /* rare word */
153 #define WF_BANNED 0x10 /* bad word */ 187 #define WF_BANNED 0x10 /* bad word */
154 #define WF_KEEPCAP 0x80 /* keep-case word */ 188 #define WF_KEEPCAP 0x80 /* keep-case word */
155 189
156 #define WF_CAPMASK (WF_ONECAP | WF_ALLCAP | WF_KEEPCAP) 190 #define WF_CAPMASK (WF_ONECAP | WF_ALLCAP | WF_KEEPCAP)
191
192 #define WF_USED 0x10000 /* Word was found in text. Must be in separate
193 byte before region and flags. */
157 194
158 #define BY_NOFLAGS 0 /* end of word without flags or region */ 195 #define BY_NOFLAGS 0 /* end of word without flags or region */
159 #define BY_FLAGS 1 /* end of word, flag byte follows */ 196 #define BY_FLAGS 1 /* end of word, flag byte follows */
160 #define BY_INDEX 2 /* child is shared, index follows */ 197 #define BY_INDEX 2 /* child is shared, index follows */
161 #define BY_SPECIAL BY_INDEX /* hightest special byte value */ 198 #define BY_SPECIAL BY_INDEX /* hightest special byte value */
190 slang_T *sl_next; /* next language */ 227 slang_T *sl_next; /* next language */
191 char_u *sl_name; /* language name "en", "en.rare", "nl", etc. */ 228 char_u *sl_name; /* language name "en", "en.rare", "nl", etc. */
192 char_u *sl_fname; /* name of .spl file */ 229 char_u *sl_fname; /* name of .spl file */
193 int sl_add; /* TRUE if it's a .add file. */ 230 int sl_add; /* TRUE if it's a .add file. */
194 char_u *sl_fbyts; /* case-folded word bytes */ 231 char_u *sl_fbyts; /* case-folded word bytes */
195 int *sl_fidxs; /* case-folded word indexes */ 232 idx_T *sl_fidxs; /* case-folded word indexes */
196 char_u *sl_kbyts; /* keep-case word bytes */ 233 char_u *sl_kbyts; /* keep-case word bytes */
197 int *sl_kidxs; /* keep-case word indexes */ 234 idx_T *sl_kidxs; /* keep-case word indexes */
198 char_u sl_regions[17]; /* table with up to 8 region names plus NUL */ 235 char_u sl_regions[17]; /* table with up to 8 region names plus NUL */
199 236
200 garray_T sl_rep; /* list of fromto_T entries from REP lines */ 237 garray_T sl_rep; /* list of fromto_T entries from REP lines */
201 short sl_rep_first[256]; /* indexes where byte first appears, -1 if 238 short sl_rep_first[256]; /* indexes where byte first appears, -1 if
202 there is none */ 239 there is none */
265 typedef struct suggest_S 302 typedef struct suggest_S
266 { 303 {
267 char_u *st_word; /* suggested word, allocated string */ 304 char_u *st_word; /* suggested word, allocated string */
268 int st_orglen; /* length of replaced text */ 305 int st_orglen; /* length of replaced text */
269 int st_score; /* lower is better */ 306 int st_score; /* lower is better */
307 #ifdef RESCORE
308 int st_had_bonus; /* bonus already included in score */
309 #endif
270 } suggest_T; 310 } suggest_T;
271 311
272 #define SUG(sup, i) (((suggest_T *)(sup)->su_ga.ga_data)[i]) 312 #define SUG(sup, i) (((suggest_T *)(sup)->su_ga.ga_data)[i])
273 313
274 /* Number of suggestions displayed. */ 314 /* Number of suggestions displayed. */
275 #define SUG_PROMPT_COUNT ((int)Rows - 2) 315 #define SUG_PROMPT_COUNT ((int)Rows - 2)
276 316
277 /* Threshold for sorting and cleaning up suggestions. */ 317 /* Number of suggestions kept when cleaning up. When rescore_suggestions() is
278 #define SUG_CLEANUP_COUNT (SUG_PROMPT_COUNT + 50) 318 * called the score may change, thus we need to keep more than what is
319 * displayed. */
320 #define SUG_CLEAN_COUNT (SUG_PROMPT_COUNT < 25 ? 25 : SUG_PROMPT_COUNT)
321
322 /* Threshold for sorting and cleaning up suggestions. Don't want to keep lots
323 * of suggestions that are not going to be displayed. */
324 #define SUG_MAX_COUNT (SUG_PROMPT_COUNT + 50)
279 325
280 /* score for various changes */ 326 /* score for various changes */
281 #define SCORE_SPLIT 99 /* split bad word */ 327 #define SCORE_SPLIT 99 /* split bad word */
282 #define SCORE_ICASE 52 /* slightly different case */ 328 #define SCORE_ICASE 52 /* slightly different case */
283 #define SCORE_ALLCAP 120 /* need all-cap case */ 329 #define SCORE_ALLCAP 120 /* need all-cap case */
284 #define SCORE_REGION 70 /* word is for different region */ 330 #define SCORE_REGION 70 /* word is for different region */
285 #define SCORE_RARE 180 /* rare word */ 331 #define SCORE_RARE 180 /* rare word */
332 #define SCORE_NOTUSED 11 /* word not found in text yet */
286 333
287 /* score for edit distance */ 334 /* score for edit distance */
288 #define SCORE_SWAP 90 /* swap two characters */ 335 #define SCORE_SWAP 90 /* swap two characters */
289 #define SCORE_SWAP3 110 /* swap two characters in three */ 336 #define SCORE_SWAP3 110 /* swap two characters in three */
290 #define SCORE_REP 87 /* REP replacement */ 337 #define SCORE_REP 87 /* REP replacement */
291 #define SCORE_SUBST 93 /* substitute a character */ 338 #define SCORE_SUBST 93 /* substitute a character */
292 #define SCORE_SIMILAR 33 /* substitute a similar character */ 339 #define SCORE_SIMILAR 33 /* substitute a similar character */
293 #define SCORE_DEL 96 /* delete a character */ 340 #define SCORE_DEL 94 /* delete a character */
294 #define SCORE_INS 94 /* insert a character */ 341 #define SCORE_INS 96 /* insert a character */
295 342
296 #define SCORE_MAXINIT 350 /* Initial maximum score: higher == slower. 343 #define SCORE_MAXINIT 350 /* Initial maximum score: higher == slower.
297 * 350 allows for about three changes. */ 344 * 350 allows for about three changes. */
298 #define SCORE_MAXMAX 999999 /* accept any score */ 345 #define SCORE_MAXMAX 999999 /* accept any score */
299 346
327 typedef struct spelltab_S 374 typedef struct spelltab_S
328 { 375 {
329 char_u st_isw[256]; /* flags: is word char */ 376 char_u st_isw[256]; /* flags: is word char */
330 char_u st_isu[256]; /* flags: is uppercase char */ 377 char_u st_isu[256]; /* flags: is uppercase char */
331 char_u st_fold[256]; /* chars: folded case */ 378 char_u st_fold[256]; /* chars: folded case */
379 char_u st_upper[256]; /* chars: upper case */
332 } spelltab_T; 380 } spelltab_T;
333 381
334 static spelltab_T spelltab; 382 static spelltab_T spelltab;
335 static int did_set_spelltab; 383 static int did_set_spelltab;
336 384
337 #define SPELL_ISWORD 1 385 #define CF_WORD 0x01
338 #define SPELL_ISUPPER 2 386 #define CF_UPPER 0x02
339 387
340 static void clear_spell_chartab __ARGS((spelltab_T *sp)); 388 static void clear_spell_chartab __ARGS((spelltab_T *sp));
341 static int set_spell_finish __ARGS((spelltab_T *new_st)); 389 static int set_spell_finish __ARGS((spelltab_T *new_st));
342 390
343 /* 391 /*
362 int ts_score; /* score */ 410 int ts_score; /* score */
363 int ts_curi; /* index in list of child nodes */ 411 int ts_curi; /* index in list of child nodes */
364 int ts_fidx; /* index in fword[], case-folded bad word */ 412 int ts_fidx; /* index in fword[], case-folded bad word */
365 int ts_fidxtry; /* ts_fidx at which bytes may be changed */ 413 int ts_fidxtry; /* ts_fidx at which bytes may be changed */
366 int ts_twordlen; /* valid length of tword[] */ 414 int ts_twordlen; /* valid length of tword[] */
367 int ts_arridx; /* index in tree array, start of node */ 415 idx_T ts_arridx; /* index in tree array, start of node */
368 char_u ts_save_prewordlen; /* saved "prewordlen" */ 416 char_u ts_save_prewordlen; /* saved "prewordlen" */
369 int ts_save_splitoff; /* su_splitoff saved here */ 417 int ts_save_splitoff; /* su_splitoff saved here */
370 int ts_save_badflags; /* badflags saved here */ 418 int ts_save_badflags; /* badflags saved here */
371 } trystate_T; 419 } trystate_T;
372 420
377 static int spell_valid_case __ARGS((int origflags, int treeflags)); 425 static int spell_valid_case __ARGS((int origflags, int treeflags));
378 static void spell_load_lang __ARGS((char_u *lang)); 426 static void spell_load_lang __ARGS((char_u *lang));
379 static char_u *spell_enc __ARGS((void)); 427 static char_u *spell_enc __ARGS((void));
380 static void spell_load_cb __ARGS((char_u *fname, void *cookie)); 428 static void spell_load_cb __ARGS((char_u *fname, void *cookie));
381 static slang_T *spell_load_file __ARGS((char_u *fname, char_u *lang, slang_T *old_lp, int silent)); 429 static slang_T *spell_load_file __ARGS((char_u *fname, char_u *lang, slang_T *old_lp, int silent));
382 static int read_tree __ARGS((FILE *fd, char_u *byts, int *idxs, int maxidx, int startidx)); 430 static idx_T read_tree __ARGS((FILE *fd, char_u *byts, idx_T *idxs, int maxidx, int startidx));
383 static int find_region __ARGS((char_u *rp, char_u *region)); 431 static int find_region __ARGS((char_u *rp, char_u *region));
384 static int captype __ARGS((char_u *word, char_u *end)); 432 static int captype __ARGS((char_u *word, char_u *end));
385 static void spell_reload_one __ARGS((char_u *fname, int added_word)); 433 static void spell_reload_one __ARGS((char_u *fname, int added_word));
386 static int set_spell_charflags __ARGS((char_u *flags, int cnt, char_u *upp)); 434 static int set_spell_charflags __ARGS((char_u *flags, int cnt, char_u *upp));
387 static int set_spell_chartab __ARGS((char_u *fol, char_u *low, char_u *upp)); 435 static int set_spell_chartab __ARGS((char_u *fol, char_u *low, char_u *upp));
388 static void write_spell_chartab __ARGS((FILE *fd)); 436 static void write_spell_chartab __ARGS((FILE *fd));
389 static int spell_isupper __ARGS((int c));
390 static int spell_casefold __ARGS((char_u *p, int len, char_u *buf, int buflen)); 437 static int spell_casefold __ARGS((char_u *p, int len, char_u *buf, int buflen));
391 static void onecap_copy __ARGS((char_u *word, int len, char_u *wcopy, int upper)); 438 static void onecap_copy __ARGS((char_u *word, char_u *wcopy, int upper));
392 static void spell_try_change __ARGS((suginfo_T *su)); 439 static void spell_try_change __ARGS((suginfo_T *su));
393 static int try_deeper __ARGS((suginfo_T *su, trystate_T *stack, int depth, int score_add)); 440 static int try_deeper __ARGS((suginfo_T *su, trystate_T *stack, int depth, int score_add));
394 static void find_keepcap_word __ARGS((slang_T *slang, char_u *fword, char_u *kword)); 441 static void find_keepcap_word __ARGS((slang_T *slang, char_u *fword, char_u *kword));
395 static void spell_try_soundalike __ARGS((suginfo_T *su)); 442 static void spell_try_soundalike __ARGS((suginfo_T *su));
396 static void make_case_word __ARGS((char_u *fword, char_u *cword, int flags)); 443 static void make_case_word __ARGS((char_u *fword, char_u *cword, int flags));
444 #if 0
397 static int similar_chars __ARGS((slang_T *slang, int c1, int c2)); 445 static int similar_chars __ARGS((slang_T *slang, int c1, int c2));
446 #endif
447 #ifdef RESCORE
448 static void add_suggestion __ARGS((suginfo_T *su, char_u *goodword, int use_score, int had_bonus));
449 #else
398 static void add_suggestion __ARGS((suginfo_T *su, char_u *goodword, int use_score)); 450 static void add_suggestion __ARGS((suginfo_T *su, char_u *goodword, int use_score));
451 #endif
399 static void add_banned __ARGS((suginfo_T *su, char_u *word)); 452 static void add_banned __ARGS((suginfo_T *su, char_u *word));
400 static int was_banned __ARGS((suginfo_T *su, char_u *word)); 453 static int was_banned __ARGS((suginfo_T *su, char_u *word));
401 static void free_banned __ARGS((suginfo_T *su)); 454 static void free_banned __ARGS((suginfo_T *su));
402 static void cleanup_suggestions __ARGS((suginfo_T *su)); 455 #ifdef RESCORE
456 static void rescore_suggestions __ARGS((suginfo_T *su));
457 #endif
458 static void cleanup_suggestions __ARGS((suginfo_T *su, int keep));
403 static void spell_soundfold __ARGS((slang_T *slang, char_u *inword, char_u *res)); 459 static void spell_soundfold __ARGS((slang_T *slang, char_u *inword, char_u *res));
460 #if defined(RESCORE) || defined(SOUNDFOLD_SCORE)
461 static int spell_sound_score __ARGS((slang_T *slang, char_u *goodword, char_u *badsound));
462 #endif
404 static int spell_edit_score __ARGS((char_u *badword, char_u *goodword)); 463 static int spell_edit_score __ARGS((char_u *badword, char_u *goodword));
464
465 /*
466 * Use our own character-case definitions, because the current locale may
467 * differ from what the .spl file uses.
468 * These must not be called with negative number!
469 */
470 #ifndef FEAT_MBYTE
471 /* Non-multi-byte implementation. */
472 # define SPELL_TOFOLD(c) ((c) < 256 ? spelltab.st_fold[c] : (c))
473 # define SPELL_TOUPPER(c) ((c) < 256 ? spelltab.st_upper[c] : (c))
474 # define SPELL_ISUPPER(c) ((c) < 256 ? spelltab.st_isu[c] : FALSE)
475 #else
476 /* Multi-byte implementation. For Unicode we can call utf_*(), but don't do
477 * that for ASCII, because we don't want to use 'casemap' here. Otherwise use
478 * the "w" library function for characters above 255 if available. */
479 # ifdef HAVE_TOWLOWER
480 # define SPELL_TOFOLD(c) (enc_utf8 && (c) >= 128 ? utf_fold(c) \
481 : (c) < 256 ? spelltab.st_fold[c] : towlower(c))
482 # else
483 # define SPELL_TOFOLD(c) (enc_utf8 && (c) >= 128 ? utf_fold(c) \
484 : (c) < 256 ? spelltab.st_fold[c] : (c))
485 # endif
486
487 # ifdef HAVE_TOWUPPER
488 # define SPELL_TOUPPER(c) (enc_utf8 && (c) >= 128 ? utf_toupper(c) \
489 : (c) < 256 ? spelltab.st_upper[c] : towupper(c))
490 # else
491 # define SPELL_TOUPPER(c) (enc_utf8 && (c) >= 128 ? utf_toupper(c) \
492 : (c) < 256 ? spelltab.st_upper[c] : (c))
493 # endif
494
495 # ifdef HAVE_ISWUPPER
496 # define SPELL_ISUPPER(c) (enc_utf8 && (c) >= 128 ? utf_isupper(c) \
497 : (c) < 256 ? spelltab.st_isu[c] : iswupper(c))
498 # else
499 # define SPELL_ISUPPER(c) (enc_utf8 && (c) >= 128 ? utf_isupper(c) \
500 : (c) < 256 ? spelltab.st_isu[c] : (c))
501 # endif
502 #endif
405 503
406 504
407 static char *e_format = N_("E759: Format error in spell file"); 505 static char *e_format = N_("E759: Format error in spell file");
408 506
409 /* 507 /*
487 mi.mi_lp->lp_slang != NULL; ++mi.mi_lp) 585 mi.mi_lp->lp_slang != NULL; ++mi.mi_lp)
488 { 586 {
489 /* Check for a matching word in case-folded words. */ 587 /* Check for a matching word in case-folded words. */
490 find_word(&mi, FALSE); 588 find_word(&mi, FALSE);
491 589
590 /* Check for a matching word in keep-case words. */
492 find_word(&mi, TRUE); 591 find_word(&mi, TRUE);
493 } 592 }
494 593
495 if (mi.mi_result != SP_OK) 594 if (mi.mi_result != SP_OK)
496 { 595 {
526 static void 625 static void
527 find_word(mip, keepcap) 626 find_word(mip, keepcap)
528 matchinf_T *mip; 627 matchinf_T *mip;
529 int keepcap; 628 int keepcap;
530 { 629 {
531 int arridx = 0; 630 idx_T arridx = 0;
532 int endlen[MAXWLEN]; /* length at possible word endings */ 631 int endlen[MAXWLEN]; /* length at possible word endings */
533 int endidx[MAXWLEN]; /* possible word endings */ 632 idx_T endidx[MAXWLEN]; /* possible word endings */
534 int endidxcnt = 0; 633 int endidxcnt = 0;
535 int len; 634 int len;
536 int wlen = 0; 635 int wlen = 0;
537 int flen; 636 int flen;
538 int c; 637 int c;
539 char_u *ptr; 638 char_u *ptr;
540 unsigned lo, hi, m; 639 idx_T lo, hi, m;
541 #ifdef FEAT_MBYTE 640 #ifdef FEAT_MBYTE
542 char_u *s; 641 char_u *s;
543 #endif 642 #endif
544 char_u *p; 643 char_u *p;
545 int res = SP_BAD; 644 int res = SP_BAD;
546 int valid; 645 int valid;
547 slang_T *slang = mip->mi_lp->lp_slang; 646 slang_T *slang = mip->mi_lp->lp_slang;
548 unsigned flags; 647 unsigned flags;
549 char_u *byts; 648 char_u *byts;
550 int *idxs; 649 idx_T *idxs;
551 650
552 if (keepcap) 651 if (keepcap)
553 { 652 {
554 /* Check for word with matching case in keep-case tree. */ 653 /* Check for word with matching case in keep-case tree. */
555 ptr = mip->mi_word; 654 ptr = mip->mi_word;
690 /* Check flags and region. Repeat this if there are more 789 /* Check flags and region. Repeat this if there are more
691 * flags/region alternatives until there is a match. */ 790 * flags/region alternatives until there is a match. */
692 for (len = byts[arridx - 1]; len > 0 && byts[arridx] == 0; --len) 791 for (len = byts[arridx - 1]; len > 0 && byts[arridx] == 0; --len)
693 { 792 {
694 flags = idxs[arridx]; 793 flags = idxs[arridx];
794
795 /* Set a flag for words that were used. The region and case
796 * doesn't matter here, it's only used to rate the suggestions. */
797 idxs[arridx] = flags | WF_USED;
798
695 if (keepcap) 799 if (keepcap)
696 { 800 {
697 /* For "keepcap" tree the case is always right. */ 801 /* For "keepcap" tree the case is always right. */
698 valid = TRUE; 802 valid = TRUE;
699 } 803 }
821 len = spell_check(curwin, p, &attr); 925 len = spell_check(curwin, p, &attr);
822 926
823 if (attr != 0) 927 if (attr != 0)
824 { 928 {
825 /* We found a bad word. Check the attribute. */ 929 /* We found a bad word. Check the attribute. */
826 /* TODO: check for syntax @Spell cluster. */
827 if (allwords || attr == highlight_attr[HLF_SPB]) 930 if (allwords || attr == highlight_attr[HLF_SPB])
828 { 931 {
829 /* When searching forward only accept a bad word after 932 /* When searching forward only accept a bad word after
830 * the cursor. */ 933 * the cursor. */
831 if (dir == BACKWARD 934 if (dir == BACKWARD
1071 slang_T *lp = NULL; 1174 slang_T *lp = NULL;
1072 garray_T *gap; 1175 garray_T *gap;
1073 fromto_T *ftp; 1176 fromto_T *ftp;
1074 int rr; 1177 int rr;
1075 short *first; 1178 short *first;
1179 idx_T idx;
1076 1180
1077 fd = mch_fopen((char *)fname, "r"); 1181 fd = mch_fopen((char *)fname, "r");
1078 if (fd == NULL) 1182 if (fd == NULL)
1079 { 1183 {
1080 if (!silent) 1184 if (!silent)
1168 } 1272 }
1169 for (i = 0; i < ccnt; ++i) 1273 for (i = 0; i < ccnt; ++i)
1170 fol[i] = getc(fd); /* <fchars> */ 1274 fol[i] = getc(fd); /* <fchars> */
1171 fol[i] = NUL; 1275 fol[i] = NUL;
1172 1276
1173 /* Set the word-char flags and fill spell_isupper() table. */ 1277 /* Set the word-char flags and fill SPELL_ISUPPER() table. */
1174 i = set_spell_charflags(p, cnt, fol); 1278 i = set_spell_charflags(p, cnt, fol);
1175 vim_free(p); 1279 vim_free(p);
1176 vim_free(fol); 1280 vim_free(fol);
1177 if (i == FAIL) 1281 if (i == FAIL)
1178 goto formerr; 1282 goto formerr;
1291 /* Allocate the index array. */ 1395 /* Allocate the index array. */
1292 p = lalloc_clear((long_u)(len * sizeof(int)), TRUE); 1396 p = lalloc_clear((long_u)(len * sizeof(int)), TRUE);
1293 if (p == NULL) 1397 if (p == NULL)
1294 goto endFAIL; 1398 goto endFAIL;
1295 if (round == 1) 1399 if (round == 1)
1296 lp->sl_fidxs = (int *)p; 1400 lp->sl_fidxs = (idx_T *)p;
1297 else 1401 else
1298 lp->sl_kidxs = (int *)p; 1402 lp->sl_kidxs = (idx_T *)p;
1299 1403
1300 1404
1301 /* Read the tree and store it in the array. */ 1405 /* Read the tree and store it in the array. */
1302 i = read_tree(fd, 1406 idx = read_tree(fd,
1303 round == 1 ? lp->sl_fbyts : lp->sl_kbyts, 1407 round == 1 ? lp->sl_fbyts : lp->sl_kbyts,
1304 round == 1 ? lp->sl_fidxs : lp->sl_kidxs, 1408 round == 1 ? lp->sl_fidxs : lp->sl_kidxs,
1305 len, 0); 1409 len, 0);
1306 if (i == -1) 1410 if (idx == -1)
1307 goto truncerr; 1411 goto truncerr;
1308 if (i < 0) 1412 if (idx < 0)
1309 goto formerr; 1413 goto formerr;
1310 } 1414 }
1311 } 1415 }
1312 1416
1313 /* For a new file link it in the list of spell files. */ 1417 /* For a new file link it in the list of spell files. */
1346 * 1450 *
1347 * Returns the index follosing the siblings. 1451 * Returns the index follosing the siblings.
1348 * Returns -1 if the file is shorter than expected. 1452 * Returns -1 if the file is shorter than expected.
1349 * Returns -2 if there is a format error. 1453 * Returns -2 if there is a format error.
1350 */ 1454 */
1351 static int 1455 static idx_T
1352 read_tree(fd, byts, idxs, maxidx, startidx) 1456 read_tree(fd, byts, idxs, maxidx, startidx)
1353 FILE *fd; 1457 FILE *fd;
1354 char_u *byts; 1458 char_u *byts;
1355 int *idxs; 1459 idx_T *idxs;
1356 int maxidx; /* size of arrays */ 1460 int maxidx; /* size of arrays */
1357 int startidx; /* current index in "byts" and "idxs" */ 1461 idx_T startidx; /* current index in "byts" and "idxs" */
1358 { 1462 {
1359 int len; 1463 int len;
1360 int i; 1464 int i;
1361 int n; 1465 int n;
1362 int idx = startidx; 1466 idx_T idx = startidx;
1363 int c; 1467 int c;
1364 #define SHARED_MASK 0x8000000 1468 #define SHARED_MASK 0x8000000
1365 1469
1366 len = getc(fd); /* <siblingcount> */ 1470 len = getc(fd); /* <siblingcount> */
1367 if (len <= 0) 1471 if (len <= 0)
1617 if (has_mbyte) 1721 if (has_mbyte)
1618 c = mb_ptr2char_adv(&p); 1722 c = mb_ptr2char_adv(&p);
1619 else 1723 else
1620 #endif 1724 #endif
1621 c = *p++; 1725 c = *p++;
1622 firstcap = allcap = spell_isupper(c); 1726 firstcap = allcap = SPELL_ISUPPER(c);
1623 1727
1624 /* 1728 /*
1625 * Need to check all letters to find a word with mixed upper/lower. 1729 * Need to check all letters to find a word with mixed upper/lower.
1626 * But a word with an upper char only at start is a ONECAP. 1730 * But a word with an upper char only at start is a ONECAP.
1627 */ 1731 */
1631 #ifdef FEAT_MBYTE 1735 #ifdef FEAT_MBYTE
1632 c = mb_ptr2char(p); 1736 c = mb_ptr2char(p);
1633 #else 1737 #else
1634 c = *p; 1738 c = *p;
1635 #endif 1739 #endif
1636 if (!spell_isupper(c)) 1740 if (!SPELL_ISUPPER(c))
1637 { 1741 {
1638 /* UUl -> KEEPCAP */ 1742 /* UUl -> KEEPCAP */
1639 if (past_second && allcap) 1743 if (past_second && allcap)
1640 return WF_KEEPCAP; 1744 return WF_KEEPCAP;
1641 allcap = FALSE; 1745 allcap = FALSE;
1874 static char *e_affname = N_("Affix name too long in %s line %d: %s"); 1978 static char *e_affname = N_("Affix name too long in %s line %d: %s");
1875 int do_rep; 1979 int do_rep;
1876 int do_sal; 1980 int do_sal;
1877 int do_map; 1981 int do_map;
1878 int found_map = FALSE; 1982 int found_map = FALSE;
1983 hashitem_T *hi;
1879 1984
1880 /* 1985 /*
1881 * Open the file. 1986 * Open the file.
1882 */ 1987 */
1883 fd = mch_fopen((char *)fname, "r"); 1988 fd = mch_fopen((char *)fname, "r");
2029 if (*items[0] == 'P') 2134 if (*items[0] == 'P')
2030 tp = &aff->af_pref; 2135 tp = &aff->af_pref;
2031 else 2136 else
2032 tp = &aff->af_suff; 2137 tp = &aff->af_suff;
2033 aff_todo = atoi((char *)items[3]); 2138 aff_todo = atoi((char *)items[3]);
2034 if (!HASHITEM_EMPTY(hash_find(tp, cur_aff->ah_key))) 2139 hi = hash_find(tp, cur_aff->ah_key);
2140 if (!HASHITEM_EMPTY(hi))
2035 { 2141 {
2036 smsg((char_u *)_("Duplicate affix in %s line %d: %s"), 2142 smsg((char_u *)_("Duplicate affix in %s line %d: %s"),
2037 fname, lnum, items[1]); 2143 fname, lnum, items[1]);
2038 aff_todo = 0; 2144 aff_todo = 0;
2039 } 2145 }
2169 if (fol != NULL || low != NULL || upp != NULL) 2275 if (fol != NULL || low != NULL || upp != NULL)
2170 { 2276 {
2171 /* 2277 /*
2172 * Don't write a word table for an ASCII file, so that we don't check 2278 * Don't write a word table for an ASCII file, so that we don't check
2173 * for conflicts with a word table that matches 'encoding'. 2279 * for conflicts with a word table that matches 'encoding'.
2174 * Don't write one for utf-8 either, we use utf_isupper() and 2280 * Don't write one for utf-8 either, we use utf_*() and
2175 * mb_get_class(), the list of chars in the file will be incomplete. 2281 * mb_get_class(), the list of chars in the file will be incomplete.
2176 */ 2282 */
2177 if (!spin->si_ascii 2283 if (!spin->si_ascii
2178 #ifdef FEAT_MBYTE 2284 #ifdef FEAT_MBYTE
2179 && !enc_utf8 2285 && !enc_utf8
2334 verbose_leave(); 2440 verbose_leave();
2335 } 2441 }
2336 2442
2337 /* Read and ignore the first line: word count. */ 2443 /* Read and ignore the first line: word count. */
2338 (void)vim_fgets(line, MAXLINELEN, fd); 2444 (void)vim_fgets(line, MAXLINELEN, fd);
2339 if (!isdigit(*skipwhite(line))) 2445 if (!vim_isdigit(*skipwhite(line)))
2340 EMSG2(_("E760: No word count in %s"), fname); 2446 EMSG2(_("E760: No word count in %s"), fname);
2341 2447
2342 /* 2448 /*
2343 * Read all the lines in the file one by one. 2449 * Read all the lines in the file one by one.
2344 * The words are converted to 'encoding' here, before being added to 2450 * The words are converted to 'encoding' here, before being added to
2526 if (ae->ae_chop != NULL) 2632 if (ae->ae_chop != NULL)
2527 { 2633 {
2528 /* Skip chop string. */ 2634 /* Skip chop string. */
2529 #ifdef FEAT_MBYTE 2635 #ifdef FEAT_MBYTE
2530 if (has_mbyte) 2636 if (has_mbyte)
2637 {
2531 i = mb_charlen(ae->ae_chop); 2638 i = mb_charlen(ae->ae_chop);
2639 for ( ; i > 0; --i)
2640 mb_ptr_adv(p);
2641 }
2532 else 2642 else
2533 #endif 2643 #endif
2534 i = STRLEN(ae->ae_chop); 2644 p += STRLEN(ae->ae_chop);
2535 for ( ; i > 0; --i)
2536 mb_ptr_adv(p);
2537 } 2645 }
2538 STRCAT(newword, p); 2646 STRCAT(newword, p);
2539 } 2647 }
2540 else 2648 else
2541 { 2649 {
3752 */ 3860 */
3753 static void 3861 static void
3754 clear_spell_chartab(sp) 3862 clear_spell_chartab(sp)
3755 spelltab_T *sp; 3863 spelltab_T *sp;
3756 { 3864 {
3757 int i; 3865 int i;
3758 3866
3759 /* Init everything to FALSE. */ 3867 /* Init everything to FALSE. */
3760 vim_memset(sp->st_isw, FALSE, sizeof(sp->st_isw)); 3868 vim_memset(sp->st_isw, FALSE, sizeof(sp->st_isw));
3761 vim_memset(sp->st_isu, FALSE, sizeof(sp->st_isu)); 3869 vim_memset(sp->st_isu, FALSE, sizeof(sp->st_isu));
3762 for (i = 0; i < 256; ++i) 3870 for (i = 0; i < 256; ++i)
3871 {
3763 sp->st_fold[i] = i; 3872 sp->st_fold[i] = i;
3873 sp->st_upper[i] = i;
3874 }
3764 3875
3765 /* We include digits. A word shouldn't start with a digit, but handling 3876 /* We include digits. A word shouldn't start with a digit, but handling
3766 * that is done separately. */ 3877 * that is done separately. */
3767 for (i = '0'; i <= '9'; ++i) 3878 for (i = '0'; i <= '9'; ++i)
3768 sp->st_isw[i] = TRUE; 3879 sp->st_isw[i] = TRUE;
3771 sp->st_isw[i] = TRUE; 3882 sp->st_isw[i] = TRUE;
3772 sp->st_isu[i] = TRUE; 3883 sp->st_isu[i] = TRUE;
3773 sp->st_fold[i] = i + 0x20; 3884 sp->st_fold[i] = i + 0x20;
3774 } 3885 }
3775 for (i = 'a'; i <= 'z'; ++i) 3886 for (i = 'a'; i <= 'z'; ++i)
3887 {
3776 sp->st_isw[i] = TRUE; 3888 sp->st_isw[i] = TRUE;
3889 sp->st_upper[i] = i - 0x20;
3890 }
3777 } 3891 }
3778 3892
3779 /* 3893 /*
3780 * Init the chartab used for spelling. Only depends on 'encoding'. 3894 * Init the chartab used for spelling. Only depends on 'encoding'.
3781 * Called once while starting up and when 'encoding' changes. 3895 * Called once while starting up and when 'encoding' changes.
3797 /* DBCS: assume double-wide characters are word characters. */ 3911 /* DBCS: assume double-wide characters are word characters. */
3798 for (i = 128; i <= 255; ++i) 3912 for (i = 128; i <= 255; ++i)
3799 if (MB_BYTE2LEN(i) == 2) 3913 if (MB_BYTE2LEN(i) == 2)
3800 spelltab.st_isw[i] = TRUE; 3914 spelltab.st_isw[i] = TRUE;
3801 } 3915 }
3916 else if (enc_utf8)
3917 {
3918 for (i = 128; i < 256; ++i)
3919 {
3920 spelltab.st_isu[i] = utf_isupper(i);
3921 spelltab.st_isw[i] = spelltab.st_isu[i] || utf_islower(i);
3922 spelltab.st_fold[i] = utf_fold(i);
3923 spelltab.st_upper[i] = utf_toupper(i);
3924 }
3925 }
3802 else 3926 else
3803 #endif 3927 #endif
3804 { 3928 {
3805 /* Rough guess: use isalpha() and isupper() for characters above 128. */ 3929 /* Rough guess: use locale-dependent library functions. */
3806 for (i = 128; i < 256; ++i) 3930 for (i = 128; i < 256; ++i)
3807 { 3931 {
3808 spelltab.st_isw[i] = MB_ISUPPER(i) || MB_ISLOWER(i);
3809 if (MB_ISUPPER(i)) 3932 if (MB_ISUPPER(i))
3810 { 3933 {
3934 spelltab.st_isw[i] = TRUE;
3811 spelltab.st_isu[i] = TRUE; 3935 spelltab.st_isu[i] = TRUE;
3812 spelltab.st_fold[i] = MB_TOLOWER(i); 3936 spelltab.st_fold[i] = MB_TOLOWER(i);
3937 }
3938 else if (MB_ISLOWER(i))
3939 {
3940 spelltab.st_isw[i] = TRUE;
3941 spelltab.st_upper[i] = MB_TOUPPER(i);
3813 } 3942 }
3814 } 3943 }
3815 } 3944 }
3816 } 3945 }
3817 3946
3870 } 3999 }
3871 new_st.st_fold[l] = f; 4000 new_st.st_fold[l] = f;
3872 } 4001 }
3873 4002
3874 /* if "UPP" and "FOL" are not the same the "UPP" char needs 4003 /* if "UPP" and "FOL" are not the same the "UPP" char needs
3875 * case-folding and it's upper case. */ 4004 * case-folding, it's upper case and the "UPP" is the upper case of
4005 * "FOL" . */
3876 if (u < 256 && u != f) 4006 if (u < 256 && u != f)
3877 { 4007 {
3878 if (f >= 256) 4008 if (f >= 256)
3879 { 4009 {
3880 EMSG(_(e_affrange)); 4010 EMSG(_(e_affrange));
3881 return FAIL; 4011 return FAIL;
3882 } 4012 }
3883 new_st.st_fold[u] = f; 4013 new_st.st_fold[u] = f;
3884 new_st.st_isu[u] = TRUE; 4014 new_st.st_isu[u] = TRUE;
4015 new_st.st_upper[f] = u;
3885 } 4016 }
3886 } 4017 }
3887 4018
3888 if (*pl != NUL || *pu != NUL) 4019 if (*pl != NUL || *pu != NUL)
3889 { 4020 {
3906 /* We build the new tables here first, so that we can compare with the 4037 /* We build the new tables here first, so that we can compare with the
3907 * previous one. */ 4038 * previous one. */
3908 spelltab_T new_st; 4039 spelltab_T new_st;
3909 int i; 4040 int i;
3910 char_u *p = upp; 4041 char_u *p = upp;
4042 int c;
3911 4043
3912 clear_spell_chartab(&new_st); 4044 clear_spell_chartab(&new_st);
3913 4045
3914 for (i = 0; i < cnt; ++i) 4046 for (i = 0; i < cnt; ++i)
3915 { 4047 {
3916 new_st.st_isw[i + 128] = (flags[i] & SPELL_ISWORD) != 0; 4048 new_st.st_isw[i + 128] = (flags[i] & CF_WORD) != 0;
3917 new_st.st_isu[i + 128] = (flags[i] & SPELL_ISUPPER) != 0; 4049 new_st.st_isu[i + 128] = (flags[i] & CF_UPPER) != 0;
3918 4050
3919 if (*p == NUL) 4051 if (*p == NUL)
3920 return FAIL; 4052 return FAIL;
3921 #ifdef FEAT_MBYTE 4053 #ifdef FEAT_MBYTE
3922 new_st.st_fold[i + 128] = mb_ptr2char_adv(&p); 4054 c = mb_ptr2char_adv(&p);
3923 #else 4055 #else
3924 new_st.st_fold[i + 128] = *p++; 4056 c = *p++;
3925 #endif 4057 #endif
4058 new_st.st_fold[i + 128] = c;
4059 if (i + 128 != c && new_st.st_isu[i + 128] && c < 256)
4060 new_st.st_upper[c] = i + 128;
3926 } 4061 }
3927 4062
3928 return set_spell_finish(&new_st); 4063 return set_spell_finish(&new_st);
3929 } 4064 }
3930 4065
3939 /* check that it's the same table */ 4074 /* check that it's the same table */
3940 for (i = 0; i < 256; ++i) 4075 for (i = 0; i < 256; ++i)
3941 { 4076 {
3942 if (spelltab.st_isw[i] != new_st->st_isw[i] 4077 if (spelltab.st_isw[i] != new_st->st_isw[i]
3943 || spelltab.st_isu[i] != new_st->st_isu[i] 4078 || spelltab.st_isu[i] != new_st->st_isu[i]
3944 || spelltab.st_fold[i] != new_st->st_fold[i]) 4079 || spelltab.st_fold[i] != new_st->st_fold[i]
4080 || spelltab.st_upper[i] != new_st->st_upper[i])
3945 { 4081 {
3946 EMSG(_("E763: Word characters differ between spell files")); 4082 EMSG(_("E763: Word characters differ between spell files"));
3947 return FAIL; 4083 return FAIL;
3948 } 4084 }
3949 } 4085 }
3975 fputc(128, fd); /* <charflagslen> */ 4111 fputc(128, fd); /* <charflagslen> */
3976 for (i = 128; i < 256; ++i) 4112 for (i = 128; i < 256; ++i)
3977 { 4113 {
3978 flags = 0; 4114 flags = 0;
3979 if (spelltab.st_isw[i]) 4115 if (spelltab.st_isw[i])
3980 flags |= SPELL_ISWORD; 4116 flags |= CF_WORD;
3981 if (spelltab.st_isu[i]) 4117 if (spelltab.st_isu[i])
3982 flags |= SPELL_ISUPPER; 4118 flags |= CF_UPPER;
3983 fputc(flags, fd); /* <charflags> */ 4119 fputc(flags, fd); /* <charflags> */
3984 4120
3985 #ifdef FEAT_MBYTE 4121 #ifdef FEAT_MBYTE
3986 if (has_mbyte) 4122 if (has_mbyte)
3987 len += mb_char2bytes(spelltab.st_fold[i], charbuf + len); 4123 len += mb_char2bytes(spelltab.st_fold[i], charbuf + len);
3993 put_bytes(fd, (long_u)len, 2); /* <fcharlen> */ 4129 put_bytes(fd, (long_u)len, 2); /* <fcharlen> */
3994 fwrite(charbuf, (size_t)len, (size_t)1, fd); /* <fchars> */ 4130 fwrite(charbuf, (size_t)len, (size_t)1, fd); /* <fchars> */
3995 } 4131 }
3996 4132
3997 /* 4133 /*
3998 * Return TRUE if "c" is an upper-case character for spelling. 4134 * Case-fold "str[len]" into "buf[buflen]". The result is NUL terminated.
3999 */ 4135 * Uses the character definitions from the .spl file.
4000 static int
4001 spell_isupper(c)
4002 int c;
4003 {
4004 # ifdef FEAT_MBYTE
4005 if (enc_utf8)
4006 {
4007 /* For Unicode we can call utf_isupper(), but don't do that for ASCII,
4008 * because we don't want to use 'casemap' here. */
4009 if (c >= 128)
4010 return utf_isupper(c);
4011 }
4012 else if (has_mbyte && c > 256)
4013 {
4014 /* For characters above 255 we don't have something specfied.
4015 * Fall back to locale-dependent iswupper(). If not available
4016 * simply return FALSE. */
4017 # ifdef HAVE_ISWUPPER
4018 return iswupper(c);
4019 # else
4020 return FALSE;
4021 # endif
4022 }
4023 # endif
4024 return spelltab.st_isu[c];
4025 }
4026
4027 /*
4028 * Case-fold "p[len]" into "buf[buflen]". Used for spell checking.
4029 * When using a multi-byte 'encoding' the length may change! 4136 * When using a multi-byte 'encoding' the length may change!
4030 * Returns FAIL when something wrong. 4137 * Returns FAIL when something wrong.
4031 */ 4138 */
4032 static int 4139 static int
4033 spell_casefold(p, len, buf, buflen) 4140 spell_casefold(str, len, buf, buflen)
4034 char_u *p; 4141 char_u *str;
4035 int len; 4142 int len;
4036 char_u *buf; 4143 char_u *buf;
4037 int buflen; 4144 int buflen;
4038 { 4145 {
4039 int i; 4146 int i;
4045 } 4152 }
4046 4153
4047 #ifdef FEAT_MBYTE 4154 #ifdef FEAT_MBYTE
4048 if (has_mbyte) 4155 if (has_mbyte)
4049 { 4156 {
4157 int outi = 0;
4158 char_u *p;
4050 int c; 4159 int c;
4051 int outi = 0;
4052 4160
4053 /* Fold one character at a time. */ 4161 /* Fold one character at a time. */
4054 for (i = 0; i < len; i += mb_ptr2len_check(p + i)) 4162 for (p = str; p < str + len; )
4055 { 4163 {
4056 c = mb_ptr2char(p + i);
4057 if (enc_utf8)
4058 /* For Unicode case folding is always the same, no need to use
4059 * the table from the spell file. */
4060 c = utf_fold(c);
4061 else if (c < 256)
4062 /* Use the table from the spell file. */
4063 c = spelltab.st_fold[c];
4064 # ifdef HAVE_TOWLOWER
4065 else
4066 /* We don't know what to do, fall back to towlower(), it
4067 * depends on the current locale. */
4068 c = towlower(c);
4069 # endif
4070 if (outi + MB_MAXBYTES > buflen) 4164 if (outi + MB_MAXBYTES > buflen)
4071 { 4165 {
4072 buf[outi] = NUL; 4166 buf[outi] = NUL;
4073 return FAIL; 4167 return FAIL;
4074 } 4168 }
4075 outi += mb_char2bytes(c, buf + outi); 4169 c = mb_ptr2char_adv(&p);
4170 outi += mb_char2bytes(SPELL_TOFOLD(c), buf + outi);
4076 } 4171 }
4077 buf[outi] = NUL; 4172 buf[outi] = NUL;
4078 } 4173 }
4079 else 4174 else
4080 #endif 4175 #endif
4081 { 4176 {
4082 /* Be quick for non-multibyte encodings. */ 4177 /* Be quick for non-multibyte encodings. */
4083 for (i = 0; i < len; ++i) 4178 for (i = 0; i < len; ++i)
4084 buf[i] = spelltab.st_fold[p[i]]; 4179 buf[i] = spelltab.st_fold[str[i]];
4085 buf[i] = NUL; 4180 buf[i] = NUL;
4086 } 4181 }
4087 4182
4088 return OK; 4183 return OK;
4089 } 4184 }
4134 add_banned(&sug, sug.su_badword); 4229 add_banned(&sug, sug.su_badword);
4135 4230
4136 /* 4231 /*
4137 * 1. Try inserting/deleting/swapping/changing a letter, use REP entries 4232 * 1. Try inserting/deleting/swapping/changing a letter, use REP entries
4138 * from the .aff file and inserting a space (split the word). 4233 * from the .aff file and inserting a space (split the word).
4234 *
4235 * Set a maximum score to limit the combination of operations that is
4236 * tried.
4139 */ 4237 */
4140 /* Set a maximum score to limit the combination of operations that is
4141 * tried. */
4142 sug.su_maxscore = SCORE_MAXINIT; 4238 sug.su_maxscore = SCORE_MAXINIT;
4143 spell_try_change(&sug); 4239 spell_try_change(&sug);
4144 cleanup_suggestions(&sug);
4145 4240
4146 /* 4241 /*
4147 * 2. Try finding sound-a-like words. 4242 * 2. Try finding sound-a-like words.
4243 *
4244 * Only do this when we don't have a lot of suggestions yet, because it's
4245 * very slow and often doesn't find new suggestions.
4148 */ 4246 */
4149 /* Allow a higher score if we don't have many suggestions yet. */ 4247 if (sug.su_ga.ga_len < SUG_CLEAN_COUNT)
4150 if (sug.su_maxscore == SCORE_MAXINIT) 4248 {
4249 /* Allow a higher score now. */
4151 sug.su_maxscore = SCORE_MAXMAX; 4250 sug.su_maxscore = SCORE_MAXMAX;
4152 spell_try_soundalike(&sug); 4251 spell_try_soundalike(&sug);
4252 }
4153 4253
4154 /* When CTRL-C was hit while searching do show the results. */ 4254 /* When CTRL-C was hit while searching do show the results. */
4255 ui_breakcheck();
4155 if (got_int) 4256 if (got_int)
4156 { 4257 {
4157 (void)vgetc(); 4258 (void)vgetc();
4158 got_int = FALSE; 4259 got_int = FALSE;
4159 } 4260 }
4160 4261
4161 if (sug.su_ga.ga_len == 0) 4262 if (sug.su_ga.ga_len == 0)
4162 MSG(_("Sorry, no suggestions")); 4263 MSG(_("Sorry, no suggestions"));
4163 else 4264 else
4164 { 4265 {
4165 /* Cleanup, sort the suggestions and truncate at SUG_PROMPT_COUNT. */ 4266 #ifdef RESCORE
4166 cleanup_suggestions(&sug); 4267 /* Do slow but more accurate computation of the word score. */
4268 rescore_suggestions(&sug);
4269 #endif
4270
4271 /* Sort the suggestions and truncate at SUG_PROMPT_COUNT. */
4272 cleanup_suggestions(&sug, SUG_PROMPT_COUNT);
4167 4273
4168 /* List the suggestions. */ 4274 /* List the suggestions. */
4169 msg_start(); 4275 msg_start();
4170 vim_snprintf((char *)IObuff, IOSIZE, _("Change \"%.*s\" to:"), 4276 vim_snprintf((char *)IObuff, IOSIZE, _("Change \"%.*s\" to:"),
4171 sug.su_badlen, sug.su_badptr); 4277 sug.su_badlen, sug.su_badptr);
4182 STRCPY(wcopy, stp->st_word); 4288 STRCPY(wcopy, stp->st_word);
4183 if (sug.su_badlen > stp->st_orglen) 4289 if (sug.su_badlen > stp->st_orglen)
4184 vim_strncpy(wcopy + STRLEN(wcopy), 4290 vim_strncpy(wcopy + STRLEN(wcopy),
4185 sug.su_badptr + stp->st_orglen, 4291 sug.su_badptr + stp->st_orglen,
4186 sug.su_badlen - stp->st_orglen); 4292 sug.su_badlen - stp->st_orglen);
4187 /* TODO: remove score */ 4293 if (p_verbose > 0)
4188 vim_snprintf((char *)IObuff, IOSIZE, _("%2d \"%s\" (%d)"), 4294 vim_snprintf((char *)IObuff, IOSIZE, _("%2d \"%s\" (%d)"),
4189 i + 1, wcopy, stp->st_score); 4295 i + 1, wcopy, stp->st_score);
4296 else
4297 vim_snprintf((char *)IObuff, IOSIZE, _("%2d \"%s\""),
4298 i + 1, wcopy);
4190 msg_puts(IObuff); 4299 msg_puts(IObuff);
4191 lines_left = 3; /* avoid more prompt */ 4300 lines_left = 3; /* avoid more prompt */
4192 msg_putchar('\n'); 4301 msg_putchar('\n');
4193 } 4302 }
4194 4303
4222 /* Free the banned words. */ 4331 /* Free the banned words. */
4223 free_banned(&sug); 4332 free_banned(&sug);
4224 } 4333 }
4225 4334
4226 /* 4335 /*
4227 * Make a copy of "word[len]", with the first letter upper or lower cased, 4336 * Make a copy of "word", with the first letter upper or lower cased, to
4228 * to "wcopy[MAXWLEN]". 4337 * "wcopy[MAXWLEN]". "word" must not be empty.
4338 * The result is NUL terminated.
4229 */ 4339 */
4230 static void 4340 static void
4231 onecap_copy(word, len, wcopy, upper) 4341 onecap_copy(word, wcopy, upper)
4232 char_u *word; 4342 char_u *word;
4233 int len;
4234 char_u *wcopy; 4343 char_u *wcopy;
4235 int upper; /* TRUE: first letter made upper case */ 4344 int upper; /* TRUE: first letter made upper case */
4236 { 4345 {
4237 char_u *p; 4346 char_u *p;
4238 int c; 4347 int c;
4244 c = mb_ptr2char_adv(&p); 4353 c = mb_ptr2char_adv(&p);
4245 else 4354 else
4246 #endif 4355 #endif
4247 c = *p++; 4356 c = *p++;
4248 if (upper) 4357 if (upper)
4249 c = MB_TOUPPER(c); 4358 c = SPELL_TOUPPER(c);
4250 else 4359 else
4251 c = MB_TOLOWER(c); 4360 c = SPELL_TOFOLD(c);
4252 #ifdef FEAT_MBYTE 4361 #ifdef FEAT_MBYTE
4253 if (has_mbyte) 4362 if (has_mbyte)
4254 l = mb_char2bytes(c, wcopy); 4363 l = mb_char2bytes(c, wcopy);
4255 else 4364 else
4256 #endif 4365 #endif
4257 { 4366 {
4258 l = 1; 4367 l = 1;
4259 wcopy[0] = c; 4368 wcopy[0] = c;
4260 } 4369 }
4261 vim_strncpy(wcopy + l, p, len - (p - word)); 4370 vim_strncpy(wcopy + l, p, MAXWLEN - l);
4262 } 4371 }
4263 4372
4264 /* 4373 /*
4265 * Make a copy of "word[len]" with all the letters upper cased into 4374 * Make a copy of "word" with all the letters upper cased into
4266 * "wcopy[MAXWLEN]". 4375 * "wcopy[MAXWLEN]". The result is NUL terminated.
4267 */ 4376 */
4268 static void 4377 static void
4269 allcap_copy(word, wcopy) 4378 allcap_copy(word, wcopy)
4270 char_u *word; 4379 char_u *word;
4271 char_u *wcopy; 4380 char_u *wcopy;
4281 if (has_mbyte) 4390 if (has_mbyte)
4282 c = mb_ptr2char_adv(&s); 4391 c = mb_ptr2char_adv(&s);
4283 else 4392 else
4284 #endif 4393 #endif
4285 c = *s++; 4394 c = *s++;
4286 4395 c = SPELL_TOUPPER(c);
4287 c = MB_TOUPPER(c); /* TODO: use spell toupper */
4288 4396
4289 #ifdef FEAT_MBYTE 4397 #ifdef FEAT_MBYTE
4290 if (has_mbyte) 4398 if (has_mbyte)
4291 { 4399 {
4292 if (d - wcopy >= MAXWLEN - MB_MAXBYTES) 4400 if (d - wcopy >= MAXWLEN - MB_MAXBYTES)
4320 int splitoff = 0; /* index in tword after last split */ 4428 int splitoff = 0; /* index in tword after last split */
4321 trystate_T *sp; 4429 trystate_T *sp;
4322 int newscore; 4430 int newscore;
4323 langp_T *lp; 4431 langp_T *lp;
4324 char_u *byts; 4432 char_u *byts;
4325 int *idxs; 4433 idx_T *idxs;
4326 int depth; 4434 int depth;
4327 int c; 4435 int c;
4328 int n; 4436 int n;
4329 int flags; 4437 int flags;
4330 int badflags; 4438 int badflags;
4331 garray_T *gap; 4439 garray_T *gap;
4332 int arridx; 4440 idx_T arridx;
4333 int len; 4441 int len;
4334 char_u *p; 4442 char_u *p;
4335 fromto_T *ftp; 4443 fromto_T *ftp;
4336 int fl, tl; 4444 int fl, tl;
4337 4445
4415 /* 4523 /*
4416 * End of word in tree. 4524 * End of word in tree.
4417 */ 4525 */
4418 ++sp->ts_curi; /* eat one NUL byte */ 4526 ++sp->ts_curi; /* eat one NUL byte */
4419 4527
4420 flags = idxs[arridx]; 4528 flags = (int)idxs[arridx];
4421 4529
4422 /* 4530 /*
4423 * Form the word with proper case in preword. 4531 * Form the word with proper case in preword.
4424 * If there is a word from a previous split, append. 4532 * If there is a word from a previous split, append.
4425 */ 4533 */
4449 && (((unsigned)flags >> 8) & lp->lp_region) == 0) 4557 && (((unsigned)flags >> 8) & lp->lp_region) == 0)
4450 newscore += SCORE_REGION; 4558 newscore += SCORE_REGION;
4451 if (flags & WF_RARE) 4559 if (flags & WF_RARE)
4452 newscore += SCORE_RARE; 4560 newscore += SCORE_RARE;
4453 4561
4562 /* Words that were not found in the text get a penalty. */
4563 if ((flags & WF_USED) == 0)
4564 newscore += SCORE_NOTUSED;
4565
4454 if (!spell_valid_case(badflags, 4566 if (!spell_valid_case(badflags,
4455 captype(preword + prewordlen, NULL))) 4567 captype(preword + prewordlen, NULL)))
4456 newscore += SCORE_ICASE; 4568 newscore += SCORE_ICASE;
4457 4569
4458 if (fword[sp->ts_fidx] == 0) 4570 if (fword[sp->ts_fidx] == 0)
4459 { 4571 {
4460 /* The badword also ends: add suggestions, */ 4572 /* The badword also ends: add suggestions, */
4461 add_suggestion(su, preword, sp->ts_score + newscore); 4573 add_suggestion(su, preword, sp->ts_score + newscore
4574 #ifdef RESCORE
4575 , FALSE
4576 #endif
4577 );
4462 } 4578 }
4463 else if (sp->ts_fidx >= sp->ts_fidxtry) 4579 else if (sp->ts_fidx >= sp->ts_fidxtry)
4464 { 4580 {
4465 /* The word in the tree ends but the badword 4581 /* The word in the tree ends but the badword
4466 * continues: try inserting a space and check that a valid 4582 * continues: try inserting a space and check that a valid
4474 4590
4475 /* Append a space to preword. */ 4591 /* Append a space to preword. */
4476 STRCAT(preword, " "); 4592 STRCAT(preword, " ");
4477 prewordlen = STRLEN(preword); 4593 prewordlen = STRLEN(preword);
4478 splitoff = sp->ts_twordlen; 4594 splitoff = sp->ts_twordlen;
4479 /* TODO: when case-folding changed the number of bytes 4595 #ifdef FEAT_MBYTE
4480 * this doesn't work... */ 4596 if (has_mbyte)
4481 badflags = captype(su->su_badptr + sp->ts_fidx, 4597 {
4482 su->su_badptr + su->su_badlen); 4598 int i = 0;
4599
4600 /* Case-folding may change the number of bytes:
4601 * Count nr of chars in fword[sp->ts_fidx] and
4602 * advance that many chars in su->su_badptr. */
4603 for (p = fword; p < fword + sp->ts_fidx;
4604 mb_ptr_adv(p))
4605 ++i;
4606 for (p = su->su_badptr; i > 0; mb_ptr_adv(p))
4607 --i;
4608 }
4609 else
4610 #endif
4611 p = su->su_badptr + sp->ts_fidx;
4612 badflags = captype(p, su->su_badptr + su->su_badlen);
4483 4613
4484 sp->ts_state = STATE_SPLITUNDO; 4614 sp->ts_state = STATE_SPLITUNDO;
4485 ++depth; 4615 ++depth;
4486 /* Restart at top of the tree. */ 4616 /* Restart at top of the tree. */
4487 stack[depth].ts_arridx = 0; 4617 stack[depth].ts_arridx = 0;
4533 /* Normal byte, go one level deeper. If it's not equal to 4663 /* Normal byte, go one level deeper. If it's not equal to
4534 * the byte in the bad word adjust the score. But don't 4664 * the byte in the bad word adjust the score. But don't
4535 * even try when the byte was already changed. */ 4665 * even try when the byte was already changed. */
4536 if (c == fword[sp->ts_fidx]) 4666 if (c == fword[sp->ts_fidx])
4537 newscore = 0; 4667 newscore = 0;
4538 /* TODO: multi-byte characters */ 4668
4669 /* TODO: this is too slow and comparing bytes isn't right
4670 * for multi-byte characters. */
4671 #if 0
4539 else if (lp->lp_slang->sl_map != NULL 4672 else if (lp->lp_slang->sl_map != NULL
4540 && similar_chars(lp->lp_slang, 4673 && similar_chars(lp->lp_slang,
4541 c, fword[sp->ts_fidx])) 4674 c, fword[sp->ts_fidx]))
4542 newscore = SCORE_SIMILAR; 4675 newscore = SCORE_SIMILAR;
4676 #endif
4543 else 4677 else
4544 newscore = SCORE_SUBST; 4678 newscore = SCORE_SUBST;
4545 if ((newscore == 0 || sp->ts_fidx >= sp->ts_fidxtry) 4679 if ((newscore == 0 || sp->ts_fidx >= sp->ts_fidxtry)
4546 && try_deeper(su, stack, depth, newscore)) 4680 && try_deeper(su, stack, depth, newscore))
4547 { 4681 {
4816 char_u *fword; 4950 char_u *fword;
4817 char_u *kword; 4951 char_u *kword;
4818 { 4952 {
4819 char_u uword[MAXWLEN]; /* "fword" in upper-case */ 4953 char_u uword[MAXWLEN]; /* "fword" in upper-case */
4820 int depth; 4954 int depth;
4821 int tryidx; 4955 idx_T tryidx;
4822 4956
4823 /* The following arrays are used at each depth in the tree. */ 4957 /* The following arrays are used at each depth in the tree. */
4824 int arridx[MAXWLEN]; 4958 idx_T arridx[MAXWLEN];
4825 int round[MAXWLEN]; 4959 int round[MAXWLEN];
4826 int fwordidx[MAXWLEN]; 4960 int fwordidx[MAXWLEN];
4827 int uwordidx[MAXWLEN]; 4961 int uwordidx[MAXWLEN];
4828 int kwordlen[MAXWLEN]; 4962 int kwordlen[MAXWLEN];
4829 4963
4830 int flen, ulen; 4964 int flen, ulen;
4831 int l; 4965 int l;
4832 int len; 4966 int len;
4833 int c; 4967 int c;
4834 unsigned lo, hi, m; 4968 idx_T lo, hi, m;
4835 char_u *p; 4969 char_u *p;
4836 char_u *byts = slang->sl_kbyts; /* array with bytes of the words */ 4970 char_u *byts = slang->sl_kbyts; /* array with bytes of the words */
4837 int *idxs = slang->sl_kidxs; /* array with indexes */ 4971 idx_T *idxs = slang->sl_kidxs; /* array with indexes */
4838 4972
4839 if (byts == NULL) 4973 if (byts == NULL)
4840 { 4974 {
4841 /* array is empty: "cannot happen" */ 4975 /* array is empty: "cannot happen" */
4842 *kword = NUL; 4976 *kword = NUL;
4974 { 5108 {
4975 char_u salword[MAXWLEN]; 5109 char_u salword[MAXWLEN];
4976 char_u tword[MAXWLEN]; 5110 char_u tword[MAXWLEN];
4977 char_u tfword[MAXWLEN]; 5111 char_u tfword[MAXWLEN];
4978 char_u tsalword[MAXWLEN]; 5112 char_u tsalword[MAXWLEN];
4979 int arridx[MAXWLEN]; 5113 idx_T arridx[MAXWLEN];
4980 int curi[MAXWLEN]; 5114 int curi[MAXWLEN];
4981 langp_T *lp; 5115 langp_T *lp;
4982 char_u *byts; 5116 char_u *byts;
4983 int *idxs; 5117 idx_T *idxs;
4984 int depth; 5118 int depth;
4985 int c; 5119 int c;
4986 int n; 5120 idx_T n;
4987 int round; 5121 int round;
4988 int flags; 5122 int flags;
5123 int score, sound_score;
5124 char_u *bp, *sp;
4989 5125
4990 for (lp = LANGP_ENTRY(curwin->w_buffer->b_langp, 0); 5126 for (lp = LANGP_ENTRY(curwin->w_buffer->b_langp, 0);
4991 lp->lp_slang != NULL; ++lp) 5127 lp->lp_slang != NULL; ++lp)
4992 { 5128 {
4993 if (lp->lp_slang->sl_sal.ga_len > 0) 5129 if (lp->lp_slang->sl_sal.ga_len > 0)
5028 ++curi[depth]; 5164 ++curi[depth];
5029 c = byts[n]; 5165 c = byts[n];
5030 if (c == 0) 5166 if (c == 0)
5031 { 5167 {
5032 /* End of word, deal with the word. */ 5168 /* End of word, deal with the word. */
5033 flags = idxs[n]; 5169 flags = (int)idxs[n];
5034 if (round == 2 || (flags & WF_KEEPCAP) == 0) 5170 if (round == 2 || (flags & WF_KEEPCAP) == 0)
5035 { 5171 {
5036 tword[depth] = NUL; 5172 tword[depth] = NUL;
5037 if (round == 1) 5173 if (round == 1)
5038 spell_soundfold(lp->lp_slang, 5174 spell_soundfold(lp->lp_slang,
5045 tfword, MAXWLEN); 5181 tfword, MAXWLEN);
5046 spell_soundfold(lp->lp_slang, 5182 spell_soundfold(lp->lp_slang,
5047 tfword, tsalword); 5183 tfword, tsalword);
5048 } 5184 }
5049 5185
5050 /* TODO: also compare with small changes 5186 /*
5051 * (insert char, swap char, etc.) */ 5187 * Accept the word if the sound-folded words
5052 if (STRCMP(salword, tsalword) == 0) 5188 * are (almost) equal.
5189 */
5190 for (bp = salword, sp = tsalword; *bp == *sp;
5191 ++bp, ++sp)
5192 if (*bp == NUL)
5193 break;
5194
5195 if (*bp == *sp)
5196 /* equal */
5197 sound_score = 0;
5198 else if (*bp != NUL && bp[1] != NUL
5199 && *bp == sp[1] && bp[1] == *sp
5200 && STRCMP(bp + 2, sp + 2) == 0)
5201 /* swap two bytes */
5202 sound_score = SCORE_SWAP;
5203 else if (STRCMP(bp + 1, sp) == 0)
5204 /* delete byte */
5205 sound_score = SCORE_DEL;
5206 else if (STRCMP(bp, sp + 1) == 0)
5207 /* insert byte */
5208 sound_score = SCORE_INS;
5209 else if (STRCMP(bp + 1, sp + 1) == 0)
5210 /* skip one byte */
5211 sound_score = SCORE_SUBST;
5212 else
5213 /* not equal or similar */
5214 sound_score = SCORE_MAXMAX;
5215
5216 if (sound_score < SCORE_MAXMAX)
5053 { 5217 {
5218 char_u cword[MAXWLEN];
5219 char_u *p;
5220
5054 if (round == 1 && flags != 0) 5221 if (round == 1 && flags != 0)
5055 { 5222 {
5056 char_u cword[MAXWLEN]; 5223 /* Need to fix case according to
5057 5224 * "flags". */
5058 make_case_word(tword, cword, flags); 5225 make_case_word(tword, cword, flags);
5059 add_suggestion(su, cword, 0); 5226 p = cword;
5060 } 5227 }
5061 else 5228 else
5062 add_suggestion(su, tword, 0); 5229 p = tword;
5230
5231 /* Compute the score. */
5232 score = spell_edit_score(su->su_badword, p);
5233 #ifdef RESCORE
5234 /* give a bonus for the good word sounding
5235 * the same as the bad word */
5236 add_suggestion(su, tword,
5237 RESCORE(score, sound_score),
5238 TRUE);
5239 #else
5240 add_suggestion(su, tword,
5241 score + sound_score);
5242 #endif
5063 } 5243 }
5064 } 5244 }
5065 5245
5066 /* Skip over other NUL bytes. */ 5246 /* Skip over other NUL bytes. */
5067 while (byts[n + 1] == 0) 5247 while (byts[n + 1] == 0)
5076 tword[depth++] = c; 5256 tword[depth++] = c;
5077 arridx[depth] = idxs[n]; 5257 arridx[depth] = idxs[n];
5078 curi[depth] = 1; 5258 curi[depth] = 1;
5079 } 5259 }
5080 } 5260 }
5261
5262 line_breakcheck();
5081 } 5263 }
5082 line_breakcheck(); 5264 }
5083 } 5265 }
5084 } 5266 }
5085 } 5267 }
5086 } 5268
5087 5269 /*
5088 /* 5270 * Copy "fword" to "cword", fixing case according to "flags".
5089 * Copy "fword" to "cword", fixing according to "flags".
5090 */ 5271 */
5091 static void 5272 static void
5092 make_case_word(fword, cword, flags) 5273 make_case_word(fword, cword, flags)
5093 char_u *fword; 5274 char_u *fword;
5094 char_u *cword; 5275 char_u *cword;
5097 if (flags & WF_ALLCAP) 5278 if (flags & WF_ALLCAP)
5098 /* Make it all upper-case */ 5279 /* Make it all upper-case */
5099 allcap_copy(fword, cword); 5280 allcap_copy(fword, cword);
5100 else if (flags & WF_ONECAP) 5281 else if (flags & WF_ONECAP)
5101 /* Make the first letter upper-case */ 5282 /* Make the first letter upper-case */
5102 onecap_copy(fword, STRLEN(fword), cword, TRUE); 5283 onecap_copy(fword, cword, TRUE);
5103 else 5284 else
5104 /* Use goodword as-is. */ 5285 /* Use goodword as-is. */
5105 STRCPY(cword, fword); 5286 STRCPY(cword, fword);
5106 } 5287 }
5107 5288
5289 #if 0
5108 /* 5290 /*
5109 * Return TRUE if "c1" and "c2" are similar characters according to the MAP 5291 * Return TRUE if "c1" and "c2" are similar characters according to the MAP
5110 * lines in the .aff file. 5292 * lines in the .aff file.
5111 */ 5293 */
5112 static int 5294 static int
5127 p2 = vim_strchr(slang->sl_map, c2); 5309 p2 = vim_strchr(slang->sl_map, c2);
5128 if (p2 == NULL) 5310 if (p2 == NULL)
5129 return FALSE; 5311 return FALSE;
5130 return vim_strchr(p1, '/') == vim_strchr(p2, '/'); 5312 return vim_strchr(p1, '/') == vim_strchr(p2, '/');
5131 } 5313 }
5314 #endif
5132 5315
5133 /* 5316 /*
5134 * Add a suggestion to the list of suggestions. 5317 * Add a suggestion to the list of suggestions.
5135 * Do not add a duplicate suggestion or suggestions with a bad score. 5318 * Do not add a duplicate suggestion or suggestions with a bad score.
5136 * When "use_score" is not zero it's used, otherwise the score is computed 5319 * When "use_score" is not zero it's used, otherwise the score is computed
5137 * with spell_edit_score(). 5320 * with spell_edit_score().
5138 */ 5321 */
5139 static void 5322 static void
5140 add_suggestion(su, goodword, use_score) 5323 add_suggestion(su, goodword, score
5324 #ifdef RESCORE
5325 , had_bonus
5326 #endif
5327 )
5141 suginfo_T *su; 5328 suginfo_T *su;
5142 char_u *goodword; 5329 char_u *goodword;
5143 int use_score; 5330 int score;
5331 #ifdef RESCORE
5332 int had_bonus; /* set st_had_bonus */
5333 #endif
5144 { 5334 {
5145 suggest_T *stp; 5335 suggest_T *stp;
5146 int score;
5147 int i; 5336 int i;
5148 #ifdef SOUNDFOLD_SCORE 5337 #ifdef SOUNDFOLD_SCORE
5149 char_u fword[MAXWLEN]; 5338 char_u fword[MAXWLEN];
5150 char_u salword[MAXWLEN]; 5339 char_u salword[MAXWLEN];
5151 #endif 5340 #endif
5152 5341
5153 /* Check that the word wasn't banned. */ 5342 /* Check that the word wasn't banned. */
5154 if (was_banned(su, goodword)) 5343 if (was_banned(su, goodword))
5155 return; 5344 return;
5156 5345
5157 /* Compute the score and add the suggestion if it's good enough. */
5158 if (use_score != 0)
5159 score = use_score;
5160 else
5161 score = spell_edit_score(su->su_badword, goodword);
5162
5163 if (score <= su->su_maxscore) 5346 if (score <= su->su_maxscore)
5164 { 5347 {
5165 #ifdef SOUNDFOLD_SCORE 5348 #ifdef SOUNDFOLD_SCORE
5166 /* Add to the score when the word sounds differently. 5349 /* Add to the score when the word sounds differently.
5167 * This is slow... */ 5350 * This is slow... */
5168 if (su->su_slang->sl_sal.ga_len > 0) 5351 if (su->su_slang->sl_sal.ga_len > 0)
5169 { 5352 score += spell_sound_score(su->su_slang, fword, su->su_salword);
5170 (void)spell_casefold(goodword, STRLEN(goodword), fword, MAXWLEN);
5171 spell_soundfold(su->su_slang, fword, salword);
5172 score += spell_edit_score(su->su_salword, salword);
5173 }
5174 #endif 5353 #endif
5175 5354
5176 /* Check if the word is already there. */ 5355 /* Check if the word is already there. */
5177 stp = &SUG(su, 0); 5356 stp = &SUG(su, 0);
5178 for (i = su->su_ga.ga_len - 1; i >= 0; --i) 5357 for (i = su->su_ga.ga_len - 1; i >= 0; --i)
5179 if (STRCMP(stp[i].st_word, goodword) == 0) 5358 if (STRCMP(stp[i].st_word, goodword) == 0)
5180 { 5359 {
5181 /* Found it. Remember the lowest score. */ 5360 /* Found it. Remember the lowest score. */
5182 if (stp[i].st_score > score) 5361 if (stp[i].st_score > score)
5362 {
5183 stp[i].st_score = score; 5363 stp[i].st_score = score;
5364 #ifdef RESCORE
5365 stp[i].st_had_bonus = had_bonus;
5366 #endif
5367 }
5184 break; 5368 break;
5185 } 5369 }
5186 5370
5187 if (i < 0 && ga_grow(&su->su_ga, 1) == OK) 5371 if (i < 0 && ga_grow(&su->su_ga, 1) == OK)
5188 { 5372 {
5190 stp = &SUG(su, su->su_ga.ga_len); 5374 stp = &SUG(su, su->su_ga.ga_len);
5191 stp->st_word = vim_strsave(goodword); 5375 stp->st_word = vim_strsave(goodword);
5192 if (stp->st_word != NULL) 5376 if (stp->st_word != NULL)
5193 { 5377 {
5194 stp->st_score = score; 5378 stp->st_score = score;
5379 #ifdef RESCORE
5380 stp->st_had_bonus = had_bonus;
5381 #endif
5195 stp->st_orglen = su->su_badlen; 5382 stp->st_orglen = su->su_badlen;
5196 ++su->su_ga.ga_len; 5383 ++su->su_ga.ga_len;
5197 5384
5198 /* If we have too many suggestions now, sort the list and keep 5385 /* If we have too many suggestions now, sort the list and keep
5199 * the best suggestions. */ 5386 * the best suggestions. */
5200 if (su->su_ga.ga_len > SUG_CLEANUP_COUNT) 5387 if (su->su_ga.ga_len > SUG_MAX_COUNT)
5201 cleanup_suggestions(su); 5388 cleanup_suggestions(su, SUG_CLEAN_COUNT);
5202 } 5389 }
5203 } 5390 }
5204 } 5391 }
5205 } 5392 }
5206 5393
5231 static int 5418 static int
5232 was_banned(su, word) 5419 was_banned(su, word)
5233 suginfo_T *su; 5420 suginfo_T *su;
5234 char_u *word; 5421 char_u *word;
5235 { 5422 {
5236 return !HASHITEM_EMPTY(hash_find(&su->su_banned, word)); 5423 hashitem_T *hi = hash_find(&su->su_banned, word);
5424
5425 return !HASHITEM_EMPTY(hi);
5237 } 5426 }
5238 5427
5239 /* 5428 /*
5240 * Free the banned words in "su". 5429 * Free the banned words in "su".
5241 */ 5430 */
5256 } 5445 }
5257 } 5446 }
5258 hash_clear(&su->su_banned); 5447 hash_clear(&su->su_banned);
5259 } 5448 }
5260 5449
5450 #ifdef RESCORE
5451 /*
5452 * Recompute the score if sound-folding is possible. This is slow,
5453 * thus only done for the final results.
5454 */
5455 static void
5456 rescore_suggestions(su)
5457 suginfo_T *su;
5458 {
5459 langp_T *lp;
5460 suggest_T *stp;
5461 char_u sal_badword[MAXWLEN];
5462 int score;
5463 int i;
5464
5465 for (lp = LANGP_ENTRY(curwin->w_buffer->b_langp, 0);
5466 lp->lp_slang != NULL; ++lp)
5467 {
5468 if (lp->lp_slang->sl_sal.ga_len > 0)
5469 {
5470 /* soundfold the bad word */
5471 spell_soundfold(lp->lp_slang, su->su_fbadword, sal_badword);
5472
5473 for (i = 0; i < su->su_ga.ga_len; ++i)
5474 {
5475 stp = &SUG(su, i);
5476 if (!stp->st_had_bonus)
5477 {
5478 score = spell_sound_score(lp->lp_slang, stp->st_word,
5479 sal_badword);
5480 stp->st_score = RESCORE(stp->st_score, score);
5481 }
5482 }
5483 break;
5484 }
5485 }
5486 }
5487 #endif
5488
5261 static int 5489 static int
5262 #ifdef __BORLANDC__ 5490 #ifdef __BORLANDC__
5263 _RTLENTRYF 5491 _RTLENTRYF
5264 #endif 5492 #endif
5265 sug_compare __ARGS((const void *s1, const void *s2)); 5493 sug_compare __ARGS((const void *s1, const void *s2));
5285 * Cleanup the suggestions: 5513 * Cleanup the suggestions:
5286 * - Sort on score. 5514 * - Sort on score.
5287 * - Remove words that won't be displayed. 5515 * - Remove words that won't be displayed.
5288 */ 5516 */
5289 static void 5517 static void
5290 cleanup_suggestions(su) 5518 cleanup_suggestions(su, keep)
5291 suginfo_T *su; 5519 suginfo_T *su;
5520 int keep; /* nr of suggestions to keep */
5292 { 5521 {
5293 suggest_T *stp = &SUG(su, 0); 5522 suggest_T *stp = &SUG(su, 0);
5294 int i; 5523 int i;
5295 5524
5296 /* Sort the list. */ 5525 /* Sort the list. */
5297 qsort(su->su_ga.ga_data, (size_t)su->su_ga.ga_len, 5526 qsort(su->su_ga.ga_data, (size_t)su->su_ga.ga_len,
5298 sizeof(suggest_T), sug_compare); 5527 sizeof(suggest_T), sug_compare);
5299 5528
5300 /* Truncate the list to the number of suggestions that will be displayed. */ 5529 /* Truncate the list to the number of suggestions that will be displayed. */
5301 if (su->su_ga.ga_len > SUG_PROMPT_COUNT) 5530 if (su->su_ga.ga_len > keep)
5302 { 5531 {
5303 for (i = SUG_PROMPT_COUNT; i < su->su_ga.ga_len; ++i) 5532 for (i = keep; i < su->su_ga.ga_len; ++i)
5304 vim_free(stp[i].st_word); 5533 vim_free(stp[i].st_word);
5305 su->su_ga.ga_len = SUG_PROMPT_COUNT; 5534 su->su_ga.ga_len = keep;
5306 su->su_maxscore = stp[SUG_PROMPT_COUNT - 1].st_score; 5535 su->su_maxscore = stp[keep - 1].st_score;
5307 } 5536 }
5308 } 5537 }
5309 5538
5310 /* 5539 /*
5311 * Turn "inword" into its sound-a-like equivalent in "res[MAXWLEN]". 5540 * Turn "inword" into its sound-a-like equivalent in "res[MAXWLEN]".
5318 { 5547 {
5319 fromto_T *ftp; 5548 fromto_T *ftp;
5320 char_u word[MAXWLEN]; 5549 char_u word[MAXWLEN];
5321 #ifdef FEAT_MBYTE 5550 #ifdef FEAT_MBYTE
5322 int l; 5551 int l;
5552 int found_mbyte = FALSE;
5323 #endif 5553 #endif
5324 char_u *s; 5554 char_u *s;
5325 char_u *t; 5555 char_u *t;
5326 int i, j, z; 5556 int i, j, z;
5327 int n, k = 0; 5557 int n, k = 0;
5331 int c; 5561 int c;
5332 int pri; 5562 int pri;
5333 int p0 = -333; 5563 int p0 = -333;
5334 int c0; 5564 int c0;
5335 5565
5336 /* Remove accents, if wanted. 5566 /* Remove accents, if wanted. We actually remove all non-word characters.
5337 * We actually remove all non-word characters. */ 5567 * But keep white space. */
5338 if (slang->sl_rem_accents) 5568 if (slang->sl_rem_accents)
5339 { 5569 {
5340 t = word; 5570 t = word;
5341 for (s = inword; *s != NUL; ) 5571 for (s = inword; *s != NUL; )
5342 { 5572 {
5573 if (vim_iswhite(*s))
5574 *t++ = *s++;
5343 #ifdef FEAT_MBYTE 5575 #ifdef FEAT_MBYTE
5344 if (has_mbyte) 5576 else if (has_mbyte)
5345 { 5577 {
5346 l = mb_ptr2len_check(s); 5578 l = mb_ptr2len_check(s);
5347 if (SPELL_ISWORDP(s)) 5579 if (SPELL_ISWORDP(s))
5348 { 5580 {
5349 mch_memmove(t, s, l); 5581 mch_memmove(t, s, l);
5350 t += l; 5582 t += l;
5583 if (l > 1)
5584 found_mbyte = TRUE;
5351 } 5585 }
5352 s += l; 5586 s += l;
5353 } 5587 }
5588 #endif
5354 else 5589 else
5355 #endif
5356 { 5590 {
5357 if (SPELL_ISWORDP(s)) 5591 if (SPELL_ISWORDP(s))
5358 *t++ = *s; 5592 *t++ = *s;
5359 ++s; 5593 ++s;
5360 } 5594 }
5361 } 5595 }
5362 *t = NUL; 5596 *t = NUL;
5363 } 5597 }
5364 else 5598 else
5599 {
5600 #ifdef FEAT_MBYTE
5601 if (has_mbyte)
5602 for (s = inword; *s != NUL; s += l)
5603 if ((l = mb_ptr2len_check(s)) > 1)
5604 {
5605 found_mbyte = TRUE;
5606 break;
5607 }
5608 #endif
5365 STRCPY(word, inword); 5609 STRCPY(word, inword);
5610 }
5611
5612 #ifdef FEAT_MBYTE
5613 /* If there are multi-byte characters in the word return it as-is, because
5614 * the following won't work. */
5615 if (found_mbyte)
5616 {
5617 STRCPY(res, word);
5618 return;
5619 }
5620 #endif
5366 5621
5367 ftp = (fromto_T *)slang->sl_sal.ga_data; 5622 ftp = (fromto_T *)slang->sl_sal.ga_data;
5368 5623
5369 /* 5624 /*
5370 * This comes from Aspell phonet.cpp. Converted from C++ to C. 5625 * This comes from Aspell phonet.cpp. Converted from C++ to C.
5626 * Changed to keep spaces.
5371 * TODO: support for multi-byte chars. 5627 * TODO: support for multi-byte chars.
5372 */ 5628 */
5373 i = j = z = 0; 5629 i = j = z = 0;
5374 while ((c = word[i]) != NUL) 5630 while ((c = word[i]) != NUL)
5375 { 5631 {
5431 if (*s == '^' && *(s + 1) == '^') 5687 if (*s == '^' && *(s + 1) == '^')
5432 s++; 5688 s++;
5433 5689
5434 if (*s == NUL 5690 if (*s == NUL
5435 || (*s == '^' 5691 || (*s == '^'
5436 && (i == 0 || !SPELL_ISWORDP(word + i - 1)) 5692 && (i == 0 || !(word[i - 1] == ' '
5693 || SPELL_ISWORDP(word + i - 1)))
5437 && (*(s + 1) != '$' 5694 && (*(s + 1) != '$'
5438 || (!SPELL_ISWORDP(word + i + k0)))) 5695 || (!SPELL_ISWORDP(word + i + k0))))
5439 || (*s == '$' && i > 0 5696 || (*s == '$' && i > 0
5440 && SPELL_ISWORDP(word + i - 1) 5697 && SPELL_ISWORDP(word + i - 1)
5441 && (!SPELL_ISWORDP(word + i + k0)))) 5698 && (!SPELL_ISWORDP(word + i + k0))))
5587 break; 5844 break;
5588 } 5845 }
5589 ++n; 5846 ++n;
5590 } 5847 }
5591 } 5848 }
5849 else if (vim_iswhite(c))
5850 {
5851 c = ' ';
5852 k = 1;
5853 }
5592 5854
5593 if (z0 == 0) 5855 if (z0 == 0)
5594 { 5856 {
5595 if (k && !p0 && j < MAXWLEN && c != NUL 5857 if (k && !p0 && j < MAXWLEN && c != NUL
5596 && (!slang->sl_collapse || j == 0 || res[j - 1] != c)) 5858 && (!slang->sl_collapse || j == 0 || res[j - 1] != c))
5607 } 5869 }
5608 5870
5609 res[j] = NUL; 5871 res[j] = NUL;
5610 } 5872 }
5611 5873
5874 #if defined(RESCORE) || defined(SOUNDFOLD_SCORE)
5875 /*
5876 * Return the score for how much words sound different.
5877 */
5878 static int
5879 spell_sound_score(slang, goodword, badsound)
5880 slang_T *slang;
5881 char_u *goodword; /* good word */
5882 char_u *badsound; /* sound-folded bad word */
5883 {
5884 char_u fword[MAXWLEN];
5885 char_u goodsound[MAXWLEN];
5886 int score;
5887
5888 /* Case-fold the word, needed for sound folding. */
5889 (void)spell_casefold(goodword, STRLEN(goodword), fword, MAXWLEN);
5890
5891 /* sound-fold the good word */
5892 spell_soundfold(slang, fword, goodsound);
5893
5894 /* compute the edit distance-score of the sounds */
5895 score = spell_edit_score(badsound, goodsound);
5896
5897 /* Correction: adding/inserting "*" at the start (word starts with vowel)
5898 * shouldn't be counted so much, vowels halfway the word aren't counted at
5899 * all. */
5900 if (*badsound != *goodsound && (*badsound == '*' || *goodsound == '*'))
5901 score -= SCORE_DEL / 2;
5902
5903 return score;
5904 }
5905 #endif
5906
5612 /* 5907 /*
5613 * Compute the "edit distance" to turn "badword" into "goodword". The less 5908 * Compute the "edit distance" to turn "badword" into "goodword". The less
5614 * deletes/inserts/swaps are required the lower the score. 5909 * deletes/inserts/swaps are required the lower the score.
5910 *
5615 * The algorithm comes from Aspell editdist.cpp, edit_distance(). 5911 * The algorithm comes from Aspell editdist.cpp, edit_distance().
5616 * TODO: make this work with multi-byte chars. 5912 * It has been converted from C++ to C and modified to support multi-byte
5913 * characters.
5617 */ 5914 */
5618 static int 5915 static int
5619 spell_edit_score(badword, goodword) 5916 spell_edit_score(badword, goodword)
5620 char_u *badword; 5917 char_u *badword;
5621 char_u *goodword; 5918 char_u *goodword;
5623 int *cnt; 5920 int *cnt;
5624 int badlen, goodlen; 5921 int badlen, goodlen;
5625 int j, i; 5922 int j, i;
5626 int t; 5923 int t;
5627 int bc, gc; 5924 int bc, gc;
5925 int pbc, pgc;
5926 #ifdef FEAT_MBYTE
5927 char_u *p;
5928 int wbadword[MAXWLEN];
5929 int wgoodword[MAXWLEN];
5930
5931 if (has_mbyte)
5932 {
5933 /* Get the characters from the multi-byte strings and put them in an
5934 * int array for easy access. */
5935 for (p = badword, badlen = 0; *p != NUL; )
5936 wbadword[badlen++] = mb_ptr2char_adv(&p);
5937 ++badlen;
5938 for (p = goodword, goodlen = 0; *p != NUL; )
5939 wgoodword[goodlen++] = mb_ptr2char_adv(&p);
5940 ++goodlen;
5941 }
5942 else
5943 #endif
5944 {
5945 badlen = STRLEN(badword) + 1;
5946 goodlen = STRLEN(goodword) + 1;
5947 }
5628 5948
5629 /* We use "cnt" as an array: CNT(badword_idx, goodword_idx). */ 5949 /* We use "cnt" as an array: CNT(badword_idx, goodword_idx). */
5630 #define CNT(a, b) cnt[(a) + (b) * (badlen + 1)] 5950 #define CNT(a, b) cnt[(a) + (b) * (badlen + 1)]
5631 badlen = STRLEN(badword) + 1;
5632 goodlen = STRLEN(goodword) + 1;
5633 cnt = (int *)lalloc((long_u)(sizeof(int) * (badlen + 1) * (goodlen + 1)), 5951 cnt = (int *)lalloc((long_u)(sizeof(int) * (badlen + 1) * (goodlen + 1)),
5634 TRUE); 5952 TRUE);
5635 if (cnt == 0) 5953 if (cnt == NULL)
5636 return 0; 5954 return 0; /* out of memory */
5637 5955
5638 CNT(0, 0) = 0; 5956 CNT(0, 0) = 0;
5639 for (j = 1; j <= goodlen; ++j) 5957 for (j = 1; j <= goodlen; ++j)
5640 CNT(0, j) = CNT(0, j - 1) + SCORE_DEL; 5958 CNT(0, j) = CNT(0, j - 1) + SCORE_DEL;
5641 5959
5642 for (i = 1; i <= badlen; ++i) 5960 for (i = 1; i <= badlen; ++i)
5643 { 5961 {
5644 CNT(i, 0) = CNT(i - 1, 0) + SCORE_INS; 5962 CNT(i, 0) = CNT(i - 1, 0) + SCORE_INS;
5645 for (j = 1; j <= goodlen; ++j) 5963 for (j = 1; j <= goodlen; ++j)
5646 { 5964 {
5647 bc = badword[i - 1]; 5965 #ifdef FEAT_MBYTE
5648 gc = goodword[j - 1]; 5966 if (has_mbyte)
5967 {
5968 bc = wbadword[i - 1];
5969 gc = wgoodword[j - 1];
5970 }
5971 else
5972 #endif
5973 {
5974 bc = badword[i - 1];
5975 gc = goodword[j - 1];
5976 }
5649 if (bc == gc) 5977 if (bc == gc)
5650 CNT(i, j) = CNT(i - 1, j - 1); 5978 CNT(i, j) = CNT(i - 1, j - 1);
5651 else 5979 else
5652 { 5980 {
5653 /* Use a better score when there is only a case difference. */ 5981 /* Use a better score when there is only a case difference. */
5654 if (spelltab.st_fold[bc] == spelltab.st_fold[gc]) 5982 if (SPELL_TOFOLD(bc) == SPELL_TOFOLD(gc))
5655 CNT(i, j) = SCORE_ICASE + CNT(i - 1, j - 1); 5983 CNT(i, j) = SCORE_ICASE + CNT(i - 1, j - 1);
5656 else 5984 else
5657 CNT(i, j) = SCORE_SUBST + CNT(i - 1, j - 1); 5985 CNT(i, j) = SCORE_SUBST + CNT(i - 1, j - 1);
5658 5986
5659 if (i > 1 && j > 1 && bc == goodword[j - 2] 5987 if (i > 1 && j > 1)
5660 && badword[i - 2] == gc)
5661 { 5988 {
5662 t = SCORE_SWAP + CNT(i - 2, j - 2); 5989 #ifdef FEAT_MBYTE
5663 if (t < CNT(i, j)) 5990 if (has_mbyte)
5664 CNT(i, j) = t; 5991 {
5992 pbc = wbadword[i - 2];
5993 pgc = wgoodword[j - 2];
5994 }
5995 else
5996 #endif
5997 {
5998 pbc = badword[i - 2];
5999 pgc = goodword[j - 2];
6000 }
6001 if (bc == pgc && pbc == gc)
6002 {
6003 t = SCORE_SWAP + CNT(i - 2, j - 2);
6004 if (t < CNT(i, j))
6005 CNT(i, j) = t;
6006 }
5665 } 6007 }
5666 t = SCORE_DEL + CNT(i - 1, j); 6008 t = SCORE_DEL + CNT(i - 1, j);
5667 if (t < CNT(i, j)) 6009 if (t < CNT(i, j))
5668 CNT(i, j) = t; 6010 CNT(i, j) = t;
5669 t = SCORE_INS + CNT(i, j - 1); 6011 t = SCORE_INS + CNT(i, j - 1);