comparison src/spell.c @ 625:81fe2ccc1207 v7.0179

updated for version 7.0179
author vimboss
date Thu, 12 Jan 2006 23:22:24 +0000
parents c5688885c414
children 732c7ae5743e
comparison
equal deleted inserted replaced
624:91e7d4a7b3b0 625:81fe2ccc1207
41 * following word must support this prefix nr. And the condition nr is 41 * following word must support this prefix nr. And the condition nr is
42 * stored, used to lookup the condition that the word must match with. 42 * stored, used to lookup the condition that the word must match with.
43 * 43 *
44 * Thanks to Olaf Seibert for providing an example implementation of this tree 44 * Thanks to Olaf Seibert for providing an example implementation of this tree
45 * and the compression mechanism. 45 * and the compression mechanism.
46 * LZ trie ideas:
47 * http://www.irb.hr/hr/home/ristov/papers/RistovLZtrieRevision1.pdf
48 * More papers: http://www-igm.univ-mlv.fr/~laporte/publi_en.html
46 * 49 *
47 * Matching involves checking the caps type: Onecap ALLCAP KeepCap. 50 * Matching involves checking the caps type: Onecap ALLCAP KeepCap.
48 * 51 *
49 * Why doesn't Vim use aspell/ispell/myspell/etc.? 52 * Why doesn't Vim use aspell/ispell/myspell/etc.?
50 * See ":help develop-spell". 53 * See ":help develop-spell".
54 * Only use it for small word lists! */ 57 * Only use it for small word lists! */
55 #if 0 58 #if 0
56 # define SPELL_PRINTTREE 59 # define SPELL_PRINTTREE
57 #endif 60 #endif
58 61
62 /* Use DEBUG_TRIEWALK to print the changes made in suggest_trie_walk(). */
63 #if 0
64 # define DEBUG_TRIEWALK
65 #endif
66
59 /* 67 /*
60 * Use this to adjust the score after finding suggestions, based on the 68 * Use this to adjust the score after finding suggestions, based on the
61 * suggested word sounding like the bad word. This is much faster than doing 69 * suggested word sounding like the bad word. This is much faster than doing
62 * it for every possible suggestion. 70 * it for every possible suggestion.
63 * Disadvantage: When "the" is typed as "hte" it sounds different and goes 71 * Disadvantage: When "the" is typed as "hte" it sounds quite different ("@"
64 * down in the list. 72 * vs "ht") and goes down in the list.
65 * Used when 'spellsuggest' is set to "best". 73 * Used when 'spellsuggest' is set to "best".
66 */ 74 */
67 #define RESCORE(word_score, sound_score) ((3 * word_score + sound_score) / 4) 75 #define RESCORE(word_score, sound_score) ((3 * word_score + sound_score) / 4)
76
77 /*
78 * Do the opposite: based on a maximum end score and a known sound score,
79 * compute the the maximum word score that can be used.
80 */
81 #define MAXSCORE(word_score, sound_score) ((4 * word_score - sound_score) / 3)
68 82
69 /* 83 /*
70 * Vim spell file format: <HEADER> 84 * Vim spell file format: <HEADER>
71 * <SECTIONS> 85 * <SECTIONS>
72 * <LWORDTREE> 86 * <LWORDTREE>
131 * <repfromlen> 1 byte length of <repfrom> 145 * <repfromlen> 1 byte length of <repfrom>
132 * <repfrom> N bytes "from" part of replacement 146 * <repfrom> N bytes "from" part of replacement
133 * <reptolen> 1 byte length of <repto> 147 * <reptolen> 1 byte length of <repto>
134 * <repto> N bytes "to" part of replacement 148 * <repto> N bytes "to" part of replacement
135 * 149 *
150 * sectionID == SN_REPSAL: <repcount> <rep> ...
151 * just like SN_REP but for soundfolded words
152 *
136 * sectionID == SN_SAL: <salflags> <salcount> <sal> ... 153 * sectionID == SN_SAL: <salflags> <salcount> <sal> ...
137 * <salflags> 1 byte flags for soundsalike conversion: 154 * <salflags> 1 byte flags for soundsalike conversion:
138 * SAL_F0LLOWUP 155 * SAL_F0LLOWUP
139 * SAL_COLLAPSE 156 * SAL_COLLAPSE
140 * SAL_REM_ACCENTS 157 * SAL_REM_ACCENTS
148 * sectionID == SN_SOFO: <sofofromlen> <sofofrom> <sofotolen> <sofoto> 165 * sectionID == SN_SOFO: <sofofromlen> <sofofrom> <sofotolen> <sofoto>
149 * <sofofromlen> 2 bytes length of <sofofrom> 166 * <sofofromlen> 2 bytes length of <sofofrom>
150 * <sofofrom> N bytes "from" part of soundfold 167 * <sofofrom> N bytes "from" part of soundfold
151 * <sofotolen> 2 bytes length of <sofoto> 168 * <sofotolen> 2 bytes length of <sofoto>
152 * <sofoto> N bytes "to" part of soundfold 169 * <sofoto> N bytes "to" part of soundfold
170 *
171 * sectionID == SN_SUGFILE: <timestamp>
172 * <timestamp> 8 bytes time in seconds that must match with .sug file
173 *
174 * sectionID == SN_WORDS: <word> ...
175 * <word> N bytes NUL terminated common word
153 * 176 *
154 * sectionID == SN_MAP: <mapstr> 177 * sectionID == SN_MAP: <mapstr>
155 * <mapstr> N bytes String with sequences of similar characters, 178 * <mapstr> N bytes String with sequences of similar characters,
156 * separated by slashes. 179 * separated by slashes.
157 * 180 *
234 * from HEADER. 257 * from HEADER.
235 * 258 *
236 * All text characters are in 'encoding', but stored as single bytes. 259 * All text characters are in 'encoding', but stored as single bytes.
237 */ 260 */
238 261
262 /*
263 * Vim .sug file format: <SUGHEADER>
264 * <SUGWORDTREE>
265 * <SUGTABLE>
266 *
267 * <SUGHEADER>: <fileID> <versionnr> <timestamp>
268 *
269 * <fileID> 6 bytes "VIMsug"
270 * <versionnr> 1 byte VIMSUGVERSION
271 * <timestamp> 8 bytes timestamp that must match with .spl file
272 *
273 *
274 * <SUGWORDTREE>: <wordtree> (see above, no flags or region used)
275 *
276 *
277 * <SUGTABLE>: <sugwcount> <sugline> ...
278 *
279 * <sugwcount> 4 bytes number of <sugline> following
280 *
281 * <sugline>: <sugnr> ... NUL
282 *
283 * <sugnr>: X bytes word number that results in this soundfolded word,
284 * stored as an offset to the previous number in as
285 * few bytes as possible, see offset2bytes())
286 */
287
239 #if defined(MSDOS) || defined(WIN16) || defined(WIN32) || defined(_WIN64) 288 #if defined(MSDOS) || defined(WIN16) || defined(WIN32) || defined(_WIN64)
240 # include <io.h> /* for lseek(), must be before vim.h */ 289 # include <io.h> /* for lseek(), must be before vim.h */
241 #endif 290 #endif
242 291
243 #include "vim.h" 292 #include "vim.h"
244 293
245 #if defined(FEAT_SYN_HL) || defined(PROTO) 294 #if defined(FEAT_SYN_HL) || defined(PROTO)
246 295
247 #ifdef HAVE_FCNTL_H 296 #ifdef HAVE_FCNTL_H
248 # include <fcntl.h> 297 # include <fcntl.h>
298 #endif
299
300 #ifndef UNIX /* it's in os_unix.h for Unix */
301 # include <time.h> /* for time_t */
249 #endif 302 #endif
250 303
251 #define MAXWLEN 250 /* Assume max. word len is this many bytes. 304 #define MAXWLEN 250 /* Assume max. word len is this many bytes.
252 Some places assume a word length fits in a 305 Some places assume a word length fits in a
253 byte, thus it can't be above 255. */ 306 byte, thus it can't be above 255. */
300 * postponed prefix: <pflags> follows */ 353 * postponed prefix: <pflags> follows */
301 #define BY_FLAGS2 3 /* end of word, <flags> and <flags2> bytes 354 #define BY_FLAGS2 3 /* end of word, <flags> and <flags2> bytes
302 * follow; never used in prefix tree */ 355 * follow; never used in prefix tree */
303 #define BY_SPECIAL BY_FLAGS2 /* highest special byte value */ 356 #define BY_SPECIAL BY_FLAGS2 /* highest special byte value */
304 357
305 /* Info from "REP" and "SAL" entries in ".aff" file used in si_rep, sl_rep, 358 /* Info from "REP", "REPSAL" and "SAL" entries in ".aff" file used in si_rep,
306 * and si_sal. Not for sl_sal! 359 * si_repsal, sl_rep, and si_sal. Not for sl_sal!
307 * One replacement: from "ft_from" to "ft_to". */ 360 * One replacement: from "ft_from" to "ft_to". */
308 typedef struct fromto_S 361 typedef struct fromto_S
309 { 362 {
310 char_u *ft_from; 363 char_u *ft_from;
311 char_u *ft_to; 364 char_u *ft_to;
372 425
373 char_u sl_regions[17]; /* table with up to 8 region names plus NUL */ 426 char_u sl_regions[17]; /* table with up to 8 region names plus NUL */
374 427
375 char_u *sl_midword; /* MIDWORD string or NULL */ 428 char_u *sl_midword; /* MIDWORD string or NULL */
376 429
430 hashtab_T sl_wordcount; /* hashtable with word count, wordcount_T */
431
377 int sl_compmax; /* COMPOUNDMAX (default: MAXWLEN) */ 432 int sl_compmax; /* COMPOUNDMAX (default: MAXWLEN) */
378 int sl_compminlen; /* COMPOUNDMIN (default: 0) */ 433 int sl_compminlen; /* COMPOUNDMIN (default: 0) */
379 int sl_compsylmax; /* COMPOUNDSYLMAX (default: MAXWLEN) */ 434 int sl_compsylmax; /* COMPOUNDSYLMAX (default: MAXWLEN) */
380 regprog_T *sl_compprog; /* COMPOUNDFLAGS turned into a regexp progrm 435 regprog_T *sl_compprog; /* COMPOUNDFLAGS turned into a regexp progrm
381 * (NULL when no compounding) */ 436 * (NULL when no compounding) */
392 short sl_rep_first[256]; /* indexes where byte first appears, -1 if 447 short sl_rep_first[256]; /* indexes where byte first appears, -1 if
393 there is none */ 448 there is none */
394 garray_T sl_sal; /* list of salitem_T entries from SAL lines */ 449 garray_T sl_sal; /* list of salitem_T entries from SAL lines */
395 salfirst_T sl_sal_first[256]; /* indexes where byte first appears, -1 if 450 salfirst_T sl_sal_first[256]; /* indexes where byte first appears, -1 if
396 there is none */ 451 there is none */
452 int sl_followup; /* SAL followup */
453 int sl_collapse; /* SAL collapse_result */
454 int sl_rem_accents; /* SAL remove_accents */
397 int sl_sofo; /* SOFOFROM and SOFOTO instead of SAL items: 455 int sl_sofo; /* SOFOFROM and SOFOTO instead of SAL items:
398 * "sl_sal_first" maps chars, when has_mbyte 456 * "sl_sal_first" maps chars, when has_mbyte
399 * "sl_sal" is a list of wide char lists. */ 457 * "sl_sal" is a list of wide char lists. */
400 int sl_followup; /* SAL followup */ 458 garray_T sl_repsal; /* list of fromto_T entries from REPSAL lines */
401 int sl_collapse; /* SAL collapse_result */ 459 short sl_repsal_first[256]; /* sl_rep_first for REPSAL lines */
402 int sl_rem_accents; /* SAL remove_accents */ 460
461 /* Info from the .sug file. Loaded on demand. */
462 time_t sl_sugtime; /* timestamp for .sug file */
463 char_u *sl_sbyts; /* soundfolded word bytes */
464 idx_T *sl_sidxs; /* soundfolded word indexes */
465 buf_T *sl_sugbuf; /* buffer with word number table */
466 int sl_sugloaded; /* TRUE when .sug file was loaded or failed to
467 load */
468
403 int sl_has_map; /* TRUE if there is a MAP line */ 469 int sl_has_map; /* TRUE if there is a MAP line */
404 #ifdef FEAT_MBYTE 470 #ifdef FEAT_MBYTE
405 hashtab_T sl_map_hash; /* MAP for multi-byte chars */ 471 hashtab_T sl_map_hash; /* MAP for multi-byte chars */
406 int sl_map_array[256]; /* MAP for first 256 chars */ 472 int sl_map_array[256]; /* MAP for first 256 chars */
407 #else 473 #else
408 char_u sl_map_array[256]; /* MAP for first 256 chars */ 474 char_u sl_map_array[256]; /* MAP for first 256 chars */
409 #endif 475 #endif
476 hashtab_T sl_sounddone; /* table with soundfolded words that have
477 handled, see add_sound_suggest() */
410 }; 478 };
411 479
412 /* First language that is loaded, start of the linked list of loaded 480 /* First language that is loaded, start of the linked list of loaded
413 * languages. */ 481 * languages. */
414 static slang_T *first_lang = NULL; 482 static slang_T *first_lang = NULL;
434 #define REGION_ALL 0xff /* word valid in all regions */ 502 #define REGION_ALL 0xff /* word valid in all regions */
435 503
436 #define VIMSPELLMAGIC "VIMspell" /* string at start of Vim spell file */ 504 #define VIMSPELLMAGIC "VIMspell" /* string at start of Vim spell file */
437 #define VIMSPELLMAGICL 8 505 #define VIMSPELLMAGICL 8
438 #define VIMSPELLVERSION 50 506 #define VIMSPELLVERSION 50
507
508 #define VIMSUGMAGIC "VIMsug" /* string at start of Vim .sug file */
509 #define VIMSUGMAGICL 6
510 #define VIMSUGVERSION 1
439 511
440 /* Section IDs. Only renumber them when VIMSPELLVERSION changes! */ 512 /* Section IDs. Only renumber them when VIMSPELLVERSION changes! */
441 #define SN_REGION 0 /* <regionname> section */ 513 #define SN_REGION 0 /* <regionname> section */
442 #define SN_CHARFLAGS 1 /* charflags section */ 514 #define SN_CHARFLAGS 1 /* charflags section */
443 #define SN_MIDWORD 2 /* <midword> section */ 515 #define SN_MIDWORD 2 /* <midword> section */
447 #define SN_SOFO 6 /* soundfolding section */ 519 #define SN_SOFO 6 /* soundfolding section */
448 #define SN_MAP 7 /* MAP items section */ 520 #define SN_MAP 7 /* MAP items section */
449 #define SN_COMPOUND 8 /* compound words section */ 521 #define SN_COMPOUND 8 /* compound words section */
450 #define SN_SYLLABLE 9 /* syllable section */ 522 #define SN_SYLLABLE 9 /* syllable section */
451 #define SN_NOBREAK 10 /* NOBREAK section */ 523 #define SN_NOBREAK 10 /* NOBREAK section */
524 #define SN_SUGFILE 11 /* timestamp for .sug file */
525 #define SN_REPSAL 12 /* REPSAL items section */
526 #define SN_WORDS 13 /* common words */
452 #define SN_END 255 /* end of sections */ 527 #define SN_END 255 /* end of sections */
453 528
454 #define SNF_REQUIRED 1 /* <sectionflags>: required section */ 529 #define SNF_REQUIRED 1 /* <sectionflags>: required section */
455 530
456 /* Result values. Lower number is accepted over higher one. */ 531 /* Result values. Lower number is accepted over higher one. */
461 #define SP_BAD 3 536 #define SP_BAD 3
462 537
463 /* file used for "zG" and "zW" */ 538 /* file used for "zG" and "zW" */
464 static char_u *int_wordlist = NULL; 539 static char_u *int_wordlist = NULL;
465 540
541 typedef struct wordcount_S
542 {
543 short_u wc_count; /* nr of times word was seen */
544 char_u wc_word[1]; /* word, actually longer */
545 } wordcount_T;
546
547 static wordcount_T dumwc;
548 #define WC_KEY_OFF (dumwc.wc_word - (char_u *)&dumwc)
549 #define HI2WC(hi) ((wordcount_T *)((hi)->hi_key - WC_KEY_OFF))
550 #define MAXWORDCOUNT 0xffff
551
466 /* 552 /*
467 * Information used when looking for suggestions. 553 * Information used when looking for suggestions.
468 */ 554 */
469 typedef struct suginfo_S 555 typedef struct suginfo_S
470 { 556 {
471 garray_T su_ga; /* suggestions, contains "suggest_T" */ 557 garray_T su_ga; /* suggestions, contains "suggest_T" */
472 int su_maxcount; /* max. number of suggestions displayed */ 558 int su_maxcount; /* max. number of suggestions displayed */
473 int su_maxscore; /* maximum score for adding to su_ga */ 559 int su_maxscore; /* maximum score for adding to su_ga */
560 int su_sfmaxscore; /* idem, for when doing soundfold words */
474 garray_T su_sga; /* like su_ga, sound-folded scoring */ 561 garray_T su_sga; /* like su_ga, sound-folded scoring */
475 char_u *su_badptr; /* start of bad word in line */ 562 char_u *su_badptr; /* start of bad word in line */
476 int su_badlen; /* length of detected bad word in line */ 563 int su_badlen; /* length of detected bad word in line */
477 int su_badflags; /* caps flags for bad word */ 564 int su_badflags; /* caps flags for bad word */
478 char_u su_badword[MAXWLEN]; /* bad word truncated at su_badlen */ 565 char_u su_badword[MAXWLEN]; /* bad word truncated at su_badlen */
479 char_u su_fbadword[MAXWLEN]; /* su_badword case-folded */ 566 char_u su_fbadword[MAXWLEN]; /* su_badword case-folded */
480 char_u su_sal_badword[MAXWLEN]; /* su_badword soundfolded */ 567 char_u su_sal_badword[MAXWLEN]; /* su_badword soundfolded */
481 slang_T *su_slang_first; /* slang_T used for su_sal_badword */
482 hashtab_T su_banned; /* table with banned words */ 568 hashtab_T su_banned; /* table with banned words */
483 slang_T *su_sallang; /* default language for sound folding */ 569 slang_T *su_sallang; /* default language for sound folding */
484 } suginfo_T; 570 } suginfo_T;
485 571
486 /* One word suggestion. Used in "si_ga". */ 572 /* One word suggestion. Used in "si_ga". */
487 typedef struct suggest_S 573 typedef struct suggest_S
488 { 574 {
489 char_u *st_word; /* suggested word, allocated string */ 575 char_u *st_word; /* suggested word, allocated string */
576 int st_wordlen; /* STRLEN(st_word) */
490 int st_orglen; /* length of replaced text */ 577 int st_orglen; /* length of replaced text */
491 int st_score; /* lower is better */ 578 int st_score; /* lower is better */
492 int st_altscore; /* used when st_score compares equal */ 579 int st_altscore; /* used when st_score compares equal */
493 int st_salscore; /* st_score is for soundalike */ 580 int st_salscore; /* st_score is for soundalike */
494 int st_had_bonus; /* bonus already included in score */ 581 int st_had_bonus; /* bonus already included in score */
495 slang_T *st_slang; /* language used for sound folding */ 582 slang_T *st_slang; /* language used for sound folding */
496 } suggest_T; 583 } suggest_T;
497 584
498 #define SUG(ga, i) (((suggest_T *)(ga).ga_data)[i]) 585 #define SUG(ga, i) (((suggest_T *)(ga).ga_data)[i])
499 586
500 /* Number of suggestions kept when cleaning up. When rescore_suggestions() is 587 /* TRUE if a word appears in the list of banned words. */
501 * called the score may change, thus we need to keep more than what is 588 #define WAS_BANNED(su, word) (!HASHITEM_EMPTY(hash_find(&su->su_banned, word)))
502 * displayed. */ 589
503 #define SUG_CLEAN_COUNT(su) ((su)->su_maxcount < 50 ? 50 : (su)->su_maxcount) 590 /* Number of suggestions kept when cleaning up. we need to keep more than
591 * what is displayed, because when rescore_suggestions() is called the score
592 * may change and wrong suggestions may be removed later. */
593 #define SUG_CLEAN_COUNT(su) ((su)->su_maxcount < 130 ? 150 : (su)->su_maxcount + 20)
504 594
505 /* Threshold for sorting and cleaning up suggestions. Don't want to keep lots 595 /* Threshold for sorting and cleaning up suggestions. Don't want to keep lots
506 * of suggestions that are not going to be displayed. */ 596 * of suggestions that are not going to be displayed. */
507 #define SUG_MAX_COUNT(su) ((su)->su_maxcount + 50) 597 #define SUG_MAX_COUNT(su) (SUG_CLEAN_COUNT(su) + 50)
508 598
509 /* score for various changes */ 599 /* score for various changes */
510 #define SCORE_SPLIT 149 /* split bad word */ 600 #define SCORE_SPLIT 149 /* split bad word */
511 #define SCORE_ICASE 52 /* slightly different case */ 601 #define SCORE_ICASE 52 /* slightly different case */
512 #define SCORE_REGION 200 /* word is for different region */ 602 #define SCORE_REGION 200 /* word is for different region */
513 #define SCORE_RARE 180 /* rare word */ 603 #define SCORE_RARE 180 /* rare word */
514 #define SCORE_SWAP 90 /* swap two characters */ 604 #define SCORE_SWAP 75 /* swap two characters */
515 #define SCORE_SWAP3 110 /* swap two characters in three */ 605 #define SCORE_SWAP3 110 /* swap two characters in three */
516 #define SCORE_REP 65 /* REP replacement */ 606 #define SCORE_REP 65 /* REP replacement */
517 #define SCORE_SUBST 93 /* substitute a character */ 607 #define SCORE_SUBST 93 /* substitute a character */
518 #define SCORE_SIMILAR 33 /* substitute a similar character */ 608 #define SCORE_SIMILAR 33 /* substitute a similar character */
519 #define SCORE_SUBCOMP 33 /* substitute a composing character */ 609 #define SCORE_SUBCOMP 33 /* substitute a composing character */
527 617
528 #define SCORE_FILE 30 /* suggestion from a file */ 618 #define SCORE_FILE 30 /* suggestion from a file */
529 #define SCORE_MAXINIT 350 /* Initial maximum score: higher == slower. 619 #define SCORE_MAXINIT 350 /* Initial maximum score: higher == slower.
530 * 350 allows for about three changes. */ 620 * 350 allows for about three changes. */
531 621
622 #define SCORE_COMMON1 30 /* subtracted for words seen before */
623 #define SCORE_COMMON2 40 /* subtracted for words often seen */
624 #define SCORE_COMMON3 50 /* subtracted for words very often seen */
625 #define SCORE_THRES2 10 /* word count threshold for COMMON2 */
626 #define SCORE_THRES3 100 /* word count threshold for COMMON3 */
627
628 /* When trying changed soundfold words it becomes slow when trying more than
629 * two changes. With less then two changes it's slightly faster but we miss a
630 * few good suggestions. In rare cases we need to try three of four changes.
631 */
632 #define SCORE_SFMAX1 200 /* maximum score for first try */
633 #define SCORE_SFMAX2 300 /* maximum score for second try */
634 #define SCORE_SFMAX3 400 /* maximum score for third try */
635
532 #define SCORE_BIG SCORE_INS * 3 /* big difference */ 636 #define SCORE_BIG SCORE_INS * 3 /* big difference */
533 #define SCORE_MAXMAX 999999 /* accept any score */ 637 #define SCORE_MAXMAX 999999 /* accept any score */
638 #define SCORE_LIMITMAX 350 /* for spell_edit_score_limit() */
639
640 /* for spell_edit_score_limit() we need to know the minimum value of
641 * SCORE_ICASE, SCORE_SWAP, SCORE_DEL, SCORE_SIMILAR and SCORE_INS */
642 #define SCORE_EDIT_MIN SCORE_SIMILAR
534 643
535 /* 644 /*
536 * Structure to store info for word matching. 645 * Structure to store info for word matching.
537 */ 646 */
538 typedef struct matchinf_S 647 typedef struct matchinf_S
615 STATE_NOPREFIX, /* try without prefix */ 724 STATE_NOPREFIX, /* try without prefix */
616 STATE_SPLITUNDO, /* Undo splitting. */ 725 STATE_SPLITUNDO, /* Undo splitting. */
617 STATE_ENDNUL, /* Past NUL bytes at start of the node. */ 726 STATE_ENDNUL, /* Past NUL bytes at start of the node. */
618 STATE_PLAIN, /* Use each byte of the node. */ 727 STATE_PLAIN, /* Use each byte of the node. */
619 STATE_DEL, /* Delete a byte from the bad word. */ 728 STATE_DEL, /* Delete a byte from the bad word. */
729 STATE_INS_PREP, /* Prepare for inserting bytes. */
620 STATE_INS, /* Insert a byte in the bad word. */ 730 STATE_INS, /* Insert a byte in the bad word. */
621 STATE_SWAP, /* Swap two bytes. */ 731 STATE_SWAP, /* Swap two bytes. */
622 STATE_UNSWAP, /* Undo swap two characters. */ 732 STATE_UNSWAP, /* Undo swap two characters. */
623 STATE_SWAP3, /* Swap two characters over three. */ 733 STATE_SWAP3, /* Swap two characters over three. */
624 STATE_UNSWAP3, /* Undo Swap two characters over three. */ 734 STATE_UNSWAP3, /* Undo Swap two characters over three. */
655 char_u ts_splitoff; /* index in "tword" after last split */ 765 char_u ts_splitoff; /* index in "tword" after last split */
656 char_u ts_splitfidx; /* "ts_fidx" at word split */ 766 char_u ts_splitfidx; /* "ts_fidx" at word split */
657 char_u ts_complen; /* nr of compound words used */ 767 char_u ts_complen; /* nr of compound words used */
658 char_u ts_compsplit; /* index for "compflags" where word was spit */ 768 char_u ts_compsplit; /* index for "compflags" where word was spit */
659 char_u ts_save_badflags; /* su_badflags saved here */ 769 char_u ts_save_badflags; /* su_badflags saved here */
770 char_u ts_delidx; /* index in fword for char that was deleted,
771 valid when "ts_flags" has TSF_DIDDEL */
660 } trystate_T; 772 } trystate_T;
661 773
662 /* values for ts_isdiff */ 774 /* values for ts_isdiff */
663 #define DIFF_NONE 0 /* no different byte (yet) */ 775 #define DIFF_NONE 0 /* no different byte (yet) */
664 #define DIFF_YES 1 /* different byte found */ 776 #define DIFF_YES 1 /* different byte found */
665 #define DIFF_INSERT 2 /* inserting character */ 777 #define DIFF_INSERT 2 /* inserting character */
666 778
667 /* values for ts_flags */ 779 /* values for ts_flags */
668 #define TSF_PREFIXOK 1 /* already checked that prefix is OK */ 780 #define TSF_PREFIXOK 1 /* already checked that prefix is OK */
669 #define TSF_DIDSPLIT 2 /* tried split at this point */ 781 #define TSF_DIDSPLIT 2 /* tried split at this point */
782 #define TSF_DIDDEL 4 /* did a delete, "ts_delidx" has index */
670 783
671 /* special values ts_prefixdepth */ 784 /* special values ts_prefixdepth */
672 #define PFD_NOPREFIX 0xff /* not using prefixes */ 785 #define PFD_NOPREFIX 0xff /* not using prefixes */
673 #define PFD_PREFIXTREE 0xfe /* walking through the prefix tree */ 786 #define PFD_PREFIXTREE 0xfe /* walking through the prefix tree */
674 #define PFD_NOTSPECIAL 0xfd /* first value that's not special */ 787 #define PFD_NOTSPECIAL 0xfd /* highest value that's not special */
675 788
676 /* mode values for find_word */ 789 /* mode values for find_word */
677 #define FIND_FOLDWORD 0 /* find word case-folded */ 790 #define FIND_FOLDWORD 0 /* find word case-folded */
678 #define FIND_KEEPWORD 1 /* find keep-case word */ 791 #define FIND_KEEPWORD 1 /* find keep-case word */
679 #define FIND_PREFIX 2 /* find word after prefix */ 792 #define FIND_PREFIX 2 /* find word after prefix */
681 #define FIND_KEEPCOMPOUND 4 /* find keep-case compound word */ 794 #define FIND_KEEPCOMPOUND 4 /* find keep-case compound word */
682 795
683 static slang_T *slang_alloc __ARGS((char_u *lang)); 796 static slang_T *slang_alloc __ARGS((char_u *lang));
684 static void slang_free __ARGS((slang_T *lp)); 797 static void slang_free __ARGS((slang_T *lp));
685 static void slang_clear __ARGS((slang_T *lp)); 798 static void slang_clear __ARGS((slang_T *lp));
799 static void slang_clear_sug __ARGS((slang_T *lp));
686 static void find_word __ARGS((matchinf_T *mip, int mode)); 800 static void find_word __ARGS((matchinf_T *mip, int mode));
687 static int can_compound __ARGS((slang_T *slang, char_u *word, char_u *flags)); 801 static int can_compound __ARGS((slang_T *slang, char_u *word, char_u *flags));
688 static int valid_word_prefix __ARGS((int totprefcnt, int arridx, int flags, char_u *word, slang_T *slang, int cond_req)); 802 static int valid_word_prefix __ARGS((int totprefcnt, int arridx, int flags, char_u *word, slang_T *slang, int cond_req));
689 static void find_prefix __ARGS((matchinf_T *mip, int mode)); 803 static void find_prefix __ARGS((matchinf_T *mip, int mode));
690 static int fold_more __ARGS((matchinf_T *mip)); 804 static int fold_more __ARGS((matchinf_T *mip));
698 static char_u *read_cnt_string __ARGS((FILE *fd, int cnt_bytes, int *lenp)); 812 static char_u *read_cnt_string __ARGS((FILE *fd, int cnt_bytes, int *lenp));
699 static char_u *read_string __ARGS((FILE *fd, int cnt)); 813 static char_u *read_string __ARGS((FILE *fd, int cnt));
700 static int read_region_section __ARGS((FILE *fd, slang_T *slang, int len)); 814 static int read_region_section __ARGS((FILE *fd, slang_T *slang, int len));
701 static int read_charflags_section __ARGS((FILE *fd)); 815 static int read_charflags_section __ARGS((FILE *fd));
702 static int read_prefcond_section __ARGS((FILE *fd, slang_T *lp)); 816 static int read_prefcond_section __ARGS((FILE *fd, slang_T *lp));
703 static int read_rep_section __ARGS((FILE *fd, slang_T *slang)); 817 static int read_rep_section __ARGS((FILE *fd, garray_T *gap, short *first));
704 static int read_sal_section __ARGS((FILE *fd, slang_T *slang)); 818 static int read_sal_section __ARGS((FILE *fd, slang_T *slang));
819 static int read_words_section __ARGS((FILE *fd, slang_T *lp, int len));
820 static void count_common_word __ARGS((slang_T *lp, char_u *word, int len, int count));
821 static int score_wordcount_adj __ARGS((slang_T *slang, int score, char_u *word, int split));
705 static int read_sofo_section __ARGS((FILE *fd, slang_T *slang)); 822 static int read_sofo_section __ARGS((FILE *fd, slang_T *slang));
706 static int read_compound __ARGS((FILE *fd, slang_T *slang, int len)); 823 static int read_compound __ARGS((FILE *fd, slang_T *slang, int len));
707 static int byte_in_str __ARGS((char_u *str, int byte)); 824 static int byte_in_str __ARGS((char_u *str, int byte));
708 static int init_syl_tab __ARGS((slang_T *slang)); 825 static int init_syl_tab __ARGS((slang_T *slang));
709 static int count_syllables __ARGS((slang_T *slang, char_u *word)); 826 static int count_syllables __ARGS((slang_T *slang, char_u *word));
710 static int set_sofo __ARGS((slang_T *lp, char_u *from, char_u *to)); 827 static int set_sofo __ARGS((slang_T *lp, char_u *from, char_u *to));
711 static void set_sal_first __ARGS((slang_T *lp)); 828 static void set_sal_first __ARGS((slang_T *lp));
712 #ifdef FEAT_MBYTE 829 #ifdef FEAT_MBYTE
713 static int *mb_str2wide __ARGS((char_u *s)); 830 static int *mb_str2wide __ARGS((char_u *s));
714 #endif 831 #endif
715 static idx_T read_tree __ARGS((FILE *fd, char_u *byts, idx_T *idxs, int maxidx, int startidx, int prefixtree, int maxprefcondnr)); 832 static int spell_read_tree __ARGS((FILE *fd, char_u **bytsp, idx_T **idxsp, int prefixtree, int prefixcnt));
833 static idx_T read_tree_node __ARGS((FILE *fd, char_u *byts, idx_T *idxs, int maxidx, int startidx, int prefixtree, int maxprefcondnr));
716 static void clear_midword __ARGS((buf_T *buf)); 834 static void clear_midword __ARGS((buf_T *buf));
717 static void use_midword __ARGS((slang_T *lp, buf_T *buf)); 835 static void use_midword __ARGS((slang_T *lp, buf_T *buf));
718 static int find_region __ARGS((char_u *rp, char_u *region)); 836 static int find_region __ARGS((char_u *rp, char_u *region));
719 static int captype __ARGS((char_u *word, char_u *end)); 837 static int captype __ARGS((char_u *word, char_u *end));
720 static int badword_captype __ARGS((char_u *word, char_u *end)); 838 static int badword_captype __ARGS((char_u *word, char_u *end));
721 static void spell_reload_one __ARGS((char_u *fname, int added_word)); 839 static void spell_reload_one __ARGS((char_u *fname, int added_word));
722 static void set_spell_charflags __ARGS((char_u *flags, int cnt, char_u *upp)); 840 static void set_spell_charflags __ARGS((char_u *flags, int cnt, char_u *upp));
723 static int set_spell_chartab __ARGS((char_u *fol, char_u *low, char_u *upp)); 841 static int set_spell_chartab __ARGS((char_u *fol, char_u *low, char_u *upp));
724 static int spell_casefold __ARGS((char_u *p, int len, char_u *buf, int buflen)); 842 static int spell_casefold __ARGS((char_u *p, int len, char_u *buf, int buflen));
725 static int check_need_cap __ARGS((linenr_T lnum, colnr_T col)); 843 static int check_need_cap __ARGS((linenr_T lnum, colnr_T col));
726 static void spell_find_suggest __ARGS((char_u *badptr, suginfo_T *su, int maxcount, int banbadword, int need_cap)); 844 static void spell_find_suggest __ARGS((char_u *badptr, suginfo_T *su, int maxcount, int banbadword, int need_cap, int interactive));
727 #ifdef FEAT_EVAL 845 #ifdef FEAT_EVAL
728 static void spell_suggest_expr __ARGS((suginfo_T *su, char_u *expr)); 846 static void spell_suggest_expr __ARGS((suginfo_T *su, char_u *expr));
729 #endif 847 #endif
730 static void spell_suggest_file __ARGS((suginfo_T *su, char_u *fname)); 848 static void spell_suggest_file __ARGS((suginfo_T *su, char_u *fname));
731 static void spell_suggest_intern __ARGS((suginfo_T *su)); 849 static void spell_suggest_intern __ARGS((suginfo_T *su, int interactive));
850 static void suggest_load_files __ARGS((void));
851 static void tree_count_words __ARGS((char_u *byts, idx_T *idxs));
732 static void spell_find_cleanup __ARGS((suginfo_T *su)); 852 static void spell_find_cleanup __ARGS((suginfo_T *su));
733 static void onecap_copy __ARGS((char_u *word, char_u *wcopy, int upper)); 853 static void onecap_copy __ARGS((char_u *word, char_u *wcopy, int upper));
734 static void allcap_copy __ARGS((char_u *word, char_u *wcopy)); 854 static void allcap_copy __ARGS((char_u *word, char_u *wcopy));
735 static void suggest_try_special __ARGS((suginfo_T *su)); 855 static void suggest_try_special __ARGS((suginfo_T *su));
736 static void suggest_try_change __ARGS((suginfo_T *su)); 856 static void suggest_try_change __ARGS((suginfo_T *su));
737 static int try_deeper __ARGS((suginfo_T *su, trystate_T *stack, int depth, int score_add)); 857 static void suggest_trie_walk __ARGS((suginfo_T *su, langp_T *lp, char_u *fword, int soundfold));
858 static void go_deeper __ARGS((trystate_T *stack, int depth, int score_add));
738 #ifdef FEAT_MBYTE 859 #ifdef FEAT_MBYTE
739 static int nofold_len __ARGS((char_u *fword, int flen, char_u *word)); 860 static int nofold_len __ARGS((char_u *fword, int flen, char_u *word));
740 #endif 861 #endif
741 static void find_keepcap_word __ARGS((slang_T *slang, char_u *fword, char_u *kword)); 862 static void find_keepcap_word __ARGS((slang_T *slang, char_u *fword, char_u *kword));
742 static void score_comp_sal __ARGS((suginfo_T *su)); 863 static void score_comp_sal __ARGS((suginfo_T *su));
743 static void score_combine __ARGS((suginfo_T *su)); 864 static void score_combine __ARGS((suginfo_T *su));
744 static int stp_sal_score __ARGS((suggest_T *stp, suginfo_T *su, slang_T *slang, char_u *badsound)); 865 static int stp_sal_score __ARGS((suggest_T *stp, suginfo_T *su, slang_T *slang, char_u *badsound));
866 static void suggest_try_soundalike_prep __ARGS((void));
745 static void suggest_try_soundalike __ARGS((suginfo_T *su)); 867 static void suggest_try_soundalike __ARGS((suginfo_T *su));
868 static void suggest_try_soundalike_finish __ARGS((void));
869 static void add_sound_suggest __ARGS((suginfo_T *su, char_u *goodword, int score, langp_T *lp));
870 static int soundfold_find __ARGS((slang_T *slang, char_u *word));
746 static void make_case_word __ARGS((char_u *fword, char_u *cword, int flags)); 871 static void make_case_word __ARGS((char_u *fword, char_u *cword, int flags));
747 static void set_map_str __ARGS((slang_T *lp, char_u *map)); 872 static void set_map_str __ARGS((slang_T *lp, char_u *map));
748 static int similar_chars __ARGS((slang_T *slang, int c1, int c2)); 873 static int similar_chars __ARGS((slang_T *slang, int c1, int c2));
749 static void add_suggestion __ARGS((suginfo_T *su, garray_T *gap, char_u *goodword, int badlen, int score, int altscore, int had_bonus, slang_T *slang)); 874 static void add_suggestion __ARGS((suginfo_T *su, garray_T *gap, char_u *goodword, int badlen, int score, int altscore, int had_bonus, slang_T *slang, int maxsf));
875 static void check_suggestions __ARGS((suginfo_T *su, garray_T *gap));
750 static void add_banned __ARGS((suginfo_T *su, char_u *word)); 876 static void add_banned __ARGS((suginfo_T *su, char_u *word));
751 static int was_banned __ARGS((suginfo_T *su, char_u *word));
752 static void free_banned __ARGS((suginfo_T *su));
753 static void rescore_suggestions __ARGS((suginfo_T *su)); 877 static void rescore_suggestions __ARGS((suginfo_T *su));
754 static void rescore_one __ARGS((suginfo_T *su, suggest_T *stp)); 878 static void rescore_one __ARGS((suginfo_T *su, suggest_T *stp));
755 static int cleanup_suggestions __ARGS((garray_T *gap, int maxscore, int keep)); 879 static int cleanup_suggestions __ARGS((garray_T *gap, int maxscore, int keep));
756 static void spell_soundfold __ARGS((slang_T *slang, char_u *inword, int folded, char_u *res)); 880 static void spell_soundfold __ARGS((slang_T *slang, char_u *inword, int folded, char_u *res));
757 static void spell_soundfold_sofo __ARGS((slang_T *slang, char_u *inword, char_u *res)); 881 static void spell_soundfold_sofo __ARGS((slang_T *slang, char_u *inword, char_u *res));
758 static void spell_soundfold_sal __ARGS((slang_T *slang, char_u *inword, char_u *res)); 882 static void spell_soundfold_sal __ARGS((slang_T *slang, char_u *inword, char_u *res));
759 #ifdef FEAT_MBYTE 883 #ifdef FEAT_MBYTE
760 static void spell_soundfold_wsal __ARGS((slang_T *slang, char_u *inword, char_u *res)); 884 static void spell_soundfold_wsal __ARGS((slang_T *slang, char_u *inword, char_u *res));
761 #endif 885 #endif
762 static int soundalike_score __ARGS((char_u *goodsound, char_u *badsound)); 886 static int soundalike_score __ARGS((char_u *goodsound, char_u *badsound));
763 static int spell_edit_score __ARGS((char_u *badword, char_u *goodword)); 887 static int spell_edit_score __ARGS((slang_T *slang, char_u *badword, char_u *goodword));
764 static void dump_word __ARGS((char_u *word, int round, int flags, linenr_T lnum)); 888 static int spell_edit_score_limit __ARGS((slang_T *slang, char_u *badword, char_u *goodword, int limit));
889 #ifdef FEAT_MBYTE
890 static int spell_edit_score_limit_w __ARGS((slang_T *slang, char_u *badword, char_u *goodword, int limit));
891 #endif
892 static void dump_word __ARGS((slang_T *slang, char_u *word, int round, int flags, linenr_T lnum));
765 static linenr_T dump_prefixes __ARGS((slang_T *slang, char_u *word, int round, int flags, linenr_T startlnum)); 893 static linenr_T dump_prefixes __ARGS((slang_T *slang, char_u *word, int round, int flags, linenr_T startlnum));
894 static buf_T *open_spellbuf __ARGS((void));
895 static void close_spellbuf __ARGS((buf_T *buf));
766 896
767 /* 897 /*
768 * Use our own character-case definitions, because the current locale may 898 * Use our own character-case definitions, because the current locale may
769 * differ from what the .spl file uses. 899 * differ from what the .spl file uses.
770 * These must not be called with negative number! 900 * These must not be called with negative number!
829 * 959 *
830 * Returns the length of the word in bytes, also when it's OK, so that the 960 * Returns the length of the word in bytes, also when it's OK, so that the
831 * caller can skip over the word. 961 * caller can skip over the word.
832 */ 962 */
833 int 963 int
834 spell_check(wp, ptr, attrp, capcol) 964 spell_check(wp, ptr, attrp, capcol, docount)
835 win_T *wp; /* current window */ 965 win_T *wp; /* current window */
836 char_u *ptr; 966 char_u *ptr;
837 hlf_T *attrp; 967 hlf_T *attrp;
838 int *capcol; /* column to check for Capital */ 968 int *capcol; /* column to check for Capital */
969 int docount; /* count good words */
839 { 970 {
840 matchinf_T mi; /* Most things are put in "mi" so that it can 971 matchinf_T mi; /* Most things are put in "mi" so that it can
841 be passed to functions quickly. */ 972 be passed to functions quickly. */
842 int nrlen = 0; /* found a number first */ 973 int nrlen = 0; /* found a number first */
843 int c; 974 int c;
844 int wrongcaplen = 0; 975 int wrongcaplen = 0;
845 int lpi; 976 int lpi;
977 int count_word = docount;
846 978
847 /* A word never starts at a space or a control character. Return quickly 979 /* A word never starts at a space or a control character. Return quickly
848 * then, skipping over the character. */ 980 * then, skipping over the character. */
849 if (*ptr <= ' ') 981 if (*ptr <= ' ')
850 return 1; 982 return 1;
903 mi.mi_result = SP_BAD; 1035 mi.mi_result = SP_BAD;
904 mi.mi_result2 = SP_BAD; 1036 mi.mi_result2 = SP_BAD;
905 1037
906 /* 1038 /*
907 * Loop over the languages specified in 'spelllang'. 1039 * Loop over the languages specified in 'spelllang'.
908 * We check them all, because a matching word may be longer than an 1040 * We check them all, because a word may be matched longer in another
909 * already found matching word. 1041 * language.
910 */ 1042 */
911 for (lpi = 0; lpi < wp->w_buffer->b_langp.ga_len; ++lpi) 1043 for (lpi = 0; lpi < wp->w_buffer->b_langp.ga_len; ++lpi)
912 { 1044 {
913 mi.mi_lp = LANGP_ENTRY(wp->w_buffer->b_langp, lpi); 1045 mi.mi_lp = LANGP_ENTRY(wp->w_buffer->b_langp, lpi);
914 1046
931 if (mi.mi_lp->lp_slang->sl_nobreak && mi.mi_result == SP_BAD 1063 if (mi.mi_lp->lp_slang->sl_nobreak && mi.mi_result == SP_BAD
932 && mi.mi_result2 != SP_BAD) 1064 && mi.mi_result2 != SP_BAD)
933 { 1065 {
934 mi.mi_result = mi.mi_result2; 1066 mi.mi_result = mi.mi_result2;
935 mi.mi_end = mi.mi_end2; 1067 mi.mi_end = mi.mi_end2;
1068 }
1069
1070 /* Count the word in the first language where it's found to be OK. */
1071 if (count_word && mi.mi_result == SP_OK)
1072 {
1073 count_common_word(mi.mi_lp->lp_slang, ptr,
1074 (int)(mi.mi_end - ptr), 1);
1075 count_word = FALSE;
936 } 1076 }
937 } 1077 }
938 1078
939 if (mi.mi_result != SP_OK) 1079 if (mi.mi_result != SP_OK)
940 { 1080 {
1895 && (colnr_T)(p - buf) >= wp->w_cursor.col) 2035 && (colnr_T)(p - buf) >= wp->w_cursor.col)
1896 break; 2036 break;
1897 2037
1898 /* start of word */ 2038 /* start of word */
1899 attr = HLF_COUNT; 2039 attr = HLF_COUNT;
1900 len = spell_check(wp, p, &attr, &capcol); 2040 len = spell_check(wp, p, &attr, &capcol, FALSE);
1901 2041
1902 if (attr != HLF_COUNT) 2042 if (attr != HLF_COUNT)
1903 { 2043 {
1904 /* We found a bad word. Check the attribute. */ 2044 /* We found a bad word. Check the attribute. */
1905 if (allwords || attr == HLF_SPB) 2045 if (allwords || attr == HLF_SPB)
2138 vim_snprintf((char *)fname, MAXPATHL, "%s.%s.spl", 2278 vim_snprintf((char *)fname, MAXPATHL, "%s.%s.spl",
2139 int_wordlist, spell_enc()); 2279 int_wordlist, spell_enc());
2140 } 2280 }
2141 2281
2142 /* 2282 /*
2143 * Allocate a new slang_T. 2283 * Allocate a new slang_T for language "lang". "lang" can be NULL.
2144 * Caller must fill "sl_next". 2284 * Caller must fill "sl_next".
2145 */ 2285 */
2146 static slang_T * 2286 static slang_T *
2147 slang_alloc(lang) 2287 slang_alloc(lang)
2148 char_u *lang; 2288 char_u *lang;
2150 slang_T *lp; 2290 slang_T *lp;
2151 2291
2152 lp = (slang_T *)alloc_clear(sizeof(slang_T)); 2292 lp = (slang_T *)alloc_clear(sizeof(slang_T));
2153 if (lp != NULL) 2293 if (lp != NULL)
2154 { 2294 {
2155 lp->sl_name = vim_strsave(lang); 2295 if (lang != NULL)
2296 lp->sl_name = vim_strsave(lang);
2156 ga_init2(&lp->sl_rep, sizeof(fromto_T), 10); 2297 ga_init2(&lp->sl_rep, sizeof(fromto_T), 10);
2298 ga_init2(&lp->sl_repsal, sizeof(fromto_T), 10);
2157 lp->sl_compmax = MAXWLEN; 2299 lp->sl_compmax = MAXWLEN;
2158 lp->sl_compsylmax = MAXWLEN; 2300 lp->sl_compsylmax = MAXWLEN;
2159 } 2301 hash_init(&lp->sl_wordcount);
2302 }
2303
2160 return lp; 2304 return lp;
2161 } 2305 }
2162 2306
2163 /* 2307 /*
2164 * Free the contents of an slang_T and the structure itself. 2308 * Free the contents of an slang_T and the structure itself.
2182 { 2326 {
2183 garray_T *gap; 2327 garray_T *gap;
2184 fromto_T *ftp; 2328 fromto_T *ftp;
2185 salitem_T *smp; 2329 salitem_T *smp;
2186 int i; 2330 int i;
2331 int round;
2187 2332
2188 vim_free(lp->sl_fbyts); 2333 vim_free(lp->sl_fbyts);
2189 lp->sl_fbyts = NULL; 2334 lp->sl_fbyts = NULL;
2190 vim_free(lp->sl_kbyts); 2335 vim_free(lp->sl_kbyts);
2191 lp->sl_kbyts = NULL; 2336 lp->sl_kbyts = NULL;
2197 vim_free(lp->sl_kidxs); 2342 vim_free(lp->sl_kidxs);
2198 lp->sl_kidxs = NULL; 2343 lp->sl_kidxs = NULL;
2199 vim_free(lp->sl_pidxs); 2344 vim_free(lp->sl_pidxs);
2200 lp->sl_pidxs = NULL; 2345 lp->sl_pidxs = NULL;
2201 2346
2202 gap = &lp->sl_rep; 2347 for (round = 1; round <= 2; ++round)
2203 while (gap->ga_len > 0) 2348 {
2204 { 2349 gap = round == 1 ? &lp->sl_rep : &lp->sl_repsal;
2205 ftp = &((fromto_T *)gap->ga_data)[--gap->ga_len]; 2350 while (gap->ga_len > 0)
2206 vim_free(ftp->ft_from); 2351 {
2207 vim_free(ftp->ft_to); 2352 ftp = &((fromto_T *)gap->ga_data)[--gap->ga_len];
2208 } 2353 vim_free(ftp->ft_from);
2209 ga_clear(gap); 2354 vim_free(ftp->ft_to);
2355 }
2356 ga_clear(gap);
2357 }
2210 2358
2211 gap = &lp->sl_sal; 2359 gap = &lp->sl_sal;
2212 if (lp->sl_sofo) 2360 if (lp->sl_sofo)
2213 { 2361 {
2214 /* "ga_len" is set to 1 without adding an item for latin1 */ 2362 /* "ga_len" is set to 1 without adding an item for latin1 */
2251 2399
2252 vim_free(lp->sl_syllable); 2400 vim_free(lp->sl_syllable);
2253 lp->sl_syllable = NULL; 2401 lp->sl_syllable = NULL;
2254 ga_clear(&lp->sl_syl_items); 2402 ga_clear(&lp->sl_syl_items);
2255 2403
2404 hash_clear_all(&lp->sl_wordcount, WC_KEY_OFF);
2405 hash_init(&lp->sl_wordcount);
2406
2256 #ifdef FEAT_MBYTE 2407 #ifdef FEAT_MBYTE
2257 { 2408 hash_clear_all(&lp->sl_map_hash, 0);
2258 int todo = lp->sl_map_hash.ht_used;
2259 hashitem_T *hi;
2260
2261 for (hi = lp->sl_map_hash.ht_array; todo > 0; ++hi)
2262 if (!HASHITEM_EMPTY(hi))
2263 {
2264 --todo;
2265 vim_free(hi->hi_key);
2266 }
2267 }
2268 hash_clear(&lp->sl_map_hash);
2269 #endif 2409 #endif
2410
2411 /* Clear info from .sug file. */
2412 slang_clear_sug(lp);
2270 2413
2271 lp->sl_compmax = MAXWLEN; 2414 lp->sl_compmax = MAXWLEN;
2272 lp->sl_compminlen = 0; 2415 lp->sl_compminlen = 0;
2273 lp->sl_compsylmax = MAXWLEN; 2416 lp->sl_compsylmax = MAXWLEN;
2274 lp->sl_regions[0] = NUL; 2417 lp->sl_regions[0] = NUL;
2418 }
2419
2420 /*
2421 * Clear the info from the .sug file in "lp".
2422 */
2423 static void
2424 slang_clear_sug(lp)
2425 slang_T *lp;
2426 {
2427 vim_free(lp->sl_sbyts);
2428 lp->sl_sbyts = NULL;
2429 vim_free(lp->sl_sidxs);
2430 lp->sl_sidxs = NULL;
2431 close_spellbuf(lp->sl_sugbuf);
2432 lp->sl_sugbuf = NULL;
2433 lp->sl_sugloaded = FALSE;
2434 lp->sl_sugtime = 0;
2275 } 2435 }
2276 2436
2277 /* 2437 /*
2278 * Load one spell file and store the info into a slang_T. 2438 * Load one spell file and store the info into a slang_T.
2279 * Invoked through do_in_runtimepath(). 2439 * Invoked through do_in_runtimepath().
2301 } 2461 }
2302 2462
2303 /* 2463 /*
2304 * Load one spell file and store the info into a slang_T. 2464 * Load one spell file and store the info into a slang_T.
2305 * 2465 *
2306 * This is invoked in two ways: 2466 * This is invoked in three ways:
2307 * - From spell_load_cb() to load a spell file for the first time. "lang" is 2467 * - From spell_load_cb() to load a spell file for the first time. "lang" is
2308 * the language name, "old_lp" is NULL. Will allocate an slang_T. 2468 * the language name, "old_lp" is NULL. Will allocate an slang_T.
2309 * - To reload a spell file that was changed. "lang" is NULL and "old_lp" 2469 * - To reload a spell file that was changed. "lang" is NULL and "old_lp"
2310 * points to the existing slang_T. 2470 * points to the existing slang_T.
2471 * - Just after writing a .spl file; it's read back to produce the .sug file.
2472 * "old_lp" is NULL and "lang" is a dummy name. Will allocate an slang_T.
2311 * Returns the slang_T the spell file was loaded into. NULL for error. 2473 * Returns the slang_T the spell file was loaded into. NULL for error.
2312 */ 2474 */
2313 static slang_T * 2475 static slang_T *
2314 spell_load_file(fname, lang, old_lp, silent) 2476 spell_load_file(fname, lang, old_lp, silent)
2315 char_u *fname; 2477 char_u *fname;
2318 int silent; /* no error if file doesn't exist */ 2480 int silent; /* no error if file doesn't exist */
2319 { 2481 {
2320 FILE *fd; 2482 FILE *fd;
2321 char_u buf[VIMSPELLMAGICL]; 2483 char_u buf[VIMSPELLMAGICL];
2322 char_u *p; 2484 char_u *p;
2323 char_u *bp;
2324 idx_T *ip;
2325 int i; 2485 int i;
2326 int n; 2486 int n;
2327 int len; 2487 int len;
2328 int round;
2329 char_u *save_sourcing_name = sourcing_name; 2488 char_u *save_sourcing_name = sourcing_name;
2330 linenr_T save_sourcing_lnum = sourcing_lnum; 2489 linenr_T save_sourcing_lnum = sourcing_lnum;
2331 slang_T *lp = NULL; 2490 slang_T *lp = NULL;
2332 idx_T idx;
2333 int c = 0; 2491 int c = 0;
2334 int res; 2492 int res;
2335 2493
2336 fd = mch_fopen((char *)fname, "r"); 2494 fd = mch_fopen((char *)fname, "r");
2337 if (fd == NULL) 2495 if (fd == NULL)
2372 2530
2373 /* Set sourcing_name, so that error messages mention the file name. */ 2531 /* Set sourcing_name, so that error messages mention the file name. */
2374 sourcing_name = fname; 2532 sourcing_name = fname;
2375 sourcing_lnum = 0; 2533 sourcing_lnum = 0;
2376 2534
2377 /* <HEADER>: <fileID> 2535 /*
2536 * <HEADER>: <fileID>
2378 */ 2537 */
2379 for (i = 0; i < VIMSPELLMAGICL; ++i) 2538 for (i = 0; i < VIMSPELLMAGICL; ++i)
2380 buf[i] = getc(fd); /* <fileID> */ 2539 buf[i] = getc(fd); /* <fileID> */
2381 if (STRNCMP(buf, VIMSPELLMAGIC, VIMSPELLMAGICL) != 0) 2540 if (STRNCMP(buf, VIMSPELLMAGIC, VIMSPELLMAGICL) != 0)
2382 { 2541 {
2431 case SN_PREFCOND: 2590 case SN_PREFCOND:
2432 res = read_prefcond_section(fd, lp); 2591 res = read_prefcond_section(fd, lp);
2433 break; 2592 break;
2434 2593
2435 case SN_REP: 2594 case SN_REP:
2436 res = read_rep_section(fd, lp); 2595 res = read_rep_section(fd, &lp->sl_rep, lp->sl_rep_first);
2596 break;
2597
2598 case SN_REPSAL:
2599 res = read_rep_section(fd, &lp->sl_repsal, lp->sl_repsal_first);
2437 break; 2600 break;
2438 2601
2439 case SN_SAL: 2602 case SN_SAL:
2440 res = read_sal_section(fd, lp); 2603 res = read_sal_section(fd, lp);
2441 break; 2604 break;
2448 p = read_string(fd, len); /* <mapstr> */ 2611 p = read_string(fd, len); /* <mapstr> */
2449 if (p == NULL) 2612 if (p == NULL)
2450 goto endFAIL; 2613 goto endFAIL;
2451 set_map_str(lp, p); 2614 set_map_str(lp, p);
2452 vim_free(p); 2615 vim_free(p);
2616 break;
2617
2618 case SN_WORDS:
2619 res = read_words_section(fd, lp, len);
2620 break;
2621
2622 case SN_SUGFILE:
2623 for (i = 7; i >= 0; --i) /* <timestamp> */
2624 lp->sl_sugtime += getc(fd) << (i * 8);
2453 break; 2625 break;
2454 2626
2455 case SN_COMPOUND: 2627 case SN_COMPOUND:
2456 res = read_compound(fd, lp, len); 2628 res = read_compound(fd, lp, len);
2457 break; 2629 break;
2479 while (--len >= 0) 2651 while (--len >= 0)
2480 if (getc(fd) < 0) 2652 if (getc(fd) < 0)
2481 goto truncerr; 2653 goto truncerr;
2482 break; 2654 break;
2483 } 2655 }
2656 someerror:
2484 if (res == SP_FORMERROR) 2657 if (res == SP_FORMERROR)
2485 { 2658 {
2486 formerr:
2487 EMSG(_(e_format)); 2659 EMSG(_(e_format));
2488 goto endFAIL; 2660 goto endFAIL;
2489 } 2661 }
2490 if (res == SP_TRUNCERROR) 2662 if (res == SP_TRUNCERROR)
2491 { 2663 {
2495 } 2667 }
2496 if (res == SP_OTHERERROR) 2668 if (res == SP_OTHERERROR)
2497 goto endFAIL; 2669 goto endFAIL;
2498 } 2670 }
2499 2671
2500 /* round 1: <LWORDTREE> 2672 /* <LWORDTREE> */
2501 * round 2: <KWORDTREE> 2673 res = spell_read_tree(fd, &lp->sl_fbyts, &lp->sl_fidxs, FALSE, 0);
2502 * round 3: <PREFIXTREE> */ 2674 if (res != 0)
2503 for (round = 1; round <= 3; ++round) 2675 goto someerror;
2504 { 2676
2505 /* The tree size was computed when writing the file, so that we can 2677 /* <KWORDTREE> */
2506 * allocate it as one long block. <nodecount> */ 2678 res = spell_read_tree(fd, &lp->sl_kbyts, &lp->sl_kidxs, FALSE, 0);
2507 len = (getc(fd) << 24) + (getc(fd) << 16) + (getc(fd) << 8) + getc(fd); 2679 if (res != 0)
2508 if (len < 0) 2680 goto someerror;
2509 goto truncerr; 2681
2510 if (len > 0) 2682 /* <PREFIXTREE> */
2511 { 2683 res = spell_read_tree(fd, &lp->sl_pbyts, &lp->sl_pidxs, TRUE,
2512 /* Allocate the byte array. */ 2684 lp->sl_prefixcnt);
2513 bp = lalloc((long_u)len, TRUE); 2685 if (res != 0)
2514 if (bp == NULL) 2686 goto someerror;
2515 goto endFAIL;
2516 if (round == 1)
2517 lp->sl_fbyts = bp;
2518 else if (round == 2)
2519 lp->sl_kbyts = bp;
2520 else
2521 lp->sl_pbyts = bp;
2522
2523 /* Allocate the index array. */
2524 ip = (idx_T *)lalloc_clear((long_u)(len * sizeof(int)), TRUE);
2525 if (ip == NULL)
2526 goto endFAIL;
2527 if (round == 1)
2528 lp->sl_fidxs = ip;
2529 else if (round == 2)
2530 lp->sl_kidxs = ip;
2531 else
2532 lp->sl_pidxs = ip;
2533
2534 /* Read the tree and store it in the array. */
2535 idx = read_tree(fd, bp, ip, len, 0, round == 3, lp->sl_prefixcnt);
2536 if (idx == -1)
2537 goto truncerr;
2538 if (idx < 0)
2539 goto formerr;
2540 }
2541 }
2542 2687
2543 /* For a new file link it in the list of spell files. */ 2688 /* For a new file link it in the list of spell files. */
2544 if (old_lp == NULL) 2689 if (old_lp == NULL)
2545 { 2690 {
2546 lp->sl_next = first_lang; 2691 lp->sl_next = first_lang;
2731 } 2876 }
2732 return 0; 2877 return 0;
2733 } 2878 }
2734 2879
2735 /* 2880 /*
2736 * Read REP items section from "fd": <repcount> <rep> ... 2881 * Read REP or REPSAL items section from "fd": <repcount> <rep> ...
2737 * Return SP_*ERROR flags. 2882 * Return SP_*ERROR flags.
2738 */ 2883 */
2739 static int 2884 static int
2740 read_rep_section(fd, slang) 2885 read_rep_section(fd, gap, first)
2741 FILE *fd; 2886 FILE *fd;
2742 slang_T *slang; 2887 garray_T *gap;
2888 short *first;
2743 { 2889 {
2744 int cnt; 2890 int cnt;
2745 garray_T *gap;
2746 fromto_T *ftp; 2891 fromto_T *ftp;
2747 short *first;
2748 int i; 2892 int i;
2749 2893
2750 cnt = (getc(fd) << 8) + getc(fd); /* <repcount> */ 2894 cnt = (getc(fd) << 8) + getc(fd); /* <repcount> */
2751 if (cnt < 0) 2895 if (cnt < 0)
2752 return SP_TRUNCERROR; 2896 return SP_TRUNCERROR;
2753 2897
2754 gap = &slang->sl_rep;
2755 if (ga_grow(gap, cnt) == FAIL) 2898 if (ga_grow(gap, cnt) == FAIL)
2756 return SP_OTHERERROR; 2899 return SP_OTHERERROR;
2757 2900
2758 /* <rep> : <repfromlen> <repfrom> <reptolen> <repto> */ 2901 /* <rep> : <repfromlen> <repfrom> <reptolen> <repto> */
2759 for (; gap->ga_len < cnt; ++gap->ga_len) 2902 for (; gap->ga_len < cnt; ++gap->ga_len)
2773 return SP_FORMERROR; 2916 return SP_FORMERROR;
2774 } 2917 }
2775 } 2918 }
2776 2919
2777 /* Fill the first-index table. */ 2920 /* Fill the first-index table. */
2778 first = slang->sl_rep_first;
2779 for (i = 0; i < 256; ++i) 2921 for (i = 0; i < 256; ++i)
2780 first[i] = -1; 2922 first[i] = -1;
2781 for (i = 0; i < gap->ga_len; ++i) 2923 for (i = 0; i < gap->ga_len; ++i)
2782 { 2924 {
2783 ftp = &((fromto_T *)gap->ga_data)[i]; 2925 ftp = &((fromto_T *)gap->ga_data)[i];
2936 3078
2937 /* Fill the first-index table. */ 3079 /* Fill the first-index table. */
2938 set_sal_first(slang); 3080 set_sal_first(slang);
2939 3081
2940 return 0; 3082 return 0;
3083 }
3084
3085 /*
3086 * Read SN_WORDS: <word> ...
3087 * Return SP_*ERROR flags.
3088 */
3089 static int
3090 read_words_section(fd, lp, len)
3091 FILE *fd;
3092 slang_T *lp;
3093 int len;
3094 {
3095 int done = 0;
3096 int i;
3097 char_u word[MAXWLEN];
3098
3099 while (done < len)
3100 {
3101 /* Read one word at a time. */
3102 for (i = 0; ; ++i)
3103 {
3104 word[i] = getc(fd);
3105 if (word[i] == NUL)
3106 break;
3107 if (i == MAXWLEN - 1)
3108 return SP_FORMERROR;
3109 }
3110
3111 /* Init the count to 10. */
3112 count_common_word(lp, word, -1, 10);
3113 done += i + 1;
3114 }
3115 return 0;
3116 }
3117
3118 /*
3119 * Add a word to the hashtable of common words.
3120 * If it's already there then the counter is increased.
3121 */
3122 static void
3123 count_common_word(lp, word, len, count)
3124 slang_T *lp;
3125 char_u *word;
3126 int len; /* word length, -1 for upto NUL */
3127 int count; /* 1 to count once, 10 to init */
3128 {
3129 hash_T hash;
3130 hashitem_T *hi;
3131 wordcount_T *wc;
3132 char_u buf[MAXWLEN];
3133 char_u *p;
3134
3135 if (len == -1)
3136 p = word;
3137 else
3138 {
3139 vim_strncpy(buf, word, len);
3140 p = buf;
3141 }
3142
3143 hash = hash_hash(p);
3144 hi = hash_lookup(&lp->sl_wordcount, p, hash);
3145 if (HASHITEM_EMPTY(hi))
3146 {
3147 wc = (wordcount_T *)alloc(sizeof(wordcount_T) + STRLEN(p));
3148 if (wc == NULL)
3149 return;
3150 STRCPY(wc->wc_word, p);
3151 wc->wc_count = count;
3152 hash_add_item(&lp->sl_wordcount, hi, wc->wc_word, hash);
3153 }
3154 else
3155 {
3156 wc = HI2WC(hi);
3157 if ((wc->wc_count += count) < (unsigned)count) /* check for overflow */
3158 wc->wc_count = MAXWORDCOUNT;
3159 }
3160 }
3161
3162 /*
3163 * Adjust the score of common words.
3164 */
3165 static int
3166 score_wordcount_adj(slang, score, word, split)
3167 slang_T *slang;
3168 int score;
3169 char_u *word;
3170 int split; /* word was split, less bonus */
3171 {
3172 hashitem_T *hi;
3173 wordcount_T *wc;
3174 int bonus;
3175 int newscore;
3176
3177 hi = hash_find(&slang->sl_wordcount, word);
3178 if (!HASHITEM_EMPTY(hi))
3179 {
3180 wc = HI2WC(hi);
3181 if (wc->wc_count < SCORE_THRES2)
3182 bonus = SCORE_COMMON1;
3183 else if (wc->wc_count < SCORE_THRES3)
3184 bonus = SCORE_COMMON2;
3185 else
3186 bonus = SCORE_COMMON3;
3187 if (split)
3188 newscore = score - bonus / 2;
3189 else
3190 newscore = score - bonus;
3191 if (newscore < 0)
3192 return 0;
3193 return newscore;
3194 }
3195 return score;
2941 } 3196 }
2942 3197
2943 /* 3198 /*
2944 * SN_SOFO: <sofofromlen> <sofofrom> <sofotolen> <sofoto> 3199 * SN_SOFO: <sofofromlen> <sofofrom> <sofotolen> <sofoto>
2945 * Return SP_*ERROR flags. 3200 * Return SP_*ERROR flags.
3432 return res; 3687 return res;
3433 } 3688 }
3434 #endif 3689 #endif
3435 3690
3436 /* 3691 /*
3692 * Read a tree from the .spl or .sug file.
3693 * Allocates the memory and stores pointers in "bytsp" and "idxsp".
3694 * This is skipped when the tree has zero length.
3695 * Returns zero when OK, SP_ value for an error.
3696 */
3697 static int
3698 spell_read_tree(fd, bytsp, idxsp, prefixtree, prefixcnt)
3699 FILE *fd;
3700 char_u **bytsp;
3701 idx_T **idxsp;
3702 int prefixtree; /* TRUE for the prefix tree */
3703 int prefixcnt; /* when "prefixtree" is TRUE: prefix count */
3704 {
3705 int len;
3706 int idx;
3707 char_u *bp;
3708 idx_T *ip;
3709
3710 /* The tree size was computed when writing the file, so that we can
3711 * allocate it as one long block. <nodecount> */
3712 len = (getc(fd) << 24) + (getc(fd) << 16) + (getc(fd) << 8) + getc(fd);
3713 if (len < 0)
3714 return SP_TRUNCERROR;
3715 if (len > 0)
3716 {
3717 /* Allocate the byte array. */
3718 bp = lalloc((long_u)len, TRUE);
3719 if (bp == NULL)
3720 return SP_OTHERERROR;
3721 *bytsp = bp;
3722
3723 /* Allocate the index array. */
3724 ip = (idx_T *)lalloc_clear((long_u)(len * sizeof(int)), TRUE);
3725 if (ip == NULL)
3726 return SP_OTHERERROR;
3727 *idxsp = ip;
3728
3729 /* Recursively read the tree and store it in the array. */
3730 idx = read_tree_node(fd, bp, ip, len, 0, prefixtree, prefixcnt);
3731 if (idx < 0)
3732 return idx;
3733 }
3734 return 0;
3735 }
3736
3737 /*
3437 * Read one row of siblings from the spell file and store it in the byte array 3738 * Read one row of siblings from the spell file and store it in the byte array
3438 * "byts" and index array "idxs". Recursively read the children. 3739 * "byts" and index array "idxs". Recursively read the children.
3439 * 3740 *
3440 * NOTE: The code here must match put_node(). 3741 * NOTE: The code here must match put_node()!
3441 * 3742 *
3442 * Returns the index follosing the siblings. 3743 * Returns the index (>= 0) following the siblings.
3443 * Returns -1 if the file is shorter than expected. 3744 * Returns SP_TRUNCERROR if the file is shorter than expected.
3444 * Returns -2 if there is a format error. 3745 * Returns SP_FORMERROR if there is a format error.
3445 */ 3746 */
3446 static idx_T 3747 static idx_T
3447 read_tree(fd, byts, idxs, maxidx, startidx, prefixtree, maxprefcondnr) 3748 read_tree_node(fd, byts, idxs, maxidx, startidx, prefixtree, maxprefcondnr)
3448 FILE *fd; 3749 FILE *fd;
3449 char_u *byts; 3750 char_u *byts;
3450 idx_T *idxs; 3751 idx_T *idxs;
3451 int maxidx; /* size of arrays */ 3752 int maxidx; /* size of arrays */
3452 idx_T startidx; /* current index in "byts" and "idxs" */ 3753 idx_T startidx; /* current index in "byts" and "idxs" */
3461 int c2; 3762 int c2;
3462 #define SHARED_MASK 0x8000000 3763 #define SHARED_MASK 0x8000000
3463 3764
3464 len = getc(fd); /* <siblingcount> */ 3765 len = getc(fd); /* <siblingcount> */
3465 if (len <= 0) 3766 if (len <= 0)
3466 return -1; 3767 return SP_TRUNCERROR;
3467 3768
3468 if (startidx + len >= maxidx) 3769 if (startidx + len >= maxidx)
3469 return -2; 3770 return SP_FORMERROR;
3470 byts[idx++] = len; 3771 byts[idx++] = len;
3471 3772
3472 /* Read the byte values, flag/region bytes and shared indexes. */ 3773 /* Read the byte values, flag/region bytes and shared indexes. */
3473 for (i = 1; i <= len; ++i) 3774 for (i = 1; i <= len; ++i)
3474 { 3775 {
3475 c = getc(fd); /* <byte> */ 3776 c = getc(fd); /* <byte> */
3476 if (c < 0) 3777 if (c < 0)
3477 return -1; 3778 return SP_TRUNCERROR;
3478 if (c <= BY_SPECIAL) 3779 if (c <= BY_SPECIAL)
3479 { 3780 {
3480 if (c == BY_NOFLAGS && !prefixtree) 3781 if (c == BY_NOFLAGS && !prefixtree)
3481 { 3782 {
3482 /* No flags, all regions. */ 3783 /* No flags, all regions. */
3498 3799
3499 c |= getc(fd); /* <affixID> */ 3800 c |= getc(fd); /* <affixID> */
3500 3801
3501 n = (getc(fd) << 8) + getc(fd); /* <prefcondnr> */ 3802 n = (getc(fd) << 8) + getc(fd); /* <prefcondnr> */
3502 if (n >= maxprefcondnr) 3803 if (n >= maxprefcondnr)
3503 return -2; 3804 return SP_FORMERROR;
3504 c |= (n << 8); 3805 c |= (n << 8);
3505 } 3806 }
3506 else /* c must be BY_FLAGS or BY_FLAGS2 */ 3807 else /* c must be BY_FLAGS or BY_FLAGS2 */
3507 { 3808 {
3508 /* Read flags and optional region and prefix ID. In 3809 /* Read flags and optional region and prefix ID. In
3524 else /* c == BY_INDEX */ 3825 else /* c == BY_INDEX */
3525 { 3826 {
3526 /* <nodeidx> */ 3827 /* <nodeidx> */
3527 n = (getc(fd) << 16) + (getc(fd) << 8) + getc(fd); 3828 n = (getc(fd) << 16) + (getc(fd) << 8) + getc(fd);
3528 if (n < 0 || n >= maxidx) 3829 if (n < 0 || n >= maxidx)
3529 return -2; 3830 return SP_FORMERROR;
3530 idxs[idx] = n + SHARED_MASK; 3831 idxs[idx] = n + SHARED_MASK;
3531 c = getc(fd); /* <xbyte> */ 3832 c = getc(fd); /* <xbyte> */
3532 } 3833 }
3533 } 3834 }
3534 byts[idx++] = c; 3835 byts[idx++] = c;
3543 if (idxs[startidx + i] & SHARED_MASK) 3844 if (idxs[startidx + i] & SHARED_MASK)
3544 idxs[startidx + i] &= ~SHARED_MASK; 3845 idxs[startidx + i] &= ~SHARED_MASK;
3545 else 3846 else
3546 { 3847 {
3547 idxs[startidx + i] = idx; 3848 idxs[startidx + i] = idx;
3548 idx = read_tree(fd, byts, idxs, maxidx, idx, 3849 idx = read_tree_node(fd, byts, idxs, maxidx, idx,
3549 prefixtree, maxprefcondnr); 3850 prefixtree, maxprefcondnr);
3550 if (idx < 0) 3851 if (idx < 0)
3551 break; 3852 break;
3552 } 3853 }
3553 } 3854 }
3818 /* REP items */ 4119 /* REP items */
3819 if (lp->lp_slang->sl_rep.ga_len > 0) 4120 if (lp->lp_slang->sl_rep.ga_len > 0)
3820 /* language has REP items itself */ 4121 /* language has REP items itself */
3821 lp->lp_replang = lp->lp_slang; 4122 lp->lp_replang = lp->lp_slang;
3822 else 4123 else
3823 /* find first similar language that does sound folding */ 4124 /* find first similar language that has REP items */
3824 for (j = 0; j < ga.ga_len; ++j) 4125 for (j = 0; j < ga.ga_len; ++j)
3825 { 4126 {
3826 lp2 = LANGP_ENTRY(ga, j); 4127 lp2 = LANGP_ENTRY(ga, j);
3827 if (lp2->lp_slang->sl_rep.ga_len > 0 4128 if (lp2->lp_slang->sl_rep.ga_len > 0
3828 && STRNCMP(lp->lp_slang->sl_name, 4129 && STRNCMP(lp->lp_slang->sl_name,
4237 int wn_refs; /* Nr. of references to this node. Only 4538 int wn_refs; /* Nr. of references to this node. Only
4238 relevant for first node in a list of 4539 relevant for first node in a list of
4239 siblings, in following siblings it is 4540 siblings, in following siblings it is
4240 always one. */ 4541 always one. */
4241 char_u wn_byte; /* Byte for this node. NUL for word end */ 4542 char_u wn_byte; /* Byte for this node. NUL for word end */
4242 char_u wn_affixID; /* when "wn_byte" is NUL: supported/required 4543
4243 prefix ID or 0 */ 4544 /* Info for when "wn_byte" is NUL.
4244 short_u wn_flags; /* when "wn_byte" is NUL: WF_ flags */ 4545 * In PREFIXTREE "wn_region" is used for the prefcondnr.
4245 short wn_region; /* when "wn_byte" is NUL: region mask; for 4546 * In the soundfolded word tree "wn_flags" has the MSW of the wordnr and
4246 PREFIXTREE it's the prefcondnr */ 4547 * "wn_region" the LSW of the wordnr. */
4548 char_u wn_affixID; /* supported/required prefix ID or 0 */
4549 short_u wn_flags; /* WF_ flags */
4550 short wn_region; /* region mask */
4551
4247 #ifdef SPELL_PRINTTREE 4552 #ifdef SPELL_PRINTTREE
4248 int wn_nr; /* sequence nr for printing */ 4553 int wn_nr; /* sequence nr for printing */
4249 #endif 4554 #endif
4250 }; 4555 };
4251 4556
4263 4568
4264 wordnode_T *si_keeproot; /* tree with keep-case words */ 4569 wordnode_T *si_keeproot; /* tree with keep-case words */
4265 long si_keepwcount; /* nr of words in si_keeproot */ 4570 long si_keepwcount; /* nr of words in si_keeproot */
4266 4571
4267 wordnode_T *si_prefroot; /* tree with postponed prefixes */ 4572 wordnode_T *si_prefroot; /* tree with postponed prefixes */
4573
4574 long si_sugtree; /* creating the soundfolding trie */
4268 4575
4269 sblock_T *si_blocks; /* memory blocks used */ 4576 sblock_T *si_blocks; /* memory blocks used */
4270 long si_blocks_cnt; /* memory blocks allocated */ 4577 long si_blocks_cnt; /* memory blocks allocated */
4271 long si_compress_cnt; /* words to add before lowering 4578 long si_compress_cnt; /* words to add before lowering
4272 compression limit */ 4579 compression limit */
4274 compression, linked by "wn_child" field. */ 4581 compression, linked by "wn_child" field. */
4275 long si_free_count; /* number of nodes in si_first_free */ 4582 long si_free_count; /* number of nodes in si_first_free */
4276 #ifdef SPELL_PRINTTREE 4583 #ifdef SPELL_PRINTTREE
4277 int si_wordnode_nr; /* sequence nr for nodes */ 4584 int si_wordnode_nr; /* sequence nr for nodes */
4278 #endif 4585 #endif
4279 4586 buf_T *si_spellbuf; /* buffer used to store soundfold word table */
4280 4587
4281 int si_ascii; /* handling only ASCII words */ 4588 int si_ascii; /* handling only ASCII words */
4282 int si_add; /* addition file */ 4589 int si_add; /* addition file */
4283 int si_clear_chartab; /* when TRUE clear char tables */ 4590 int si_clear_chartab; /* when TRUE clear char tables */
4284 int si_region; /* region mask */ 4591 int si_region; /* region mask */
4290 are no regions) */ 4597 are no regions) */
4291 char_u si_region_name[16]; /* region names; used only if 4598 char_u si_region_name[16]; /* region names; used only if
4292 * si_region_count > 1) */ 4599 * si_region_count > 1) */
4293 4600
4294 garray_T si_rep; /* list of fromto_T entries from REP lines */ 4601 garray_T si_rep; /* list of fromto_T entries from REP lines */
4602 garray_T si_repsal; /* list of fromto_T entries from REPSAL lines */
4295 garray_T si_sal; /* list of fromto_T entries from SAL lines */ 4603 garray_T si_sal; /* list of fromto_T entries from SAL lines */
4296 char_u *si_sofofr; /* SOFOFROM text */ 4604 char_u *si_sofofr; /* SOFOFROM text */
4297 char_u *si_sofoto; /* SOFOTO text */ 4605 char_u *si_sofoto; /* SOFOTO text */
4606 int si_nosugfile; /* NOSUGFILE item found */
4298 int si_followup; /* soundsalike: ? */ 4607 int si_followup; /* soundsalike: ? */
4299 int si_collapse; /* soundsalike: ? */ 4608 int si_collapse; /* soundsalike: ? */
4609 hashtab_T si_commonwords; /* hashtable for common words */
4610 time_t si_sugtime; /* timestamp for .sug file */
4300 int si_rem_accents; /* soundsalike: remove accents */ 4611 int si_rem_accents; /* soundsalike: remove accents */
4301 garray_T si_map; /* MAP info concatenated */ 4612 garray_T si_map; /* MAP info concatenated */
4302 char_u *si_midword; /* MIDWORD chars or NULL */ 4613 char_u *si_midword; /* MIDWORD chars or NULL */
4303 int si_compmax; /* max nr of words for compounding */ 4614 int si_compmax; /* max nr of words for compounding */
4304 int si_compminlen; /* minimal length for compounding */ 4615 int si_compminlen; /* minimal length for compounding */
4335 static void free_blocks __ARGS((sblock_T *bl)); 4646 static void free_blocks __ARGS((sblock_T *bl));
4336 static wordnode_T *wordtree_alloc __ARGS((spellinfo_T *spin)); 4647 static wordnode_T *wordtree_alloc __ARGS((spellinfo_T *spin));
4337 static int store_word __ARGS((spellinfo_T *spin, char_u *word, int flags, int region, char_u *pfxlist, int need_affix)); 4648 static int store_word __ARGS((spellinfo_T *spin, char_u *word, int flags, int region, char_u *pfxlist, int need_affix));
4338 static int tree_add_word __ARGS((spellinfo_T *spin, char_u *word, wordnode_T *tree, int flags, int region, int affixID)); 4649 static int tree_add_word __ARGS((spellinfo_T *spin, char_u *word, wordnode_T *tree, int flags, int region, int affixID));
4339 static wordnode_T *get_wordnode __ARGS((spellinfo_T *spin)); 4650 static wordnode_T *get_wordnode __ARGS((spellinfo_T *spin));
4340 static void deref_wordnode __ARGS((spellinfo_T *spin, wordnode_T *node)); 4651 static int deref_wordnode __ARGS((spellinfo_T *spin, wordnode_T *node));
4341 static void free_wordnode __ARGS((spellinfo_T *spin, wordnode_T *n)); 4652 static void free_wordnode __ARGS((spellinfo_T *spin, wordnode_T *n));
4342 static void wordtree_compress __ARGS((spellinfo_T *spin, wordnode_T *root)); 4653 static void wordtree_compress __ARGS((spellinfo_T *spin, wordnode_T *root));
4343 static int node_compress __ARGS((spellinfo_T *spin, wordnode_T *node, hashtab_T *ht, int *tot)); 4654 static int node_compress __ARGS((spellinfo_T *spin, wordnode_T *node, hashtab_T *ht, int *tot));
4344 static int node_equal __ARGS((wordnode_T *n1, wordnode_T *n2)); 4655 static int node_equal __ARGS((wordnode_T *n1, wordnode_T *n2));
4656 static void put_sugtime __ARGS((spellinfo_T *spin, FILE *fd));
4345 static int write_vim_spell __ARGS((spellinfo_T *spin, char_u *fname)); 4657 static int write_vim_spell __ARGS((spellinfo_T *spin, char_u *fname));
4346 static void clear_node __ARGS((wordnode_T *node)); 4658 static void clear_node __ARGS((wordnode_T *node));
4347 static int put_node __ARGS((FILE *fd, wordnode_T *node, int index, int regionmask, int prefixtree)); 4659 static int put_node __ARGS((FILE *fd, wordnode_T *node, int index, int regionmask, int prefixtree));
4660 static void spell_make_sugfile __ARGS((spellinfo_T *spin, char_u *wfname));
4661 static int sug_filltree __ARGS((spellinfo_T *spin, slang_T *slang));
4662 static int sug_maketable __ARGS((spellinfo_T *spin));
4663 static int sug_filltable __ARGS((spellinfo_T *spin, wordnode_T *node, int startwordnr, garray_T *gap));
4664 static int offset2bytes __ARGS((int nr, char_u *buf));
4665 static int bytes2offset __ARGS((char_u **pp));
4666 static void sug_write __ARGS((spellinfo_T *spin, char_u *fname));
4348 static void mkspell __ARGS((int fcount, char_u **fnames, int ascii, int overwrite, int added_word)); 4667 static void mkspell __ARGS((int fcount, char_u **fnames, int ascii, int overwrite, int added_word));
4668 static void spell_message __ARGS((spellinfo_T *spin, char_u *str));
4349 static void init_spellfile __ARGS((void)); 4669 static void init_spellfile __ARGS((void));
4350 4670
4351 /* In the postponed prefixes tree wn_flags is used to store the WFP_ flags, 4671 /* In the postponed prefixes tree wn_flags is used to store the WFP_ flags,
4352 * but it must be negative to indicate the prefix tree to tree_add_word(). 4672 * but it must be negative to indicate the prefix tree to tree_add_word().
4353 * Use a negative number with the lower 8 bits zero. */ 4673 * Use a negative number with the lower 8 bits zero. */
4473 FILE *fd; 4793 FILE *fd;
4474 afffile_T *aff; 4794 afffile_T *aff;
4475 char_u rline[MAXLINELEN]; 4795 char_u rline[MAXLINELEN];
4476 char_u *line; 4796 char_u *line;
4477 char_u *pc = NULL; 4797 char_u *pc = NULL;
4478 #define MAXITEMCNT 7 4798 #define MAXITEMCNT 30
4479 char_u *(items[MAXITEMCNT]); 4799 char_u *(items[MAXITEMCNT]);
4480 int itemcnt; 4800 int itemcnt;
4481 char_u *p; 4801 char_u *p;
4482 int lnum = 0; 4802 int lnum = 0;
4483 affheader_T *cur_aff = NULL; 4803 affheader_T *cur_aff = NULL;
4486 hashtab_T *tp; 4806 hashtab_T *tp;
4487 char_u *low = NULL; 4807 char_u *low = NULL;
4488 char_u *fol = NULL; 4808 char_u *fol = NULL;
4489 char_u *upp = NULL; 4809 char_u *upp = NULL;
4490 int do_rep; 4810 int do_rep;
4811 int do_repsal;
4491 int do_sal; 4812 int do_sal;
4492 int do_map; 4813 int do_map;
4493 int found_map = FALSE; 4814 int found_map = FALSE;
4494 hashitem_T *hi; 4815 hashitem_T *hi;
4495 int l; 4816 int l;
4511 { 4832 {
4512 EMSG2(_(e_notopen), fname); 4833 EMSG2(_(e_notopen), fname);
4513 return NULL; 4834 return NULL;
4514 } 4835 }
4515 4836
4516 if (spin->si_verbose || p_verbose > 2) 4837 vim_snprintf((char *)IObuff, IOSIZE, _("Reading affix file %s ..."), fname);
4517 { 4838 spell_message(spin, IObuff);
4518 if (!spin->si_verbose)
4519 verbose_enter();
4520 smsg((char_u *)_("Reading affix file %s ..."), fname);
4521 out_flush();
4522 if (!spin->si_verbose)
4523 verbose_leave();
4524 }
4525 4839
4526 /* Only do REP lines when not done in another .aff file already. */ 4840 /* Only do REP lines when not done in another .aff file already. */
4527 do_rep = spin->si_rep.ga_len == 0; 4841 do_rep = spin->si_rep.ga_len == 0;
4842
4843 /* Only do REPSAL lines when not done in another .aff file already. */
4844 do_repsal = spin->si_repsal.ga_len == 0;
4528 4845
4529 /* Only do SAL lines when not done in another .aff file already. */ 4846 /* Only do SAL lines when not done in another .aff file already. */
4530 do_sal = spin->si_sal.ga_len == 0; 4847 do_sal = spin->si_sal.ga_len == 0;
4531 4848
4532 /* Only do MAP lines when not done in another .aff file already. */ 4849 /* Only do MAP lines when not done in another .aff file already. */
4753 syllable = getroom_save(spin, items[1]); 5070 syllable = getroom_save(spin, items[1]);
4754 } 5071 }
4755 else if (STRCMP(items[0], "NOBREAK") == 0 && itemcnt == 1) 5072 else if (STRCMP(items[0], "NOBREAK") == 0 && itemcnt == 1)
4756 { 5073 {
4757 spin->si_nobreak = TRUE; 5074 spin->si_nobreak = TRUE;
5075 }
5076 else if (STRCMP(items[0], "NOSUGFILE") == 0 && itemcnt == 1)
5077 {
5078 spin->si_nosugfile = TRUE;
4758 } 5079 }
4759 else if (STRCMP(items[0], "PFXPOSTPONE") == 0 && itemcnt == 1) 5080 else if (STRCMP(items[0], "PFXPOSTPONE") == 0 && itemcnt == 1)
4760 { 5081 {
4761 aff->af_pfxpostpone = TRUE; 5082 aff->af_pfxpostpone = TRUE;
4762 } 5083 }
5059 else if (STRCMP(items[0], "UPP") == 0 && itemcnt == 2 5380 else if (STRCMP(items[0], "UPP") == 0 && itemcnt == 2
5060 && upp == NULL) 5381 && upp == NULL)
5061 { 5382 {
5062 upp = vim_strsave(items[1]); 5383 upp = vim_strsave(items[1]);
5063 } 5384 }
5064 else if (STRCMP(items[0], "REP") == 0 && itemcnt == 2) 5385 else if ((STRCMP(items[0], "REP") == 0
5065 { 5386 || STRCMP(items[0], "REPSAL") == 0)
5066 /* Ignore REP count */; 5387 && itemcnt == 2)
5388 {
5389 /* Ignore REP/REPSAL count */;
5067 if (!isdigit(*items[1])) 5390 if (!isdigit(*items[1]))
5068 smsg((char_u *)_("Expected REP count in %s line %d"), 5391 smsg((char_u *)_("Expected REP(SAL) count in %s line %d"),
5069 fname, lnum); 5392 fname, lnum);
5070 } 5393 }
5071 else if (STRCMP(items[0], "REP") == 0 && itemcnt >= 3) 5394 else if ((STRCMP(items[0], "REP") == 0
5072 { 5395 || STRCMP(items[0], "REPSAL") == 0)
5073 /* REP item */ 5396 && itemcnt >= 3)
5397 {
5398 /* REP/REPSAL item */
5074 /* Myspell ignores extra arguments, we require it starts with 5399 /* Myspell ignores extra arguments, we require it starts with
5075 * # to detect mistakes. */ 5400 * # to detect mistakes. */
5076 if (itemcnt > 3 && items[3][0] != '#') 5401 if (itemcnt > 3 && items[3][0] != '#')
5077 smsg((char_u *)_(e_afftrailing), fname, lnum, items[3]); 5402 smsg((char_u *)_(e_afftrailing), fname, lnum, items[3]);
5078 if (do_rep) 5403 if (items[0][3] == 'S' ? do_repsal : do_rep)
5079 { 5404 {
5080 /* Replace underscore with space (can't include a space 5405 /* Replace underscore with space (can't include a space
5081 * directly). */ 5406 * directly). */
5082 for (p = items[1]; *p != NUL; mb_ptr_adv(p)) 5407 for (p = items[1]; *p != NUL; mb_ptr_adv(p))
5083 if (*p == '_') 5408 if (*p == '_')
5084 *p = ' '; 5409 *p = ' ';
5085 for (p = items[2]; *p != NUL; mb_ptr_adv(p)) 5410 for (p = items[2]; *p != NUL; mb_ptr_adv(p))
5086 if (*p == '_') 5411 if (*p == '_')
5087 *p = ' '; 5412 *p = ' ';
5088 add_fromto(spin, &spin->si_rep, items[1], items[2]); 5413 add_fromto(spin, items[0][3] == 'S'
5414 ? &spin->si_repsal
5415 : &spin->si_rep, items[1], items[2]);
5089 } 5416 }
5090 } 5417 }
5091 else if (STRCMP(items[0], "MAP") == 0 && itemcnt == 2) 5418 else if (STRCMP(items[0], "MAP") == 0 && itemcnt == 2)
5092 { 5419 {
5093 /* MAP item or count */ 5420 /* MAP item or count */
5153 } 5480 }
5154 else if (STRCMP(items[0], "SOFOTO") == 0 && itemcnt == 2 5481 else if (STRCMP(items[0], "SOFOTO") == 0 && itemcnt == 2
5155 && sofoto == NULL) 5482 && sofoto == NULL)
5156 { 5483 {
5157 sofoto = getroom_save(spin, items[1]); 5484 sofoto = getroom_save(spin, items[1]);
5485 }
5486 else if (STRCMP(items[0], "COMMON") == 0)
5487 {
5488 int i;
5489
5490 for (i = 1; i < itemcnt; ++i)
5491 {
5492 if (HASHITEM_EMPTY(hash_find(&spin->si_commonwords,
5493 items[i])))
5494 {
5495 p = vim_strsave(items[i]);
5496 if (p == NULL)
5497 break;
5498 hash_add(&spin->si_commonwords, p);
5499 }
5500 }
5158 } 5501 }
5159 else 5502 else
5160 smsg((char_u *)_("Unrecognized or duplicate item in %s line %d: %s"), 5503 smsg((char_u *)_("Unrecognized or duplicate item in %s line %d: %s"),
5161 fname, lnum, items[0]); 5504 fname, lnum, items[0]);
5162 } 5505 }
5663 } 6006 }
5664 6007
5665 /* The hashtable is only used to detect duplicated words. */ 6008 /* The hashtable is only used to detect duplicated words. */
5666 hash_init(&ht); 6009 hash_init(&ht);
5667 6010
5668 if (spin->si_verbose || p_verbose > 2) 6011 vim_snprintf((char *)IObuff, IOSIZE,
5669 { 6012 _("Reading dictionary file %s ..."), fname);
5670 if (!spin->si_verbose) 6013 spell_message(spin, IObuff);
5671 verbose_enter();
5672 smsg((char_u *)_("Reading dictionary file %s ..."), fname);
5673 out_flush();
5674 if (!spin->si_verbose)
5675 verbose_leave();
5676 }
5677 6014
5678 /* start with a message for the first line */ 6015 /* start with a message for the first line */
5679 spin->si_msg_count = 999999; 6016 spin->si_msg_count = 999999;
5680 6017
5681 /* Read and ignore the first line: word count. */ 6018 /* Read and ignore the first line: word count. */
6120 { 6457 {
6121 EMSG2(_(e_notopen), fname); 6458 EMSG2(_(e_notopen), fname);
6122 return FAIL; 6459 return FAIL;
6123 } 6460 }
6124 6461
6125 if (spin->si_verbose || p_verbose > 2) 6462 vim_snprintf((char *)IObuff, IOSIZE, _("Reading word file %s ..."), fname);
6126 { 6463 spell_message(spin, IObuff);
6127 if (!spin->si_verbose)
6128 verbose_enter();
6129 smsg((char_u *)_("Reading word file %s ..."), fname);
6130 out_flush();
6131 if (!spin->si_verbose)
6132 verbose_leave();
6133 }
6134 6464
6135 /* 6465 /*
6136 * Read all the lines in the file one by one. 6466 * Read all the lines in the file one by one.
6137 */ 6467 */
6138 while (!vim_fgets(rline, MAXLINELEN, fd) && !got_int) 6468 while (!vim_fgets(rline, MAXLINELEN, fd) && !got_int)
6292 } 6622 }
6293 6623
6294 vim_free(pc); 6624 vim_free(pc);
6295 fclose(fd); 6625 fclose(fd);
6296 6626
6297 if (spin->si_ascii && non_ascii > 0 && (spin->si_verbose || p_verbose > 2)) 6627 if (spin->si_ascii && non_ascii > 0)
6298 { 6628 {
6299 if (p_verbose > 2) 6629 vim_snprintf((char *)IObuff, IOSIZE,
6300 verbose_enter(); 6630 _("Ignored %d words with non-ASCII characters"), non_ascii);
6301 smsg((char_u *)_("Ignored %d words with non-ASCII characters"), 6631 spell_message(spin, IObuff);
6302 non_ascii); 6632 }
6303 if (p_verbose > 2) 6633
6304 verbose_leave();
6305 }
6306 return retval; 6634 return retval;
6307 } 6635 }
6308 6636
6309 /* 6637 /*
6310 * Get part of an sblock_T, "len" bytes long. 6638 * Get part of an sblock_T, "len" bytes long.
6440 return res; 6768 return res;
6441 } 6769 }
6442 6770
6443 /* 6771 /*
6444 * Add word "word" to a word tree at "root". 6772 * Add word "word" to a word tree at "root".
6445 * When "flags" < 0 we are adding to the prefix tree where flags is used for 6773 * When "flags" < 0 we are adding to the prefix tree where "flags" is used for
6446 * "rare" and "region" is the condition nr. 6774 * "rare" and "region" is the condition nr.
6447 * Returns FAIL when out of memory. 6775 * Returns FAIL when out of memory.
6448 */ 6776 */
6449 static int 6777 static int
6450 tree_add_word(spin, word, root, flags, region, affixID) 6778 tree_add_word(spin, word, root, flags, region, affixID)
6505 * done on flags and then on affixID. */ 6833 * done on flags and then on affixID. */
6506 while (node != NULL 6834 while (node != NULL
6507 && (node->wn_byte < word[i] 6835 && (node->wn_byte < word[i]
6508 || (node->wn_byte == NUL 6836 || (node->wn_byte == NUL
6509 && (flags < 0 6837 && (flags < 0
6510 ? node->wn_affixID < affixID 6838 ? node->wn_affixID < (unsigned)affixID
6511 : node->wn_flags < (flags & WN_MASK) 6839 : (node->wn_flags < (unsigned)(flags & WN_MASK)
6512 || (node->wn_flags == (flags & WN_MASK) 6840 || (node->wn_flags == (flags & WN_MASK)
6513 && node->wn_affixID < affixID))))) 6841 && (spin->si_sugtree
6842 ? (node->wn_region & 0xffff) < region
6843 : node->wn_affixID
6844 < (unsigned)affixID)))))))
6514 { 6845 {
6515 prev = &node->wn_sibling; 6846 prev = &node->wn_sibling;
6516 node = *prev; 6847 node = *prev;
6517 } 6848 }
6518 if (node == NULL 6849 if (node == NULL
6519 || node->wn_byte != word[i] 6850 || node->wn_byte != word[i]
6520 || (word[i] == NUL 6851 || (word[i] == NUL
6521 && (flags < 0 6852 && (flags < 0
6853 || spin->si_sugtree
6522 || node->wn_flags != (flags & WN_MASK) 6854 || node->wn_flags != (flags & WN_MASK)
6523 || node->wn_affixID != affixID))) 6855 || node->wn_affixID != affixID)))
6524 { 6856 {
6525 /* Allocate a new node. */ 6857 /* Allocate a new node. */
6526 np = get_wordnode(spin); 6858 np = get_wordnode(spin);
6604 out_flush(); 6936 out_flush();
6605 } 6937 }
6606 6938
6607 /* Compress both trees. Either they both have many nodes, which makes 6939 /* Compress both trees. Either they both have many nodes, which makes
6608 * compression useful, or one of them is small, which means 6940 * compression useful, or one of them is small, which means
6609 * compression goes fast. */ 6941 * compression goes fast. But when filling the souldfold word tree
6942 * there is no keep-case tree. */
6610 wordtree_compress(spin, spin->si_foldroot); 6943 wordtree_compress(spin, spin->si_foldroot);
6611 wordtree_compress(spin, spin->si_keeproot); 6944 if (affixID >= 0)
6945 wordtree_compress(spin, spin->si_keeproot);
6612 } 6946 }
6613 6947
6614 return OK; 6948 return OK;
6615 } 6949 }
6616 6950
6682 7016
6683 /* 7017 /*
6684 * Decrement the reference count on a node (which is the head of a list of 7018 * Decrement the reference count on a node (which is the head of a list of
6685 * siblings). If the reference count becomes zero free the node and its 7019 * siblings). If the reference count becomes zero free the node and its
6686 * siblings. 7020 * siblings.
6687 */ 7021 * Returns the number of nodes actually freed.
6688 static void 7022 */
7023 static int
6689 deref_wordnode(spin, node) 7024 deref_wordnode(spin, node)
6690 spellinfo_T *spin; 7025 spellinfo_T *spin;
6691 wordnode_T *node; 7026 wordnode_T *node;
6692 { 7027 {
6693 wordnode_T *np; 7028 wordnode_T *np;
7029 int cnt = 0;
6694 7030
6695 if (--node->wn_refs == 0) 7031 if (--node->wn_refs == 0)
7032 {
6696 for (np = node; np != NULL; np = np->wn_sibling) 7033 for (np = node; np != NULL; np = np->wn_sibling)
6697 { 7034 {
6698 if (np->wn_child != NULL) 7035 if (np->wn_child != NULL)
6699 deref_wordnode(spin, np->wn_child); 7036 cnt += deref_wordnode(spin, np->wn_child);
6700 free_wordnode(spin, np); 7037 free_wordnode(spin, np);
6701 } 7038 ++cnt;
7039 }
7040 ++cnt; /* length field */
7041 }
7042 return cnt;
6702 } 7043 }
6703 7044
6704 /* 7045 /*
6705 * Free a wordnode_T for re-use later. 7046 * Free a wordnode_T for re-use later.
6706 * Only the "wn_child" field becomes invalid. 7047 * Only the "wn_child" field becomes invalid.
6737 7078
6738 #ifndef SPELL_PRINTTREE 7079 #ifndef SPELL_PRINTTREE
6739 if (spin->si_verbose || p_verbose > 2) 7080 if (spin->si_verbose || p_verbose > 2)
6740 #endif 7081 #endif
6741 { 7082 {
6742 if (!spin->si_verbose)
6743 verbose_enter();
6744 if (tot > 1000000) 7083 if (tot > 1000000)
6745 perc = (tot - n) / (tot / 100); 7084 perc = (tot - n) / (tot / 100);
6746 else if (tot == 0) 7085 else if (tot == 0)
6747 perc = 0; 7086 perc = 0;
6748 else 7087 else
6749 perc = (tot - n) * 100 / tot; 7088 perc = (tot - n) * 100 / tot;
6750 smsg((char_u *)_("Compressed %d of %d nodes; %d%% remaining"), 7089 vim_snprintf((char *)IObuff, IOSIZE,
6751 n, tot, perc); 7090 _("Compressed %d of %d nodes; %d (%d%%) remaining"),
6752 if (p_verbose > 2) 7091 n, tot, tot - n, perc);
6753 verbose_leave(); 7092 spell_message(spin, IObuff);
6754 } 7093 }
6755 #ifdef SPELL_PRINTTREE 7094 #ifdef SPELL_PRINTTREE
6756 spell_print_tree(root->wn_sibling); 7095 spell_print_tree(root->wn_sibling);
6757 #endif 7096 #endif
6758 hash_clear(&ht); 7097 hash_clear(&ht);
6782 7121
6783 /* 7122 /*
6784 * Go through the list of siblings. Compress each child and then try 7123 * Go through the list of siblings. Compress each child and then try
6785 * finding an identical child to replace it. 7124 * finding an identical child to replace it.
6786 * Note that with "child" we mean not just the node that is pointed to, 7125 * Note that with "child" we mean not just the node that is pointed to,
6787 * but the whole list of siblings, of which the node is the first. 7126 * but the whole list of siblings of which the child node is the first.
6788 */ 7127 */
6789 for (np = node; np != NULL && !got_int; np = np->wn_sibling) 7128 for (np = node; np != NULL && !got_int; np = np->wn_sibling)
6790 { 7129 {
6791 ++len; 7130 ++len;
6792 if ((child = np->wn_child) != NULL) 7131 if ((child = np->wn_child) != NULL)
6793 { 7132 {
6794 /* Compress the child. This fills hashkey. */ 7133 /* Compress the child first. This fills hashkey. */
6795 compressed += node_compress(spin, child, ht, tot); 7134 compressed += node_compress(spin, child, ht, tot);
6796 7135
6797 /* Try to find an identical child. */ 7136 /* Try to find an identical child. */
6798 hash = hash_hash(child->wn_u1.hashkey); 7137 hash = hash_hash(child->wn_u1.hashkey);
6799 hi = hash_lookup(ht, child->wn_u1.hashkey, hash); 7138 hi = hash_lookup(ht, child->wn_u1.hashkey, hash);
6800 tp = NULL;
6801 if (!HASHITEM_EMPTY(hi)) 7139 if (!HASHITEM_EMPTY(hi))
6802 { 7140 {
6803 /* There are children with an identical hash value. Now check 7141 /* There are children we encountered before with a hash value
6804 * if there is one that is really identical. */ 7142 * identical to the current child. Now check if there is one
7143 * that is really identical. */
6805 for (tp = HI2WN(hi); tp != NULL; tp = tp->wn_u2.next) 7144 for (tp = HI2WN(hi); tp != NULL; tp = tp->wn_u2.next)
6806 if (node_equal(child, tp)) 7145 if (node_equal(child, tp))
6807 { 7146 {
6808 /* Found one! Now use that child in place of the 7147 /* Found one! Now use that child in place of the
6809 * current one. This means the current child and all 7148 * current one. This means the current child and all
6810 * its siblings is unlinked from the tree. */ 7149 * its siblings is unlinked from the tree. */
6811 ++tp->wn_refs; 7150 ++tp->wn_refs;
6812 deref_wordnode(spin, child); 7151 compressed += deref_wordnode(spin, child);
6813 np->wn_child = tp; 7152 np->wn_child = tp;
6814 ++compressed;
6815 break; 7153 break;
6816 } 7154 }
6817 if (tp == NULL) 7155 if (tp == NULL)
6818 { 7156 {
6819 /* No other child with this hash value equals the child of 7157 /* No other child with this hash value equals the child of
6828 /* No other child has this hash value, add it to the 7166 /* No other child has this hash value, add it to the
6829 * hashtable. */ 7167 * hashtable. */
6830 hash_add_item(ht, hi, child->wn_u1.hashkey, hash); 7168 hash_add_item(ht, hi, child->wn_u1.hashkey, hash);
6831 } 7169 }
6832 } 7170 }
6833 *tot += len; 7171 *tot += len + 1; /* add one for the node that stores the length */
6834 7172
6835 /* 7173 /*
6836 * Make a hash key for the node and its siblings, so that we can quickly 7174 * Make a hash key for the node and its siblings, so that we can quickly
6837 * find a lookalike node. This must be done after compressing the sibling 7175 * find a lookalike node. This must be done after compressing the sibling
6838 * list, otherwise the hash key would become invalid by the compression. 7176 * list, otherwise the hash key would become invalid by the compression.
6902 { 7240 {
6903 int i; 7241 int i;
6904 7242
6905 for (i = len - 1; i >= 0; --i) 7243 for (i = len - 1; i >= 0; --i)
6906 putc((int)(nr >> (i * 8)), fd); 7244 putc((int)(nr >> (i * 8)), fd);
7245 }
7246
7247 /*
7248 * Write spin->si_sugtime to file "fd".
7249 */
7250 static void
7251 put_sugtime(spin, fd)
7252 spellinfo_T *spin;
7253 FILE *fd;
7254 {
7255 int c;
7256 int i;
7257
7258 /* time_t can be up to 8 bytes in size, more than long_u, thus we
7259 * can't use put_bytes() here. */
7260 for (i = 7; i >= 0; --i)
7261 if (i + 1 > sizeof(time_t))
7262 /* ">>" doesn't work well when shifting more bits than avail */
7263 putc(0, fd);
7264 else
7265 {
7266 c = (unsigned)spin->si_sugtime >> (i * 8);
7267 putc(c, fd);
7268 }
6907 } 7269 }
6908 7270
6909 static int 7271 static int
6910 #ifdef __BORLANDC__ 7272 #ifdef __BORLANDC__
6911 _RTLENTRYF 7273 _RTLENTRYF
7054 7416
7055 write_spell_prefcond(fd, &spin->si_prefcond); 7417 write_spell_prefcond(fd, &spin->si_prefcond);
7056 } 7418 }
7057 7419
7058 /* SN_REP: <repcount> <rep> ... 7420 /* SN_REP: <repcount> <rep> ...
7059 * SN_SAL: <salflags> <salcount> <sal> ... */ 7421 * SN_SAL: <salflags> <salcount> <sal> ...
7060 7422 * SN_REPSAL: <repcount> <rep> ... */
7061 /* Sort the REP items. */ 7423
7062 qsort(spin->si_rep.ga_data, (size_t)spin->si_rep.ga_len, 7424 /* round 1: SN_REP section
7425 * round 2: SN_SAL section (unless SN_SOFO is used)
7426 * round 3: SN_REPSAL section */
7427 for (round = 1; round <= 3; ++round)
7428 {
7429 if (round == 1)
7430 gap = &spin->si_rep;
7431 else if (round == 2)
7432 {
7433 /* Don't write SN_SAL when using a SN_SOFO section */
7434 if (spin->si_sofofr != NULL && spin->si_sofoto != NULL)
7435 continue;
7436 gap = &spin->si_sal;
7437 }
7438 else
7439 gap = &spin->si_repsal;
7440
7441 /* Don't write the section if there are no items. */
7442 if (gap->ga_len == 0)
7443 continue;
7444
7445 /* Sort the REP/REPSAL items. */
7446 if (round != 2)
7447 qsort(gap->ga_data, (size_t)gap->ga_len,
7063 sizeof(fromto_T), rep_compare); 7448 sizeof(fromto_T), rep_compare);
7064 7449
7065 /* round 1: SN_REP section 7450 i = round == 1 ? SN_REP : (round == 2 ? SN_SAL : SN_REPSAL);
7066 * round 2: SN_SAL section (unless SN_SOFO is used) */ 7451 putc(i, fd); /* <sectionID> */
7067 for (round = 1; round <= 2; ++round)
7068 {
7069 if (round == 1)
7070 {
7071 gap = &spin->si_rep;
7072 putc(SN_REP, fd); /* <sectionID> */
7073 }
7074 else
7075 {
7076 if (spin->si_sofofr != NULL && spin->si_sofoto != NULL)
7077 /* using SN_SOFO section instead of SN_SAL */
7078 break;
7079 gap = &spin->si_sal;
7080 putc(SN_SAL, fd); /* <sectionID> */
7081 }
7082 7452
7083 /* This is for making suggestions, section is not required. */ 7453 /* This is for making suggestions, section is not required. */
7084 putc(0, fd); /* <sectionflags> */ 7454 putc(0, fd); /* <sectionflags> */
7085 7455
7086 /* Compute the length of what follows. */ 7456 /* Compute the length of what follows. */
7141 l = STRLEN(spin->si_sofoto); 7511 l = STRLEN(spin->si_sofoto);
7142 put_bytes(fd, (long_u)l, 2); /* <sofotolen> */ 7512 put_bytes(fd, (long_u)l, 2); /* <sofotolen> */
7143 fwrite(spin->si_sofoto, l, (size_t)1, fd); /* <sofoto> */ 7513 fwrite(spin->si_sofoto, l, (size_t)1, fd); /* <sofoto> */
7144 } 7514 }
7145 7515
7516 /* SN_WORDS: <word> ...
7517 * This is for making suggestions, section is not required. */
7518 if (spin->si_commonwords.ht_used > 0)
7519 {
7520 putc(SN_WORDS, fd); /* <sectionID> */
7521 putc(0, fd); /* <sectionflags> */
7522
7523 /* round 1: count the bytes
7524 * round 2: write the bytes */
7525 for (round = 1; round <= 2; ++round)
7526 {
7527 int todo;
7528 int len = 0;
7529 hashitem_T *hi;
7530
7531 todo = spin->si_commonwords.ht_used;
7532 for (hi = spin->si_commonwords.ht_array; todo > 0; ++hi)
7533 if (!HASHITEM_EMPTY(hi))
7534 {
7535 l = STRLEN(hi->hi_key) + 1;
7536 len += l;
7537 if (round == 2) /* <word> */
7538 fwrite(hi->hi_key, (size_t)l, (size_t)1, fd);
7539 --todo;
7540 }
7541 if (round == 1)
7542 put_bytes(fd, (long_u)len, 4); /* <sectionlen> */
7543 }
7544 }
7545
7146 /* SN_MAP: <mapstr> 7546 /* SN_MAP: <mapstr>
7147 * This is for making suggestions, section is not required. */ 7547 * This is for making suggestions, section is not required. */
7148 if (spin->si_map.ga_len > 0) 7548 if (spin->si_map.ga_len > 0)
7149 { 7549 {
7150 putc(SN_MAP, fd); /* <sectionID> */ 7550 putc(SN_MAP, fd); /* <sectionID> */
7151 putc(0, fd); /* <sectionflags> */ 7551 putc(0, fd); /* <sectionflags> */
7152 l = spin->si_map.ga_len; 7552 l = spin->si_map.ga_len;
7153 put_bytes(fd, (long_u)l, 4); /* <sectionlen> */ 7553 put_bytes(fd, (long_u)l, 4); /* <sectionlen> */
7154 fwrite(spin->si_map.ga_data, (size_t)l, (size_t)1, fd); 7554 fwrite(spin->si_map.ga_data, (size_t)l, (size_t)1, fd);
7155 /* <mapstr> */ 7555 /* <mapstr> */
7556 }
7557
7558 /* SN_SUGFILE: <timestamp>
7559 * This is used to notify that a .sug file may be available and at the
7560 * same time allows for checking that a .sug file that is found matches
7561 * with this .spl file. That's because the word numbers must be exactly
7562 * right. */
7563 if (!spin->si_nosugfile
7564 && (spin->si_sal.ga_len > 0
7565 || (spin->si_sofofr != NULL && spin->si_sofoto != NULL)))
7566 {
7567 putc(SN_SUGFILE, fd); /* <sectionID> */
7568 putc(0, fd); /* <sectionflags> */
7569 put_bytes(fd, (long_u)8, 4); /* <sectionlen> */
7570
7571 /* Set si_sugtime and write it to the file. */
7572 spin->si_sugtime = time(NULL);
7573 put_sugtime(spin, fd); /* <timestamp> */
7156 } 7574 }
7157 7575
7158 /* SN_COMPOUND: compound info. 7576 /* SN_COMPOUND: compound info.
7159 * We don't mark it required, when not supported all compound words will 7577 * We don't mark it required, when not supported all compound words will
7160 * be bad words. */ 7578 * be bad words. */
7265 * Dump a word tree at node "node". 7683 * Dump a word tree at node "node".
7266 * 7684 *
7267 * This first writes the list of possible bytes (siblings). Then for each 7685 * This first writes the list of possible bytes (siblings). Then for each
7268 * byte recursively write the children. 7686 * byte recursively write the children.
7269 * 7687 *
7270 * NOTE: The code here must match the code in read_tree(), since assumptions 7688 * NOTE: The code here must match the code in read_tree_node(), since
7271 * are made about the indexes (so that we don't have to write them in the 7689 * assumptions are made about the indexes (so that we don't have to write them
7272 * file). 7690 * in the file).
7273 * 7691 *
7274 * Returns the number of nodes used. 7692 * Returns the number of nodes used.
7275 */ 7693 */
7276 static int 7694 static int
7277 put_node(fd, node, index, regionmask, prefixtree) 7695 put_node(fd, node, index, regionmask, prefixtree)
7425 FreeWild(fcount, fnames); 7843 FreeWild(fcount, fnames);
7426 } 7844 }
7427 } 7845 }
7428 7846
7429 /* 7847 /*
7848 * Create the .sug file.
7849 * Uses the soundfold info in "spin".
7850 * Writes the file with the name "wfname", with ".spl" changed to ".sug".
7851 */
7852 static void
7853 spell_make_sugfile(spin, wfname)
7854 spellinfo_T *spin;
7855 char_u *wfname;
7856 {
7857 char_u fname[MAXPATHL];
7858 int len;
7859 slang_T *slang;
7860 int free_slang = FALSE;
7861
7862 /*
7863 * Read back the .spl file that was written. This fills the required
7864 * info for soundfolding. This also uses less memory than the
7865 * pointer-linked version of the trie. And it avoids having two versions
7866 * of the code for the soundfolding stuff.
7867 * It might have been done already by spell_reload_one().
7868 */
7869 for (slang = first_lang; slang != NULL; slang = slang->sl_next)
7870 if (fullpathcmp(wfname, slang->sl_fname, FALSE) == FPC_SAME)
7871 break;
7872 if (slang == NULL)
7873 {
7874 spell_message(spin, (char_u *)_("Reading back spell file..."));
7875 slang = spell_load_file(wfname, NULL, NULL, FALSE);
7876 if (slang == NULL)
7877 return;
7878 /* don't want this language in the list */
7879 if (first_lang == slang)
7880 first_lang = slang->sl_next;
7881 free_slang = TRUE;
7882 }
7883
7884 /*
7885 * Clear the info in "spin" that is used.
7886 */
7887 spin->si_blocks = NULL;
7888 spin->si_blocks_cnt = 0;
7889 spin->si_compress_cnt = 0; /* will stay at 0 all the time*/
7890 spin->si_free_count = 0;
7891 spin->si_first_free = NULL;
7892 spin->si_foldwcount = 0;
7893
7894 /*
7895 * Go through the trie of good words, soundfold each word and add it to
7896 * the soundfold trie.
7897 */
7898 spell_message(spin, (char_u *)_("Performing soundfolding..."));
7899 if (sug_filltree(spin, slang) == FAIL)
7900 goto theend;
7901
7902 /*
7903 * Create the table which links each soundfold word with a list of the
7904 * good words it may come from. Creates buffer "spin->si_spellbuf".
7905 * This also removes the wordnr from the NUL byte entries to make
7906 * compression possible.
7907 */
7908 if (sug_maketable(spin) == FAIL)
7909 goto theend;
7910
7911 smsg((char_u *)_("Number of words after soundfolding: %ld"),
7912 (long)spin->si_spellbuf->b_ml.ml_line_count);
7913
7914 /*
7915 * Compress the soundfold trie.
7916 */
7917 spell_message(spin, (char_u *)_(msg_compressing));
7918 wordtree_compress(spin, spin->si_foldroot);
7919
7920 /*
7921 * Write the .sug file.
7922 * Make the file name by changing ".spl" to ".sug".
7923 */
7924 STRCPY(fname, wfname);
7925 len = STRLEN(fname);
7926 fname[len - 2] = 'u';
7927 fname[len - 1] = 'g';
7928 sug_write(spin, fname);
7929
7930 theend:
7931 if (free_slang)
7932 slang_free(slang);
7933 free_blocks(spin->si_blocks);
7934 close_spellbuf(spin->si_spellbuf);
7935 }
7936
7937 /*
7938 * Build the soundfold trie for language "slang".
7939 */
7940 static int
7941 sug_filltree(spin, slang)
7942 spellinfo_T *spin;
7943 slang_T *slang;
7944 {
7945 char_u *byts;
7946 idx_T *idxs;
7947 int depth;
7948 idx_T arridx[MAXWLEN];
7949 int curi[MAXWLEN];
7950 char_u tword[MAXWLEN];
7951 char_u tsalword[MAXWLEN];
7952 int c;
7953 idx_T n;
7954 unsigned words_done = 0;
7955 int wordcount[MAXWLEN];
7956
7957 /* We use si_foldroot for the souldfolded trie. */
7958 spin->si_foldroot = wordtree_alloc(spin);
7959 if (spin->si_foldroot == NULL)
7960 return FAIL;
7961
7962 /* let tree_add_word() know we're adding to the soundfolded tree */
7963 spin->si_sugtree = TRUE;
7964
7965 /*
7966 * Go through the whole case-folded tree, soundfold each word and put it
7967 * in the trie.
7968 */
7969 byts = slang->sl_fbyts;
7970 idxs = slang->sl_fidxs;
7971
7972 arridx[0] = 0;
7973 curi[0] = 1;
7974 wordcount[0] = 0;
7975
7976 depth = 0;
7977 while (depth >= 0 && !got_int)
7978 {
7979 if (curi[depth] > byts[arridx[depth]])
7980 {
7981 /* Done all bytes at this node, go up one level. */
7982 idxs[arridx[depth]] = wordcount[depth];
7983 if (depth > 0)
7984 wordcount[depth - 1] += wordcount[depth];
7985
7986 --depth;
7987 line_breakcheck();
7988 }
7989 else
7990 {
7991
7992 /* Do one more byte at this node. */
7993 n = arridx[depth] + curi[depth];
7994 ++curi[depth];
7995
7996 c = byts[n];
7997 if (c == 0)
7998 {
7999 /* Sound-fold the word. */
8000 tword[depth] = NUL;
8001 spell_soundfold(slang, tword, TRUE, tsalword);
8002
8003 /* We use the "flags" field for the MSB of the wordnr,
8004 * "region" for the LSB of the wordnr. */
8005 if (tree_add_word(spin, tsalword, spin->si_foldroot,
8006 words_done >> 16, words_done & 0xffff,
8007 0) == FAIL)
8008 return FAIL;
8009
8010 ++words_done;
8011 ++wordcount[depth];
8012
8013 /* Reset the block count each time to avoid compression
8014 * kicking in. */
8015 spin->si_blocks_cnt = 0;
8016
8017 /* Skip over any other NUL bytes (same word with different
8018 * flags). */
8019 while (byts[n + 1] == 0)
8020 {
8021 ++n;
8022 ++curi[depth];
8023 }
8024 }
8025 else
8026 {
8027 /* Normal char, go one level deeper. */
8028 tword[depth++] = c;
8029 arridx[depth] = idxs[n];
8030 curi[depth] = 1;
8031 wordcount[depth] = 0;
8032 }
8033 }
8034 }
8035
8036 smsg((char_u *)_("Total number of words: %d"), words_done);
8037
8038 return OK;
8039 }
8040
8041 /*
8042 * Make the table that links each word in the soundfold trie to the words it
8043 * can be produced from.
8044 * This is not unlike lines in a file, thus use a memfile to be able to access
8045 * the table efficiently.
8046 * Returns FAIL when out of memory.
8047 */
8048 static int
8049 sug_maketable(spin)
8050 spellinfo_T *spin;
8051 {
8052 garray_T ga;
8053 int res = OK;
8054
8055 /* Allocate a buffer, open a memline for it and create the swap file
8056 * (uses a temp file, not a .swp file). */
8057 spin->si_spellbuf = open_spellbuf();
8058 if (spin->si_spellbuf == NULL)
8059 return FAIL;
8060
8061 /* Use a buffer to store the line info, avoids allocating many small
8062 * pieces of memory. */
8063 ga_init2(&ga, 1, 100);
8064
8065 /* recursively go through the tree */
8066 if (sug_filltable(spin, spin->si_foldroot->wn_sibling, 0, &ga) == -1)
8067 res = FAIL;
8068
8069 ga_clear(&ga);
8070 return res;
8071 }
8072
8073 /*
8074 * Fill the table for one node and its children.
8075 * Returns the wordnr at the start of the node.
8076 * Returns -1 when out of memory.
8077 */
8078 static int
8079 sug_filltable(spin, node, startwordnr, gap)
8080 spellinfo_T *spin;
8081 wordnode_T *node;
8082 int startwordnr;
8083 garray_T *gap; /* place to store line of numbers */
8084 {
8085 wordnode_T *p, *np;
8086 int wordnr = startwordnr;
8087 int nr;
8088 int prev_nr;
8089
8090 for (p = node; p != NULL; p = p->wn_sibling)
8091 {
8092 if (p->wn_byte == NUL)
8093 {
8094 gap->ga_len = 0;
8095 prev_nr = 0;
8096 for (np = p; np != NULL && np->wn_byte == NUL; np = np->wn_sibling)
8097 {
8098 if (ga_grow(gap, 10) == FAIL)
8099 return -1;
8100
8101 nr = (np->wn_flags << 16) + (np->wn_region & 0xffff);
8102 /* Compute the offset from the previous nr and store the
8103 * offset in a way that it takes a minimum number of bytes.
8104 * It's a bit like utf-8, but without the need to mark
8105 * following bytes. */
8106 nr -= prev_nr;
8107 prev_nr += nr;
8108 gap->ga_len += offset2bytes(nr,
8109 (char_u *)gap->ga_data + gap->ga_len);
8110 }
8111
8112 /* add the NUL byte */
8113 ((char_u *)gap->ga_data)[gap->ga_len++] = NUL;
8114
8115 if (ml_append_buf(spin->si_spellbuf, (linenr_T)wordnr,
8116 gap->ga_data, gap->ga_len, TRUE) == FAIL)
8117 return -1;
8118 ++wordnr;
8119
8120 /* Remove extra NUL entries, we no longer need them. We don't
8121 * bother freeing the nodes, the won't be reused anyway. */
8122 while (p->wn_sibling != NULL && p->wn_sibling->wn_byte == NUL)
8123 p->wn_sibling = p->wn_sibling->wn_sibling;
8124
8125 /* Clear the flags on the remaining NUL node, so that compression
8126 * works a lot better. */
8127 p->wn_flags = 0;
8128 p->wn_region = 0;
8129 }
8130 else
8131 {
8132 wordnr = sug_filltable(spin, p->wn_child, wordnr, gap);
8133 if (wordnr == -1)
8134 return -1;
8135 }
8136 }
8137 return wordnr;
8138 }
8139
8140 /*
8141 * Convert an offset into a minimal number of bytes.
8142 * Similar to utf_char2byters, but use 8 bits in followup bytes and avoid NUL
8143 * bytes.
8144 */
8145 static int
8146 offset2bytes(nr, buf)
8147 int nr;
8148 char_u *buf;
8149 {
8150 int rem;
8151 int b1, b2, b3, b4;
8152
8153 /* Split the number in parts of base 255. We need to avoid NUL bytes. */
8154 b1 = nr % 255 + 1;
8155 rem = nr / 255;
8156 b2 = rem % 255 + 1;
8157 rem = rem / 255;
8158 b3 = rem % 255 + 1;
8159 b4 = rem / 255 + 1;
8160
8161 if (b4 > 1 || b3 > 0x1f) /* 4 bytes */
8162 {
8163 buf[0] = 0xe0 + b4;
8164 buf[1] = b3;
8165 buf[2] = b2;
8166 buf[3] = b1;
8167 return 4;
8168 }
8169 if (b3 > 1 || b2 > 0x3f ) /* 3 bytes */
8170 {
8171 buf[0] = 0xc0 + b3;
8172 buf[1] = b2;
8173 buf[2] = b1;
8174 return 3;
8175 }
8176 if (b2 > 1 || b1 > 0x7f ) /* 2 bytes */
8177 {
8178 buf[0] = 0x80 + b2;
8179 buf[1] = b1;
8180 return 2;
8181 }
8182 /* 1 byte */
8183 buf[0] = b1;
8184 return 1;
8185 }
8186
8187 /*
8188 * Opposite of offset2bytes().
8189 * "pp" points to the bytes and is advanced over it.
8190 * Returns the offset.
8191 */
8192 static int
8193 bytes2offset(pp)
8194 char_u **pp;
8195 {
8196 char_u *p = *pp;
8197 int nr;
8198 int c;
8199
8200 c = *p++;
8201 if ((c & 0x80) == 0x00) /* 1 byte */
8202 {
8203 nr = c - 1;
8204 }
8205 else if ((c & 0xc0) == 0x80) /* 2 bytes */
8206 {
8207 nr = (c & 0x3f) - 1;
8208 nr = nr * 255 + (*p++ - 1);
8209 }
8210 else if ((c & 0xe0) == 0xc0) /* 3 bytes */
8211 {
8212 nr = (c & 0x1f) - 1;
8213 nr = nr * 255 + (*p++ - 1);
8214 nr = nr * 255 + (*p++ - 1);
8215 }
8216 else /* 4 bytes */
8217 {
8218 nr = (c & 0x0f) - 1;
8219 nr = nr * 255 + (*p++ - 1);
8220 nr = nr * 255 + (*p++ - 1);
8221 nr = nr * 255 + (*p++ - 1);
8222 }
8223
8224 *pp = p;
8225 return nr;
8226 }
8227
8228 /*
8229 * Write the .sug file in "fname".
8230 */
8231 static void
8232 sug_write(spin, fname)
8233 spellinfo_T *spin;
8234 char_u *fname;
8235 {
8236 FILE *fd;
8237 wordnode_T *tree;
8238 int nodecount;
8239 int wcount;
8240 char_u *line;
8241 linenr_T lnum;
8242 int len;
8243
8244 /* Create the file. Note that an existing file is silently overwritten! */
8245 fd = mch_fopen((char *)fname, "w");
8246 if (fd == NULL)
8247 {
8248 EMSG2(_(e_notopen), fname);
8249 return;
8250 }
8251
8252 vim_snprintf((char *)IObuff, IOSIZE,
8253 _("Writing suggestion file %s ..."), fname);
8254 spell_message(spin, IObuff);
8255
8256 /*
8257 * <SUGHEADER>: <fileID> <versionnr> <timestamp>
8258 */
8259 if (fwrite(VIMSUGMAGIC, VIMSUGMAGICL, (size_t)1, fd) != 1) /* <fileID> */
8260 {
8261 EMSG(_(e_write));
8262 goto theend;
8263 }
8264 putc(VIMSUGVERSION, fd); /* <versionnr> */
8265
8266 /* Write si_sugtime to the file. */
8267 put_sugtime(spin, fd); /* <timestamp> */
8268
8269 /*
8270 * <SUGWORDTREE>
8271 */
8272 spin->si_memtot = 0;
8273 tree = spin->si_foldroot->wn_sibling;
8274
8275 /* Clear the index and wnode fields in the tree. */
8276 clear_node(tree);
8277
8278 /* Count the number of nodes. Needed to be able to allocate the
8279 * memory when reading the nodes. Also fills in index for shared
8280 * nodes. */
8281 nodecount = put_node(NULL, tree, 0, 0, FALSE);
8282
8283 /* number of nodes in 4 bytes */
8284 put_bytes(fd, (long_u)nodecount, 4); /* <nodecount> */
8285 spin->si_memtot += nodecount + nodecount * sizeof(int);
8286
8287 /* Write the nodes. */
8288 (void)put_node(fd, tree, 0, 0, FALSE);
8289
8290 /*
8291 * <SUGTABLE>: <sugwcount> <sugline> ...
8292 */
8293 wcount = spin->si_spellbuf->b_ml.ml_line_count;
8294 put_bytes(fd, (long_u)wcount, 4); /* <sugwcount> */
8295
8296 for (lnum = 1; lnum <= (linenr_T)wcount; ++lnum)
8297 {
8298 /* <sugline>: <sugnr> ... NUL */
8299 line = ml_get_buf(spin->si_spellbuf, lnum, FALSE);
8300 len = STRLEN(line) + 1;
8301 if (fwrite(line, (size_t)len, (size_t)1, fd) == 0)
8302 {
8303 EMSG(_(e_write));
8304 goto theend;
8305 }
8306 spin->si_memtot += len;
8307 }
8308
8309 /* Write another byte to check for errors. */
8310 if (putc(0, fd) == EOF)
8311 EMSG(_(e_write));
8312
8313 vim_snprintf((char *)IObuff, IOSIZE,
8314 _("Estimated runtime memory use: %d bytes"), spin->si_memtot);
8315 spell_message(spin, IObuff);
8316
8317 theend:
8318 /* close the file */
8319 fclose(fd);
8320 }
8321
8322 /*
8323 * Open a spell buffer. This is a nameless buffer that is not in the buffer
8324 * list and only contains text lines. Can use a swapfile to reduce memory
8325 * use.
8326 * Most other fields are invalid! Esp. watch out for string options being
8327 * NULL and there is no undo info.
8328 * Returns NULL when out of memory.
8329 */
8330 static buf_T *
8331 open_spellbuf()
8332 {
8333 buf_T *buf;
8334
8335 buf = (buf_T *)alloc_clear(sizeof(buf_T));
8336 if (buf != NULL)
8337 {
8338 buf->b_spell = TRUE;
8339 buf->b_p_swf = TRUE; /* may create a swap file */
8340 ml_open(buf);
8341 ml_open_file(buf); /* create swap file now */
8342 }
8343 return buf;
8344 }
8345
8346 /*
8347 * Close the buffer used for spell info.
8348 */
8349 static void
8350 close_spellbuf(buf)
8351 buf_T *buf;
8352 {
8353 if (buf != NULL)
8354 {
8355 ml_close(buf, TRUE);
8356 vim_free(buf);
8357 }
8358 }
8359
8360
8361 /*
7430 * Create a Vim spell file from one or more word lists. 8362 * Create a Vim spell file from one or more word lists.
7431 * "fnames[0]" is the output file name. 8363 * "fnames[0]" is the output file name.
7432 * "fnames[fcount - 1]" is the last input file name. 8364 * "fnames[fcount - 1]" is the last input file name.
7433 * Exception: when "fnames[0]" ends in ".add" it's used as the input file name 8365 * Exception: when "fnames[0]" ends in ".add" it's used as the input file name
7434 * and ".spl" is appended to make the output file name. 8366 * and ".spl" is appended to make the output file name.
7456 spin.si_verbose = !added_word; 8388 spin.si_verbose = !added_word;
7457 spin.si_ascii = ascii; 8389 spin.si_ascii = ascii;
7458 spin.si_followup = TRUE; 8390 spin.si_followup = TRUE;
7459 spin.si_rem_accents = TRUE; 8391 spin.si_rem_accents = TRUE;
7460 ga_init2(&spin.si_rep, (int)sizeof(fromto_T), 20); 8392 ga_init2(&spin.si_rep, (int)sizeof(fromto_T), 20);
8393 ga_init2(&spin.si_repsal, (int)sizeof(fromto_T), 20);
7461 ga_init2(&spin.si_sal, (int)sizeof(fromto_T), 20); 8394 ga_init2(&spin.si_sal, (int)sizeof(fromto_T), 20);
7462 ga_init2(&spin.si_map, (int)sizeof(char_u), 100); 8395 ga_init2(&spin.si_map, (int)sizeof(char_u), 100);
7463 ga_init2(&spin.si_prefcond, (int)sizeof(char_u *), 50); 8396 ga_init2(&spin.si_prefcond, (int)sizeof(char_u *), 50);
8397 hash_init(&spin.si_commonwords);
7464 spin.si_newcompID = 127; /* start compound ID at first maximum */ 8398 spin.si_newcompID = 127; /* start compound ID at first maximum */
7465 8399
7466 /* default: fnames[0] is output file, following are input files */ 8400 /* default: fnames[0] is output file, following are input files */
7467 innames = &fnames[1]; 8401 innames = &fnames[1];
7468 incount = fcount - 1; 8402 incount = fcount - 1;
7611 } 8545 }
7612 8546
7613 if (spin.si_compflags != NULL && spin.si_nobreak) 8547 if (spin.si_compflags != NULL && spin.si_nobreak)
7614 MSG(_("Warning: both compounding and NOBREAK specified")); 8548 MSG(_("Warning: both compounding and NOBREAK specified"));
7615 8549
7616 if (!error) 8550 if (!error && !got_int)
7617 { 8551 {
7618 /* 8552 /*
7619 * Combine tails in the tree. 8553 * Combine tails in the tree.
7620 */ 8554 */
7621 if (spin.si_verbose || p_verbose > 2) 8555 spell_message(&spin, (char_u *)_(msg_compressing));
7622 {
7623 if (!spin.si_verbose)
7624 verbose_enter();
7625 MSG(_(msg_compressing));
7626 out_flush();
7627 if (!spin.si_verbose)
7628 verbose_leave();
7629 }
7630 wordtree_compress(&spin, spin.si_foldroot); 8556 wordtree_compress(&spin, spin.si_foldroot);
7631 wordtree_compress(&spin, spin.si_keeproot); 8557 wordtree_compress(&spin, spin.si_keeproot);
7632 wordtree_compress(&spin, spin.si_prefroot); 8558 wordtree_compress(&spin, spin.si_prefroot);
7633 } 8559 }
7634 8560
7635 if (!error) 8561 if (!error && !got_int)
7636 { 8562 {
7637 /* 8563 /*
7638 * Write the info in the spell file. 8564 * Write the info in the spell file.
7639 */ 8565 */
7640 if (spin.si_verbose || p_verbose > 2) 8566 vim_snprintf((char *)IObuff, IOSIZE,
7641 { 8567 _("Writing spell file %s ..."), wfname);
7642 if (!spin.si_verbose) 8568 spell_message(&spin, IObuff);
7643 verbose_enter();
7644 smsg((char_u *)_("Writing spell file %s ..."), wfname);
7645 out_flush();
7646 if (!spin.si_verbose)
7647 verbose_leave();
7648 }
7649 8569
7650 error = write_vim_spell(&spin, wfname) == FAIL; 8570 error = write_vim_spell(&spin, wfname) == FAIL;
7651 8571
7652 if (spin.si_verbose || p_verbose > 2) 8572 spell_message(&spin, (char_u *)_("Done!"));
7653 { 8573 vim_snprintf((char *)IObuff, IOSIZE,
7654 if (!spin.si_verbose) 8574 _("Estimated runtime memory use: %d bytes"), spin.si_memtot);
7655 verbose_enter(); 8575 spell_message(&spin, IObuff);
7656 MSG(_("Done!")); 8576
7657 smsg((char_u *)_("Estimated runtime memory use: %d bytes"), 8577 /*
7658 spin.si_memtot); 8578 * If the file is loaded need to reload it.
7659 out_flush(); 8579 */
7660 if (!spin.si_verbose)
7661 verbose_leave();
7662 }
7663
7664 /* If the file is loaded need to reload it. */
7665 if (!error) 8580 if (!error)
7666 spell_reload_one(wfname, added_word); 8581 spell_reload_one(wfname, added_word);
7667 } 8582 }
7668 8583
7669 /* Free the allocated memory. */ 8584 /* Free the allocated memory. */
7670 ga_clear(&spin.si_rep); 8585 ga_clear(&spin.si_rep);
8586 ga_clear(&spin.si_repsal);
7671 ga_clear(&spin.si_sal); 8587 ga_clear(&spin.si_sal);
7672 ga_clear(&spin.si_map); 8588 ga_clear(&spin.si_map);
7673 ga_clear(&spin.si_prefcond); 8589 ga_clear(&spin.si_prefcond);
8590 hash_clear_all(&spin.si_commonwords, 0);
7674 8591
7675 /* Free the .aff file structures. */ 8592 /* Free the .aff file structures. */
7676 for (i = 0; i < incount; ++i) 8593 for (i = 0; i < incount; ++i)
7677 if (afile[i] != NULL) 8594 if (afile[i] != NULL)
7678 spell_free_aff(afile[i]); 8595 spell_free_aff(afile[i]);
7679 8596
7680 /* Free all the bits and pieces at once. */ 8597 /* Free all the bits and pieces at once. */
7681 free_blocks(spin.si_blocks); 8598 free_blocks(spin.si_blocks);
7682 } 8599
7683 } 8600 /*
7684 8601 * If there is soundfolding info and no NOSUGFILE item create the
8602 * .sug file with the soundfolded word trie.
8603 */
8604 if (spin.si_sugtime != 0 && !error && !got_int)
8605 spell_make_sugfile(&spin, wfname);
8606
8607 }
8608 }
8609
8610 /*
8611 * Display a message for spell file processing when 'verbose' is set or using
8612 * ":mkspell". "str" can be IObuff.
8613 */
8614 static void
8615 spell_message(spin, str)
8616 spellinfo_T *spin;
8617 char_u *str;
8618 {
8619 if (spin->si_verbose || p_verbose > 2)
8620 {
8621 if (!spin->si_verbose)
8622 verbose_enter();
8623 MSG(str);
8624 out_flush();
8625 if (!spin->si_verbose)
8626 verbose_leave();
8627 }
8628 }
7685 8629
7686 /* 8630 /*
7687 * ":[count]spellgood {word}" 8631 * ":[count]spellgood {word}"
7688 * ":[count]spellwrong {word}" 8632 * ":[count]spellwrong {word}"
7689 */ 8633 */
8332 } 9276 }
8333 9277
8334 return OK; 9278 return OK;
8335 } 9279 }
8336 9280
9281 /* values for sps_flags */
8337 #define SPS_BEST 1 9282 #define SPS_BEST 1
8338 #define SPS_FAST 2 9283 #define SPS_FAST 2
8339 #define SPS_DOUBLE 4 9284 #define SPS_DOUBLE 4
8340 9285
8341 static int sps_flags = SPS_BEST; 9286 static int sps_flags = SPS_BEST; /* flags from 'spellsuggest' */
8342 static int sps_limit = 9999; 9287 static int sps_limit = 9999; /* max nr of suggestions given */
8343 9288
8344 /* 9289 /*
8345 * Check the 'spellsuggest' option. Return FAIL if it's wrong. 9290 * Check the 'spellsuggest' option. Return FAIL if it's wrong.
8346 * Sets "sps_flags" and "sps_limit". 9291 * Sets "sps_flags" and "sps_limit".
8347 */ 9292 */
8459 if (sps_limit > (int)Rows - 2) 9404 if (sps_limit > (int)Rows - 2)
8460 limit = (int)Rows - 2; 9405 limit = (int)Rows - 2;
8461 else 9406 else
8462 limit = sps_limit; 9407 limit = sps_limit;
8463 spell_find_suggest(line + curwin->w_cursor.col, &sug, limit, 9408 spell_find_suggest(line + curwin->w_cursor.col, &sug, limit,
8464 TRUE, need_cap); 9409 TRUE, need_cap, TRUE);
8465 9410
8466 if (sug.su_ga.ga_len == 0) 9411 if (sug.su_ga.ga_len == 0)
8467 MSG(_("Sorry, no suggestions")); 9412 MSG(_("Sorry, no suggestions"));
8468 else if (count > 0) 9413 else if (count > 0)
8469 { 9414 {
8510 9455
8511 /* The suggested word may replace only part of the bad word, add 9456 /* The suggested word may replace only part of the bad word, add
8512 * the not replaced part. */ 9457 * the not replaced part. */
8513 STRCPY(wcopy, stp->st_word); 9458 STRCPY(wcopy, stp->st_word);
8514 if (sug.su_badlen > stp->st_orglen) 9459 if (sug.su_badlen > stp->st_orglen)
8515 vim_strncpy(wcopy + STRLEN(wcopy), 9460 vim_strncpy(wcopy + stp->st_wordlen,
8516 sug.su_badptr + stp->st_orglen, 9461 sug.su_badptr + stp->st_orglen,
8517 sug.su_badlen - stp->st_orglen); 9462 sug.su_badlen - stp->st_orglen);
8518 vim_snprintf((char *)IObuff, IOSIZE, "%2d", i + 1); 9463 vim_snprintf((char *)IObuff, IOSIZE, "%2d", i + 1);
8519 #ifdef FEAT_RIGHTLEFT 9464 #ifdef FEAT_RIGHTLEFT
8520 if (cmdmsg_rl) 9465 if (cmdmsg_rl)
8584 repl_from = vim_strnsave(sug.su_badptr, stp->st_orglen); 9529 repl_from = vim_strnsave(sug.su_badptr, stp->st_orglen);
8585 repl_to = vim_strsave(stp->st_word); 9530 repl_to = vim_strsave(stp->st_word);
8586 } 9531 }
8587 9532
8588 /* Replace the word. */ 9533 /* Replace the word. */
8589 p = alloc(STRLEN(line) - stp->st_orglen + STRLEN(stp->st_word) + 1); 9534 p = alloc(STRLEN(line) - stp->st_orglen + stp->st_wordlen + 1);
8590 if (p != NULL) 9535 if (p != NULL)
8591 { 9536 {
8592 c = sug.su_badptr - line; 9537 c = sug.su_badptr - line;
8593 mch_memmove(p, line, c); 9538 mch_memmove(p, line, c);
8594 STRCPY(p + c, stp->st_word); 9539 STRCPY(p + c, stp->st_word);
8599 9544
8600 /* For redo we use a change-word command. */ 9545 /* For redo we use a change-word command. */
8601 ResetRedobuff(); 9546 ResetRedobuff();
8602 AppendToRedobuff((char_u *)"ciw"); 9547 AppendToRedobuff((char_u *)"ciw");
8603 AppendToRedobuffLit(p + c, 9548 AppendToRedobuffLit(p + c,
8604 STRLEN(stp->st_word) + sug.su_badlen - stp->st_orglen); 9549 stp->st_wordlen + sug.su_badlen - stp->st_orglen);
8605 AppendCharToRedobuff(ESC); 9550 AppendCharToRedobuff(ESC);
8606 } 9551 }
8607 } 9552 }
8608 else 9553 else
8609 curwin->w_cursor = prev_cursor; 9554 curwin->w_cursor = prev_cursor;
8757 /* 9702 /*
8758 * Find spell suggestions for "word". Return them in the growarray "*gap" as 9703 * Find spell suggestions for "word". Return them in the growarray "*gap" as
8759 * a list of allocated strings. 9704 * a list of allocated strings.
8760 */ 9705 */
8761 void 9706 void
8762 spell_suggest_list(gap, word, maxcount, need_cap) 9707 spell_suggest_list(gap, word, maxcount, need_cap, interactive)
8763 garray_T *gap; 9708 garray_T *gap;
8764 char_u *word; 9709 char_u *word;
8765 int maxcount; /* maximum nr of suggestions */ 9710 int maxcount; /* maximum nr of suggestions */
8766 int need_cap; /* 'spellcapcheck' matched */ 9711 int need_cap; /* 'spellcapcheck' matched */
9712 int interactive;
8767 { 9713 {
8768 suginfo_T sug; 9714 suginfo_T sug;
8769 int i; 9715 int i;
8770 suggest_T *stp; 9716 suggest_T *stp;
8771 char_u *wcopy; 9717 char_u *wcopy;
8772 9718
8773 spell_find_suggest(word, &sug, maxcount, FALSE, need_cap); 9719 spell_find_suggest(word, &sug, maxcount, FALSE, need_cap, interactive);
8774 9720
8775 /* Make room in "gap". */ 9721 /* Make room in "gap". */
8776 ga_init2(gap, sizeof(char_u *), sug.su_ga.ga_len + 1); 9722 ga_init2(gap, sizeof(char_u *), sug.su_ga.ga_len + 1);
8777 if (ga_grow(gap, sug.su_ga.ga_len) == FAIL) 9723 if (ga_grow(gap, sug.su_ga.ga_len) == FAIL)
8778 return; 9724 return;
8781 { 9727 {
8782 stp = &SUG(sug.su_ga, i); 9728 stp = &SUG(sug.su_ga, i);
8783 9729
8784 /* The suggested word may replace only part of "word", add the not 9730 /* The suggested word may replace only part of "word", add the not
8785 * replaced part. */ 9731 * replaced part. */
8786 wcopy = alloc(STRLEN(stp->st_word) 9732 wcopy = alloc(stp->st_wordlen
8787 + STRLEN(sug.su_badptr + stp->st_orglen) + 1); 9733 + STRLEN(sug.su_badptr + stp->st_orglen) + 1);
8788 if (wcopy == NULL) 9734 if (wcopy == NULL)
8789 break; 9735 break;
8790 STRCPY(wcopy, stp->st_word); 9736 STRCPY(wcopy, stp->st_word);
8791 STRCAT(wcopy, sug.su_badptr + stp->st_orglen); 9737 STRCPY(wcopy + stp->st_wordlen, sug.su_badptr + stp->st_orglen);
8792 ((char_u **)gap->ga_data)[gap->ga_len++] = wcopy; 9738 ((char_u **)gap->ga_data)[gap->ga_len++] = wcopy;
8793 } 9739 }
8794 9740
8795 spell_find_cleanup(&sug); 9741 spell_find_cleanup(&sug);
8796 } 9742 }
8801 * The maximum number of suggestions is "maxcount". 9747 * The maximum number of suggestions is "maxcount".
8802 * Note: does use info for the current window. 9748 * Note: does use info for the current window.
8803 * This is based on the mechanisms of Aspell, but completely reimplemented. 9749 * This is based on the mechanisms of Aspell, but completely reimplemented.
8804 */ 9750 */
8805 static void 9751 static void
8806 spell_find_suggest(badptr, su, maxcount, banbadword, need_cap) 9752 spell_find_suggest(badptr, su, maxcount, banbadword, need_cap, interactive)
8807 char_u *badptr; 9753 char_u *badptr;
8808 suginfo_T *su; 9754 suginfo_T *su;
8809 int maxcount; 9755 int maxcount;
8810 int banbadword; /* don't include badword in suggestions */ 9756 int banbadword; /* don't include badword in suggestions */
8811 int need_cap; /* word should start with capital */ 9757 int need_cap; /* word should start with capital */
9758 int interactive;
8812 { 9759 {
8813 hlf_T attr = HLF_COUNT; 9760 hlf_T attr = HLF_COUNT;
8814 char_u buf[MAXPATHL]; 9761 char_u buf[MAXPATHL];
8815 char_u *p; 9762 char_u *p;
8816 int do_combine = FALSE; 9763 int do_combine = FALSE;
8831 if (*badptr == NUL) 9778 if (*badptr == NUL)
8832 return; 9779 return;
8833 hash_init(&su->su_banned); 9780 hash_init(&su->su_banned);
8834 9781
8835 su->su_badptr = badptr; 9782 su->su_badptr = badptr;
8836 su->su_badlen = spell_check(curwin, su->su_badptr, &attr, NULL); 9783 su->su_badlen = spell_check(curwin, su->su_badptr, &attr, NULL, FALSE);
8837 su->su_maxcount = maxcount; 9784 su->su_maxcount = maxcount;
8838 su->su_maxscore = SCORE_MAXINIT; 9785 su->su_maxscore = SCORE_MAXINIT;
8839 9786
8840 if (su->su_badlen >= MAXWLEN) 9787 if (su->su_badlen >= MAXWLEN)
8841 su->su_badlen = MAXWLEN - 1; /* just in case */ 9788 su->su_badlen = MAXWLEN - 1; /* just in case */
8874 c = PTR2CHAR(su->su_badptr); 9821 c = PTR2CHAR(su->su_badptr);
8875 if (!SPELL_ISUPPER(c) && attr == HLF_COUNT) 9822 if (!SPELL_ISUPPER(c) && attr == HLF_COUNT)
8876 { 9823 {
8877 make_case_word(su->su_badword, buf, WF_ONECAP); 9824 make_case_word(su->su_badword, buf, WF_ONECAP);
8878 add_suggestion(su, &su->su_ga, buf, su->su_badlen, SCORE_ICASE, 9825 add_suggestion(su, &su->su_ga, buf, su->su_badlen, SCORE_ICASE,
8879 0, TRUE, su->su_sallang); 9826 0, TRUE, su->su_sallang, FALSE);
8880 } 9827 }
8881 9828
8882 /* Ban the bad word itself. It may appear in another region. */ 9829 /* Ban the bad word itself. It may appear in another region. */
8883 if (banbadword) 9830 if (banbadword)
8884 add_banned(su, su->su_badword); 9831 add_banned(su, su->su_badword);
8910 /* Use list of suggestions in a file. */ 9857 /* Use list of suggestions in a file. */
8911 spell_suggest_file(su, buf + 5); 9858 spell_suggest_file(su, buf + 5);
8912 else 9859 else
8913 { 9860 {
8914 /* Use internal method. */ 9861 /* Use internal method. */
8915 spell_suggest_intern(su); 9862 spell_suggest_intern(su, interactive);
8916 if (sps_flags & SPS_DOUBLE) 9863 if (sps_flags & SPS_DOUBLE)
8917 do_combine = TRUE; 9864 do_combine = TRUE;
8918 } 9865 }
8919 } 9866 }
8920 9867
8950 for (li = list->lv_first; li != NULL; li = li->li_next) 9897 for (li = list->lv_first; li != NULL; li = li->li_next)
8951 if (li->li_tv.v_type == VAR_LIST) 9898 if (li->li_tv.v_type == VAR_LIST)
8952 { 9899 {
8953 /* Get the word and the score from the items. */ 9900 /* Get the word and the score from the items. */
8954 score = get_spellword(li->li_tv.vval.v_list, &p); 9901 score = get_spellword(li->li_tv.vval.v_list, &p);
8955 if (score >= 0) 9902 if (score >= 0 && score <= su->su_maxscore)
8956 add_suggestion(su, &su->su_ga, p, 9903 add_suggestion(su, &su->su_ga, p, su->su_badlen,
8957 su->su_badlen, score, 0, TRUE, su->su_sallang); 9904 score, 0, TRUE, su->su_sallang, FALSE);
8958 } 9905 }
8959 list_unref(list); 9906 list_unref(list);
8960 } 9907 }
8961 9908
8962 /* Sort the suggestions and truncate at "maxcount". */ 9909 /* Remove bogus suggestions, sort and truncate at "maxcount". */
9910 check_suggestions(su, &su->su_ga);
8963 (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, su->su_maxcount); 9911 (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, su->su_maxcount);
8964 } 9912 }
8965 #endif 9913 #endif
8966 9914
8967 /* 9915 /*
9009 make_case_word(p, cword, su->su_badflags); 9957 make_case_word(p, cword, su->su_badflags);
9010 p = cword; 9958 p = cword;
9011 } 9959 }
9012 9960
9013 add_suggestion(su, &su->su_ga, p, su->su_badlen, 9961 add_suggestion(su, &su->su_ga, p, su->su_badlen,
9014 SCORE_FILE, 0, TRUE, su->su_sallang); 9962 SCORE_FILE, 0, TRUE, su->su_sallang, FALSE);
9015 } 9963 }
9016 } 9964 }
9017 9965
9018 fclose(fd); 9966 fclose(fd);
9019 9967
9020 /* Sort the suggestions and truncate at "maxcount". */ 9968 /* Remove bogus suggestions, sort and truncate at "maxcount". */
9969 check_suggestions(su, &su->su_ga);
9021 (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, su->su_maxcount); 9970 (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, su->su_maxcount);
9022 } 9971 }
9023 9972
9024 /* 9973 /*
9025 * Find suggestions for the internal method indicated by "sps_flags". 9974 * Find suggestions for the internal method indicated by "sps_flags".
9026 */ 9975 */
9027 static void 9976 static void
9028 spell_suggest_intern(su) 9977 spell_suggest_intern(su, interactive)
9029 suginfo_T *su; 9978 suginfo_T *su;
9030 { 9979 int interactive;
9980 {
9981 /*
9982 * Load the .sug file(s) that are available and not done yet.
9983 */
9984 suggest_load_files();
9985
9031 /* 9986 /*
9032 * 1. Try special cases, such as repeating a word: "the the" -> "the". 9987 * 1. Try special cases, such as repeating a word: "the the" -> "the".
9033 * 9988 *
9034 * Set a maximum score to limit the combination of operations that is 9989 * Set a maximum score to limit the combination of operations that is
9035 * tried. 9990 * tried.
9046 if (sps_flags & SPS_DOUBLE) 10001 if (sps_flags & SPS_DOUBLE)
9047 score_comp_sal(su); 10002 score_comp_sal(su);
9048 10003
9049 /* 10004 /*
9050 * 3. Try finding sound-a-like words. 10005 * 3. Try finding sound-a-like words.
9051 *
9052 * Only do this when we don't have a lot of suggestions yet, because it's
9053 * very slow and often doesn't find new suggestions.
9054 */ 10006 */
9055 if ((sps_flags & SPS_DOUBLE) 10007 if ((sps_flags & SPS_FAST) == 0)
9056 || (!(sps_flags & SPS_FAST) 10008 {
9057 && su->su_ga.ga_len < SUG_CLEAN_COUNT(su))) 10009 if (sps_flags & SPS_BEST)
9058 { 10010 /* Adjust the word score for the suggestions found so far for how
9059 /* Allow a higher score now. */ 10011 * they sounds like. */
9060 su->su_maxscore = SCORE_MAXMAX; 10012 rescore_suggestions(su);
10013
10014 /*
10015 * While going throught the soundfold tree "su_maxscore" is the score
10016 * for the soundfold word, limits the changes that are being tried,
10017 * and "su_sfmaxscore" the rescored score, which is set by
10018 * cleanup_suggestions().
10019 * First find words with a small edit distance, because this is much
10020 * faster and often already finds the top-N suggestions. If we didn't
10021 * find many suggestions try again with a higher edit distance.
10022 * "sl_sounddone" is used to avoid doing the same word twice.
10023 */
10024 suggest_try_soundalike_prep();
10025 su->su_maxscore = SCORE_SFMAX1;
10026 su->su_sfmaxscore = SCORE_MAXINIT * 3;
9061 suggest_try_soundalike(su); 10027 suggest_try_soundalike(su);
9062 } 10028 if (su->su_ga.ga_len < SUG_CLEAN_COUNT(su))
9063 10029 {
9064 /* When CTRL-C was hit while searching do show the results. */ 10030 /* We didn't find enough matches, try again, allowing more
10031 * changes to the soundfold word. */
10032 su->su_maxscore = SCORE_SFMAX2;
10033 suggest_try_soundalike(su);
10034 if (su->su_ga.ga_len < SUG_CLEAN_COUNT(su))
10035 {
10036 /* Still didn't find enough matches, try again, allowing even
10037 * more changes to the soundfold word. */
10038 su->su_maxscore = SCORE_SFMAX3;
10039 suggest_try_soundalike(su);
10040 }
10041 }
10042 su->su_maxscore = su->su_sfmaxscore;
10043 suggest_try_soundalike_finish();
10044 }
10045
10046 /* When CTRL-C was hit while searching do show the results. Only clear
10047 * got_int when using a command, not for spellsuggest(). */
9065 ui_breakcheck(); 10048 ui_breakcheck();
9066 if (got_int) 10049 if (interactive && got_int)
9067 { 10050 {
9068 (void)vgetc(); 10051 (void)vgetc();
9069 got_int = FALSE; 10052 got_int = FALSE;
9070 } 10053 }
9071 10054
9073 { 10056 {
9074 if (sps_flags & SPS_BEST) 10057 if (sps_flags & SPS_BEST)
9075 /* Adjust the word score for how it sounds like. */ 10058 /* Adjust the word score for how it sounds like. */
9076 rescore_suggestions(su); 10059 rescore_suggestions(su);
9077 10060
9078 /* Sort the suggestions and truncate at "maxcount". */ 10061 /* Remove bogus suggestions, sort and truncate at "maxcount". */
10062 check_suggestions(su, &su->su_ga);
9079 (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, su->su_maxcount); 10063 (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, su->su_maxcount);
10064 }
10065 }
10066
10067 /*
10068 * Load the .sug files for languages that have one and weren't loaded yet.
10069 */
10070 static void
10071 suggest_load_files()
10072 {
10073 langp_T *lp;
10074 int lpi;
10075 slang_T *slang;
10076 char_u *dotp;
10077 FILE *fd;
10078 char_u buf[MAXWLEN];
10079 int i;
10080 time_t timestamp;
10081 int wcount;
10082 int wordnr;
10083 garray_T ga;
10084 int c;
10085
10086 /* Do this for all languages that support sound folding. */
10087 for (lpi = 0; lpi < curbuf->b_langp.ga_len; ++lpi)
10088 {
10089 lp = LANGP_ENTRY(curbuf->b_langp, lpi);
10090 slang = lp->lp_slang;
10091 if (slang->sl_sugtime != 0 && !slang->sl_sugloaded)
10092 {
10093 /* Change ".spl" to ".sug" and open the file. When the file isn't
10094 * found silently skip it. Do set "sl_sugloaded" so that we
10095 * don't try again and again. */
10096 slang->sl_sugloaded = TRUE;
10097
10098 dotp = vim_strrchr(slang->sl_fname, '.');
10099 if (dotp == NULL || fnamecmp(dotp, ".spl") != 0)
10100 continue;
10101 STRCPY(dotp, ".sug");
10102 fd = fopen((char *)slang->sl_fname, "r");
10103 if (fd == NULL)
10104 goto nextone;
10105
10106 /*
10107 * <SUGHEADER>: <fileID> <versionnr> <timestamp>
10108 */
10109 for (i = 0; i < VIMSUGMAGICL; ++i)
10110 buf[i] = getc(fd); /* <fileID> */
10111 if (STRNCMP(buf, VIMSUGMAGIC, VIMSUGMAGICL) != 0)
10112 {
10113 EMSG2(_("E999: This does not look like a .sug file: %s"),
10114 slang->sl_fname);
10115 goto nextone;
10116 }
10117 c = getc(fd); /* <versionnr> */
10118 if (c < VIMSUGVERSION)
10119 {
10120 EMSG2(_("E999: Old .sug file, needs to be updated: %s"),
10121 slang->sl_fname);
10122 goto nextone;
10123 }
10124 else if (c > VIMSUGVERSION)
10125 {
10126 EMSG2(_("E999: .sug file is for newer version of Vim: %s"),
10127 slang->sl_fname);
10128 goto nextone;
10129 }
10130
10131 /* Check the timestamp, it must be exactly the same as the one in
10132 * the .spl file. Otherwise the word numbers won't match. */
10133 timestamp = 0;
10134 for (i = 7; i >= 0; --i) /* <timestamp> */
10135 timestamp += getc(fd) << (i * 8);
10136 if (timestamp != slang->sl_sugtime)
10137 {
10138 EMSG2(_("E999: .sug file doesn't match .spl file: %s"),
10139 slang->sl_fname);
10140 goto nextone;
10141 }
10142
10143 /*
10144 * <SUGWORDTREE>: <wordtree>
10145 * Read the trie with the soundfolded words.
10146 */
10147 if (spell_read_tree(fd, &slang->sl_sbyts, &slang->sl_sidxs,
10148 FALSE, 0) != 0)
10149 {
10150 someerror:
10151 EMSG2(_("E999: error while reading .sug file: %s"),
10152 slang->sl_fname);
10153 slang_clear_sug(slang);
10154 goto nextone;
10155 }
10156
10157 /*
10158 * <SUGTABLE>: <sugwcount> <sugline> ...
10159 *
10160 * Read the table with word numbers. We use a file buffer for
10161 * this, because it's so much like a file with lines. Makes it
10162 * possible to swap the info and save on memory use.
10163 */
10164 slang->sl_sugbuf = open_spellbuf();
10165 if (slang->sl_sugbuf == NULL)
10166 goto someerror;
10167 /* <sugwcount> */
10168 wcount = (getc(fd) << 24) + (getc(fd) << 16) + (getc(fd) << 8)
10169 + getc(fd);
10170 if (wcount < 0)
10171 goto someerror;
10172
10173 /* Read all the wordnr lists into the buffer, one NUL terminated
10174 * list per line. */
10175 ga_init2(&ga, 1, 100);
10176 for (wordnr = 0; wordnr < wcount; ++wordnr)
10177 {
10178 ga.ga_len = 0;
10179 for (;;)
10180 {
10181 c = getc(fd); /* <sugline> */
10182 if (c < 0 || ga_grow(&ga, 1) == FAIL)
10183 goto someerror;
10184 ((char_u *)ga.ga_data)[ga.ga_len++] = c;
10185 if (c == NUL)
10186 break;
10187 }
10188 if (ml_append_buf(slang->sl_sugbuf, (linenr_T)wordnr,
10189 ga.ga_data, ga.ga_len, TRUE) == FAIL)
10190 goto someerror;
10191 }
10192 ga_clear(&ga);
10193
10194 /*
10195 * Need to put word counts in the word tries, so that we can find
10196 * a word by its number.
10197 */
10198 tree_count_words(slang->sl_fbyts, slang->sl_fidxs);
10199 tree_count_words(slang->sl_sbyts, slang->sl_sidxs);
10200
10201 nextone:
10202 if (fd != NULL)
10203 fclose(fd);
10204 STRCPY(dotp, ".spl");
10205 }
10206 }
10207 }
10208
10209
10210 /*
10211 * Fill in the wordcount fields for a trie.
10212 * Returns the total number of words.
10213 */
10214 static void
10215 tree_count_words(byts, idxs)
10216 char_u *byts;
10217 idx_T *idxs;
10218 {
10219 int depth;
10220 idx_T arridx[MAXWLEN];
10221 int curi[MAXWLEN];
10222 int c;
10223 idx_T n;
10224 int wordcount[MAXWLEN];
10225
10226 arridx[0] = 0;
10227 curi[0] = 1;
10228 wordcount[0] = 0;
10229 depth = 0;
10230 while (depth >= 0 && !got_int)
10231 {
10232 if (curi[depth] > byts[arridx[depth]])
10233 {
10234 /* Done all bytes at this node, go up one level. */
10235 idxs[arridx[depth]] = wordcount[depth];
10236 if (depth > 0)
10237 wordcount[depth - 1] += wordcount[depth];
10238
10239 --depth;
10240 fast_breakcheck();
10241 }
10242 else
10243 {
10244 /* Do one more byte at this node. */
10245 n = arridx[depth] + curi[depth];
10246 ++curi[depth];
10247
10248 c = byts[n];
10249 if (c == 0)
10250 {
10251 /* End of word, count it. */
10252 ++wordcount[depth];
10253
10254 /* Skip over any other NUL bytes (same word with different
10255 * flags). */
10256 while (byts[n + 1] == 0)
10257 {
10258 ++n;
10259 ++curi[depth];
10260 }
10261 }
10262 else
10263 {
10264 /* Normal char, go one level deeper to count the words. */
10265 ++depth;
10266 arridx[depth] = idxs[n];
10267 curi[depth] = 1;
10268 wordcount[depth] = 0;
10269 }
10270 }
9080 } 10271 }
9081 } 10272 }
9082 10273
9083 /* 10274 /*
9084 * Free the info put in "*su" by spell_find_suggest(). 10275 * Free the info put in "*su" by spell_find_suggest().
9096 for (i = 0; i < su->su_sga.ga_len; ++i) 10287 for (i = 0; i < su->su_sga.ga_len; ++i)
9097 vim_free(SUG(su->su_sga, i).st_word); 10288 vim_free(SUG(su->su_sga, i).st_word);
9098 ga_clear(&su->su_sga); 10289 ga_clear(&su->su_sga);
9099 10290
9100 /* Free the banned words. */ 10291 /* Free the banned words. */
9101 free_banned(su); 10292 hash_clear_all(&su->su_banned, 0);
9102 } 10293 }
9103 10294
9104 /* 10295 /*
9105 * Make a copy of "word", with the first letter upper or lower cased, to 10296 * Make a copy of "word", with the first letter upper or lower cased, to
9106 * "wcopy[MAXWLEN]". "word" must not be empty. 10297 * "wcopy[MAXWLEN]". "word" must not be empty.
9222 su->su_fbadword[len] = c; 10413 su->su_fbadword[len] = c;
9223 10414
9224 /* Give a soundalike score of 0, compute the score as if deleting one 10415 /* Give a soundalike score of 0, compute the score as if deleting one
9225 * character. */ 10416 * character. */
9226 add_suggestion(su, &su->su_ga, word, su->su_badlen, 10417 add_suggestion(su, &su->su_ga, word, su->su_badlen,
9227 RESCORE(SCORE_REP, 0), 0, TRUE, su->su_sallang); 10418 RESCORE(SCORE_REP, 0), 0, TRUE, su->su_sallang, FALSE);
9228 } 10419 }
9229 } 10420 }
10421
10422 /*
10423 * Try finding suggestions by adding/removing/swapping letters.
10424 */
10425 static void
10426 suggest_try_change(su)
10427 suginfo_T *su;
10428 {
10429 char_u fword[MAXWLEN]; /* copy of the bad word, case-folded */
10430 int n;
10431 char_u *p;
10432 int lpi;
10433 langp_T *lp;
10434
10435 /* We make a copy of the case-folded bad word, so that we can modify it
10436 * to find matches (esp. REP items). Append some more text, changing
10437 * chars after the bad word may help. */
10438 STRCPY(fword, su->su_fbadword);
10439 n = STRLEN(fword);
10440 p = su->su_badptr + su->su_badlen;
10441 (void)spell_casefold(p, STRLEN(p), fword + n, MAXWLEN - n);
10442
10443 for (lpi = 0; lpi < curbuf->b_langp.ga_len; ++lpi)
10444 {
10445 lp = LANGP_ENTRY(curbuf->b_langp, lpi);
10446
10447 /* If reloading a spell file fails it's still in the list but
10448 * everything has been cleared. */
10449 if (lp->lp_slang->sl_fbyts == NULL)
10450 continue;
10451
10452 /* Try it for this language. Will add possible suggestions. */
10453 suggest_trie_walk(su, lp, fword, FALSE);
10454 }
10455 }
10456
10457 /* Check the maximum score, if we go over it we won't try this change. */
10458 #define TRY_DEEPER(su, stack, depth, add) \
10459 (stack[depth].ts_score + (add) < su->su_maxscore)
9230 10460
9231 /* 10461 /*
9232 * Try finding suggestions by adding/removing/swapping letters. 10462 * Try finding suggestions by adding/removing/swapping letters.
9233 * 10463 *
9234 * This uses a state machine. At each node in the tree we try various 10464 * This uses a state machine. At each node in the tree we try various
9235 * operations. When trying if an operation work "depth" is increased and the 10465 * operations. When trying if an operation works "depth" is increased and the
9236 * stack[] is used to store info. This allows combinations, thus insert one 10466 * stack[] is used to store info. This allows combinations, thus insert one
9237 * character, replace one and delete another. The number of changes is 10467 * character, replace one and delete another. The number of changes is
9238 * limited by su->su_maxscore, checked in try_deeper(). 10468 * limited by su->su_maxscore.
9239 * 10469 *
9240 * After implementing this I noticed an article by Kemal Oflazer that 10470 * After implementing this I noticed an article by Kemal Oflazer that
9241 * describes something similar: "Error-tolerant Finite State Recognition with 10471 * describes something similar: "Error-tolerant Finite State Recognition with
9242 * Applications to Morphological Analysis and Spelling Correction" (1996). 10472 * Applications to Morphological Analysis and Spelling Correction" (1996).
9243 * The implementation in the article is simplified and requires a stack of 10473 * The implementation in the article is simplified and requires a stack of
9244 * unknown depth. The implementation here only needs a stack depth of the 10474 * unknown depth. The implementation here only needs a stack depth equal to
9245 * length of the word. 10475 * the length of the word.
10476 *
10477 * This is also used for the sound-folded word, "soundfold" is TRUE then.
10478 * The mechanism is the same, but we find a match with a sound-folded word
10479 * that comes from one or more original words. Each of these words may be
10480 * added, this is done by add_sound_suggest().
10481 * Don't use:
10482 * the prefix tree or the keep-case tree
10483 * "su->su_badlen"
10484 * anything to do with upper and lower case
10485 * anything to do with word or non-word characters ("spell_iswordp()")
10486 * banned words
10487 * word flags (rare, region, compounding)
10488 * word splitting for now
10489 * "similar_chars()"
10490 * use "slang->sl_repsal" instead of "lp->lp_replang->sl_rep"
9246 */ 10491 */
9247 static void 10492 static void
9248 suggest_try_change(su) 10493 suggest_trie_walk(su, lp, fword, soundfold)
9249 suginfo_T *su; 10494 suginfo_T *su;
9250 { 10495 langp_T *lp;
9251 char_u fword[MAXWLEN]; /* copy of the bad word, case-folded */ 10496 char_u *fword;
10497 int soundfold;
10498 {
9252 char_u tword[MAXWLEN]; /* good word collected so far */ 10499 char_u tword[MAXWLEN]; /* good word collected so far */
9253 trystate_T stack[MAXWLEN]; 10500 trystate_T stack[MAXWLEN];
9254 char_u preword[MAXWLEN * 3]; /* word found with proper case; 10501 char_u preword[MAXWLEN * 3]; /* word found with proper case;
9255 * concatanation of prefix compound 10502 * concatanation of prefix compound
9256 * words and split word. NUL terminated 10503 * words and split word. NUL terminated
9257 * when going deeper but not when coming 10504 * when going deeper but not when coming
9258 * back. */ 10505 * back. */
9259 char_u compflags[MAXWLEN]; /* compound flags, one for each word */ 10506 char_u compflags[MAXWLEN]; /* compound flags, one for each word */
9260 trystate_T *sp; 10507 trystate_T *sp;
9261 int newscore; 10508 int newscore;
9262 langp_T *lp; 10509 int score;
9263 char_u *byts, *fbyts, *pbyts; 10510 char_u *byts, *fbyts, *pbyts;
9264 idx_T *idxs, *fidxs, *pidxs; 10511 idx_T *idxs, *fidxs, *pidxs;
9265 int depth; 10512 int depth;
9266 int c, c2, c3; 10513 int c, c2, c3;
9267 int n; 10514 int n = 0;
9268 int flags; 10515 int flags;
9269 garray_T *gap; 10516 garray_T *gap;
9270 idx_T arridx; 10517 idx_T arridx;
9271 int len; 10518 int len;
9272 char_u *p; 10519 char_u *p;
9273 fromto_T *ftp; 10520 fromto_T *ftp;
9274 int fl = 0, tl; 10521 int fl = 0, tl;
9275 int repextra = 0; /* extra bytes in fword[] from REP item */ 10522 int repextra = 0; /* extra bytes in fword[] from REP item */
9276 slang_T *slang; 10523 slang_T *slang = lp->lp_slang;
9277 int fword_ends; 10524 int fword_ends;
9278 int lpi;
9279 int maysplit;
9280 int goodword_ends; 10525 int goodword_ends;
9281 10526 #ifdef DEBUG_TRIEWALK
9282 /* We make a copy of the case-folded bad word, so that we can modify it 10527 /* Stores the name of the change made at each level. */
9283 * to find matches (esp. REP items). Append some more text, changing 10528 char_u changename[MAXWLEN][80];
9284 * chars after the bad word may help. */ 10529 #endif
9285 STRCPY(fword, su->su_fbadword); 10530 int breakcheckcount = 1000;
9286 n = STRLEN(fword); 10531 int compound_ok;
9287 p = su->su_badptr + su->su_badlen; 10532
9288 (void)spell_casefold(p, STRLEN(p), fword + n, MAXWLEN - n); 10533 /*
9289 10534 * Go through the whole case-fold tree, try changes at each node.
9290 for (lpi = 0; lpi < curbuf->b_langp.ga_len; ++lpi) 10535 * "tword[]" contains the word collected from nodes in the tree.
9291 { 10536 * "fword[]" the word we are trying to match with (initially the bad
9292 lp = LANGP_ENTRY(curbuf->b_langp, lpi); 10537 * word).
9293 slang = lp->lp_slang; 10538 */
9294 10539 depth = 0;
9295 /* If reloading a spell file fails it's still in the list but 10540 sp = &stack[0];
9296 * everything has been cleared. */ 10541 vim_memset(sp, 0, sizeof(trystate_T));
9297 if (slang->sl_fbyts == NULL) 10542 sp->ts_curi = 1;
9298 continue; 10543
9299 10544 if (soundfold)
9300 /* 10545 {
9301 * Go through the whole case-fold tree, try changes at each node. 10546 /* Going through the soundfold tree. */
9302 * "tword[]" contains the word collected from nodes in the tree. 10547 byts = fbyts = slang->sl_sbyts;
9303 * "fword[]" the word we are trying to match with (initially the bad 10548 idxs = fidxs = slang->sl_sidxs;
9304 * word). 10549 pbyts = NULL;
9305 */ 10550 pidxs = NULL;
9306 depth = 0; 10551 sp->ts_prefixdepth = PFD_NOPREFIX;
9307 sp = &stack[0]; 10552 sp->ts_state = STATE_START;
9308 vim_memset(sp, 0, sizeof(trystate_T)); 10553 }
9309 sp->ts_curi = 1; 10554 else
9310 10555 {
9311 /* 10556 /*
9312 * When there are postponed prefixes we need to use these first. At 10557 * When there are postponed prefixes we need to use these first. At
9313 * the end of the prefix we continue in the case-fold tree. 10558 * the end of the prefix we continue in the case-fold tree.
9314 */ 10559 */
9315 fbyts = slang->sl_fbyts; 10560 fbyts = slang->sl_fbyts;
9328 byts = fbyts; 10573 byts = fbyts;
9329 idxs = fidxs; 10574 idxs = fidxs;
9330 sp->ts_prefixdepth = PFD_NOPREFIX; 10575 sp->ts_prefixdepth = PFD_NOPREFIX;
9331 sp->ts_state = STATE_START; 10576 sp->ts_state = STATE_START;
9332 } 10577 }
9333 10578 }
9334 /* 10579
9335 * Loop to find all suggestions. At each round we either: 10580 /*
9336 * - For the current state try one operation, advance "ts_curi", 10581 * Loop to find all suggestions. At each round we either:
9337 * increase "depth". 10582 * - For the current state try one operation, advance "ts_curi",
9338 * - When a state is done go to the next, set "ts_state". 10583 * increase "depth".
9339 * - When all states are tried decrease "depth". 10584 * - When a state is done go to the next, set "ts_state".
9340 */ 10585 * - When all states are tried decrease "depth".
9341 while (depth >= 0 && !got_int) 10586 */
9342 { 10587 while (depth >= 0 && !got_int)
9343 sp = &stack[depth]; 10588 {
9344 switch (sp->ts_state) 10589 sp = &stack[depth];
9345 { 10590 switch (sp->ts_state)
9346 case STATE_START: 10591 {
9347 case STATE_NOPREFIX: 10592 case STATE_START:
9348 /* 10593 case STATE_NOPREFIX:
9349 * Start of node: Deal with NUL bytes, which means 10594 /*
9350 * tword[] may end here. 10595 * Start of node: Deal with NUL bytes, which means
9351 */ 10596 * tword[] may end here.
9352 arridx = sp->ts_arridx; /* current node in the tree */ 10597 */
9353 len = byts[arridx]; /* bytes in this node */ 10598 arridx = sp->ts_arridx; /* current node in the tree */
9354 arridx += sp->ts_curi; /* index of current byte */ 10599 len = byts[arridx]; /* bytes in this node */
9355 10600 arridx += sp->ts_curi; /* index of current byte */
9356 if (sp->ts_prefixdepth == PFD_PREFIXTREE) 10601
10602 if (sp->ts_prefixdepth == PFD_PREFIXTREE)
10603 {
10604 /* Skip over the NUL bytes, we use them later. */
10605 for (n = 0; n < len && byts[arridx + n] == 0; ++n)
10606 ;
10607 sp->ts_curi += n;
10608
10609 /* Always past NUL bytes now. */
10610 n = (int)sp->ts_state;
10611 sp->ts_state = STATE_ENDNUL;
10612 sp->ts_save_badflags = su->su_badflags;
10613
10614 /* At end of a prefix or at start of prefixtree: check for
10615 * following word. */
10616 if (byts[arridx] == 0 || n == (int)STATE_NOPREFIX)
9357 { 10617 {
9358 /* Skip over the NUL bytes, we use them later. */ 10618 /* Set su->su_badflags to the caps type at this position.
9359 for (n = 0; n < len && byts[arridx + n] == 0; ++n) 10619 * Use the caps type until here for the prefix itself. */
9360 ; 10620 #ifdef FEAT_MBYTE
9361 sp->ts_curi += n; 10621 if (has_mbyte)
9362 10622 n = nofold_len(fword, sp->ts_fidx, su->su_badptr);
9363 /* Always past NUL bytes now. */ 10623 else
9364 n = (int)sp->ts_state; 10624 #endif
9365 sp->ts_state = STATE_ENDNUL; 10625 n = sp->ts_fidx;
9366 sp->ts_save_badflags = su->su_badflags; 10626 flags = badword_captype(su->su_badptr, su->su_badptr + n);
9367 10627 su->su_badflags = badword_captype(su->su_badptr + n,
9368 /* At end of a prefix or at start of prefixtree: check for 10628 su->su_badptr + su->su_badlen);
9369 * following word. */ 10629 #ifdef DEBUG_TRIEWALK
9370 if (byts[arridx] == 0 || n == (int)STATE_NOPREFIX) 10630 sprintf(changename[depth], "prefix");
10631 #endif
10632 go_deeper(stack, depth, 0);
10633 ++depth;
10634 sp = &stack[depth];
10635 sp->ts_prefixdepth = depth - 1;
10636 byts = fbyts;
10637 idxs = fidxs;
10638 sp->ts_arridx = 0;
10639
10640 /* Move the prefix to preword[] with the right case
10641 * and make find_keepcap_word() works. */
10642 tword[sp->ts_twordlen] = NUL;
10643 make_case_word(tword + sp->ts_splitoff,
10644 preword + sp->ts_prewordlen, flags);
10645 sp->ts_prewordlen = STRLEN(preword);
10646 sp->ts_splitoff = sp->ts_twordlen;
10647 }
10648 break;
10649 }
10650
10651 if (sp->ts_curi > len || byts[arridx] != 0)
10652 {
10653 /* Past bytes in node and/or past NUL bytes. */
10654 sp->ts_state = STATE_ENDNUL;
10655 sp->ts_save_badflags = su->su_badflags;
10656 break;
10657 }
10658
10659 /*
10660 * End of word in tree.
10661 */
10662 ++sp->ts_curi; /* eat one NUL byte */
10663
10664 flags = (int)idxs[arridx];
10665 fword_ends = (fword[sp->ts_fidx] == NUL
10666 || (soundfold
10667 ? vim_iswhite(fword[sp->ts_fidx])
10668 : !spell_iswordp(fword + sp->ts_fidx, curbuf)));
10669 tword[sp->ts_twordlen] = NUL;
10670
10671 if (sp->ts_prefixdepth <= PFD_NOTSPECIAL
10672 && (sp->ts_flags & TSF_PREFIXOK) == 0)
10673 {
10674 /* There was a prefix before the word. Check that the prefix
10675 * can be used with this word. */
10676 /* Count the length of the NULs in the prefix. If there are
10677 * none this must be the first try without a prefix. */
10678 n = stack[sp->ts_prefixdepth].ts_arridx;
10679 len = pbyts[n++];
10680 for (c = 0; c < len && pbyts[n + c] == 0; ++c)
10681 ;
10682 if (c > 0)
10683 {
10684 c = valid_word_prefix(c, n, flags,
10685 tword + sp->ts_splitoff, slang, FALSE);
10686 if (c == 0)
10687 break;
10688
10689 /* Use the WF_RARE flag for a rare prefix. */
10690 if (c & WF_RAREPFX)
10691 flags |= WF_RARE;
10692
10693 /* Tricky: when checking for both prefix and compounding
10694 * we run into the prefix flag first.
10695 * Remember that it's OK, so that we accept the prefix
10696 * when arriving at a compound flag. */
10697 sp->ts_flags |= TSF_PREFIXOK;
10698 }
10699 }
10700
10701 /* Check NEEDCOMPOUND: can't use word without compounding. Do try
10702 * appending another compound word below. */
10703 if (sp->ts_complen == sp->ts_compsplit && fword_ends
10704 && (flags & WF_NEEDCOMP))
10705 goodword_ends = FALSE;
10706 else
10707 goodword_ends = TRUE;
10708
10709 p = NULL;
10710 compound_ok = TRUE;
10711 if (sp->ts_complen > sp->ts_compsplit)
10712 {
10713 if (slang->sl_nobreak)
10714 {
10715 /* There was a word before this word. When there was no
10716 * change in this word (it was correct) add the first word
10717 * as a suggestion. If this word was corrected too, we
10718 * need to check if a correct word follows. */
10719 if (sp->ts_fidx - sp->ts_splitfidx
10720 == sp->ts_twordlen - sp->ts_splitoff
10721 && STRNCMP(fword + sp->ts_splitfidx,
10722 tword + sp->ts_splitoff,
10723 sp->ts_fidx - sp->ts_splitfidx) == 0)
9371 { 10724 {
9372 /* Set su->su_badflags to the caps type at this 10725 preword[sp->ts_prewordlen] = NUL;
9373 * position. Use the caps type until here for the 10726 newscore = score_wordcount_adj(slang, sp->ts_score,
9374 * prefix itself. */ 10727 preword + sp->ts_prewordlen,
9375 #ifdef FEAT_MBYTE 10728 sp->ts_prewordlen > 0);
9376 if (has_mbyte) 10729 /* Add the suggestion if the score isn't too bad. */
9377 n = nofold_len(fword, sp->ts_fidx, su->su_badptr); 10730 if (newscore <= su->su_maxscore)
9378 else
9379 #endif
9380 n = sp->ts_fidx;
9381 flags = badword_captype(su->su_badptr,
9382 su->su_badptr + n);
9383 su->su_badflags = badword_captype(su->su_badptr + n,
9384 su->su_badptr + su->su_badlen);
9385 ++depth;
9386 stack[depth] = stack[depth - 1];
9387 sp = &stack[depth];
9388 sp->ts_prefixdepth = depth - 1;
9389 byts = fbyts;
9390 idxs = fidxs;
9391 sp->ts_state = STATE_START;
9392 sp->ts_curi = 1; /* start just after length byte */
9393 sp->ts_arridx = 0;
9394
9395 /* Move the prefix to preword[] with the right case
9396 * and make find_keepcap_word() works. */
9397 tword[sp->ts_twordlen] = NUL;
9398 make_case_word(tword + sp->ts_splitoff,
9399 preword + sp->ts_prewordlen,
9400 flags);
9401 sp->ts_prewordlen = STRLEN(preword);
9402 sp->ts_splitoff = sp->ts_twordlen;
9403 }
9404 break;
9405 }
9406
9407 if (sp->ts_curi > len || byts[arridx] != 0)
9408 {
9409 /* Past bytes in node and/or past NUL bytes. */
9410 sp->ts_state = STATE_ENDNUL;
9411 sp->ts_save_badflags = su->su_badflags;
9412 break;
9413 }
9414
9415 /*
9416 * End of word in tree.
9417 */
9418 ++sp->ts_curi; /* eat one NUL byte */
9419
9420 flags = (int)idxs[arridx];
9421 fword_ends = (fword[sp->ts_fidx] == NUL
9422 || !spell_iswordp(fword + sp->ts_fidx, curbuf));
9423 tword[sp->ts_twordlen] = NUL;
9424
9425 if (sp->ts_prefixdepth <= PFD_NOTSPECIAL
9426 && (sp->ts_flags & TSF_PREFIXOK) == 0)
9427 {
9428 /* There was a prefix before the word. Check that the
9429 * prefix can be used with this word. */
9430 /* Count the length of the NULs in the prefix. If there
9431 * are none this must be the first try without a prefix.
9432 */
9433 n = stack[sp->ts_prefixdepth].ts_arridx;
9434 len = pbyts[n++];
9435 for (c = 0; c < len && pbyts[n + c] == 0; ++c)
9436 ;
9437 if (c > 0)
9438 {
9439 c = valid_word_prefix(c, n, flags,
9440 tword + sp->ts_splitoff, slang, FALSE);
9441 if (c == 0)
9442 break;
9443
9444 /* Use the WF_RARE flag for a rare prefix. */
9445 if (c & WF_RAREPFX)
9446 flags |= WF_RARE;
9447
9448 /* Tricky: when checking for both prefix and
9449 * compounding we run into the prefix flag first.
9450 * Remember that it's OK, so that we accept the prefix
9451 * when arriving at a compound flag. */
9452 sp->ts_flags |= TSF_PREFIXOK;
9453 }
9454 }
9455
9456 /* Check NEEDCOMPOUND: can't use word without compounding. Do
9457 * try appending another compound word below. */
9458 if (sp->ts_complen == sp->ts_compsplit && fword_ends
9459 && (flags & WF_NEEDCOMP))
9460 goodword_ends = FALSE;
9461 else
9462 goodword_ends = TRUE;
9463
9464 if (sp->ts_complen > sp->ts_compsplit)
9465 {
9466 if (slang->sl_nobreak)
9467 {
9468 /* There was a word before this word. When there was
9469 * no change in this word (it was correct) add the
9470 * first word as a suggestion. If this word was
9471 * corrected too, we need to check if a correct word
9472 * follows. */
9473 if (sp->ts_fidx - sp->ts_splitfidx
9474 == sp->ts_twordlen - sp->ts_splitoff
9475 && STRNCMP(fword + sp->ts_splitfidx,
9476 tword + sp->ts_splitoff,
9477 sp->ts_fidx - sp->ts_splitfidx) == 0)
9478 {
9479 preword[sp->ts_prewordlen] = NUL;
9480 add_suggestion(su, &su->su_ga, preword, 10731 add_suggestion(su, &su->su_ga, preword,
9481 sp->ts_splitfidx - repextra, 10732 sp->ts_splitfidx - repextra,
9482 sp->ts_score, 0, FALSE, 10733 newscore, 0, FALSE,
9483 lp->lp_sallang); 10734 lp->lp_sallang, FALSE);
9484 break; 10735 break;
9485 }
9486 }
9487 else
9488 {
9489 /* There was a compound word before this word. If
9490 * this word does not support compounding then give up
9491 * (splitting is tried for the word without compound
9492 * flag). */
9493 if (((unsigned)flags >> 24) == 0
9494 || sp->ts_twordlen - sp->ts_splitoff
9495 < slang->sl_compminlen)
9496 break;
9497 #ifdef FEAT_MBYTE
9498 /* For multi-byte chars check character length against
9499 * COMPOUNDMIN. */
9500 if (has_mbyte
9501 && slang->sl_compminlen > 0
9502 && mb_charlen(tword + sp->ts_splitoff)
9503 < slang->sl_compminlen)
9504 break;
9505 #endif
9506
9507 compflags[sp->ts_complen] = ((unsigned)flags >> 24);
9508 compflags[sp->ts_complen + 1] = NUL;
9509 vim_strncpy(preword + sp->ts_prewordlen,
9510 tword + sp->ts_splitoff,
9511 sp->ts_twordlen - sp->ts_splitoff);
9512 p = preword;
9513 while (*skiptowhite(p) != NUL)
9514 p = skipwhite(skiptowhite(p));
9515 if (fword_ends && !can_compound(slang, p,
9516 compflags + sp->ts_compsplit))
9517 break;
9518
9519 /* Get pointer to last char of previous word. */
9520 p = preword + sp->ts_prewordlen;
9521 mb_ptr_back(preword, p);
9522 } 10736 }
9523 } 10737 }
9524 else 10738 else
9525 p = NULL; 10739 {
9526 10740 /* There was a compound word before this word. If this
9527 /* 10741 * word does not support compounding then give up
9528 * Form the word with proper case in preword. 10742 * (splitting is tried for the word without compound
9529 * If there is a word from a previous split, append. 10743 * flag). */
9530 */ 10744 if (((unsigned)flags >> 24) == 0
9531 if (flags & WF_KEEPCAP) 10745 || sp->ts_twordlen - sp->ts_splitoff
9532 /* Must find the word in the keep-case tree. */ 10746 < slang->sl_compminlen)
9533 find_keepcap_word(slang, tword + sp->ts_splitoff, 10747 break;
10748 #ifdef FEAT_MBYTE
10749 /* For multi-byte chars check character length against
10750 * COMPOUNDMIN. */
10751 if (has_mbyte
10752 && slang->sl_compminlen > 0
10753 && mb_charlen(tword + sp->ts_splitoff)
10754 < slang->sl_compminlen)
10755 break;
10756 #endif
10757
10758 compflags[sp->ts_complen] = ((unsigned)flags >> 24);
10759 compflags[sp->ts_complen + 1] = NUL;
10760 vim_strncpy(preword + sp->ts_prewordlen,
10761 tword + sp->ts_splitoff,
10762 sp->ts_twordlen - sp->ts_splitoff);
10763 p = preword;
10764 while (*skiptowhite(p) != NUL)
10765 p = skipwhite(skiptowhite(p));
10766 if (fword_ends && !can_compound(slang, p,
10767 compflags + sp->ts_compsplit))
10768 /* Compound is not allowed. But it may still be
10769 * possible if we add another (short) word. */
10770 compound_ok = FALSE;
10771
10772 /* Get pointer to last char of previous word. */
10773 p = preword + sp->ts_prewordlen;
10774 mb_ptr_back(preword, p);
10775 }
10776 }
10777
10778 /*
10779 * Form the word with proper case in preword.
10780 * If there is a word from a previous split, append.
10781 * For the soundfold tree don't change the case, simply append.
10782 */
10783 if (soundfold)
10784 STRCPY(preword + sp->ts_prewordlen, tword + sp->ts_splitoff);
10785 else if (flags & WF_KEEPCAP)
10786 /* Must find the word in the keep-case tree. */
10787 find_keepcap_word(slang, tword + sp->ts_splitoff,
9534 preword + sp->ts_prewordlen); 10788 preword + sp->ts_prewordlen);
9535 else 10789 else
9536 { 10790 {
9537 /* Include badflags: if the badword is onecap or allcap 10791 /* Include badflags: If the badword is onecap or allcap
9538 * use that for the goodword too. But if the badword is 10792 * use that for the goodword too. But if the badword is
9539 * allcap and it's only one char long use onecap. */ 10793 * allcap and it's only one char long use onecap. */
9540 c = su->su_badflags; 10794 c = su->su_badflags;
9541 if ((c & WF_ALLCAP) 10795 if ((c & WF_ALLCAP)
9542 #ifdef FEAT_MBYTE 10796 #ifdef FEAT_MBYTE
9543 && su->su_badlen == (*mb_ptr2len)(su->su_badptr) 10797 && su->su_badlen == (*mb_ptr2len)(su->su_badptr)
9544 #else 10798 #else
9545 && su->su_badlen == 1 10799 && su->su_badlen == 1
9546 #endif 10800 #endif
9547 ) 10801 )
9548 c = WF_ONECAP; 10802 c = WF_ONECAP;
9549 c |= flags; 10803 c |= flags;
9550 10804
9551 /* When appending a compound word after a word character 10805 /* When appending a compound word after a word character don't
9552 * don't use Onecap. */ 10806 * use Onecap. */
9553 if (p != NULL && spell_iswordp_nmw(p)) 10807 if (p != NULL && spell_iswordp_nmw(p))
9554 c &= ~WF_ONECAP; 10808 c &= ~WF_ONECAP;
9555 make_case_word(tword + sp->ts_splitoff, 10809 make_case_word(tword + sp->ts_splitoff,
9556 preword + sp->ts_prewordlen, c); 10810 preword + sp->ts_prewordlen, c);
9557 } 10811 }
9558 10812
10813 if (!soundfold)
10814 {
9559 /* Don't use a banned word. It may appear again as a good 10815 /* Don't use a banned word. It may appear again as a good
9560 * word, thus remember it. */ 10816 * word, thus remember it. */
9561 if (flags & WF_BANNED) 10817 if (flags & WF_BANNED)
9562 { 10818 {
9563 add_banned(su, preword + sp->ts_prewordlen); 10819 add_banned(su, preword + sp->ts_prewordlen);
9564 break; 10820 break;
9565 } 10821 }
9566 if ((sp->ts_complen == sp->ts_compsplit 10822 if ((sp->ts_complen == sp->ts_compsplit
9567 && was_banned(su, preword + sp->ts_prewordlen)) 10823 && WAS_BANNED(su, preword + sp->ts_prewordlen))
9568 || was_banned(su, preword)) 10824 || WAS_BANNED(su, preword))
9569 { 10825 {
9570 if (slang->sl_compprog == NULL) 10826 if (slang->sl_compprog == NULL)
9571 break; 10827 break;
9572 /* the word so far was banned but we may try compounding */ 10828 /* the word so far was banned but we may try compounding */
9573 goodword_ends = FALSE; 10829 goodword_ends = FALSE;
9574 } 10830 }
9575 10831 }
9576 newscore = 0; 10832
10833 newscore = 0;
10834 if (!soundfold) /* soundfold words don't have flags */
10835 {
9577 if ((flags & WF_REGION) 10836 if ((flags & WF_REGION)
9578 && (((unsigned)flags >> 16) & lp->lp_region) == 0) 10837 && (((unsigned)flags >> 16) & lp->lp_region) == 0)
9579 newscore += SCORE_REGION; 10838 newscore += SCORE_REGION;
9580 if (flags & WF_RARE) 10839 if (flags & WF_RARE)
9581 newscore += SCORE_RARE; 10840 newscore += SCORE_RARE;
9582 10841
9583 if (!spell_valid_case(su->su_badflags, 10842 if (!spell_valid_case(su->su_badflags,
9584 captype(preword + sp->ts_prewordlen, NULL))) 10843 captype(preword + sp->ts_prewordlen, NULL)))
9585 newscore += SCORE_ICASE; 10844 newscore += SCORE_ICASE;
9586 10845 }
9587 maysplit = TRUE; 10846
9588 if (fword_ends && goodword_ends 10847 /* TODO: how about splitting in the soundfold tree? */
9589 && sp->ts_fidx >= sp->ts_fidxtry) 10848 if (fword_ends
10849 && goodword_ends
10850 && sp->ts_fidx >= sp->ts_fidxtry
10851 && compound_ok)
10852 {
10853 /* The badword also ends: add suggestions. */
10854 #ifdef DEBUG_TRIEWALK
10855 if (soundfold && STRCMP(preword, "smwrd") == 0)
9590 { 10856 {
9591 /* The badword also ends: add suggestions. Give a penalty 10857 int j;
9592 * when changing non-word char to word char, e.g., "thes," 10858
9593 * -> "these". */ 10859 /* print the stack of changes that brought us here */
10860 smsg("------ %s -------", fword);
10861 for (j = 0; j < depth; ++j)
10862 smsg("%s", changename[j]);
10863 }
10864 #endif
10865 if (soundfold)
10866 {
10867 /* For soundfolded words we need to find the original
10868 * words, the edit distrance and then add them. */
10869 add_sound_suggest(su, preword, sp->ts_score, lp);
10870 }
10871 else
10872 {
10873 /* Give a penalty when changing non-word char to word
10874 * char, e.g., "thes," -> "these". */
9594 p = fword + sp->ts_fidx; 10875 p = fword + sp->ts_fidx;
9595 #ifdef FEAT_MBYTE 10876 mb_ptr_back(fword, p);
9596 if (has_mbyte)
9597 mb_ptr_back(fword, p);
9598 else
9599 #endif
9600 --p;
9601 if (!spell_iswordp(p, curbuf)) 10877 if (!spell_iswordp(p, curbuf))
9602 { 10878 {
9603 p = preword + STRLEN(preword); 10879 p = preword + STRLEN(preword);
9604 #ifdef FEAT_MBYTE 10880 mb_ptr_back(preword, p);
9605 if (has_mbyte)
9606 mb_ptr_back(preword, p);
9607 else
9608 #endif
9609 --p;
9610 if (spell_iswordp(p, curbuf)) 10881 if (spell_iswordp(p, curbuf))
9611 newscore += SCORE_NONWORD; 10882 newscore += SCORE_NONWORD;
9612 } 10883 }
9613 10884
9614 add_suggestion(su, &su->su_ga, preword, 10885 /* Give a bonus to words seen before. */
9615 sp->ts_fidx - repextra, 10886 score = score_wordcount_adj(slang,
9616 sp->ts_score + newscore, 0, FALSE, 10887 sp->ts_score + newscore,
9617 lp->lp_sallang); 10888 preword + sp->ts_prewordlen,
9618 10889 sp->ts_prewordlen > 0);
9619 /* When the bad word doesn't end yet, try changing the 10890
9620 * next word. E.g., find suggestions for "the the" where 10891 /* Add the suggestion if the score isn't too bad. */
9621 * the second "the" is different. It's done like a split. 10892 if (score <= su->su_maxscore)
9622 */ 10893 add_suggestion(su, &su->su_ga, preword,
9623 if (sp->ts_fidx - repextra >= su->su_badlen) 10894 sp->ts_fidx - repextra,
9624 maysplit = FALSE; 10895 score, 0, FALSE, lp->lp_sallang, FALSE);
9625 } 10896 }
9626 10897 }
9627 if (maysplit 10898
9628 && (sp->ts_fidx >= sp->ts_fidxtry || fword_ends) 10899 /*
10900 * Try word split and/or compounding.
10901 */
10902 if ((sp->ts_fidx >= sp->ts_fidxtry || fword_ends)
9629 #ifdef FEAT_MBYTE 10903 #ifdef FEAT_MBYTE
9630 /* Don't split halfway a character. */ 10904 /* Don't split halfway a character. */
9631 && (!has_mbyte || sp->ts_tcharlen == 0) 10905 && (!has_mbyte || sp->ts_tcharlen == 0)
9632 #endif 10906 #endif
9633 ) 10907 )
9634 { 10908 {
9635 int try_compound; 10909 int try_compound;
9636 10910 int try_split;
9637 /* Get here in two situations: 10911
9638 * 1. The word in the tree ends but the badword continues: 10912 /* If past the end of the bad word don't try a split.
9639 * If the word allows compounding try that. Otherwise 10913 * Otherwise try changing the next word. E.g., find
9640 * try a split by inserting a space. For both check 10914 * suggestions for "the the" where the second "the" is
9641 * that a valid words starts at fword[sp->ts_fidx]. 10915 * different. It's done like a split.
9642 * For NOBREAK do like compounding to be able to check 10916 * TODO: word split for soundfold words */
9643 * if the next word is valid. 10917 try_split = (sp->ts_fidx - repextra < su->su_badlen)
9644 * 2. The badword does end, but it was due to a change 10918 && !soundfold;
9645 * (e.g., a swap). No need to split, but do check that 10919
9646 * the following word is valid. 10920 /* Get here in several situations:
9647 */ 10921 * 1. The word in the tree ends:
9648 try_compound = FALSE; 10922 * If the word allows compounding try that. Otherwise try
9649 if ((!fword_ends || !goodword_ends) 10923 * a split by inserting a space. For both check that a
9650 && slang->sl_compprog != NULL 10924 * valid words starts at fword[sp->ts_fidx].
9651 && ((unsigned)flags >> 24) != 0 10925 * For NOBREAK do like compounding to be able to check if
9652 && sp->ts_twordlen - sp->ts_splitoff 10926 * the next word is valid.
9653 >= slang->sl_compminlen 10927 * 2. The badword does end, but it was due to a change (e.g.,
10928 * a swap). No need to split, but do check that the
10929 * following word is valid.
10930 * 3. The badword and the word in the tree end. It may still
10931 * be possible to compound another (short) word.
10932 */
10933 try_compound = FALSE;
10934 if (!soundfold
10935 && slang->sl_compprog != NULL
10936 && ((unsigned)flags >> 24) != 0
10937 && sp->ts_twordlen - sp->ts_splitoff
10938 >= slang->sl_compminlen
9654 #ifdef FEAT_MBYTE 10939 #ifdef FEAT_MBYTE
9655 && (!has_mbyte 10940 && (!has_mbyte
9656 || slang->sl_compminlen == 0 10941 || slang->sl_compminlen == 0
9657 || mb_charlen(tword + sp->ts_splitoff) 10942 || mb_charlen(tword + sp->ts_splitoff)
9658 >= slang->sl_compminlen) 10943 >= slang->sl_compminlen)
9659 #endif 10944 #endif
9660 && (slang->sl_compsylmax < MAXWLEN 10945 && (slang->sl_compsylmax < MAXWLEN
9661 || sp->ts_complen + 1 - sp->ts_compsplit 10946 || sp->ts_complen + 1 - sp->ts_compsplit
9662 < slang->sl_compmax) 10947 < slang->sl_compmax)
9663 && (byte_in_str(sp->ts_complen == sp->ts_compsplit 10948 && (byte_in_str(sp->ts_complen == sp->ts_compsplit
9664 ? slang->sl_compstartflags 10949 ? slang->sl_compstartflags
9665 : slang->sl_compallflags, 10950 : slang->sl_compallflags,
9666 ((unsigned)flags >> 24)))) 10951 ((unsigned)flags >> 24))))
9667 { 10952 {
9668 try_compound = TRUE; 10953 try_compound = TRUE;
9669 compflags[sp->ts_complen] = ((unsigned)flags >> 24); 10954 compflags[sp->ts_complen] = ((unsigned)flags >> 24);
9670 compflags[sp->ts_complen + 1] = NUL; 10955 compflags[sp->ts_complen + 1] = NUL;
9671 } 10956 }
9672 10957
9673 /* For NOBREAK we never try splitting, it won't make any 10958 /* For NOBREAK we never try splitting, it won't make any word
9674 * word valid. */ 10959 * valid. */
9675 if (slang->sl_nobreak) 10960 if (slang->sl_nobreak)
9676 try_compound = TRUE; 10961 try_compound = TRUE;
9677 10962
9678 /* If we could add a compound word, and it's also possible 10963 /* If we could add a compound word, and it's also possible to
9679 * to split at this point, do the split first and set 10964 * split at this point, do the split first and set
9680 * TSF_DIDSPLIT to avoid doing it again. */ 10965 * TSF_DIDSPLIT to avoid doing it again. */
9681 else if (!fword_ends 10966 else if (!fword_ends
9682 && try_compound 10967 && try_compound
9683 && (sp->ts_flags & TSF_DIDSPLIT) == 0) 10968 && (sp->ts_flags & TSF_DIDSPLIT) == 0)
9684 { 10969 {
9685 try_compound = FALSE; 10970 try_compound = FALSE;
9686 sp->ts_flags |= TSF_DIDSPLIT; 10971 sp->ts_flags |= TSF_DIDSPLIT;
9687 --sp->ts_curi; /* do the same NUL again */ 10972 --sp->ts_curi; /* do the same NUL again */
9688 compflags[sp->ts_complen] = NUL; 10973 compflags[sp->ts_complen] = NUL;
9689 } 10974 }
9690 else 10975 else
9691 sp->ts_flags &= ~TSF_DIDSPLIT; 10976 sp->ts_flags &= ~TSF_DIDSPLIT;
9692 10977
10978 if (try_split || try_compound)
10979 {
9693 if (!try_compound && (!fword_ends || !goodword_ends)) 10980 if (!try_compound && (!fword_ends || !goodword_ends))
9694 { 10981 {
9695 /* If we're going to split need to check that the 10982 /* If we're going to split need to check that the
9696 * words so far are valid for compounding. If there 10983 * words so far are valid for compounding. If there
9697 * is only one word it must not have the NEEDCOMPOUND 10984 * is only one word it must not have the NEEDCOMPOUND
9705 if (sp->ts_complen > sp->ts_compsplit 10992 if (sp->ts_complen > sp->ts_compsplit
9706 && !can_compound(slang, p, 10993 && !can_compound(slang, p,
9707 compflags + sp->ts_compsplit)) 10994 compflags + sp->ts_compsplit))
9708 break; 10995 break;
9709 newscore += SCORE_SPLIT; 10996 newscore += SCORE_SPLIT;
10997
10998 /* Give a bonus to words seen before. */
10999 newscore = score_wordcount_adj(slang, newscore,
11000 preword + sp->ts_prewordlen, TRUE);
9710 } 11001 }
9711 11002
9712 if (try_deeper(su, stack, depth, newscore)) 11003 if (TRY_DEEPER(su, stack, depth, newscore))
9713 { 11004 {
11005 go_deeper(stack, depth, newscore);
11006 #ifdef DEBUG_TRIEWALK
11007 if (!try_compound && !fword_ends)
11008 sprintf(changename[depth], "%.*s-%s: split",
11009 sp->ts_twordlen, tword, fword + sp->ts_fidx);
11010 else
11011 sprintf(changename[depth], "%.*s-%s: compound",
11012 sp->ts_twordlen, tword, fword + sp->ts_fidx);
11013 #endif
9714 /* Save things to be restored at STATE_SPLITUNDO. */ 11014 /* Save things to be restored at STATE_SPLITUNDO. */
9715 sp->ts_save_badflags = su->su_badflags; 11015 sp->ts_save_badflags = su->su_badflags;
9716 sp->ts_state = STATE_SPLITUNDO; 11016 sp->ts_state = STATE_SPLITUNDO;
9717 11017
9718 ++depth; 11018 ++depth;
9728 /* If the badword has a non-word character at this 11028 /* If the badword has a non-word character at this
9729 * position skip it. That means replacing the 11029 * position skip it. That means replacing the
9730 * non-word character with a space. Always skip a 11030 * non-word character with a space. Always skip a
9731 * character when the word ends. But only when the 11031 * character when the word ends. But only when the
9732 * good word can end. */ 11032 * good word can end. */
9733 if (((!try_compound 11033 if (((!try_compound && !spell_iswordp_nmw(fword
9734 && !spell_iswordp_nmw(fword + sp->ts_fidx)) 11034 + sp->ts_fidx))
9735 || fword_ends) 11035 || fword_ends)
9736 && goodword_ends) 11036 && fword[sp->ts_fidx] != NUL
11037 && goodword_ends)
9737 { 11038 {
9738 int l; 11039 int l;
9739 11040
9740 #ifdef FEAT_MBYTE 11041 #ifdef FEAT_MBYTE
9741 if (has_mbyte) 11042 if (has_mbyte)
9787 sp->ts_prefixdepth = PFD_PREFIXTREE; 11088 sp->ts_prefixdepth = PFD_PREFIXTREE;
9788 sp->ts_state = STATE_NOPREFIX; 11089 sp->ts_state = STATE_NOPREFIX;
9789 } 11090 }
9790 } 11091 }
9791 } 11092 }
11093 }
11094 break;
11095
11096 case STATE_SPLITUNDO:
11097 /* Undo the changes done for word split or compound word. */
11098 su->su_badflags = sp->ts_save_badflags;
11099
11100 /* Continue looking for NUL bytes. */
11101 sp->ts_state = STATE_START;
11102
11103 /* In case we went into the prefix tree. */
11104 byts = fbyts;
11105 idxs = fidxs;
11106 break;
11107
11108 case STATE_ENDNUL:
11109 /* Past the NUL bytes in the node. */
11110 su->su_badflags = sp->ts_save_badflags;
11111 if (fword[sp->ts_fidx] == NUL
11112 #ifdef FEAT_MBYTE
11113 && sp->ts_tcharlen == 0
11114 #endif
11115 )
11116 {
11117 /* The badword ends, can't use STATE_PLAIN. */
11118 sp->ts_state = STATE_DEL;
9792 break; 11119 break;
9793 11120 }
9794 case STATE_SPLITUNDO: 11121 sp->ts_state = STATE_PLAIN;
9795 /* Undo the changes done for word split or compound word. */ 11122 /*FALLTHROUGH*/
9796 su->su_badflags = sp->ts_save_badflags; 11123
9797 11124 case STATE_PLAIN:
9798 /* Continue looking for NUL bytes. */ 11125 /*
9799 sp->ts_state = STATE_START; 11126 * Go over all possible bytes at this node, add each to tword[]
9800 11127 * and use child node. "ts_curi" is the index.
9801 /* In case we went into the prefix tree. */ 11128 */
9802 byts = fbyts; 11129 arridx = sp->ts_arridx;
9803 idxs = fidxs; 11130 if (sp->ts_curi > byts[arridx])
9804 break; 11131 {
9805 11132 /* Done all bytes at this node, do next state. When still at
9806 case STATE_ENDNUL: 11133 * already changed bytes skip the other tricks. */
9807 /* Past the NUL bytes in the node. */ 11134 if (sp->ts_fidx >= sp->ts_fidxtry)
9808 su->su_badflags = sp->ts_save_badflags; 11135 sp->ts_state = STATE_DEL;
9809 if (fword[sp->ts_fidx] == NUL 11136 else
11137 sp->ts_state = STATE_FINAL;
11138 }
11139 else
11140 {
11141 arridx += sp->ts_curi++;
11142 c = byts[arridx];
11143
11144 /* Normal byte, go one level deeper. If it's not equal to the
11145 * byte in the bad word adjust the score. But don't even try
11146 * when the byte was already changed. And don't try when we
11147 * just deleted this byte, accepting it is always cheaper then
11148 * delete + substitute. */
11149 if (c == fword[sp->ts_fidx]
9810 #ifdef FEAT_MBYTE 11150 #ifdef FEAT_MBYTE
9811 && sp->ts_tcharlen == 0 11151 || (sp->ts_tcharlen > 0 && sp->ts_isdiff != DIFF_NONE)
9812 #endif 11152 #endif
9813 ) 11153 )
11154 newscore = 0;
11155 else
11156 newscore = SCORE_SUBST;
11157 if ((newscore == 0
11158 || (sp->ts_fidx >= sp->ts_fidxtry
11159 && ((sp->ts_flags & TSF_DIDDEL) == 0
11160 || c != fword[sp->ts_delidx])))
11161 && TRY_DEEPER(su, stack, depth, newscore))
9814 { 11162 {
9815 /* The badword ends, can't use the bytes in this node. */ 11163 go_deeper(stack, depth, newscore);
9816 sp->ts_state = STATE_DEL; 11164 #ifdef DEBUG_TRIEWALK
9817 break; 11165 if (newscore > 0)
9818 } 11166 sprintf(changename[depth], "%.*s-%s: subst %c to %c",
9819 sp->ts_state = STATE_PLAIN; 11167 sp->ts_twordlen, tword, fword + sp->ts_fidx,
9820 /*FALLTHROUGH*/ 11168 fword[sp->ts_fidx], c);
9821
9822 case STATE_PLAIN:
9823 /*
9824 * Go over all possible bytes at this node, add each to
9825 * tword[] and use child node. "ts_curi" is the index.
9826 */
9827 arridx = sp->ts_arridx;
9828 if (sp->ts_curi > byts[arridx])
9829 {
9830 /* Done all bytes at this node, do next state. When still
9831 * at already changed bytes skip the other tricks. */
9832 if (sp->ts_fidx >= sp->ts_fidxtry)
9833 sp->ts_state = STATE_DEL;
9834 else 11169 else
9835 sp->ts_state = STATE_FINAL; 11170 sprintf(changename[depth], "%.*s-%s: accept %c",
9836 } 11171 sp->ts_twordlen, tword, fword + sp->ts_fidx,
9837 else 11172 fword[sp->ts_fidx]);
9838 { 11173 #endif
9839 arridx += sp->ts_curi++; 11174 ++depth;
9840 c = byts[arridx]; 11175 sp = &stack[depth];
9841 11176 ++sp->ts_fidx;
9842 /* Normal byte, go one level deeper. If it's not equal to 11177 tword[sp->ts_twordlen++] = c;
9843 * the byte in the bad word adjust the score. But don't 11178 sp->ts_arridx = idxs[arridx];
9844 * even try when the byte was already changed. */
9845 if (c == fword[sp->ts_fidx]
9846 #ifdef FEAT_MBYTE 11179 #ifdef FEAT_MBYTE
9847 || (sp->ts_tcharlen > 0 11180 if (newscore == SCORE_SUBST)
9848 && sp->ts_isdiff != DIFF_NONE) 11181 sp->ts_isdiff = DIFF_YES;
9849 #endif
9850 )
9851 newscore = 0;
9852 else
9853 newscore = SCORE_SUBST;
9854 if ((newscore == 0 || sp->ts_fidx >= sp->ts_fidxtry)
9855 && try_deeper(su, stack, depth, newscore))
9856 {
9857 ++depth;
9858 sp = &stack[depth];
9859 ++sp->ts_fidx;
9860 tword[sp->ts_twordlen++] = c;
9861 sp->ts_arridx = idxs[arridx];
9862 #ifdef FEAT_MBYTE
9863 if (newscore == SCORE_SUBST)
9864 sp->ts_isdiff = DIFF_YES;
9865 if (has_mbyte)
9866 {
9867 /* Multi-byte characters are a bit complicated to
9868 * handle: They differ when any of the bytes
9869 * differ and then their length may also differ. */
9870 if (sp->ts_tcharlen == 0)
9871 {
9872 /* First byte. */
9873 sp->ts_tcharidx = 0;
9874 sp->ts_tcharlen = MB_BYTE2LEN(c);
9875 sp->ts_fcharstart = sp->ts_fidx - 1;
9876 sp->ts_isdiff = (newscore != 0)
9877 ? DIFF_YES : DIFF_NONE;
9878 }
9879 else if (sp->ts_isdiff == DIFF_INSERT)
9880 /* When inserting trail bytes don't advance in
9881 * the bad word. */
9882 --sp->ts_fidx;
9883 if (++sp->ts_tcharidx == sp->ts_tcharlen)
9884 {
9885 /* Last byte of character. */
9886 if (sp->ts_isdiff == DIFF_YES)
9887 {
9888 /* Correct ts_fidx for the byte length of
9889 * the character (we didn't check that
9890 * before). */
9891 sp->ts_fidx = sp->ts_fcharstart
9892 + MB_BYTE2LEN(
9893 fword[sp->ts_fcharstart]);
9894
9895 /* For changing a composing character
9896 * adjust the score from SCORE_SUBST to
9897 * SCORE_SUBCOMP. */
9898 if (enc_utf8
9899 && utf_iscomposing(
9900 mb_ptr2char(tword
9901 + sp->ts_twordlen
9902 - sp->ts_tcharlen))
9903 && utf_iscomposing(
9904 mb_ptr2char(fword
9905 + sp->ts_fcharstart)))
9906 sp->ts_score -=
9907 SCORE_SUBST - SCORE_SUBCOMP;
9908
9909 /* For a similar character adjust score
9910 * from SCORE_SUBST to SCORE_SIMILAR. */
9911 else if (slang->sl_has_map
9912 && similar_chars(slang,
9913 mb_ptr2char(tword
9914 + sp->ts_twordlen
9915 - sp->ts_tcharlen),
9916 mb_ptr2char(fword
9917 + sp->ts_fcharstart)))
9918 sp->ts_score -=
9919 SCORE_SUBST - SCORE_SIMILAR;
9920 }
9921 else if (sp->ts_isdiff == DIFF_INSERT
9922 && sp->ts_twordlen > sp->ts_tcharlen)
9923 {
9924 p = tword + sp->ts_twordlen
9925 - sp->ts_tcharlen;
9926 c = mb_ptr2char(p);
9927 if (enc_utf8 && utf_iscomposing(c))
9928 {
9929 /* Inserting a composing char doesn't
9930 * count that much. */
9931 sp->ts_score -= SCORE_INS
9932 - SCORE_INSCOMP;
9933 }
9934 else
9935 {
9936 /* If the previous character was the
9937 * same, thus doubling a character,
9938 * give a bonus to the score. */
9939 mb_ptr_back(tword, p);
9940 if (c == mb_ptr2char(p))
9941 sp->ts_score -= SCORE_INS
9942 - SCORE_INSDUP;
9943 }
9944 }
9945
9946 /* Starting a new char, reset the length. */
9947 sp->ts_tcharlen = 0;
9948 }
9949 }
9950 else
9951 #endif
9952 {
9953 /* If we found a similar char adjust the score.
9954 * We do this after calling try_deeper() because
9955 * it's slow. */
9956 if (newscore != 0
9957 && slang->sl_has_map
9958 && similar_chars(slang,
9959 c, fword[sp->ts_fidx - 1]))
9960 sp->ts_score -= SCORE_SUBST - SCORE_SIMILAR;
9961 }
9962 }
9963 }
9964 break;
9965
9966 case STATE_DEL:
9967 #ifdef FEAT_MBYTE
9968 /* When past the first byte of a multi-byte char don't try
9969 * delete/insert/swap a character. */
9970 if (has_mbyte && sp->ts_tcharlen > 0)
9971 {
9972 sp->ts_state = STATE_FINAL;
9973 break;
9974 }
9975 #endif
9976 /*
9977 * Try skipping one character in the bad word (delete it).
9978 */
9979 sp->ts_state = STATE_INS;
9980 sp->ts_curi = 1;
9981 if (fword[sp->ts_fidx] != NUL
9982 && try_deeper(su, stack, depth, SCORE_DEL))
9983 {
9984 ++depth;
9985
9986 /* Advance over the character in fword[]. Give a bonus to
9987 * the score if the same character is following "nn" ->
9988 * "n". */
9989 #ifdef FEAT_MBYTE
9990 if (has_mbyte) 11182 if (has_mbyte)
9991 { 11183 {
9992 c = mb_ptr2char(fword + sp->ts_fidx); 11184 /* Multi-byte characters are a bit complicated to
9993 stack[depth].ts_fidx += MB_BYTE2LEN(fword[sp->ts_fidx]); 11185 * handle: They differ when any of the bytes differ
9994 if (enc_utf8 && utf_iscomposing(c)) 11186 * and then their length may also differ. */
9995 stack[depth].ts_score -= SCORE_DEL - SCORE_DELCOMP; 11187 if (sp->ts_tcharlen == 0)
9996 else if (c == mb_ptr2char(fword + stack[depth].ts_fidx)) 11188 {
9997 stack[depth].ts_score -= SCORE_DEL - SCORE_DELDUP; 11189 /* First byte. */
11190 sp->ts_tcharidx = 0;
11191 sp->ts_tcharlen = MB_BYTE2LEN(c);
11192 sp->ts_fcharstart = sp->ts_fidx - 1;
11193 sp->ts_isdiff = (newscore != 0)
11194 ? DIFF_YES : DIFF_NONE;
11195 }
11196 else if (sp->ts_isdiff == DIFF_INSERT)
11197 /* When inserting trail bytes don't advance in the
11198 * bad word. */
11199 --sp->ts_fidx;
11200 if (++sp->ts_tcharidx == sp->ts_tcharlen)
11201 {
11202 /* Last byte of character. */
11203 if (sp->ts_isdiff == DIFF_YES)
11204 {
11205 /* Correct ts_fidx for the byte length of the
11206 * character (we didn't check that before). */
11207 sp->ts_fidx = sp->ts_fcharstart
11208 + MB_BYTE2LEN(
11209 fword[sp->ts_fcharstart]);
11210
11211 /* For changing a composing character adjust
11212 * the score from SCORE_SUBST to
11213 * SCORE_SUBCOMP. */
11214 if (enc_utf8
11215 && utf_iscomposing(
11216 mb_ptr2char(tword
11217 + sp->ts_twordlen
11218 - sp->ts_tcharlen))
11219 && utf_iscomposing(
11220 mb_ptr2char(fword
11221 + sp->ts_fcharstart)))
11222 sp->ts_score -=
11223 SCORE_SUBST - SCORE_SUBCOMP;
11224
11225 /* For a similar character adjust score from
11226 * SCORE_SUBST to SCORE_SIMILAR. */
11227 else if (!soundfold
11228 && slang->sl_has_map
11229 && similar_chars(slang,
11230 mb_ptr2char(tword
11231 + sp->ts_twordlen
11232 - sp->ts_tcharlen),
11233 mb_ptr2char(fword
11234 + sp->ts_fcharstart)))
11235 sp->ts_score -=
11236 SCORE_SUBST - SCORE_SIMILAR;
11237 }
11238 else if (sp->ts_isdiff == DIFF_INSERT
11239 && sp->ts_twordlen > sp->ts_tcharlen)
11240 {
11241 p = tword + sp->ts_twordlen - sp->ts_tcharlen;
11242 c = mb_ptr2char(p);
11243 if (enc_utf8 && utf_iscomposing(c))
11244 {
11245 /* Inserting a composing char doesn't
11246 * count that much. */
11247 sp->ts_score -= SCORE_INS - SCORE_INSCOMP;
11248 }
11249 else
11250 {
11251 /* If the previous character was the same,
11252 * thus doubling a character, give a bonus
11253 * to the score. Also for the soundfold
11254 * tree (might seem illogical but does
11255 * give better scores). */
11256 mb_ptr_back(tword, p);
11257 if (c == mb_ptr2char(p))
11258 sp->ts_score -= SCORE_INS
11259 - SCORE_INSDUP;
11260 }
11261 }
11262
11263 /* Starting a new char, reset the length. */
11264 sp->ts_tcharlen = 0;
11265 }
9998 } 11266 }
9999 else 11267 else
10000 #endif 11268 #endif
10001 { 11269 {
10002 ++stack[depth].ts_fidx; 11270 /* If we found a similar char adjust the score.
10003 if (fword[sp->ts_fidx] == fword[sp->ts_fidx + 1]) 11271 * We do this after calling go_deeper() because
10004 stack[depth].ts_score -= SCORE_DEL - SCORE_DELDUP; 11272 * it's slow. */
10005 } 11273 if (newscore != 0
10006 break; 11274 && !soundfold
10007 } 11275 && slang->sl_has_map
10008 /*FALLTHROUGH*/ 11276 && similar_chars(slang,
10009 11277 c, fword[sp->ts_fidx - 1]))
10010 case STATE_INS: 11278 sp->ts_score -= SCORE_SUBST - SCORE_SIMILAR;
10011 /* Insert one byte. Do this for each possible byte at this
10012 * node. */
10013 n = sp->ts_arridx;
10014 if (sp->ts_curi > byts[n])
10015 {
10016 /* Done all bytes at this node, do next state. */
10017 sp->ts_state = STATE_SWAP;
10018 }
10019 else
10020 {
10021 /* Do one more byte at this node. Skip NUL bytes. */
10022 n += sp->ts_curi++;
10023 c = byts[n];
10024 if (c != 0 && try_deeper(su, stack, depth, SCORE_INS))
10025 {
10026 ++depth;
10027 sp = &stack[depth];
10028 tword[sp->ts_twordlen++] = c;
10029 sp->ts_arridx = idxs[n];
10030 #ifdef FEAT_MBYTE
10031 if (has_mbyte)
10032 {
10033 fl = MB_BYTE2LEN(c);
10034 if (fl > 1)
10035 {
10036 /* There are following bytes for the same
10037 * character. We must find all bytes before
10038 * trying delete/insert/swap/etc. */
10039 sp->ts_tcharlen = fl;
10040 sp->ts_tcharidx = 1;
10041 sp->ts_isdiff = DIFF_INSERT;
10042 }
10043 }
10044 else
10045 fl = 1;
10046 if (fl == 1)
10047 #endif
10048 {
10049 /* If the previous character was the same, thus
10050 * doubling a character, give a bonus to the
10051 * score. */
10052 if (sp->ts_twordlen >= 2
10053 && tword[sp->ts_twordlen - 2] == c)
10054 sp->ts_score -= SCORE_INS - SCORE_INSDUP;
10055 }
10056 } 11279 }
10057 } 11280 }
11281 }
11282 break;
11283
11284 case STATE_DEL:
11285 #ifdef FEAT_MBYTE
11286 /* When past the first byte of a multi-byte char don't try
11287 * delete/insert/swap a character. */
11288 if (has_mbyte && sp->ts_tcharlen > 0)
11289 {
11290 sp->ts_state = STATE_FINAL;
10058 break; 11291 break;
10059 11292 }
10060 case STATE_SWAP: 11293 #endif
10061 /* 11294 /*
10062 * Swap two bytes in the bad word: "12" -> "21". 11295 * Try skipping one character in the bad word (delete it).
10063 * We change "fword" here, it's changed back afterwards. 11296 */
10064 */ 11297 sp->ts_state = STATE_INS_PREP;
10065 p = fword + sp->ts_fidx; 11298 sp->ts_curi = 1;
10066 c = *p; 11299 if (soundfold && sp->ts_fidx == 0 && fword[sp->ts_fidx] == '*')
10067 if (c == NUL) 11300 /* Deleting a vowel at the start of a word counts less, see
10068 { 11301 * soundalike_score(). */
10069 /* End of word, can't swap or replace. */ 11302 newscore = 2 * SCORE_DEL / 3;
10070 sp->ts_state = STATE_FINAL; 11303 else
10071 break; 11304 newscore = SCORE_DEL;
10072 } 11305 if (fword[sp->ts_fidx] != NUL
10073 11306 && TRY_DEEPER(su, stack, depth, newscore))
10074 /* Don't swap if the first character is not a word character. 11307 {
10075 * SWAP3 etc. also don't make sense then. */ 11308 go_deeper(stack, depth, newscore);
10076 if (!spell_iswordp(p, curbuf)) 11309 #ifdef DEBUG_TRIEWALK
10077 { 11310 sprintf(changename[depth], "%.*s-%s: delete %c",
10078 sp->ts_state = STATE_REP_INI; 11311 sp->ts_twordlen, tword, fword + sp->ts_fidx,
10079 break; 11312 fword[sp->ts_fidx]);
10080 } 11313 #endif
10081 11314 ++depth;
11315
11316 /* Remember what character we deleted, so that we can avoid
11317 * inserting it again. */
11318 stack[depth].ts_flags |= TSF_DIDDEL;
11319 stack[depth].ts_delidx = sp->ts_fidx;
11320
11321 /* Advance over the character in fword[]. Give a bonus to the
11322 * score if the same character is following "nn" -> "n". It's
11323 * a bit illogical for soundfold tree but it does give better
11324 * results. */
10082 #ifdef FEAT_MBYTE 11325 #ifdef FEAT_MBYTE
10083 if (has_mbyte) 11326 if (has_mbyte)
10084 { 11327 {
10085 n = mb_cptr2len(p); 11328 c = mb_ptr2char(fword + sp->ts_fidx);
10086 c = mb_ptr2char(p); 11329 stack[depth].ts_fidx += MB_BYTE2LEN(fword[sp->ts_fidx]);
10087 if (!spell_iswordp(p + n, curbuf)) 11330 if (enc_utf8 && utf_iscomposing(c))
10088 c2 = c; /* don't swap non-word char */ 11331 stack[depth].ts_score -= SCORE_DEL - SCORE_DELCOMP;
10089 else 11332 else if (c == mb_ptr2char(fword + stack[depth].ts_fidx))
10090 c2 = mb_ptr2char(p + n); 11333 stack[depth].ts_score -= SCORE_DEL - SCORE_DELDUP;
10091 } 11334 }
10092 else 11335 else
10093 #endif 11336 #endif
10094 { 11337 {
10095 if (!spell_iswordp(p + 1, curbuf)) 11338 ++stack[depth].ts_fidx;
10096 c2 = c; /* don't swap non-word char */ 11339 if (fword[sp->ts_fidx] == fword[sp->ts_fidx + 1])
10097 else 11340 stack[depth].ts_score -= SCORE_DEL - SCORE_DELDUP;
10098 c2 = p[1];
10099 } 11341 }
10100 11342 break;
10101 /* When characters are identical, swap won't do anything. 11343 }
10102 * Also get here if the second char is not a word character. */ 11344 /*FALLTHROUGH*/
10103 if (c == c2) 11345
11346 case STATE_INS_PREP:
11347 if (sp->ts_flags & TSF_DIDDEL)
11348 {
11349 /* If we just deleted a byte then inserting won't make sense,
11350 * a substitute is always cheaper. */
11351 sp->ts_state = STATE_SWAP;
11352 break;
11353 }
11354
11355 /* skip over NUL bytes */
11356 n = sp->ts_arridx;
11357 for (;;)
11358 {
11359 if (sp->ts_curi > byts[n])
10104 { 11360 {
10105 sp->ts_state = STATE_SWAP3; 11361 /* Only NUL bytes at this node, go to next state. */
11362 sp->ts_state = STATE_SWAP;
10106 break; 11363 break;
10107 } 11364 }
10108 if (c2 != NUL && try_deeper(su, stack, depth, SCORE_SWAP)) 11365 if (byts[n + sp->ts_curi] != NUL)
10109 { 11366 {
10110 sp->ts_state = STATE_UNSWAP; 11367 /* Found a byte to insert. */
10111 ++depth; 11368 sp->ts_state = STATE_INS;
11369 break;
11370 }
11371 ++sp->ts_curi;
11372 }
11373 break;
11374
11375 /*FALLTHROUGH*/
11376
11377 case STATE_INS:
11378 /* Insert one byte. Repeat this for each possible byte at this
11379 * node. */
11380 n = sp->ts_arridx;
11381 if (sp->ts_curi > byts[n])
11382 {
11383 /* Done all bytes at this node, go to next state. */
11384 sp->ts_state = STATE_SWAP;
11385 break;
11386 }
11387
11388 /* Do one more byte at this node, but:
11389 * - Skip NUL bytes.
11390 * - Skip the byte if it's equal to the byte in the word,
11391 * accepting that byte is always better.
11392 */
11393 n += sp->ts_curi++;
11394 c = byts[n];
11395 if (soundfold && sp->ts_twordlen == 0 && c == '*')
11396 /* Inserting a vowel at the start of a word counts less,
11397 * see soundalike_score(). */
11398 newscore = 2 * SCORE_INS / 3;
11399 else
11400 newscore = SCORE_INS;
11401 if (c != fword[sp->ts_fidx]
11402 && TRY_DEEPER(su, stack, depth, newscore))
11403 {
11404 go_deeper(stack, depth, newscore);
11405 #ifdef DEBUG_TRIEWALK
11406 sprintf(changename[depth], "%.*s-%s: insert %c",
11407 sp->ts_twordlen, tword, fword + sp->ts_fidx,
11408 c);
11409 #endif
11410 ++depth;
11411 sp = &stack[depth];
11412 tword[sp->ts_twordlen++] = c;
11413 sp->ts_arridx = idxs[n];
10112 #ifdef FEAT_MBYTE 11414 #ifdef FEAT_MBYTE
10113 if (has_mbyte) 11415 if (has_mbyte)
11416 {
11417 fl = MB_BYTE2LEN(c);
11418 if (fl > 1)
10114 { 11419 {
10115 fl = mb_char2len(c2); 11420 /* There are following bytes for the same character.
10116 mch_memmove(p, p + n, fl); 11421 * We must find all bytes before trying
10117 mb_char2bytes(c, p + fl); 11422 * delete/insert/swap/etc. */
10118 stack[depth].ts_fidxtry = sp->ts_fidx + n + fl; 11423 sp->ts_tcharlen = fl;
10119 } 11424 sp->ts_tcharidx = 1;
10120 else 11425 sp->ts_isdiff = DIFF_INSERT;
10121 #endif
10122 {
10123 p[0] = c2;
10124 p[1] = c;
10125 stack[depth].ts_fidxtry = sp->ts_fidx + 2;
10126 } 11426 }
10127 } 11427 }
10128 else 11428 else
10129 /* If this swap doesn't work then SWAP3 won't either. */ 11429 fl = 1;
10130 sp->ts_state = STATE_REP_INI; 11430 if (fl == 1)
11431 #endif
11432 {
11433 /* If the previous character was the same, thus doubling a
11434 * character, give a bonus to the score. Also for
11435 * soundfold words (illogical but does give a better
11436 * score). */
11437 if (sp->ts_twordlen >= 2
11438 && tword[sp->ts_twordlen - 2] == c)
11439 sp->ts_score -= SCORE_INS - SCORE_INSDUP;
11440 }
11441 }
11442 break;
11443
11444 case STATE_SWAP:
11445 /*
11446 * Swap two bytes in the bad word: "12" -> "21".
11447 * We change "fword" here, it's changed back afterwards at
11448 * STATE_UNSWAP.
11449 */
11450 p = fword + sp->ts_fidx;
11451 c = *p;
11452 if (c == NUL)
11453 {
11454 /* End of word, can't swap or replace. */
11455 sp->ts_state = STATE_FINAL;
10131 break; 11456 break;
10132 11457 }
10133 case STATE_UNSWAP: 11458
10134 /* Undo the STATE_SWAP swap: "21" -> "12". */ 11459 /* Don't swap if the first character is not a word character.
10135 p = fword + sp->ts_fidx; 11460 * SWAP3 etc. also don't make sense then. */
11461 if (!soundfold && !spell_iswordp(p, curbuf))
11462 {
11463 sp->ts_state = STATE_REP_INI;
11464 break;
11465 }
11466
11467 #ifdef FEAT_MBYTE
11468 if (has_mbyte)
11469 {
11470 n = mb_cptr2len(p);
11471 c = mb_ptr2char(p);
11472 if (!soundfold && !spell_iswordp(p + n, curbuf))
11473 c2 = c; /* don't swap non-word char */
11474 else
11475 c2 = mb_ptr2char(p + n);
11476 }
11477 else
11478 #endif
11479 {
11480 if (!soundfold && !spell_iswordp(p + 1, curbuf))
11481 c2 = c; /* don't swap non-word char */
11482 else
11483 c2 = p[1];
11484 }
11485
11486 /* When characters are identical, swap won't do anything.
11487 * Also get here if the second char is not a word character. */
11488 if (c == c2)
11489 {
11490 sp->ts_state = STATE_SWAP3;
11491 break;
11492 }
11493 if (c2 != NUL && TRY_DEEPER(su, stack, depth, SCORE_SWAP))
11494 {
11495 go_deeper(stack, depth, SCORE_SWAP);
11496 #ifdef DEBUG_TRIEWALK
11497 sprintf(changename[depth], "%.*s-%s: swap %c and %c",
11498 sp->ts_twordlen, tword, fword + sp->ts_fidx,
11499 c, c2);
11500 #endif
11501 sp->ts_state = STATE_UNSWAP;
11502 ++depth;
10136 #ifdef FEAT_MBYTE 11503 #ifdef FEAT_MBYTE
10137 if (has_mbyte) 11504 if (has_mbyte)
10138 { 11505 {
10139 n = MB_BYTE2LEN(*p); 11506 fl = mb_char2len(c2);
10140 c = mb_ptr2char(p + n); 11507 mch_memmove(p, p + n, fl);
10141 mch_memmove(p + MB_BYTE2LEN(p[n]), p, n); 11508 mb_char2bytes(c, p + fl);
10142 mb_char2bytes(c, p); 11509 stack[depth].ts_fidxtry = sp->ts_fidx + n + fl;
10143 } 11510 }
10144 else 11511 else
10145 #endif 11512 #endif
10146 { 11513 {
10147 c = *p; 11514 p[0] = c2;
10148 *p = p[1];
10149 p[1] = c; 11515 p[1] = c;
11516 stack[depth].ts_fidxtry = sp->ts_fidx + 2;
10150 } 11517 }
10151 /*FALLTHROUGH*/ 11518 }
10152 11519 else
10153 case STATE_SWAP3: 11520 /* If this swap doesn't work then SWAP3 won't either. */
10154 /* Swap two bytes, skipping one: "123" -> "321". We change 11521 sp->ts_state = STATE_REP_INI;
10155 * "fword" here, it's changed back afterwards. */ 11522 break;
11523
11524 case STATE_UNSWAP:
11525 /* Undo the STATE_SWAP swap: "21" -> "12". */
11526 p = fword + sp->ts_fidx;
11527 #ifdef FEAT_MBYTE
11528 if (has_mbyte)
11529 {
11530 n = MB_BYTE2LEN(*p);
11531 c = mb_ptr2char(p + n);
11532 mch_memmove(p + MB_BYTE2LEN(p[n]), p, n);
11533 mb_char2bytes(c, p);
11534 }
11535 else
11536 #endif
11537 {
11538 c = *p;
11539 *p = p[1];
11540 p[1] = c;
11541 }
11542 /*FALLTHROUGH*/
11543
11544 case STATE_SWAP3:
11545 /* Swap two bytes, skipping one: "123" -> "321". We change
11546 * "fword" here, it's changed back afterwards at STATE_UNSWAP3. */
11547 p = fword + sp->ts_fidx;
11548 #ifdef FEAT_MBYTE
11549 if (has_mbyte)
11550 {
11551 n = mb_cptr2len(p);
11552 c = mb_ptr2char(p);
11553 fl = mb_cptr2len(p + n);
11554 c2 = mb_ptr2char(p + n);
11555 if (!soundfold && !spell_iswordp(p + n + fl, curbuf))
11556 c3 = c; /* don't swap non-word char */
11557 else
11558 c3 = mb_ptr2char(p + n + fl);
11559 }
11560 else
11561 #endif
11562 {
11563 c = *p;
11564 c2 = p[1];
11565 if (!soundfold && !spell_iswordp(p + 2, curbuf))
11566 c3 = c; /* don't swap non-word char */
11567 else
11568 c3 = p[2];
11569 }
11570
11571 /* When characters are identical: "121" then SWAP3 result is
11572 * identical, ROT3L result is same as SWAP: "211", ROT3L result is
11573 * same as SWAP on next char: "112". Thus skip all swapping.
11574 * Also skip when c3 is NUL.
11575 * Also get here when the third character is not a word character.
11576 * Second character may any char: "a.b" -> "b.a" */
11577 if (c == c3 || c3 == NUL)
11578 {
11579 sp->ts_state = STATE_REP_INI;
11580 break;
11581 }
11582 if (TRY_DEEPER(su, stack, depth, SCORE_SWAP3))
11583 {
11584 go_deeper(stack, depth, SCORE_SWAP3);
11585 #ifdef DEBUG_TRIEWALK
11586 sprintf(changename[depth], "%.*s-%s: swap3 %c and %c",
11587 sp->ts_twordlen, tword, fword + sp->ts_fidx,
11588 c, c3);
11589 #endif
11590 sp->ts_state = STATE_UNSWAP3;
11591 ++depth;
11592 #ifdef FEAT_MBYTE
11593 if (has_mbyte)
11594 {
11595 tl = mb_char2len(c3);
11596 mch_memmove(p, p + n + fl, tl);
11597 mb_char2bytes(c2, p + tl);
11598 mb_char2bytes(c, p + fl + tl);
11599 stack[depth].ts_fidxtry = sp->ts_fidx + n + fl + tl;
11600 }
11601 else
11602 #endif
11603 {
11604 p[0] = p[2];
11605 p[2] = c;
11606 stack[depth].ts_fidxtry = sp->ts_fidx + 3;
11607 }
11608 }
11609 else
11610 sp->ts_state = STATE_REP_INI;
11611 break;
11612
11613 case STATE_UNSWAP3:
11614 /* Undo STATE_SWAP3: "321" -> "123" */
11615 p = fword + sp->ts_fidx;
11616 #ifdef FEAT_MBYTE
11617 if (has_mbyte)
11618 {
11619 n = MB_BYTE2LEN(*p);
11620 c2 = mb_ptr2char(p + n);
11621 fl = MB_BYTE2LEN(p[n]);
11622 c = mb_ptr2char(p + n + fl);
11623 tl = MB_BYTE2LEN(p[n + fl]);
11624 mch_memmove(p + fl + tl, p, n);
11625 mb_char2bytes(c, p);
11626 mb_char2bytes(c2, p + tl);
11627 p = p + tl;
11628 }
11629 else
11630 #endif
11631 {
11632 c = *p;
11633 *p = p[2];
11634 p[2] = c;
11635 ++p;
11636 }
11637
11638 if (!soundfold && !spell_iswordp(p, curbuf))
11639 {
11640 /* Middle char is not a word char, skip the rotate. First and
11641 * third char were already checked at swap and swap3. */
11642 sp->ts_state = STATE_REP_INI;
11643 break;
11644 }
11645
11646 /* Rotate three characters left: "123" -> "231". We change
11647 * "fword" here, it's changed back afterwards at STATE_UNROT3L. */
11648 if (TRY_DEEPER(su, stack, depth, SCORE_SWAP3))
11649 {
11650 go_deeper(stack, depth, SCORE_SWAP3);
11651 #ifdef DEBUG_TRIEWALK
11652 p = fword + sp->ts_fidx;
11653 sprintf(changename[depth], "%.*s-%s: rotate left %c%c%c",
11654 sp->ts_twordlen, tword, fword + sp->ts_fidx,
11655 p[0], p[1], p[2]);
11656 #endif
11657 sp->ts_state = STATE_UNROT3L;
11658 ++depth;
10156 p = fword + sp->ts_fidx; 11659 p = fword + sp->ts_fidx;
10157 #ifdef FEAT_MBYTE 11660 #ifdef FEAT_MBYTE
10158 if (has_mbyte) 11661 if (has_mbyte)
10159 { 11662 {
10160 n = mb_cptr2len(p); 11663 n = mb_cptr2len(p);
10161 c = mb_ptr2char(p); 11664 c = mb_ptr2char(p);
10162 fl = mb_cptr2len(p + n); 11665 fl = mb_cptr2len(p + n);
10163 c2 = mb_ptr2char(p + n); 11666 fl += mb_cptr2len(p + n + fl);
10164 if (!spell_iswordp(p + n + fl, curbuf)) 11667 mch_memmove(p, p + n, fl);
10165 c3 = c; /* don't swap non-word char */ 11668 mb_char2bytes(c, p + fl);
10166 else 11669 stack[depth].ts_fidxtry = sp->ts_fidx + n + fl;
10167 c3 = mb_ptr2char(p + n + fl);
10168 } 11670 }
10169 else 11671 else
10170 #endif 11672 #endif
10171 { 11673 {
10172 c = *p; 11674 c = *p;
10173 c2 = p[1]; 11675 *p = p[1];
10174 if (!spell_iswordp(p + 2, curbuf)) 11676 p[1] = p[2];
10175 c3 = c; /* don't swap non-word char */ 11677 p[2] = c;
10176 else 11678 stack[depth].ts_fidxtry = sp->ts_fidx + 3;
10177 c3 = p[2];
10178 } 11679 }
10179 11680 }
10180 /* When characters are identical: "121" then SWAP3 result is 11681 else
10181 * identical, ROT3L result is same as SWAP: "211", ROT3L 11682 sp->ts_state = STATE_REP_INI;
10182 * result is same as SWAP on next char: "112". Thus skip all 11683 break;
10183 * swapping. Also skip when c3 is NUL. 11684
10184 * Also get here when the third character is not a word 11685 case STATE_UNROT3L:
10185 * character. Second character may any char: "a.b" -> "b.a" */ 11686 /* Undo ROT3L: "231" -> "123" */
10186 if (c == c3 || c3 == NUL) 11687 p = fword + sp->ts_fidx;
10187 {
10188 sp->ts_state = STATE_REP_INI;
10189 break;
10190 }
10191 if (try_deeper(su, stack, depth, SCORE_SWAP3))
10192 {
10193 sp->ts_state = STATE_UNSWAP3;
10194 ++depth;
10195 #ifdef FEAT_MBYTE 11688 #ifdef FEAT_MBYTE
10196 if (has_mbyte) 11689 if (has_mbyte)
10197 { 11690 {
10198 tl = mb_char2len(c3); 11691 n = MB_BYTE2LEN(*p);
10199 mch_memmove(p, p + n + fl, tl); 11692 n += MB_BYTE2LEN(p[n]);
10200 mb_char2bytes(c2, p + tl); 11693 c = mb_ptr2char(p + n);
10201 mb_char2bytes(c, p + fl + tl); 11694 tl = MB_BYTE2LEN(p[n]);
10202 stack[depth].ts_fidxtry = sp->ts_fidx + n + fl + tl; 11695 mch_memmove(p + tl, p, n);
10203 } 11696 mb_char2bytes(c, p);
10204 else 11697 }
11698 else
10205 #endif 11699 #endif
10206 { 11700 {
10207 p[0] = p[2]; 11701 c = p[2];
10208 p[2] = c; 11702 p[2] = p[1];
10209 stack[depth].ts_fidxtry = sp->ts_fidx + 3; 11703 p[1] = *p;
10210 } 11704 *p = c;
10211 } 11705 }
10212 else 11706
10213 sp->ts_state = STATE_REP_INI; 11707 /* Rotate three bytes right: "123" -> "312". We change "fword"
10214 break; 11708 * here, it's changed back afterwards at STATE_UNROT3R. */
10215 11709 if (TRY_DEEPER(su, stack, depth, SCORE_SWAP3))
10216 case STATE_UNSWAP3: 11710 {
10217 /* Undo STATE_SWAP3: "321" -> "123" */ 11711 go_deeper(stack, depth, SCORE_SWAP3);
11712 #ifdef DEBUG_TRIEWALK
11713 p = fword + sp->ts_fidx;
11714 sprintf(changename[depth], "%.*s-%s: rotate right %c%c%c",
11715 sp->ts_twordlen, tword, fword + sp->ts_fidx,
11716 p[0], p[1], p[2]);
11717 #endif
11718 sp->ts_state = STATE_UNROT3R;
11719 ++depth;
10218 p = fword + sp->ts_fidx; 11720 p = fword + sp->ts_fidx;
10219 #ifdef FEAT_MBYTE 11721 #ifdef FEAT_MBYTE
10220 if (has_mbyte) 11722 if (has_mbyte)
10221 { 11723 {
10222 n = MB_BYTE2LEN(*p); 11724 n = mb_cptr2len(p);
10223 c2 = mb_ptr2char(p + n); 11725 n += mb_cptr2len(p + n);
10224 fl = MB_BYTE2LEN(p[n]);
10225 c = mb_ptr2char(p + n + fl);
10226 tl = MB_BYTE2LEN(p[n + fl]);
10227 mch_memmove(p + fl + tl, p, n);
10228 mb_char2bytes(c, p);
10229 mb_char2bytes(c2, p + tl);
10230 p = p + tl;
10231 }
10232 else
10233 #endif
10234 {
10235 c = *p;
10236 *p = p[2];
10237 p[2] = c;
10238 ++p;
10239 }
10240
10241 if (!spell_iswordp(p, curbuf))
10242 {
10243 /* Middle char is not a word char, skip the rotate.
10244 * First and third char were already checked at swap
10245 * and swap3. */
10246 sp->ts_state = STATE_REP_INI;
10247 break;
10248 }
10249
10250 /* Rotate three characters left: "123" -> "231". We change
10251 * "fword" here, it's changed back afterwards. */
10252 if (try_deeper(su, stack, depth, SCORE_SWAP3))
10253 {
10254 sp->ts_state = STATE_UNROT3L;
10255 ++depth;
10256 p = fword + sp->ts_fidx;
10257 #ifdef FEAT_MBYTE
10258 if (has_mbyte)
10259 {
10260 n = mb_cptr2len(p);
10261 c = mb_ptr2char(p);
10262 fl = mb_cptr2len(p + n);
10263 fl += mb_cptr2len(p + n + fl);
10264 mch_memmove(p, p + n, fl);
10265 mb_char2bytes(c, p + fl);
10266 stack[depth].ts_fidxtry = sp->ts_fidx + n + fl;
10267 }
10268 else
10269 #endif
10270 {
10271 c = *p;
10272 *p = p[1];
10273 p[1] = p[2];
10274 p[2] = c;
10275 stack[depth].ts_fidxtry = sp->ts_fidx + 3;
10276 }
10277 }
10278 else
10279 sp->ts_state = STATE_REP_INI;
10280 break;
10281
10282 case STATE_UNROT3L:
10283 /* Undo ROT3L: "231" -> "123" */
10284 p = fword + sp->ts_fidx;
10285 #ifdef FEAT_MBYTE
10286 if (has_mbyte)
10287 {
10288 n = MB_BYTE2LEN(*p);
10289 n += MB_BYTE2LEN(p[n]);
10290 c = mb_ptr2char(p + n); 11726 c = mb_ptr2char(p + n);
10291 tl = MB_BYTE2LEN(p[n]); 11727 tl = mb_cptr2len(p + n);
10292 mch_memmove(p + tl, p, n); 11728 mch_memmove(p + tl, p, n);
10293 mb_char2bytes(c, p); 11729 mb_char2bytes(c, p);
11730 stack[depth].ts_fidxtry = sp->ts_fidx + n + tl;
10294 } 11731 }
10295 else 11732 else
10296 #endif 11733 #endif
10297 { 11734 {
10298 c = p[2]; 11735 c = p[2];
10299 p[2] = p[1]; 11736 p[2] = p[1];
10300 p[1] = *p; 11737 p[1] = *p;
10301 *p = c; 11738 *p = c;
11739 stack[depth].ts_fidxtry = sp->ts_fidx + 3;
10302 } 11740 }
10303 11741 }
10304 /* Rotate three bytes right: "123" -> "312". We change 11742 else
10305 * "fword" here, it's changed back afterwards. */ 11743 sp->ts_state = STATE_REP_INI;
10306 if (try_deeper(su, stack, depth, SCORE_SWAP3)) 11744 break;
11745
11746 case STATE_UNROT3R:
11747 /* Undo ROT3R: "312" -> "123" */
11748 p = fword + sp->ts_fidx;
11749 #ifdef FEAT_MBYTE
11750 if (has_mbyte)
11751 {
11752 c = mb_ptr2char(p);
11753 tl = MB_BYTE2LEN(*p);
11754 n = MB_BYTE2LEN(p[tl]);
11755 n += MB_BYTE2LEN(p[tl + n]);
11756 mch_memmove(p, p + tl, n);
11757 mb_char2bytes(c, p + n);
11758 }
11759 else
11760 #endif
11761 {
11762 c = *p;
11763 *p = p[1];
11764 p[1] = p[2];
11765 p[2] = c;
11766 }
11767 /*FALLTHROUGH*/
11768
11769 case STATE_REP_INI:
11770 /* Check if matching with REP items from the .aff file would work.
11771 * Quickly skip if:
11772 * - there are no REP items and we are not in the soundfold trie
11773 * - the score is going to be too high anyway
11774 * - already applied a REP item or swapped here */
11775 if ((lp->lp_replang == NULL && !soundfold)
11776 || sp->ts_score + SCORE_REP >= su->su_maxscore
11777 || sp->ts_fidx < sp->ts_fidxtry)
11778 {
11779 sp->ts_state = STATE_FINAL;
11780 break;
11781 }
11782
11783 /* Use the first byte to quickly find the first entry that may
11784 * match. If the index is -1 there is none. */
11785 if (soundfold)
11786 sp->ts_curi = slang->sl_repsal_first[fword[sp->ts_fidx]];
11787 else
11788 sp->ts_curi = lp->lp_replang->sl_rep_first[fword[sp->ts_fidx]];
11789
11790 if (sp->ts_curi < 0)
11791 {
11792 sp->ts_state = STATE_FINAL;
11793 break;
11794 }
11795
11796 sp->ts_state = STATE_REP;
11797 /*FALLTHROUGH*/
11798
11799 case STATE_REP:
11800 /* Try matching with REP items from the .aff file. For each match
11801 * replace the characters and check if the resulting word is
11802 * valid. */
11803 p = fword + sp->ts_fidx;
11804
11805 if (soundfold)
11806 gap = &slang->sl_repsal;
11807 else
11808 gap = &lp->lp_replang->sl_rep;
11809 while (sp->ts_curi < gap->ga_len)
11810 {
11811 ftp = (fromto_T *)gap->ga_data + sp->ts_curi++;
11812 if (*ftp->ft_from != *p)
10307 { 11813 {
10308 sp->ts_state = STATE_UNROT3R; 11814 /* past possible matching entries */
10309 ++depth; 11815 sp->ts_curi = gap->ga_len;
10310 p = fword + sp->ts_fidx;
10311 #ifdef FEAT_MBYTE
10312 if (has_mbyte)
10313 {
10314 n = mb_cptr2len(p);
10315 n += mb_cptr2len(p + n);
10316 c = mb_ptr2char(p + n);
10317 tl = mb_cptr2len(p + n);
10318 mch_memmove(p + tl, p, n);
10319 mb_char2bytes(c, p);
10320 stack[depth].ts_fidxtry = sp->ts_fidx + n + tl;
10321 }
10322 else
10323 #endif
10324 {
10325 c = p[2];
10326 p[2] = p[1];
10327 p[1] = *p;
10328 *p = c;
10329 stack[depth].ts_fidxtry = sp->ts_fidx + 3;
10330 }
10331 }
10332 else
10333 sp->ts_state = STATE_REP_INI;
10334 break;
10335
10336 case STATE_UNROT3R:
10337 /* Undo ROT3R: "312" -> "123" */
10338 p = fword + sp->ts_fidx;
10339 #ifdef FEAT_MBYTE
10340 if (has_mbyte)
10341 {
10342 c = mb_ptr2char(p);
10343 tl = MB_BYTE2LEN(*p);
10344 n = MB_BYTE2LEN(p[tl]);
10345 n += MB_BYTE2LEN(p[tl + n]);
10346 mch_memmove(p, p + tl, n);
10347 mb_char2bytes(c, p + n);
10348 }
10349 else
10350 #endif
10351 {
10352 c = *p;
10353 *p = p[1];
10354 p[1] = p[2];
10355 p[2] = c;
10356 }
10357 /*FALLTHROUGH*/
10358
10359 case STATE_REP_INI:
10360 /* Check if matching with REP items from the .aff file would
10361 * work. Quickly skip if:
10362 * - there are no REP items
10363 * - the score is going to be too high anyway
10364 * - already applied a REP item or swapped here */
10365 if (lp->lp_replang == NULL
10366 || sp->ts_score + SCORE_REP >= su->su_maxscore
10367 || sp->ts_fidx < sp->ts_fidxtry)
10368 {
10369 sp->ts_state = STATE_FINAL;
10370 break; 11816 break;
10371 } 11817 }
10372 gap = &lp->lp_replang->sl_rep; 11818 if (STRNCMP(ftp->ft_from, p, STRLEN(ftp->ft_from)) == 0
10373 11819 && TRY_DEEPER(su, stack, depth, SCORE_REP))
10374 /* Use the first byte to quickly find the first entry that
10375 * may match. If the index is -1 there is none. */
10376 sp->ts_curi = lp->lp_replang->sl_rep_first[fword[sp->ts_fidx]];
10377 if (sp->ts_curi < 0)
10378 { 11820 {
10379 sp->ts_state = STATE_FINAL; 11821 go_deeper(stack, depth, SCORE_REP);
11822 #ifdef DEBUG_TRIEWALK
11823 sprintf(changename[depth], "%.*s-%s: replace %s with %s",
11824 sp->ts_twordlen, tword, fword + sp->ts_fidx,
11825 ftp->ft_from, ftp->ft_to);
11826 #endif
11827 /* Need to undo this afterwards. */
11828 sp->ts_state = STATE_REP_UNDO;
11829
11830 /* Change the "from" to the "to" string. */
11831 ++depth;
11832 fl = STRLEN(ftp->ft_from);
11833 tl = STRLEN(ftp->ft_to);
11834 if (fl != tl)
11835 {
11836 mch_memmove(p + tl, p + fl, STRLEN(p + fl) + 1);
11837 repextra += tl - fl;
11838 }
11839 mch_memmove(p, ftp->ft_to, tl);
11840 stack[depth].ts_fidxtry = sp->ts_fidx + tl;
11841 #ifdef FEAT_MBYTE
11842 stack[depth].ts_tcharlen = 0;
11843 #endif
10380 break; 11844 break;
10381 } 11845 }
10382 11846 }
10383 sp->ts_state = STATE_REP; 11847
10384 /*FALLTHROUGH*/ 11848 if (sp->ts_curi >= gap->ga_len && sp->ts_state == STATE_REP)
10385 11849 /* No (more) matches. */
10386 case STATE_REP: 11850 sp->ts_state = STATE_FINAL;
10387 /* Try matching with REP items from the .aff file. For each 11851
10388 * match replace the characters and check if the resulting 11852 break;
10389 * word is valid. */ 11853
10390 p = fword + sp->ts_fidx; 11854 case STATE_REP_UNDO:
10391 11855 /* Undo a REP replacement and continue with the next one. */
11856 if (soundfold)
11857 gap = &slang->sl_repsal;
11858 else
10392 gap = &lp->lp_replang->sl_rep; 11859 gap = &lp->lp_replang->sl_rep;
10393 while (sp->ts_curi < gap->ga_len) 11860 ftp = (fromto_T *)gap->ga_data + sp->ts_curi - 1;
10394 { 11861 fl = STRLEN(ftp->ft_from);
10395 ftp = (fromto_T *)gap->ga_data + sp->ts_curi++; 11862 tl = STRLEN(ftp->ft_to);
10396 if (*ftp->ft_from != *p) 11863 p = fword + sp->ts_fidx;
10397 { 11864 if (fl != tl)
10398 /* past possible matching entries */ 11865 {
10399 sp->ts_curi = gap->ga_len; 11866 mch_memmove(p + fl, p + tl, STRLEN(p + tl) + 1);
10400 break; 11867 repextra -= tl - fl;
10401 } 11868 }
10402 if (STRNCMP(ftp->ft_from, p, STRLEN(ftp->ft_from)) == 0 11869 mch_memmove(p, ftp->ft_from, fl);
10403 && try_deeper(su, stack, depth, SCORE_REP)) 11870 sp->ts_state = STATE_REP;
10404 { 11871 break;
10405 /* Need to undo this afterwards. */ 11872
10406 sp->ts_state = STATE_REP_UNDO; 11873 default:
10407 11874 /* Did all possible states at this level, go up one level. */
10408 /* Change the "from" to the "to" string. */ 11875 --depth;
10409 ++depth; 11876
10410 fl = STRLEN(ftp->ft_from); 11877 if (depth >= 0 && stack[depth].ts_prefixdepth == PFD_PREFIXTREE)
10411 tl = STRLEN(ftp->ft_to); 11878 {
10412 if (fl != tl) 11879 /* Continue in or go back to the prefix tree. */
10413 { 11880 byts = pbyts;
10414 mch_memmove(p + tl, p + fl, STRLEN(p + fl) + 1); 11881 idxs = pidxs;
10415 repextra += tl - fl; 11882 }
10416 } 11883
10417 mch_memmove(p, ftp->ft_to, tl); 11884 /* Don't check for CTRL-C too often, it takes time. */
10418 stack[depth].ts_fidxtry = sp->ts_fidx + tl; 11885 if (--breakcheckcount == 0)
10419 #ifdef FEAT_MBYTE 11886 {
10420 stack[depth].ts_tcharlen = 0; 11887 ui_breakcheck();
10421 #endif 11888 breakcheckcount = 1000;
10422 break; 11889 }
10423 } 11890 }
10424 } 11891 }
10425 11892 }
10426 if (sp->ts_curi >= gap->ga_len && sp->ts_state == STATE_REP) 11893
10427 /* No (more) matches. */ 11894
10428 sp->ts_state = STATE_FINAL; 11895 /*
10429 11896 * Go one level deeper in the tree.
10430 break; 11897 */
10431 11898 static void
10432 case STATE_REP_UNDO: 11899 go_deeper(stack, depth, score_add)
10433 /* Undo a REP replacement and continue with the next one. */
10434 ftp = (fromto_T *)lp->lp_replang->sl_rep.ga_data
10435 + sp->ts_curi - 1;
10436 fl = STRLEN(ftp->ft_from);
10437 tl = STRLEN(ftp->ft_to);
10438 p = fword + sp->ts_fidx;
10439 if (fl != tl)
10440 {
10441 mch_memmove(p + fl, p + tl, STRLEN(p + tl) + 1);
10442 repextra -= tl - fl;
10443 }
10444 mch_memmove(p, ftp->ft_from, fl);
10445 sp->ts_state = STATE_REP;
10446 break;
10447
10448 default:
10449 /* Did all possible states at this level, go up one level. */
10450 --depth;
10451
10452 if (depth >= 0 && stack[depth].ts_prefixdepth == PFD_PREFIXTREE)
10453 {
10454 /* Continue in or go back to the prefix tree. */
10455 byts = pbyts;
10456 idxs = pidxs;
10457 }
10458
10459 /* Don't check for CTRL-C too often, it takes time. */
10460 line_breakcheck();
10461 }
10462 }
10463 }
10464 }
10465
10466 /*
10467 * Try going one level deeper in the tree.
10468 */
10469 static int
10470 try_deeper(su, stack, depth, score_add)
10471 suginfo_T *su;
10472 trystate_T *stack; 11900 trystate_T *stack;
10473 int depth; 11901 int depth;
10474 int score_add; 11902 int score_add;
10475 { 11903 {
10476 int newscore;
10477
10478 /* Refuse to go deeper if the scrore is getting too big. */
10479 newscore = stack[depth].ts_score + score_add;
10480 if (newscore >= su->su_maxscore)
10481 return FALSE;
10482
10483 stack[depth + 1] = stack[depth]; 11904 stack[depth + 1] = stack[depth];
10484 stack[depth + 1].ts_state = STATE_START; 11905 stack[depth + 1].ts_state = STATE_START;
10485 stack[depth + 1].ts_score = newscore; 11906 stack[depth + 1].ts_score = stack[depth].ts_score + score_add;
10486 stack[depth + 1].ts_curi = 1; /* start just after length byte */ 11907 stack[depth + 1].ts_curi = 1; /* start just after length byte */
10487 stack[depth + 1].ts_flags = 0; 11908 stack[depth + 1].ts_flags = 0;
10488 return TRUE;
10489 } 11909 }
10490 11910
10491 #ifdef FEAT_MBYTE 11911 #ifdef FEAT_MBYTE
10492 /* 11912 /*
10493 * Case-folding may change the number of bytes: Count nr of chars in 11913 * Case-folding may change the number of bytes: Count nr of chars in
10711 /* Add the suggestion. */ 12131 /* Add the suggestion. */
10712 sstp = &SUG(su->su_sga, su->su_sga.ga_len); 12132 sstp = &SUG(su->su_sga, su->su_sga.ga_len);
10713 sstp->st_word = vim_strsave(stp->st_word); 12133 sstp->st_word = vim_strsave(stp->st_word);
10714 if (sstp->st_word != NULL) 12134 if (sstp->st_word != NULL)
10715 { 12135 {
12136 sstp->st_wordlen = stp->st_wordlen;
10716 sstp->st_score = score; 12137 sstp->st_score = score;
10717 sstp->st_altscore = 0; 12138 sstp->st_altscore = 0;
10718 sstp->st_orglen = stp->st_orglen; 12139 sstp->st_orglen = stp->st_orglen;
10719 ++su->su_sga.ga_len; 12140 ++su->su_sga.ga_len;
10720 } 12141 }
10741 suggest_T *stp; 12162 suggest_T *stp;
10742 char_u *p; 12163 char_u *p;
10743 char_u badsound[MAXWLEN]; 12164 char_u badsound[MAXWLEN];
10744 int round; 12165 int round;
10745 int lpi; 12166 int lpi;
12167 slang_T *slang = NULL;
10746 12168
10747 /* Add the alternate score to su_ga. */ 12169 /* Add the alternate score to su_ga. */
10748 for (lpi = 0; lpi < curbuf->b_langp.ga_len; ++lpi) 12170 for (lpi = 0; lpi < curbuf->b_langp.ga_len; ++lpi)
10749 { 12171 {
10750 lp = LANGP_ENTRY(curbuf->b_langp, lpi); 12172 lp = LANGP_ENTRY(curbuf->b_langp, lpi);
10751 if (lp->lp_slang->sl_sal.ga_len > 0) 12173 if (lp->lp_slang->sl_sal.ga_len > 0)
10752 { 12174 {
10753 /* soundfold the bad word */ 12175 /* soundfold the bad word */
10754 spell_soundfold(lp->lp_slang, su->su_fbadword, TRUE, badsound); 12176 slang = lp->lp_slang;
12177 spell_soundfold(slang, su->su_fbadword, TRUE, badsound);
10755 12178
10756 for (i = 0; i < su->su_ga.ga_len; ++i) 12179 for (i = 0; i < su->su_ga.ga_len; ++i)
10757 { 12180 {
10758 stp = &SUG(su->su_ga, i); 12181 stp = &SUG(su->su_ga, i);
10759 stp->st_altscore = stp_sal_score(stp, su, lp->lp_slang, 12182 stp->st_altscore = stp_sal_score(stp, su, slang, badsound);
10760 badsound);
10761 if (stp->st_altscore == SCORE_MAXMAX) 12183 if (stp->st_altscore == SCORE_MAXMAX)
10762 stp->st_score = (stp->st_score * 3 + SCORE_BIG) / 4; 12184 stp->st_score = (stp->st_score * 3 + SCORE_BIG) / 4;
10763 else 12185 else
10764 stp->st_score = (stp->st_score * 3 12186 stp->st_score = (stp->st_score * 3
10765 + stp->st_altscore) / 4; 12187 + stp->st_altscore) / 4;
10767 } 12189 }
10768 break; 12190 break;
10769 } 12191 }
10770 } 12192 }
10771 12193
12194 if (slang == NULL) /* just in case */
12195 return;
12196
10772 /* Add the alternate score to su_sga. */ 12197 /* Add the alternate score to su_sga. */
10773 for (i = 0; i < su->su_sga.ga_len; ++i) 12198 for (i = 0; i < su->su_sga.ga_len; ++i)
10774 { 12199 {
10775 stp = &SUG(su->su_sga, i); 12200 stp = &SUG(su->su_sga, i);
10776 stp->st_altscore = spell_edit_score(su->su_badword, stp->st_word); 12201 stp->st_altscore = spell_edit_score(slang,
12202 su->su_badword, stp->st_word);
10777 if (stp->st_score == SCORE_MAXMAX) 12203 if (stp->st_score == SCORE_MAXMAX)
10778 stp->st_score = (SCORE_BIG * 7 + stp->st_altscore) / 8; 12204 stp->st_score = (SCORE_BIG * 7 + stp->st_altscore) / 8;
10779 else 12205 else
10780 stp->st_score = (stp->st_score * 7 + stp->st_altscore) / 8; 12206 stp->st_score = (stp->st_score * 7 + stp->st_altscore) / 8;
10781 stp->st_salscore = TRUE; 12207 stp->st_salscore = TRUE;
10782 } 12208 }
10783 12209
10784 /* Sort the suggestions and truncate at "maxcount" for both lists. */ 12210 /* Remove bad suggestions, sort the suggestions and truncate at "maxcount"
12211 * for both lists. */
12212 check_suggestions(su, &su->su_ga);
10785 (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, su->su_maxcount); 12213 (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, su->su_maxcount);
12214 check_suggestions(su, &su->su_sga);
10786 (void)cleanup_suggestions(&su->su_sga, su->su_maxscore, su->su_maxcount); 12215 (void)cleanup_suggestions(&su->su_sga, su->su_maxscore, su->su_maxcount);
10787 12216
10788 ga_init2(&ga, (int)sizeof(suginfo_T), 1); 12217 ga_init2(&ga, (int)sizeof(suginfo_T), 1);
10789 if (ga_grow(&ga, su->su_ga.ga_len + su->su_sga.ga_len) == FAIL) 12218 if (ga_grow(&ga, su->su_ga.ga_len + su->su_sga.ga_len) == FAIL)
10790 return; 12219 return;
10870 if (lendiff > 0) 12299 if (lendiff > 0)
10871 { 12300 {
10872 /* Add part of the bad word to the good word, so that we soundfold 12301 /* Add part of the bad word to the good word, so that we soundfold
10873 * what replaces the bad word. */ 12302 * what replaces the bad word. */
10874 STRCPY(goodword, stp->st_word); 12303 STRCPY(goodword, stp->st_word);
10875 STRNCAT(goodword, su->su_badptr + su->su_badlen - lendiff, lendiff); 12304 vim_strncpy(goodword + stp->st_wordlen,
12305 su->su_badptr + su->su_badlen - lendiff, lendiff);
10876 pgood = goodword; 12306 pgood = goodword;
10877 } 12307 }
10878 else 12308 else
10879 pgood = stp->st_word; 12309 pgood = stp->st_word;
10880 12310
10881 /* Sound-fold the word and compute the score for the difference. */ 12311 /* Sound-fold the word and compute the score for the difference. */
10882 spell_soundfold(slang, pgood, FALSE, goodsound); 12312 spell_soundfold(slang, pgood, FALSE, goodsound);
10883 12313
10884 return soundalike_score(goodsound, pbad); 12314 return soundalike_score(goodsound, pbad);
12315 }
12316
12317 /* structure used to store soundfolded words that add_sound_suggest() has
12318 * handled already. */
12319 typedef struct
12320 {
12321 short sft_score; /* lowest score used */
12322 char_u sft_word[1]; /* soundfolded word, actually longer */
12323 } sftword_T;
12324
12325 static sftword_T dumsft;
12326 #define HIKEY2SFT(p) ((sftword_T *)(p - (dumsft.sft_word - (char_u *)&dumsft)))
12327 #define HI2SFT(hi) HIKEY2SFT((hi)->hi_key)
12328
12329 /*
12330 * Prepare for calling suggest_try_soundalike().
12331 */
12332 static void
12333 suggest_try_soundalike_prep()
12334 {
12335 langp_T *lp;
12336 int lpi;
12337 slang_T *slang;
12338
12339 /* Do this for all languages that support sound folding and for which a
12340 * .sug file has been loaded. */
12341 for (lpi = 0; lpi < curbuf->b_langp.ga_len; ++lpi)
12342 {
12343 lp = LANGP_ENTRY(curbuf->b_langp, lpi);
12344 slang = lp->lp_slang;
12345 if (slang->sl_sal.ga_len > 0 && slang->sl_sbyts != NULL)
12346 /* prepare the hashtable used by add_sound_suggest() */
12347 hash_init(&slang->sl_sounddone);
12348 }
10885 } 12349 }
10886 12350
10887 /* 12351 /*
10888 * Find suggestions by comparing the word in a sound-a-like form. 12352 * Find suggestions by comparing the word in a sound-a-like form.
10889 * Note: This doesn't support postponed prefixes. 12353 * Note: This doesn't support postponed prefixes.
10891 static void 12355 static void
10892 suggest_try_soundalike(su) 12356 suggest_try_soundalike(su)
10893 suginfo_T *su; 12357 suginfo_T *su;
10894 { 12358 {
10895 char_u salword[MAXWLEN]; 12359 char_u salword[MAXWLEN];
10896 char_u tword[MAXWLEN];
10897 char_u tsalword[MAXWLEN];
10898 idx_T arridx[MAXWLEN];
10899 int curi[MAXWLEN];
10900 langp_T *lp; 12360 langp_T *lp;
12361 int lpi;
12362 slang_T *slang;
12363
12364 /* Do this for all languages that support sound folding and for which a
12365 * .sug file has been loaded. */
12366 for (lpi = 0; lpi < curbuf->b_langp.ga_len; ++lpi)
12367 {
12368 lp = LANGP_ENTRY(curbuf->b_langp, lpi);
12369 slang = lp->lp_slang;
12370 if (slang->sl_sal.ga_len > 0 && slang->sl_sbyts != NULL)
12371 {
12372 /* soundfold the bad word */
12373 spell_soundfold(slang, su->su_fbadword, TRUE, salword);
12374
12375 /* try all kinds of inserts/deletes/swaps/etc. */
12376 /* TODO: also soundfold the next words, so that we can try joining
12377 * and splitting */
12378 suggest_trie_walk(su, lp, salword, TRUE);
12379 }
12380 }
12381 }
12382
12383 /*
12384 * Finish up after calling suggest_try_soundalike().
12385 */
12386 static void
12387 suggest_try_soundalike_finish()
12388 {
12389 langp_T *lp;
12390 int lpi;
12391 slang_T *slang;
12392 int todo;
12393 hashitem_T *hi;
12394
12395 /* Do this for all languages that support sound folding and for which a
12396 * .sug file has been loaded. */
12397 for (lpi = 0; lpi < curbuf->b_langp.ga_len; ++lpi)
12398 {
12399 lp = LANGP_ENTRY(curbuf->b_langp, lpi);
12400 slang = lp->lp_slang;
12401 if (slang->sl_sal.ga_len > 0 && slang->sl_sbyts != NULL)
12402 {
12403 /* Free the info about handled words. */
12404 todo = slang->sl_sounddone.ht_used;
12405 for (hi = slang->sl_sounddone.ht_array; todo > 0; ++hi)
12406 if (!HASHITEM_EMPTY(hi))
12407 {
12408 vim_free(HI2SFT(hi));
12409 --todo;
12410 }
12411 hash_clear(&slang->sl_sounddone);
12412 }
12413 }
12414 }
12415
12416 /*
12417 * A match with a soundfolded word is found. Add the good word(s) that
12418 * produce this soundfolded word.
12419 */
12420 static void
12421 add_sound_suggest(su, goodword, score, lp)
12422 suginfo_T *su;
12423 char_u *goodword;
12424 int score; /* soundfold score */
12425 langp_T *lp;
12426 {
12427 slang_T *slang = lp->lp_slang; /* language for sound folding */
12428 int sfwordnr;
12429 char_u *nrline;
12430 int orgnr;
12431 char_u theword[MAXWLEN];
12432 int i;
12433 int wlen;
10901 char_u *byts; 12434 char_u *byts;
10902 idx_T *idxs; 12435 idx_T *idxs;
10903 int depth; 12436 int n;
10904 int c; 12437 int wordcount;
10905 idx_T n; 12438 int wc;
10906 int round; 12439 int goodscore;
10907 int flags; 12440 hash_T hash;
10908 int sound_score; 12441 hashitem_T *hi;
10909 int local_score; 12442 sftword_T *sft;
10910 int lpi; 12443 int bc, gc;
10911 slang_T *slang; 12444 int limit;
10912 12445
10913 /* Do this for all languages that support sound folding. */ 12446 /*
10914 for (lpi = 0; lpi < curbuf->b_langp.ga_len; ++lpi) 12447 * It's very well possible that the same soundfold word is found several
10915 { 12448 * times with different scores. Since the following is quite slow only do
10916 lp = LANGP_ENTRY(curbuf->b_langp, lpi); 12449 * the words that have a better score than before. Use a hashtable to
10917 slang = lp->lp_slang; 12450 * remember the words that have been done.
10918 if (slang->sl_sal.ga_len > 0) 12451 */
10919 { 12452 hash = hash_hash(goodword);
10920 /* soundfold the bad word */ 12453 hi = hash_lookup(&slang->sl_sounddone, goodword, hash);
10921 spell_soundfold(slang, su->su_fbadword, TRUE, salword); 12454 if (HASHITEM_EMPTY(hi))
10922 12455 {
10923 /* 12456 sft = (sftword_T *)alloc(sizeof(sftword_T) + STRLEN(goodword));
10924 * Go through the whole tree, soundfold each word and compare. 12457 if (sft != NULL)
10925 * round 1: use the case-folded tree. 12458 {
10926 * round 2: use the keep-case tree. 12459 sft->sft_score = score;
10927 */ 12460 STRCPY(sft->sft_word, goodword);
10928 for (round = 1; round <= 2; ++round) 12461 hash_add_item(&slang->sl_sounddone, hi, sft->sft_word, hash);
10929 { 12462 }
10930 if (round == 1) 12463 }
12464 else
12465 {
12466 sft = HI2SFT(hi);
12467 if (score >= sft->sft_score)
12468 return;
12469 sft->sft_score = score;
12470 }
12471
12472 /*
12473 * Find the word nr in the soundfold tree.
12474 */
12475 sfwordnr = soundfold_find(slang, goodword);
12476 if (sfwordnr < 0)
12477 {
12478 EMSG2(_(e_intern2), "add_sound_suggest()");
12479 return;
12480 }
12481
12482 /*
12483 * go over the list of good words that produce this soundfold word
12484 */
12485 nrline = ml_get_buf(slang->sl_sugbuf, (linenr_T)(sfwordnr + 1), FALSE);
12486 orgnr = 0;
12487 while (*nrline != NUL)
12488 {
12489 /* The wordnr was stored in a minimal nr of bytes as an offset to the
12490 * previous wordnr. */
12491 orgnr += bytes2offset(&nrline);
12492
12493 byts = slang->sl_fbyts;
12494 idxs = slang->sl_fidxs;
12495
12496 /* Lookup the word "orgnr" one of the two tries. */
12497 n = 0;
12498 wlen = 0;
12499 wordcount = 0;
12500 for (;;)
12501 {
12502 i = 1;
12503 if (wordcount == orgnr && byts[n + 1] == NUL)
12504 break; /* found end of word */
12505
12506 if (byts[n + 1] == NUL)
12507 ++wordcount;
12508
12509 /* skip over the NUL bytes */
12510 for ( ; byts[n + i] == NUL; ++i)
12511 if (i > byts[n]) /* safety check */
10931 { 12512 {
10932 byts = slang->sl_fbyts; 12513 STRCPY(theword + wlen, "BAD");
10933 idxs = slang->sl_fidxs; 12514 goto badword;
12515 }
12516
12517 /* One of the siblings must have the word. */
12518 for ( ; i < byts[n]; ++i)
12519 {
12520 wc = idxs[idxs[n + i]]; /* nr of words under this byte */
12521 if (wordcount + wc > orgnr)
12522 break;
12523 wordcount += wc;
12524 }
12525
12526 theword[wlen++] = byts[n + i];
12527 n = idxs[n + i];
12528 }
12529 badword:
12530 theword[wlen] = NUL;
12531
12532 /* Go over the possible flags and regions. */
12533 for (; i <= byts[n] && byts[n + i] == NUL; ++i)
12534 {
12535 char_u cword[MAXWLEN];
12536 char_u *p;
12537 int flags = (int)idxs[n + i];
12538
12539 if (flags & WF_KEEPCAP)
12540 {
12541 /* Must find the word in the keep-case tree. */
12542 find_keepcap_word(slang, theword, cword);
12543 p = cword;
12544 }
12545 else
12546 {
12547 flags |= su->su_badflags;
12548 if ((flags & WF_CAPMASK) != 0)
12549 {
12550 /* Need to fix case according to "flags". */
12551 make_case_word(theword, cword, flags);
12552 p = cword;
10934 } 12553 }
10935 else 12554 else
12555 p = theword;
12556 }
12557
12558 /* Add the suggestion. */
12559 if (sps_flags & SPS_DOUBLE)
12560 {
12561 /* Add the suggestion if the score isn't too bad. */
12562 if (score <= su->su_maxscore)
12563 add_suggestion(su, &su->su_sga, p, su->su_badlen,
12564 score, 0, FALSE, slang, FALSE);
12565 }
12566 else
12567 {
12568 /* Add a penalty for words in another region. */
12569 if ((flags & WF_REGION)
12570 && (((unsigned)flags >> 16) & lp->lp_region) == 0)
12571 goodscore = SCORE_REGION;
12572 else
12573 goodscore = 0;
12574
12575 /* Add a small penalty for changing the first letter from
12576 * lower to upper case. Helps for "tath" -> "Kath", which is
12577 * less common thatn "tath" -> "path". Don't do it when the
12578 * letter is the same, that has already been counted. */
12579 gc = PTR2CHAR(p);
12580 if (SPELL_ISUPPER(gc))
10936 { 12581 {
10937 byts = slang->sl_kbyts; 12582 bc = PTR2CHAR(su->su_badword);
10938 idxs = slang->sl_kidxs; 12583 if (!SPELL_ISUPPER(bc)
10939 if (byts == NULL) /* no keep-case words */ 12584 && SPELL_TOFOLD(bc) != SPELL_TOFOLD(gc))
10940 continue; 12585 goodscore += SCORE_ICASE / 2;
10941 } 12586 }
10942 12587
10943 depth = 0; 12588 /* Compute the score for the good word. This only does letter
10944 arridx[0] = 0; 12589 * insert/delete/swap/replace. REP items are not considered,
10945 curi[0] = 1; 12590 * which may make the score a bit higher.
10946 while (depth >= 0 && !got_int) 12591 * Use a limit for the score to make it work faster. Use
12592 * MAXSCORE(), because RESCORE() will change the score.
12593 * If the limit is very high then the iterative method is
12594 * inefficient, using an array is quicker. */
12595 limit = MAXSCORE(su->su_sfmaxscore - goodscore, score);
12596 if (limit > SCORE_LIMITMAX)
12597 goodscore += spell_edit_score(slang, su->su_badword, p);
12598 else
12599 goodscore += spell_edit_score_limit(slang, su->su_badword,
12600 p, limit);
12601
12602 /* When going over the limit don't bother to do the rest. */
12603 if (goodscore < SCORE_MAXMAX)
10947 { 12604 {
10948 if (curi[depth] > byts[arridx[depth]]) 12605 /* Give a bonus to words seen before. */
10949 { 12606 goodscore = score_wordcount_adj(slang, goodscore, p, FALSE);
10950 /* Done all bytes at this node, go up one level. */ 12607
10951 --depth; 12608 /* Add the suggestion if the score isn't too bad. */
10952 line_breakcheck(); 12609 goodscore = RESCORE(goodscore, score);
10953 } 12610 if (goodscore <= su->su_sfmaxscore)
10954 else 12611 add_suggestion(su, &su->su_ga, p, su->su_badlen,
10955 { 12612 goodscore, score, TRUE, slang, TRUE);
10956 /* Do one more byte at this node. */
10957 n = arridx[depth] + curi[depth];
10958 ++curi[depth];
10959 c = byts[n];
10960 if (c == 0)
10961 {
10962 /* End of word, deal with the word. */
10963 flags = (int)idxs[n];
10964 if (round == 2 || (flags & WF_KEEPCAP) == 0)
10965 {
10966 tword[depth] = NUL;
10967 /* Sound-fold. Only in keep-case tree need to
10968 * case-fold the word. */
10969 spell_soundfold(slang, tword,
10970 round == 1, tsalword);
10971
10972 /* Compute the edit distance between the
10973 * sound-a-like words. */
10974 sound_score = soundalike_score(salword,
10975 tsalword);
10976
10977 /* Add a penalty for words in another region. */
10978 if ((flags & WF_REGION) && (((unsigned)flags
10979 >> 16) & lp->lp_region) == 0)
10980 local_score = SCORE_REGION;
10981 else
10982 local_score = 0;
10983 sound_score += local_score;
10984
10985 if (sound_score < SCORE_MAXMAX)
10986 {
10987 char_u cword[MAXWLEN];
10988 char_u *p;
10989 int score;
10990
10991 flags |= su->su_badflags;
10992 if (round == 1 && (flags & WF_CAPMASK) != 0)
10993 {
10994 /* Need to fix case according to
10995 * "flags". */
10996 make_case_word(tword, cword, flags);
10997 p = cword;
10998 }
10999 else
11000 p = tword;
11001
11002 if (sps_flags & SPS_DOUBLE)
11003 add_suggestion(su, &su->su_sga, p,
11004 su->su_badlen,
11005 sound_score, 0, FALSE,
11006 lp->lp_sallang);
11007 else
11008 {
11009 /* Compute the score. */
11010 score = spell_edit_score(
11011 su->su_badword, p)
11012 + local_score;
11013 if (sps_flags & SPS_BEST)
11014 /* give a bonus for the good word
11015 * sounding the same as the bad
11016 * word */
11017 add_suggestion(su, &su->su_ga, p,
11018 su->su_badlen,
11019 RESCORE(score, sound_score),
11020 sound_score, TRUE,
11021 lp->lp_sallang);
11022 else
11023 add_suggestion(su, &su->su_ga, p,
11024 su->su_badlen,
11025 score + sound_score,
11026 0, FALSE,
11027 lp->lp_sallang);
11028 }
11029 }
11030 }
11031
11032 /* Skip over other NUL bytes. */
11033 while (byts[n + 1] == 0)
11034 {
11035 ++n;
11036 ++curi[depth];
11037 }
11038 }
11039 else
11040 {
11041 /* Normal char, go one level deeper. */
11042 tword[depth++] = c;
11043 arridx[depth] = idxs[n];
11044 curi[depth] = 1;
11045 }
11046 }
11047 } 12613 }
11048 } 12614 }
11049 } 12615 }
11050 } 12616 /* smsg("word %s (%d): %s (%d)", sftword, sftnr, theword, orgnr); */
12617 }
12618 }
12619
12620 /*
12621 * Find word "word" in fold-case tree for "slang" and return the word number.
12622 */
12623 static int
12624 soundfold_find(slang, word)
12625 slang_T *slang;
12626 char_u *word;
12627 {
12628 idx_T arridx = 0;
12629 int len;
12630 int wlen = 0;
12631 int c;
12632 char_u *ptr = word;
12633 char_u *byts;
12634 idx_T *idxs;
12635 int wordnr = 0;
12636
12637 byts = slang->sl_sbyts;
12638 idxs = slang->sl_sidxs;
12639
12640 for (;;)
12641 {
12642 /* First byte is the number of possible bytes. */
12643 len = byts[arridx++];
12644
12645 /* If the first possible byte is a zero the word could end here.
12646 * If the word ends we found the word. If not skip the NUL bytes. */
12647 c = ptr[wlen];
12648 if (byts[arridx] == NUL)
12649 {
12650 if (c == NUL)
12651 break;
12652
12653 /* Skip over the zeros, there can be several. */
12654 while (len > 0 && byts[arridx] == NUL)
12655 {
12656 ++arridx;
12657 --len;
12658 }
12659 if (len == 0)
12660 return -1; /* no children, word should have ended here */
12661 ++wordnr;
12662 }
12663
12664 /* If the word ends we didn't find it. */
12665 if (c == NUL)
12666 return -1;
12667
12668 /* Perform a binary search in the list of accepted bytes. */
12669 if (c == TAB) /* <Tab> is handled like <Space> */
12670 c = ' ';
12671 while (byts[arridx] < c)
12672 {
12673 /* The word count is in the first idxs[] entry of the child. */
12674 wordnr += idxs[idxs[arridx]];
12675 ++arridx;
12676 if (--len == 0) /* end of the bytes, didn't find it */
12677 return -1;
12678 }
12679 if (byts[arridx] != c) /* didn't find the byte */
12680 return -1;
12681
12682 /* Continue at the child (if there is one). */
12683 arridx = idxs[arridx];
12684 ++wlen;
12685
12686 /* One space in the good word may stand for several spaces in the
12687 * checked word. */
12688 if (c == ' ')
12689 while (ptr[wlen] == ' ' || ptr[wlen] == TAB)
12690 ++wlen;
12691 }
12692
12693 return wordnr;
11051 } 12694 }
11052 12695
11053 /* 12696 /*
11054 * Copy "fword" to "cword", fixing case according to "flags". 12697 * Copy "fword" to "cword", fixing case according to "flags".
11055 */ 12698 */
11088 lp->sl_has_map = FALSE; 12731 lp->sl_has_map = FALSE;
11089 return; 12732 return;
11090 } 12733 }
11091 lp->sl_has_map = TRUE; 12734 lp->sl_has_map = TRUE;
11092 12735
11093 /* Init the array and hash table empty. */ 12736 /* Init the array and hash tables empty. */
11094 for (i = 0; i < 256; ++i) 12737 for (i = 0; i < 256; ++i)
11095 lp->sl_map_array[i] = 0; 12738 lp->sl_map_array[i] = 0;
11096 #ifdef FEAT_MBYTE 12739 #ifdef FEAT_MBYTE
11097 hash_init(&lp->sl_map_hash); 12740 hash_init(&lp->sl_map_hash);
11098 #endif 12741 #endif
11202 return m1 == m2; 12845 return m1 == m2;
11203 } 12846 }
11204 12847
11205 /* 12848 /*
11206 * Add a suggestion to the list of suggestions. 12849 * Add a suggestion to the list of suggestions.
11207 * Do not add a duplicate suggestion or suggestions with a bad score. 12850 * For a suggestion that is already in the list the lowest score is remembered.
11208 * When "use_score" is not zero it's used, otherwise the score is computed
11209 * with spell_edit_score().
11210 */ 12851 */
11211 static void 12852 static void
11212 add_suggestion(su, gap, goodword, badlenarg, score, altscore, had_bonus, slang) 12853 add_suggestion(su, gap, goodword, badlenarg, score, altscore, had_bonus,
12854 slang, maxsf)
11213 suginfo_T *su; 12855 suginfo_T *su;
11214 garray_T *gap; 12856 garray_T *gap; /* either su_ga or su_sga */
11215 char_u *goodword; 12857 char_u *goodword;
11216 int badlenarg; /* len of bad word replaced with "goodword" */ 12858 int badlenarg; /* len of bad word replaced with "goodword" */
11217 int score; 12859 int score;
11218 int altscore; 12860 int altscore;
11219 int had_bonus; /* value for st_had_bonus */ 12861 int had_bonus; /* value for st_had_bonus */
11220 slang_T *slang; /* language for sound folding */ 12862 slang_T *slang; /* language for sound folding */
11221 { 12863 int maxsf; /* su_maxscore applies to soundfold score,
11222 int goodlen = STRLEN(goodword); /* len of goodword changed */ 12864 su_sfmaxscore to the total score. */
11223 int badlen = badlenarg; /* len of bad word changed */ 12865 {
12866 int goodlen; /* len of goodword changed */
12867 int badlen; /* len of bad word changed */
11224 suggest_T *stp; 12868 suggest_T *stp;
11225 suggest_T new_sug; 12869 suggest_T new_sug;
11226 int i; 12870 int i;
11227 hlf_T attr = HLF_COUNT;
11228 char_u longword[MAXWLEN + 1];
11229 char_u *pgood, *pbad; 12871 char_u *pgood, *pbad;
11230
11231 /* Check that the word really is valid. Esp. for banned words and for
11232 * split words, such as "the the". Need to append what follows to check
11233 * for that. */
11234 STRCPY(longword, goodword);
11235 vim_strncpy(longword + goodlen, su->su_badptr + badlen, MAXWLEN - goodlen);
11236 (void)spell_check(curwin, longword, &attr, NULL);
11237 if (attr != HLF_COUNT)
11238 return;
11239 12872
11240 /* Minimize "badlen" for consistency. Avoids that changing "the the" to 12873 /* Minimize "badlen" for consistency. Avoids that changing "the the" to
11241 * "thee the" is added next to changing the first "the" the "thee". */ 12874 * "thee the" is added next to changing the first "the" the "thee". */
11242 pgood = goodword + STRLEN(goodword); 12875 pgood = goodword + STRLEN(goodword);
11243 pbad = su->su_badptr + badlen; 12876 pbad = su->su_badptr + badlenarg;
11244 while (pgood > goodword && pbad > su->su_badptr) 12877 for (;;)
11245 { 12878 {
12879 goodlen = pgood - goodword;
12880 badlen = pbad - su->su_badptr;
12881 if (goodlen <= 0 || badlen <= 0)
12882 break;
11246 mb_ptr_back(goodword, pgood); 12883 mb_ptr_back(goodword, pgood);
11247 mb_ptr_back(su->su_badptr, pbad); 12884 mb_ptr_back(su->su_badptr, pbad);
11248 #ifdef FEAT_MBYTE 12885 #ifdef FEAT_MBYTE
11249 if (has_mbyte) 12886 if (has_mbyte)
11250 { 12887 {
11253 } 12890 }
11254 else 12891 else
11255 #endif 12892 #endif
11256 if (*pgood != *pbad) 12893 if (*pgood != *pbad)
11257 break; 12894 break;
11258 badlen = pbad - su->su_badptr; 12895 }
11259 goodlen = pgood - goodword; 12896
11260 }
11261 if (badlen == 0 && goodlen == 0) 12897 if (badlen == 0 && goodlen == 0)
11262 /* goodword doesn't change anything; may happen for "the the" changing 12898 /* goodword doesn't change anything; may happen for "the the" changing
11263 * the first "the" to itself. */ 12899 * the first "the" to itself. */
11264 return; 12900 return;
11265 12901
11266 if (score <= su->su_maxscore) 12902 /* Check if the word is already there. Also check the length that is
11267 { 12903 * being replaced "thes," -> "these" is a different suggestion from
11268 /* Check if the word is already there. Also check the length that is 12904 * "thes" -> "these". */
11269 * being replaced "thes," -> "these" is a different suggestion from 12905 stp = &SUG(*gap, 0);
11270 * "thes" -> "these". */ 12906 for (i = gap->ga_len; --i >= 0; ++stp)
11271 stp = &SUG(*gap, 0); 12907 if (stp->st_wordlen == goodlen
11272 for (i = gap->ga_len - 1; i >= 0; --i) 12908 && stp->st_orglen == badlen
11273 if ((int)STRLEN(stp[i].st_word) == goodlen 12909 && STRNCMP(stp->st_word, goodword, goodlen) == 0)
11274 && STRNCMP(stp[i].st_word, goodword, goodlen) == 0 12910 {
11275 && stp[i].st_orglen == badlen) 12911 /*
11276 { 12912 * Found it. Remember the word with the lowest score.
11277 /* 12913 */
11278 * Found it. Remember the word with the lowest score. 12914 if (stp->st_slang == NULL)
11279 */ 12915 stp->st_slang = slang;
11280 if (stp[i].st_slang == NULL) 12916
11281 stp[i].st_slang = slang; 12917 new_sug.st_score = score;
11282 12918 new_sug.st_altscore = altscore;
11283 new_sug.st_score = score; 12919 new_sug.st_had_bonus = had_bonus;
11284 new_sug.st_altscore = altscore; 12920
11285 new_sug.st_had_bonus = had_bonus; 12921 if (stp->st_had_bonus != had_bonus)
11286 12922 {
11287 if (stp[i].st_had_bonus != had_bonus) 12923 /* Only one of the two had the soundalike score computed.
12924 * Need to do that for the other one now, otherwise the
12925 * scores can't be compared. This happens because
12926 * suggest_try_change() doesn't compute the soundalike
12927 * word to keep it fast, while some special methods set
12928 * the soundalike score to zero. */
12929 if (had_bonus)
12930 rescore_one(su, stp);
12931 else
11288 { 12932 {
11289 /* Only one of the two had the soundalike score computed. 12933 new_sug.st_word = stp->st_word;
11290 * Need to do that for the other one now, otherwise the 12934 new_sug.st_wordlen = stp->st_wordlen;
11291 * scores can't be compared. This happens because 12935 new_sug.st_slang = stp->st_slang;
11292 * suggest_try_change() doesn't compute the soundalike 12936 new_sug.st_orglen = badlen;
11293 * word to keep it fast, while some special methods set 12937 rescore_one(su, &new_sug);
11294 * the soundalike score to zero. */
11295 if (had_bonus)
11296 rescore_one(su, &stp[i]);
11297 else
11298 {
11299 new_sug.st_word = goodword;
11300 new_sug.st_slang = stp[i].st_slang;
11301 new_sug.st_orglen = badlen;
11302 rescore_one(su, &new_sug);
11303 }
11304 } 12938 }
11305 12939 }
11306 if (stp[i].st_score > new_sug.st_score) 12940
12941 if (stp->st_score > new_sug.st_score)
12942 {
12943 stp->st_score = new_sug.st_score;
12944 stp->st_altscore = new_sug.st_altscore;
12945 stp->st_had_bonus = new_sug.st_had_bonus;
12946 }
12947 break;
12948 }
12949
12950 if (i < 0 && ga_grow(gap, 1) == OK)
12951 {
12952 /* Add a suggestion. */
12953 stp = &SUG(*gap, gap->ga_len);
12954 stp->st_word = vim_strnsave(goodword, goodlen);
12955 if (stp->st_word != NULL)
12956 {
12957 stp->st_wordlen = goodlen;
12958 stp->st_score = score;
12959 stp->st_altscore = altscore;
12960 stp->st_had_bonus = had_bonus;
12961 stp->st_orglen = badlen;
12962 stp->st_slang = slang;
12963 ++gap->ga_len;
12964
12965 /* If we have too many suggestions now, sort the list and keep
12966 * the best suggestions. */
12967 if (gap->ga_len > SUG_MAX_COUNT(su))
12968 {
12969 if (maxsf)
12970 su->su_sfmaxscore = cleanup_suggestions(gap,
12971 su->su_sfmaxscore, SUG_CLEAN_COUNT(su));
12972 else
11307 { 12973 {
11308 stp[i].st_score = new_sug.st_score; 12974 i = su->su_maxscore;
11309 stp[i].st_altscore = new_sug.st_altscore; 12975 su->su_maxscore = cleanup_suggestions(gap,
11310 stp[i].st_had_bonus = new_sug.st_had_bonus; 12976 su->su_maxscore, SUG_CLEAN_COUNT(su));
11311 } 12977 }
11312 break; 12978 }
11313 } 12979 }
11314 12980 }
11315 if (i < 0 && ga_grow(gap, 1) == OK) 12981 }
11316 { 12982
11317 /* Add a suggestion. */ 12983 /*
11318 stp = &SUG(*gap, gap->ga_len); 12984 * Suggestions may in fact be flagged as errors. Esp. for banned words and
11319 stp->st_word = vim_strnsave(goodword, goodlen); 12985 * for split words, such as "the the". Remove these from the list here.
11320 if (stp->st_word != NULL) 12986 */
11321 { 12987 static void
11322 stp->st_score = score; 12988 check_suggestions(su, gap)
11323 stp->st_altscore = altscore; 12989 suginfo_T *su;
11324 stp->st_had_bonus = had_bonus; 12990 garray_T *gap; /* either su_ga or su_sga */
11325 stp->st_orglen = badlen; 12991 {
11326 stp->st_slang = slang; 12992 suggest_T *stp;
11327 ++gap->ga_len; 12993 int i;
11328 12994 char_u longword[MAXWLEN + 1];
11329 /* If we have too many suggestions now, sort the list and keep 12995 int len;
11330 * the best suggestions. */ 12996 hlf_T attr;
11331 if (gap->ga_len > SUG_MAX_COUNT(su)) 12997
11332 su->su_maxscore = cleanup_suggestions(gap, su->su_maxscore, 12998 stp = &SUG(*gap, 0);
11333 SUG_CLEAN_COUNT(su)); 12999 for (i = gap->ga_len - 1; i >= 0; --i)
11334 } 13000 {
11335 } 13001 /* Need to append what follows to check for "the the". */
11336 } 13002 STRCPY(longword, stp[i].st_word);
11337 } 13003 len = stp[i].st_wordlen;
13004 vim_strncpy(longword + len, su->su_badptr + stp[i].st_orglen,
13005 MAXWLEN - len);
13006 attr = HLF_COUNT;
13007 (void)spell_check(curwin, longword, &attr, NULL, FALSE);
13008 if (attr != HLF_COUNT)
13009 {
13010 /* Remove this entry. */
13011 vim_free(stp[i].st_word);
13012 --gap->ga_len;
13013 if (i < gap->ga_len)
13014 mch_memmove(stp + i, stp + i + 1,
13015 sizeof(suggest_T) * (gap->ga_len - i));
13016 }
13017 }
13018 }
13019
11338 13020
11339 /* 13021 /*
11340 * Add a word to be banned. 13022 * Add a word to be banned.
11341 */ 13023 */
11342 static void 13024 static void
11346 { 13028 {
11347 char_u *s = vim_strsave(word); 13029 char_u *s = vim_strsave(word);
11348 hash_T hash; 13030 hash_T hash;
11349 hashitem_T *hi; 13031 hashitem_T *hi;
11350 13032
11351 if (s != NULL) 13033 hash = hash_hash(word);
11352 { 13034 hi = hash_lookup(&su->su_banned, word, hash);
11353 hash = hash_hash(s); 13035 if (HASHITEM_EMPTY(hi))
11354 hi = hash_lookup(&su->su_banned, s, hash); 13036 {
11355 if (HASHITEM_EMPTY(hi)) 13037 s = vim_strsave(word);
13038 if (s != NULL)
11356 hash_add_item(&su->su_banned, hi, s, hash); 13039 hash_add_item(&su->su_banned, hi, s, hash);
11357 else 13040 }
11358 vim_free(s);
11359 }
11360 }
11361
11362 /*
11363 * Return TRUE if a word appears in the list of banned words.
11364 */
11365 static int
11366 was_banned(su, word)
11367 suginfo_T *su;
11368 char_u *word;
11369 {
11370 hashitem_T *hi = hash_find(&su->su_banned, word);
11371
11372 return !HASHITEM_EMPTY(hi);
11373 }
11374
11375 /*
11376 * Free the banned words in "su".
11377 */
11378 static void
11379 free_banned(su)
11380 suginfo_T *su;
11381 {
11382 int todo;
11383 hashitem_T *hi;
11384
11385 todo = su->su_banned.ht_used;
11386 for (hi = su->su_banned.ht_array; todo > 0; ++hi)
11387 {
11388 if (!HASHITEM_EMPTY(hi))
11389 {
11390 vim_free(hi->hi_key);
11391 --todo;
11392 }
11393 }
11394 hash_clear(&su->su_banned);
11395 } 13041 }
11396 13042
11397 /* 13043 /*
11398 * Recompute the score for all suggestions if sound-folding is possible. This 13044 * Recompute the score for all suggestions if sound-folding is possible. This
11399 * is slow, thus only done for the final results. 13045 * is slow, thus only done for the final results.
12268 13914
12269 /* adding/inserting "*" at the start (word starts with vowel) shouldn't be 13915 /* adding/inserting "*" at the start (word starts with vowel) shouldn't be
12270 * counted so much, vowels halfway the word aren't counted at all. */ 13916 * counted so much, vowels halfway the word aren't counted at all. */
12271 if ((*badsound == '*' || *goodsound == '*') && *badsound != *goodsound) 13917 if ((*badsound == '*' || *goodsound == '*') && *badsound != *goodsound)
12272 { 13918 {
12273 score = SCORE_DEL / 2; 13919 if (badsound[1] == goodsound[1]
12274 if (*badsound == '*') 13920 || (badsound[1] != NUL
12275 ++badsound; 13921 && goodsound[1] != NUL
13922 && badsound[2] == goodsound[2]))
13923 {
13924 /* handle like a substitute */
13925 }
12276 else 13926 else
12277 ++goodsound; 13927 {
13928 score = 2 * SCORE_DEL / 3;
13929 if (*badsound == '*')
13930 ++badsound;
13931 else
13932 ++goodsound;
13933 }
12278 } 13934 }
12279 13935
12280 goodlen = STRLEN(goodsound); 13936 goodlen = STRLEN(goodsound);
12281 badlen = STRLEN(badsound); 13937 badlen = STRLEN(badsound);
12282 13938
12468 * The implementation of the algorithm comes from Aspell editdist.cpp, 14124 * The implementation of the algorithm comes from Aspell editdist.cpp,
12469 * edit_distance(). It has been converted from C++ to C and modified to 14125 * edit_distance(). It has been converted from C++ to C and modified to
12470 * support multi-byte characters. 14126 * support multi-byte characters.
12471 */ 14127 */
12472 static int 14128 static int
12473 spell_edit_score(badword, goodword) 14129 spell_edit_score(slang, badword, goodword)
14130 slang_T *slang;
12474 char_u *badword; 14131 char_u *badword;
12475 char_u *goodword; 14132 char_u *goodword;
12476 { 14133 {
12477 int *cnt; 14134 int *cnt;
12478 int badlen, goodlen; /* lenghts including NUL */ 14135 int badlen, goodlen; /* lenghts including NUL */
12510 if (cnt == NULL) 14167 if (cnt == NULL)
12511 return 0; /* out of memory */ 14168 return 0; /* out of memory */
12512 14169
12513 CNT(0, 0) = 0; 14170 CNT(0, 0) = 0;
12514 for (j = 1; j <= goodlen; ++j) 14171 for (j = 1; j <= goodlen; ++j)
12515 CNT(0, j) = CNT(0, j - 1) + SCORE_DEL; 14172 CNT(0, j) = CNT(0, j - 1) + SCORE_INS;
12516 14173
12517 for (i = 1; i <= badlen; ++i) 14174 for (i = 1; i <= badlen; ++i)
12518 { 14175 {
12519 CNT(i, 0) = CNT(i - 1, 0) + SCORE_INS; 14176 CNT(i, 0) = CNT(i - 1, 0) + SCORE_DEL;
12520 for (j = 1; j <= goodlen; ++j) 14177 for (j = 1; j <= goodlen; ++j)
12521 { 14178 {
12522 #ifdef FEAT_MBYTE 14179 #ifdef FEAT_MBYTE
12523 if (has_mbyte) 14180 if (has_mbyte)
12524 { 14181 {
12537 { 14194 {
12538 /* Use a better score when there is only a case difference. */ 14195 /* Use a better score when there is only a case difference. */
12539 if (SPELL_TOFOLD(bc) == SPELL_TOFOLD(gc)) 14196 if (SPELL_TOFOLD(bc) == SPELL_TOFOLD(gc))
12540 CNT(i, j) = SCORE_ICASE + CNT(i - 1, j - 1); 14197 CNT(i, j) = SCORE_ICASE + CNT(i - 1, j - 1);
12541 else 14198 else
12542 CNT(i, j) = SCORE_SUBST + CNT(i - 1, j - 1); 14199 {
14200 /* For a similar character use SCORE_SIMILAR. */
14201 if (slang != NULL
14202 && slang->sl_has_map
14203 && similar_chars(slang, gc, bc))
14204 CNT(i, j) = SCORE_SIMILAR + CNT(i - 1, j - 1);
14205 else
14206 CNT(i, j) = SCORE_SUBST + CNT(i - 1, j - 1);
14207 }
12543 14208
12544 if (i > 1 && j > 1) 14209 if (i > 1 && j > 1)
12545 { 14210 {
12546 #ifdef FEAT_MBYTE 14211 #ifdef FEAT_MBYTE
12547 if (has_mbyte) 14212 if (has_mbyte)
12575 i = CNT(badlen - 1, goodlen - 1); 14240 i = CNT(badlen - 1, goodlen - 1);
12576 vim_free(cnt); 14241 vim_free(cnt);
12577 return i; 14242 return i;
12578 } 14243 }
12579 14244
14245 typedef struct
14246 {
14247 int badi;
14248 int goodi;
14249 int score;
14250 } limitscore_T;
14251
14252 /*
14253 * Like spell_edit_score(), but with a limit on the score to make it faster.
14254 * May return SCORE_MAXMAX when the score is higher than "limit".
14255 *
14256 * This uses a stack for the edits still to be tried.
14257 * The idea comes from Aspell leditdist.cpp. Rewritten in C and added support
14258 * for multi-byte characters.
14259 */
14260 static int
14261 spell_edit_score_limit(slang, badword, goodword, limit)
14262 slang_T *slang;
14263 char_u *badword;
14264 char_u *goodword;
14265 int limit;
14266 {
14267 limitscore_T stack[10]; /* allow for over 3 * 2 edits */
14268 int stackidx;
14269 int bi, gi;
14270 int bi2, gi2;
14271 int bc, gc;
14272 int score;
14273 int score_off;
14274 int minscore;
14275 int round;
14276
14277 #ifdef FEAT_MBYTE
14278 /* Multi-byte characters require a bit more work, use a different function
14279 * to avoid testing "has_mbyte" quite often. */
14280 if (has_mbyte)
14281 return spell_edit_score_limit_w(slang, badword, goodword, limit);
14282 #endif
14283
14284 /*
14285 * The idea is to go from start to end over the words. So long as
14286 * characters are equal just continue, this always gives the lowest score.
14287 * When there is a difference try several alternatives. Each alternative
14288 * increases "score" for the edit distance. Some of the alternatives are
14289 * pushed unto a stack and tried later, some are tried right away. At the
14290 * end of the word the score for one alternative is known. The lowest
14291 * possible score is stored in "minscore".
14292 */
14293 stackidx = 0;
14294 bi = 0;
14295 gi = 0;
14296 score = 0;
14297 minscore = limit + 1;
14298
14299 for (;;)
14300 {
14301 /* Skip over an equal part, score remains the same. */
14302 for (;;)
14303 {
14304 bc = badword[bi];
14305 gc = goodword[gi];
14306 if (bc != gc) /* stop at a char that's different */
14307 break;
14308 if (bc == NUL) /* both words end */
14309 {
14310 if (score < minscore)
14311 minscore = score;
14312 goto pop; /* do next alternative */
14313 }
14314 ++bi;
14315 ++gi;
14316 }
14317
14318 if (gc == NUL) /* goodword ends, delete badword chars */
14319 {
14320 do
14321 {
14322 if ((score += SCORE_DEL) >= minscore)
14323 goto pop; /* do next alternative */
14324 } while (badword[++bi] != NUL);
14325 minscore = score;
14326 }
14327 else if (bc == NUL) /* badword ends, insert badword chars */
14328 {
14329 do
14330 {
14331 if ((score += SCORE_INS) >= minscore)
14332 goto pop; /* do next alternative */
14333 } while (goodword[++gi] != NUL);
14334 minscore = score;
14335 }
14336 else /* both words continue */
14337 {
14338 /* If not close to the limit, perform a change. Only try changes
14339 * that may lead to a lower score than "minscore".
14340 * round 0: try deleting a char from badword
14341 * round 1: try inserting a char in badword */
14342 for (round = 0; round <= 1; ++round)
14343 {
14344 score_off = score + (round == 0 ? SCORE_DEL : SCORE_INS);
14345 if (score_off < minscore)
14346 {
14347 if (score_off + SCORE_EDIT_MIN >= minscore)
14348 {
14349 /* Near the limit, rest of the words must match. We
14350 * can check that right now, no need to push an item
14351 * onto the stack. */
14352 bi2 = bi + 1 - round;
14353 gi2 = gi + round;
14354 while (goodword[gi2] == badword[bi2])
14355 {
14356 if (goodword[gi2] == NUL)
14357 {
14358 minscore = score_off;
14359 break;
14360 }
14361 ++bi2;
14362 ++gi2;
14363 }
14364 }
14365 else
14366 {
14367 /* try deleting/inserting a character later */
14368 stack[stackidx].badi = bi + 1 - round;
14369 stack[stackidx].goodi = gi + round;
14370 stack[stackidx].score = score_off;
14371 ++stackidx;
14372 }
14373 }
14374 }
14375
14376 if (score + SCORE_SWAP < minscore)
14377 {
14378 /* If swapping two characters makes a match then the
14379 * substitution is more expensive, thus there is no need to
14380 * try both. */
14381 if (gc == badword[bi + 1] && bc == goodword[gi + 1])
14382 {
14383 /* Swap two characters, that is: skip them. */
14384 gi += 2;
14385 bi += 2;
14386 score += SCORE_SWAP;
14387 continue;
14388 }
14389 }
14390
14391 /* Substitute one character for another which is the same
14392 * thing as deleting a character from both goodword and badword.
14393 * Use a better score when there is only a case difference. */
14394 if (SPELL_TOFOLD(bc) == SPELL_TOFOLD(gc))
14395 score += SCORE_ICASE;
14396 else
14397 {
14398 /* For a similar character use SCORE_SIMILAR. */
14399 if (slang != NULL
14400 && slang->sl_has_map
14401 && similar_chars(slang, gc, bc))
14402 score += SCORE_SIMILAR;
14403 else
14404 score += SCORE_SUBST;
14405 }
14406
14407 if (score < minscore)
14408 {
14409 /* Do the substitution. */
14410 ++gi;
14411 ++bi;
14412 continue;
14413 }
14414 }
14415 pop:
14416 /*
14417 * Get here to try the next alternative, pop it from the stack.
14418 */
14419 if (stackidx == 0) /* stack is empty, finished */
14420 break;
14421
14422 /* pop an item from the stack */
14423 --stackidx;
14424 gi = stack[stackidx].goodi;
14425 bi = stack[stackidx].badi;
14426 score = stack[stackidx].score;
14427 }
14428
14429 /* When the score goes over "limit" it may actually be much higher.
14430 * Return a very large number to avoid going below the limit when giving a
14431 * bonus. */
14432 if (minscore > limit)
14433 return SCORE_MAXMAX;
14434 return minscore;
14435 }
14436
14437 #ifdef FEAT_MBYTE
14438 /*
14439 * Multi-byte version of spell_edit_score_limit().
14440 * Keep it in sync with the above!
14441 */
14442 static int
14443 spell_edit_score_limit_w(slang, badword, goodword, limit)
14444 slang_T *slang;
14445 char_u *badword;
14446 char_u *goodword;
14447 int limit;
14448 {
14449 limitscore_T stack[10]; /* allow for over 3 * 2 edits */
14450 int stackidx;
14451 int bi, gi;
14452 int bi2, gi2;
14453 int bc, gc;
14454 int score;
14455 int score_off;
14456 int minscore;
14457 int round;
14458 char_u *p;
14459 int wbadword[MAXWLEN];
14460 int wgoodword[MAXWLEN];
14461
14462 /* Get the characters from the multi-byte strings and put them in an
14463 * int array for easy access. */
14464 bi = 0;
14465 for (p = badword; *p != NUL; )
14466 wbadword[bi++] = mb_cptr2char_adv(&p);
14467 wbadword[bi++] = 0;
14468 gi = 0;
14469 for (p = goodword; *p != NUL; )
14470 wgoodword[gi++] = mb_cptr2char_adv(&p);
14471 wgoodword[gi++] = 0;
14472
14473 /*
14474 * The idea is to go from start to end over the words. So long as
14475 * characters are equal just continue, this always gives the lowest score.
14476 * When there is a difference try several alternatives. Each alternative
14477 * increases "score" for the edit distance. Some of the alternatives are
14478 * pushed unto a stack and tried later, some are tried right away. At the
14479 * end of the word the score for one alternative is known. The lowest
14480 * possible score is stored in "minscore".
14481 */
14482 stackidx = 0;
14483 bi = 0;
14484 gi = 0;
14485 score = 0;
14486 minscore = limit + 1;
14487
14488 for (;;)
14489 {
14490 /* Skip over an equal part, score remains the same. */
14491 for (;;)
14492 {
14493 bc = wbadword[bi];
14494 gc = wgoodword[gi];
14495
14496 if (bc != gc) /* stop at a char that's different */
14497 break;
14498 if (bc == NUL) /* both words end */
14499 {
14500 if (score < minscore)
14501 minscore = score;
14502 goto pop; /* do next alternative */
14503 }
14504 ++bi;
14505 ++gi;
14506 }
14507
14508 if (gc == NUL) /* goodword ends, delete badword chars */
14509 {
14510 do
14511 {
14512 if ((score += SCORE_DEL) >= minscore)
14513 goto pop; /* do next alternative */
14514 } while (wbadword[++bi] != NUL);
14515 minscore = score;
14516 }
14517 else if (bc == NUL) /* badword ends, insert badword chars */
14518 {
14519 do
14520 {
14521 if ((score += SCORE_INS) >= minscore)
14522 goto pop; /* do next alternative */
14523 } while (wgoodword[++gi] != NUL);
14524 minscore = score;
14525 }
14526 else /* both words continue */
14527 {
14528 /* If not close to the limit, perform a change. Only try changes
14529 * that may lead to a lower score than "minscore".
14530 * round 0: try deleting a char from badword
14531 * round 1: try inserting a char in badword */
14532 for (round = 0; round <= 1; ++round)
14533 {
14534 score_off = score + (round == 0 ? SCORE_DEL : SCORE_INS);
14535 if (score_off < minscore)
14536 {
14537 if (score_off + SCORE_EDIT_MIN >= minscore)
14538 {
14539 /* Near the limit, rest of the words must match. We
14540 * can check that right now, no need to push an item
14541 * onto the stack. */
14542 bi2 = bi + 1 - round;
14543 gi2 = gi + round;
14544 while (wgoodword[gi2] == wbadword[bi2])
14545 {
14546 if (wgoodword[gi2] == NUL)
14547 {
14548 minscore = score_off;
14549 break;
14550 }
14551 ++bi2;
14552 ++gi2;
14553 }
14554 }
14555 else
14556 {
14557 /* try deleting a character from badword later */
14558 stack[stackidx].badi = bi + 1 - round;
14559 stack[stackidx].goodi = gi + round;
14560 stack[stackidx].score = score_off;
14561 ++stackidx;
14562 }
14563 }
14564 }
14565
14566 if (score + SCORE_SWAP < minscore)
14567 {
14568 /* If swapping two characters makes a match then the
14569 * substitution is more expensive, thus there is no need to
14570 * try both. */
14571 if (gc == wbadword[bi + 1] && bc == wgoodword[gi + 1])
14572 {
14573 /* Swap two characters, that is: skip them. */
14574 gi += 2;
14575 bi += 2;
14576 score += SCORE_SWAP;
14577 continue;
14578 }
14579 }
14580
14581 /* Substitute one character for another which is the same
14582 * thing as deleting a character from both goodword and badword.
14583 * Use a better score when there is only a case difference. */
14584 if (SPELL_TOFOLD(bc) == SPELL_TOFOLD(gc))
14585 score += SCORE_ICASE;
14586 else
14587 {
14588 /* For a similar character use SCORE_SIMILAR. */
14589 if (slang != NULL
14590 && slang->sl_has_map
14591 && similar_chars(slang, gc, bc))
14592 score += SCORE_SIMILAR;
14593 else
14594 score += SCORE_SUBST;
14595 }
14596
14597 if (score < minscore)
14598 {
14599 /* Do the substitution. */
14600 ++gi;
14601 ++bi;
14602 continue;
14603 }
14604 }
14605 pop:
14606 /*
14607 * Get here to try the next alternative, pop it from the stack.
14608 */
14609 if (stackidx == 0) /* stack is empty, finished */
14610 break;
14611
14612 /* pop an item from the stack */
14613 --stackidx;
14614 gi = stack[stackidx].goodi;
14615 bi = stack[stackidx].badi;
14616 score = stack[stackidx].score;
14617 }
14618
14619 /* When the score goes over "limit" it may actually be much higher.
14620 * Return a very large number to avoid going below the limit when giving a
14621 * bonus. */
14622 if (minscore > limit)
14623 return SCORE_MAXMAX;
14624 return minscore;
14625 }
14626 #endif
14627
14628 #define DUMPFLAG_KEEPCASE 1 /* round 2: keep-case tree */
14629 #define DUMPFLAG_COUNT 2 /* include word count */
14630
12580 /* 14631 /*
12581 * ":spelldump" 14632 * ":spelldump"
12582 */ 14633 */
12583 /*ARGSUSED*/ 14634 /*ARGSUSED*/
12584 void 14635 void
12601 int flags; 14652 int flags;
12602 char_u *region_names = NULL; /* region names being used */ 14653 char_u *region_names = NULL; /* region names being used */
12603 int do_region = TRUE; /* dump region names and numbers */ 14654 int do_region = TRUE; /* dump region names and numbers */
12604 char_u *p; 14655 char_u *p;
12605 int lpi; 14656 int lpi;
14657 int dumpflags;
12606 14658
12607 if (no_spell_checking(curwin)) 14659 if (no_spell_checking(curwin))
12608 return; 14660 return;
12609 14661
12610 /* Create a new empty buffer by splitting the window. */ 14662 /* Create a new empty buffer by splitting the window. */
12655 * round 2: keep-case tree */ 14707 * round 2: keep-case tree */
12656 for (round = 1; round <= 2; ++round) 14708 for (round = 1; round <= 2; ++round)
12657 { 14709 {
12658 if (round == 1) 14710 if (round == 1)
12659 { 14711 {
14712 dumpflags = 0;
12660 byts = slang->sl_fbyts; 14713 byts = slang->sl_fbyts;
12661 idxs = slang->sl_fidxs; 14714 idxs = slang->sl_fidxs;
12662 } 14715 }
12663 else 14716 else
12664 { 14717 {
14718 dumpflags = DUMPFLAG_KEEPCASE;
12665 byts = slang->sl_kbyts; 14719 byts = slang->sl_kbyts;
12666 idxs = slang->sl_kidxs; 14720 idxs = slang->sl_kidxs;
12667 } 14721 }
12668 if (byts == NULL) 14722 if (byts == NULL)
12669 continue; /* array is empty */ 14723 continue; /* array is empty */
14724
14725 if (eap->forceit)
14726 dumpflags |= DUMPFLAG_COUNT;
12670 14727
12671 depth = 0; 14728 depth = 0;
12672 arridx[0] = 0; 14729 arridx[0] = 0;
12673 curi[0] = 1; 14730 curi[0] = 1;
12674 while (depth >= 0 && !got_int) 14731 while (depth >= 0 && !got_int)
12705 14762
12706 /* Dump the basic word if there is no prefix or 14763 /* Dump the basic word if there is no prefix or
12707 * when it's the first one. */ 14764 * when it's the first one. */
12708 c = (unsigned)flags >> 24; 14765 c = (unsigned)flags >> 24;
12709 if (c == 0 || curi[depth] == 2) 14766 if (c == 0 || curi[depth] == 2)
12710 dump_word(word, round, flags, lnum++); 14767 dump_word(slang, word, dumpflags,
14768 flags, lnum++);
12711 14769
12712 /* Apply the prefix, if there is one. */ 14770 /* Apply the prefix, if there is one. */
12713 if (c != 0) 14771 if (c != 0)
12714 lnum = dump_prefixes(slang, word, round, 14772 lnum = dump_prefixes(slang, word, dumpflags,
12715 flags, lnum); 14773 flags, lnum);
12716 } 14774 }
12717 } 14775 }
12718 else 14776 else
12719 { 14777 {
12736 14794
12737 /* 14795 /*
12738 * Dump one word: apply case modifications and append a line to the buffer. 14796 * Dump one word: apply case modifications and append a line to the buffer.
12739 */ 14797 */
12740 static void 14798 static void
12741 dump_word(word, round, flags, lnum) 14799 dump_word(slang, word, dumpflags, flags, lnum)
14800 slang_T *slang;
12742 char_u *word; 14801 char_u *word;
12743 int round; 14802 int dumpflags;
12744 int flags; 14803 int flags;
12745 linenr_T lnum; 14804 linenr_T lnum;
12746 { 14805 {
12747 int keepcap = FALSE; 14806 int keepcap = FALSE;
12748 char_u *p; 14807 char_u *p;
14808 char_u *tw;
12749 char_u cword[MAXWLEN]; 14809 char_u cword[MAXWLEN];
12750 char_u badword[MAXWLEN + 10]; 14810 char_u badword[MAXWLEN + 10];
12751 int i; 14811 int i;
12752 14812
12753 if (round == 1 && (flags & WF_CAPMASK) != 0) 14813 if ((dumpflags & DUMPFLAG_KEEPCASE) == 0 && (flags & WF_CAPMASK) != 0)
12754 { 14814 {
12755 /* Need to fix case according to "flags". */ 14815 /* Need to fix case according to "flags". */
12756 make_case_word(word, cword, flags); 14816 make_case_word(word, cword, flags);
12757 p = cword; 14817 p = cword;
12758 } 14818 }
12759 else 14819 else
12760 { 14820 {
12761 p = word; 14821 p = word;
12762 if (round == 2 && ((captype(word, NULL) & WF_KEEPCAP) == 0 14822 if ((dumpflags & DUMPFLAG_KEEPCASE)
14823 && ((captype(word, NULL) & WF_KEEPCAP) == 0
12763 || (flags & WF_FIXCAP) != 0)) 14824 || (flags & WF_FIXCAP) != 0))
12764 keepcap = TRUE; 14825 keepcap = TRUE;
12765 } 14826 }
14827 tw = p;
12766 14828
12767 /* Add flags and regions after a slash. */ 14829 /* Add flags and regions after a slash. */
12768 if ((flags & (WF_BANNED | WF_RARE | WF_REGION)) || keepcap) 14830 if ((flags & (WF_BANNED | WF_RARE | WF_REGION)) || keepcap)
12769 { 14831 {
12770 STRCPY(badword, p); 14832 STRCPY(badword, p);
12780 if (flags & (0x10000 << i)) 14842 if (flags & (0x10000 << i))
12781 sprintf((char *)badword + STRLEN(badword), "%d", i + 1); 14843 sprintf((char *)badword + STRLEN(badword), "%d", i + 1);
12782 p = badword; 14844 p = badword;
12783 } 14845 }
12784 14846
14847 if (dumpflags & DUMPFLAG_COUNT)
14848 {
14849 hashitem_T *hi;
14850
14851 /* Include the word count for ":spelldump!". */
14852 hi = hash_find(&slang->sl_wordcount, tw);
14853 if (!HASHITEM_EMPTY(hi))
14854 {
14855 vim_snprintf((char *)IObuff, IOSIZE, "%s\t%d",
14856 tw, HI2WC(hi)->wc_count);
14857 p = IObuff;
14858 }
14859 }
14860
12785 ml_append(lnum, p, (colnr_T)0, FALSE); 14861 ml_append(lnum, p, (colnr_T)0, FALSE);
12786 } 14862 }
12787 14863
12788 /* 14864 /*
12789 * For ":spelldump": Find matching prefixes for "word". Prepend each to 14865 * For ":spelldump": Find matching prefixes for "word". Prepend each to
12790 * "word" and append a line to the buffer. 14866 * "word" and append a line to the buffer.
12791 * Return the updated line number. 14867 * Return the updated line number.
12792 */ 14868 */
12793 static linenr_T 14869 static linenr_T
12794 dump_prefixes(slang, word, round, flags, startlnum) 14870 dump_prefixes(slang, word, dumpflags, flags, startlnum)
12795 slang_T *slang; 14871 slang_T *slang;
12796 char_u *word; /* case-folded word */ 14872 char_u *word; /* case-folded word */
12797 int round; 14873 int dumpflags;
12798 int flags; /* flags with prefix ID */ 14874 int flags; /* flags with prefix ID */
12799 linenr_T startlnum; 14875 linenr_T startlnum;
12800 { 14876 {
12801 idx_T arridx[MAXWLEN]; 14877 idx_T arridx[MAXWLEN];
12802 int curi[MAXWLEN]; 14878 int curi[MAXWLEN];
12858 14934
12859 c = valid_word_prefix(i, n, flags, word, slang, FALSE); 14935 c = valid_word_prefix(i, n, flags, word, slang, FALSE);
12860 if (c != 0) 14936 if (c != 0)
12861 { 14937 {
12862 vim_strncpy(prefix + depth, word, MAXWLEN - depth - 1); 14938 vim_strncpy(prefix + depth, word, MAXWLEN - depth - 1);
12863 dump_word(prefix, round, 14939 dump_word(slang, prefix, dumpflags,
12864 (c & WF_RAREPFX) ? (flags | WF_RARE) 14940 (c & WF_RAREPFX) ? (flags | WF_RARE)
12865 : flags, lnum++); 14941 : flags, lnum++);
12866 } 14942 }
12867 14943
12868 /* Check for prefix that matches the word when the 14944 /* Check for prefix that matches the word when the
12874 TRUE); 14950 TRUE);
12875 if (c != 0) 14951 if (c != 0)
12876 { 14952 {
12877 vim_strncpy(prefix + depth, word_up, 14953 vim_strncpy(prefix + depth, word_up,
12878 MAXWLEN - depth - 1); 14954 MAXWLEN - depth - 1);
12879 dump_word(prefix, round, 14955 dump_word(slang, prefix, dumpflags,
12880 (c & WF_RAREPFX) ? (flags | WF_RARE) 14956 (c & WF_RAREPFX) ? (flags | WF_RARE)
12881 : flags, lnum++); 14957 : flags, lnum++);
12882 } 14958 }
12883 } 14959 }
12884 } 14960 }
12979 char_u *pat; 15055 char_u *pat;
12980 char_u ***matchp; 15056 char_u ***matchp;
12981 { 15057 {
12982 garray_T ga; 15058 garray_T ga;
12983 15059
12984 spell_suggest_list(&ga, pat, 100, spell_expand_need_cap); 15060 spell_suggest_list(&ga, pat, 100, spell_expand_need_cap, TRUE);
12985 *matchp = ga.ga_data; 15061 *matchp = ga.ga_data;
12986 return ga.ga_len; 15062 return ga.ga_len;
12987 } 15063 }
12988 #endif 15064 #endif
12989 15065