Mercurial > vim
comparison src/spell.c @ 625:81fe2ccc1207 v7.0179
updated for version 7.0179
author | vimboss |
---|---|
date | Thu, 12 Jan 2006 23:22:24 +0000 |
parents | c5688885c414 |
children | 732c7ae5743e |
comparison
equal
deleted
inserted
replaced
624:91e7d4a7b3b0 | 625:81fe2ccc1207 |
---|---|
41 * following word must support this prefix nr. And the condition nr is | 41 * following word must support this prefix nr. And the condition nr is |
42 * stored, used to lookup the condition that the word must match with. | 42 * stored, used to lookup the condition that the word must match with. |
43 * | 43 * |
44 * Thanks to Olaf Seibert for providing an example implementation of this tree | 44 * Thanks to Olaf Seibert for providing an example implementation of this tree |
45 * and the compression mechanism. | 45 * and the compression mechanism. |
46 * LZ trie ideas: | |
47 * http://www.irb.hr/hr/home/ristov/papers/RistovLZtrieRevision1.pdf | |
48 * More papers: http://www-igm.univ-mlv.fr/~laporte/publi_en.html | |
46 * | 49 * |
47 * Matching involves checking the caps type: Onecap ALLCAP KeepCap. | 50 * Matching involves checking the caps type: Onecap ALLCAP KeepCap. |
48 * | 51 * |
49 * Why doesn't Vim use aspell/ispell/myspell/etc.? | 52 * Why doesn't Vim use aspell/ispell/myspell/etc.? |
50 * See ":help develop-spell". | 53 * See ":help develop-spell". |
54 * Only use it for small word lists! */ | 57 * Only use it for small word lists! */ |
55 #if 0 | 58 #if 0 |
56 # define SPELL_PRINTTREE | 59 # define SPELL_PRINTTREE |
57 #endif | 60 #endif |
58 | 61 |
62 /* Use DEBUG_TRIEWALK to print the changes made in suggest_trie_walk(). */ | |
63 #if 0 | |
64 # define DEBUG_TRIEWALK | |
65 #endif | |
66 | |
59 /* | 67 /* |
60 * Use this to adjust the score after finding suggestions, based on the | 68 * Use this to adjust the score after finding suggestions, based on the |
61 * suggested word sounding like the bad word. This is much faster than doing | 69 * suggested word sounding like the bad word. This is much faster than doing |
62 * it for every possible suggestion. | 70 * it for every possible suggestion. |
63 * Disadvantage: When "the" is typed as "hte" it sounds different and goes | 71 * Disadvantage: When "the" is typed as "hte" it sounds quite different ("@" |
64 * down in the list. | 72 * vs "ht") and goes down in the list. |
65 * Used when 'spellsuggest' is set to "best". | 73 * Used when 'spellsuggest' is set to "best". |
66 */ | 74 */ |
67 #define RESCORE(word_score, sound_score) ((3 * word_score + sound_score) / 4) | 75 #define RESCORE(word_score, sound_score) ((3 * word_score + sound_score) / 4) |
76 | |
77 /* | |
78 * Do the opposite: based on a maximum end score and a known sound score, | |
79 * compute the the maximum word score that can be used. | |
80 */ | |
81 #define MAXSCORE(word_score, sound_score) ((4 * word_score - sound_score) / 3) | |
68 | 82 |
69 /* | 83 /* |
70 * Vim spell file format: <HEADER> | 84 * Vim spell file format: <HEADER> |
71 * <SECTIONS> | 85 * <SECTIONS> |
72 * <LWORDTREE> | 86 * <LWORDTREE> |
131 * <repfromlen> 1 byte length of <repfrom> | 145 * <repfromlen> 1 byte length of <repfrom> |
132 * <repfrom> N bytes "from" part of replacement | 146 * <repfrom> N bytes "from" part of replacement |
133 * <reptolen> 1 byte length of <repto> | 147 * <reptolen> 1 byte length of <repto> |
134 * <repto> N bytes "to" part of replacement | 148 * <repto> N bytes "to" part of replacement |
135 * | 149 * |
150 * sectionID == SN_REPSAL: <repcount> <rep> ... | |
151 * just like SN_REP but for soundfolded words | |
152 * | |
136 * sectionID == SN_SAL: <salflags> <salcount> <sal> ... | 153 * sectionID == SN_SAL: <salflags> <salcount> <sal> ... |
137 * <salflags> 1 byte flags for soundsalike conversion: | 154 * <salflags> 1 byte flags for soundsalike conversion: |
138 * SAL_F0LLOWUP | 155 * SAL_F0LLOWUP |
139 * SAL_COLLAPSE | 156 * SAL_COLLAPSE |
140 * SAL_REM_ACCENTS | 157 * SAL_REM_ACCENTS |
148 * sectionID == SN_SOFO: <sofofromlen> <sofofrom> <sofotolen> <sofoto> | 165 * sectionID == SN_SOFO: <sofofromlen> <sofofrom> <sofotolen> <sofoto> |
149 * <sofofromlen> 2 bytes length of <sofofrom> | 166 * <sofofromlen> 2 bytes length of <sofofrom> |
150 * <sofofrom> N bytes "from" part of soundfold | 167 * <sofofrom> N bytes "from" part of soundfold |
151 * <sofotolen> 2 bytes length of <sofoto> | 168 * <sofotolen> 2 bytes length of <sofoto> |
152 * <sofoto> N bytes "to" part of soundfold | 169 * <sofoto> N bytes "to" part of soundfold |
170 * | |
171 * sectionID == SN_SUGFILE: <timestamp> | |
172 * <timestamp> 8 bytes time in seconds that must match with .sug file | |
173 * | |
174 * sectionID == SN_WORDS: <word> ... | |
175 * <word> N bytes NUL terminated common word | |
153 * | 176 * |
154 * sectionID == SN_MAP: <mapstr> | 177 * sectionID == SN_MAP: <mapstr> |
155 * <mapstr> N bytes String with sequences of similar characters, | 178 * <mapstr> N bytes String with sequences of similar characters, |
156 * separated by slashes. | 179 * separated by slashes. |
157 * | 180 * |
234 * from HEADER. | 257 * from HEADER. |
235 * | 258 * |
236 * All text characters are in 'encoding', but stored as single bytes. | 259 * All text characters are in 'encoding', but stored as single bytes. |
237 */ | 260 */ |
238 | 261 |
262 /* | |
263 * Vim .sug file format: <SUGHEADER> | |
264 * <SUGWORDTREE> | |
265 * <SUGTABLE> | |
266 * | |
267 * <SUGHEADER>: <fileID> <versionnr> <timestamp> | |
268 * | |
269 * <fileID> 6 bytes "VIMsug" | |
270 * <versionnr> 1 byte VIMSUGVERSION | |
271 * <timestamp> 8 bytes timestamp that must match with .spl file | |
272 * | |
273 * | |
274 * <SUGWORDTREE>: <wordtree> (see above, no flags or region used) | |
275 * | |
276 * | |
277 * <SUGTABLE>: <sugwcount> <sugline> ... | |
278 * | |
279 * <sugwcount> 4 bytes number of <sugline> following | |
280 * | |
281 * <sugline>: <sugnr> ... NUL | |
282 * | |
283 * <sugnr>: X bytes word number that results in this soundfolded word, | |
284 * stored as an offset to the previous number in as | |
285 * few bytes as possible, see offset2bytes()) | |
286 */ | |
287 | |
239 #if defined(MSDOS) || defined(WIN16) || defined(WIN32) || defined(_WIN64) | 288 #if defined(MSDOS) || defined(WIN16) || defined(WIN32) || defined(_WIN64) |
240 # include <io.h> /* for lseek(), must be before vim.h */ | 289 # include <io.h> /* for lseek(), must be before vim.h */ |
241 #endif | 290 #endif |
242 | 291 |
243 #include "vim.h" | 292 #include "vim.h" |
244 | 293 |
245 #if defined(FEAT_SYN_HL) || defined(PROTO) | 294 #if defined(FEAT_SYN_HL) || defined(PROTO) |
246 | 295 |
247 #ifdef HAVE_FCNTL_H | 296 #ifdef HAVE_FCNTL_H |
248 # include <fcntl.h> | 297 # include <fcntl.h> |
298 #endif | |
299 | |
300 #ifndef UNIX /* it's in os_unix.h for Unix */ | |
301 # include <time.h> /* for time_t */ | |
249 #endif | 302 #endif |
250 | 303 |
251 #define MAXWLEN 250 /* Assume max. word len is this many bytes. | 304 #define MAXWLEN 250 /* Assume max. word len is this many bytes. |
252 Some places assume a word length fits in a | 305 Some places assume a word length fits in a |
253 byte, thus it can't be above 255. */ | 306 byte, thus it can't be above 255. */ |
300 * postponed prefix: <pflags> follows */ | 353 * postponed prefix: <pflags> follows */ |
301 #define BY_FLAGS2 3 /* end of word, <flags> and <flags2> bytes | 354 #define BY_FLAGS2 3 /* end of word, <flags> and <flags2> bytes |
302 * follow; never used in prefix tree */ | 355 * follow; never used in prefix tree */ |
303 #define BY_SPECIAL BY_FLAGS2 /* highest special byte value */ | 356 #define BY_SPECIAL BY_FLAGS2 /* highest special byte value */ |
304 | 357 |
305 /* Info from "REP" and "SAL" entries in ".aff" file used in si_rep, sl_rep, | 358 /* Info from "REP", "REPSAL" and "SAL" entries in ".aff" file used in si_rep, |
306 * and si_sal. Not for sl_sal! | 359 * si_repsal, sl_rep, and si_sal. Not for sl_sal! |
307 * One replacement: from "ft_from" to "ft_to". */ | 360 * One replacement: from "ft_from" to "ft_to". */ |
308 typedef struct fromto_S | 361 typedef struct fromto_S |
309 { | 362 { |
310 char_u *ft_from; | 363 char_u *ft_from; |
311 char_u *ft_to; | 364 char_u *ft_to; |
372 | 425 |
373 char_u sl_regions[17]; /* table with up to 8 region names plus NUL */ | 426 char_u sl_regions[17]; /* table with up to 8 region names plus NUL */ |
374 | 427 |
375 char_u *sl_midword; /* MIDWORD string or NULL */ | 428 char_u *sl_midword; /* MIDWORD string or NULL */ |
376 | 429 |
430 hashtab_T sl_wordcount; /* hashtable with word count, wordcount_T */ | |
431 | |
377 int sl_compmax; /* COMPOUNDMAX (default: MAXWLEN) */ | 432 int sl_compmax; /* COMPOUNDMAX (default: MAXWLEN) */ |
378 int sl_compminlen; /* COMPOUNDMIN (default: 0) */ | 433 int sl_compminlen; /* COMPOUNDMIN (default: 0) */ |
379 int sl_compsylmax; /* COMPOUNDSYLMAX (default: MAXWLEN) */ | 434 int sl_compsylmax; /* COMPOUNDSYLMAX (default: MAXWLEN) */ |
380 regprog_T *sl_compprog; /* COMPOUNDFLAGS turned into a regexp progrm | 435 regprog_T *sl_compprog; /* COMPOUNDFLAGS turned into a regexp progrm |
381 * (NULL when no compounding) */ | 436 * (NULL when no compounding) */ |
392 short sl_rep_first[256]; /* indexes where byte first appears, -1 if | 447 short sl_rep_first[256]; /* indexes where byte first appears, -1 if |
393 there is none */ | 448 there is none */ |
394 garray_T sl_sal; /* list of salitem_T entries from SAL lines */ | 449 garray_T sl_sal; /* list of salitem_T entries from SAL lines */ |
395 salfirst_T sl_sal_first[256]; /* indexes where byte first appears, -1 if | 450 salfirst_T sl_sal_first[256]; /* indexes where byte first appears, -1 if |
396 there is none */ | 451 there is none */ |
452 int sl_followup; /* SAL followup */ | |
453 int sl_collapse; /* SAL collapse_result */ | |
454 int sl_rem_accents; /* SAL remove_accents */ | |
397 int sl_sofo; /* SOFOFROM and SOFOTO instead of SAL items: | 455 int sl_sofo; /* SOFOFROM and SOFOTO instead of SAL items: |
398 * "sl_sal_first" maps chars, when has_mbyte | 456 * "sl_sal_first" maps chars, when has_mbyte |
399 * "sl_sal" is a list of wide char lists. */ | 457 * "sl_sal" is a list of wide char lists. */ |
400 int sl_followup; /* SAL followup */ | 458 garray_T sl_repsal; /* list of fromto_T entries from REPSAL lines */ |
401 int sl_collapse; /* SAL collapse_result */ | 459 short sl_repsal_first[256]; /* sl_rep_first for REPSAL lines */ |
402 int sl_rem_accents; /* SAL remove_accents */ | 460 |
461 /* Info from the .sug file. Loaded on demand. */ | |
462 time_t sl_sugtime; /* timestamp for .sug file */ | |
463 char_u *sl_sbyts; /* soundfolded word bytes */ | |
464 idx_T *sl_sidxs; /* soundfolded word indexes */ | |
465 buf_T *sl_sugbuf; /* buffer with word number table */ | |
466 int sl_sugloaded; /* TRUE when .sug file was loaded or failed to | |
467 load */ | |
468 | |
403 int sl_has_map; /* TRUE if there is a MAP line */ | 469 int sl_has_map; /* TRUE if there is a MAP line */ |
404 #ifdef FEAT_MBYTE | 470 #ifdef FEAT_MBYTE |
405 hashtab_T sl_map_hash; /* MAP for multi-byte chars */ | 471 hashtab_T sl_map_hash; /* MAP for multi-byte chars */ |
406 int sl_map_array[256]; /* MAP for first 256 chars */ | 472 int sl_map_array[256]; /* MAP for first 256 chars */ |
407 #else | 473 #else |
408 char_u sl_map_array[256]; /* MAP for first 256 chars */ | 474 char_u sl_map_array[256]; /* MAP for first 256 chars */ |
409 #endif | 475 #endif |
476 hashtab_T sl_sounddone; /* table with soundfolded words that have | |
477 handled, see add_sound_suggest() */ | |
410 }; | 478 }; |
411 | 479 |
412 /* First language that is loaded, start of the linked list of loaded | 480 /* First language that is loaded, start of the linked list of loaded |
413 * languages. */ | 481 * languages. */ |
414 static slang_T *first_lang = NULL; | 482 static slang_T *first_lang = NULL; |
434 #define REGION_ALL 0xff /* word valid in all regions */ | 502 #define REGION_ALL 0xff /* word valid in all regions */ |
435 | 503 |
436 #define VIMSPELLMAGIC "VIMspell" /* string at start of Vim spell file */ | 504 #define VIMSPELLMAGIC "VIMspell" /* string at start of Vim spell file */ |
437 #define VIMSPELLMAGICL 8 | 505 #define VIMSPELLMAGICL 8 |
438 #define VIMSPELLVERSION 50 | 506 #define VIMSPELLVERSION 50 |
507 | |
508 #define VIMSUGMAGIC "VIMsug" /* string at start of Vim .sug file */ | |
509 #define VIMSUGMAGICL 6 | |
510 #define VIMSUGVERSION 1 | |
439 | 511 |
440 /* Section IDs. Only renumber them when VIMSPELLVERSION changes! */ | 512 /* Section IDs. Only renumber them when VIMSPELLVERSION changes! */ |
441 #define SN_REGION 0 /* <regionname> section */ | 513 #define SN_REGION 0 /* <regionname> section */ |
442 #define SN_CHARFLAGS 1 /* charflags section */ | 514 #define SN_CHARFLAGS 1 /* charflags section */ |
443 #define SN_MIDWORD 2 /* <midword> section */ | 515 #define SN_MIDWORD 2 /* <midword> section */ |
447 #define SN_SOFO 6 /* soundfolding section */ | 519 #define SN_SOFO 6 /* soundfolding section */ |
448 #define SN_MAP 7 /* MAP items section */ | 520 #define SN_MAP 7 /* MAP items section */ |
449 #define SN_COMPOUND 8 /* compound words section */ | 521 #define SN_COMPOUND 8 /* compound words section */ |
450 #define SN_SYLLABLE 9 /* syllable section */ | 522 #define SN_SYLLABLE 9 /* syllable section */ |
451 #define SN_NOBREAK 10 /* NOBREAK section */ | 523 #define SN_NOBREAK 10 /* NOBREAK section */ |
524 #define SN_SUGFILE 11 /* timestamp for .sug file */ | |
525 #define SN_REPSAL 12 /* REPSAL items section */ | |
526 #define SN_WORDS 13 /* common words */ | |
452 #define SN_END 255 /* end of sections */ | 527 #define SN_END 255 /* end of sections */ |
453 | 528 |
454 #define SNF_REQUIRED 1 /* <sectionflags>: required section */ | 529 #define SNF_REQUIRED 1 /* <sectionflags>: required section */ |
455 | 530 |
456 /* Result values. Lower number is accepted over higher one. */ | 531 /* Result values. Lower number is accepted over higher one. */ |
461 #define SP_BAD 3 | 536 #define SP_BAD 3 |
462 | 537 |
463 /* file used for "zG" and "zW" */ | 538 /* file used for "zG" and "zW" */ |
464 static char_u *int_wordlist = NULL; | 539 static char_u *int_wordlist = NULL; |
465 | 540 |
541 typedef struct wordcount_S | |
542 { | |
543 short_u wc_count; /* nr of times word was seen */ | |
544 char_u wc_word[1]; /* word, actually longer */ | |
545 } wordcount_T; | |
546 | |
547 static wordcount_T dumwc; | |
548 #define WC_KEY_OFF (dumwc.wc_word - (char_u *)&dumwc) | |
549 #define HI2WC(hi) ((wordcount_T *)((hi)->hi_key - WC_KEY_OFF)) | |
550 #define MAXWORDCOUNT 0xffff | |
551 | |
466 /* | 552 /* |
467 * Information used when looking for suggestions. | 553 * Information used when looking for suggestions. |
468 */ | 554 */ |
469 typedef struct suginfo_S | 555 typedef struct suginfo_S |
470 { | 556 { |
471 garray_T su_ga; /* suggestions, contains "suggest_T" */ | 557 garray_T su_ga; /* suggestions, contains "suggest_T" */ |
472 int su_maxcount; /* max. number of suggestions displayed */ | 558 int su_maxcount; /* max. number of suggestions displayed */ |
473 int su_maxscore; /* maximum score for adding to su_ga */ | 559 int su_maxscore; /* maximum score for adding to su_ga */ |
560 int su_sfmaxscore; /* idem, for when doing soundfold words */ | |
474 garray_T su_sga; /* like su_ga, sound-folded scoring */ | 561 garray_T su_sga; /* like su_ga, sound-folded scoring */ |
475 char_u *su_badptr; /* start of bad word in line */ | 562 char_u *su_badptr; /* start of bad word in line */ |
476 int su_badlen; /* length of detected bad word in line */ | 563 int su_badlen; /* length of detected bad word in line */ |
477 int su_badflags; /* caps flags for bad word */ | 564 int su_badflags; /* caps flags for bad word */ |
478 char_u su_badword[MAXWLEN]; /* bad word truncated at su_badlen */ | 565 char_u su_badword[MAXWLEN]; /* bad word truncated at su_badlen */ |
479 char_u su_fbadword[MAXWLEN]; /* su_badword case-folded */ | 566 char_u su_fbadword[MAXWLEN]; /* su_badword case-folded */ |
480 char_u su_sal_badword[MAXWLEN]; /* su_badword soundfolded */ | 567 char_u su_sal_badword[MAXWLEN]; /* su_badword soundfolded */ |
481 slang_T *su_slang_first; /* slang_T used for su_sal_badword */ | |
482 hashtab_T su_banned; /* table with banned words */ | 568 hashtab_T su_banned; /* table with banned words */ |
483 slang_T *su_sallang; /* default language for sound folding */ | 569 slang_T *su_sallang; /* default language for sound folding */ |
484 } suginfo_T; | 570 } suginfo_T; |
485 | 571 |
486 /* One word suggestion. Used in "si_ga". */ | 572 /* One word suggestion. Used in "si_ga". */ |
487 typedef struct suggest_S | 573 typedef struct suggest_S |
488 { | 574 { |
489 char_u *st_word; /* suggested word, allocated string */ | 575 char_u *st_word; /* suggested word, allocated string */ |
576 int st_wordlen; /* STRLEN(st_word) */ | |
490 int st_orglen; /* length of replaced text */ | 577 int st_orglen; /* length of replaced text */ |
491 int st_score; /* lower is better */ | 578 int st_score; /* lower is better */ |
492 int st_altscore; /* used when st_score compares equal */ | 579 int st_altscore; /* used when st_score compares equal */ |
493 int st_salscore; /* st_score is for soundalike */ | 580 int st_salscore; /* st_score is for soundalike */ |
494 int st_had_bonus; /* bonus already included in score */ | 581 int st_had_bonus; /* bonus already included in score */ |
495 slang_T *st_slang; /* language used for sound folding */ | 582 slang_T *st_slang; /* language used for sound folding */ |
496 } suggest_T; | 583 } suggest_T; |
497 | 584 |
498 #define SUG(ga, i) (((suggest_T *)(ga).ga_data)[i]) | 585 #define SUG(ga, i) (((suggest_T *)(ga).ga_data)[i]) |
499 | 586 |
500 /* Number of suggestions kept when cleaning up. When rescore_suggestions() is | 587 /* TRUE if a word appears in the list of banned words. */ |
501 * called the score may change, thus we need to keep more than what is | 588 #define WAS_BANNED(su, word) (!HASHITEM_EMPTY(hash_find(&su->su_banned, word))) |
502 * displayed. */ | 589 |
503 #define SUG_CLEAN_COUNT(su) ((su)->su_maxcount < 50 ? 50 : (su)->su_maxcount) | 590 /* Number of suggestions kept when cleaning up. we need to keep more than |
591 * what is displayed, because when rescore_suggestions() is called the score | |
592 * may change and wrong suggestions may be removed later. */ | |
593 #define SUG_CLEAN_COUNT(su) ((su)->su_maxcount < 130 ? 150 : (su)->su_maxcount + 20) | |
504 | 594 |
505 /* Threshold for sorting and cleaning up suggestions. Don't want to keep lots | 595 /* Threshold for sorting and cleaning up suggestions. Don't want to keep lots |
506 * of suggestions that are not going to be displayed. */ | 596 * of suggestions that are not going to be displayed. */ |
507 #define SUG_MAX_COUNT(su) ((su)->su_maxcount + 50) | 597 #define SUG_MAX_COUNT(su) (SUG_CLEAN_COUNT(su) + 50) |
508 | 598 |
509 /* score for various changes */ | 599 /* score for various changes */ |
510 #define SCORE_SPLIT 149 /* split bad word */ | 600 #define SCORE_SPLIT 149 /* split bad word */ |
511 #define SCORE_ICASE 52 /* slightly different case */ | 601 #define SCORE_ICASE 52 /* slightly different case */ |
512 #define SCORE_REGION 200 /* word is for different region */ | 602 #define SCORE_REGION 200 /* word is for different region */ |
513 #define SCORE_RARE 180 /* rare word */ | 603 #define SCORE_RARE 180 /* rare word */ |
514 #define SCORE_SWAP 90 /* swap two characters */ | 604 #define SCORE_SWAP 75 /* swap two characters */ |
515 #define SCORE_SWAP3 110 /* swap two characters in three */ | 605 #define SCORE_SWAP3 110 /* swap two characters in three */ |
516 #define SCORE_REP 65 /* REP replacement */ | 606 #define SCORE_REP 65 /* REP replacement */ |
517 #define SCORE_SUBST 93 /* substitute a character */ | 607 #define SCORE_SUBST 93 /* substitute a character */ |
518 #define SCORE_SIMILAR 33 /* substitute a similar character */ | 608 #define SCORE_SIMILAR 33 /* substitute a similar character */ |
519 #define SCORE_SUBCOMP 33 /* substitute a composing character */ | 609 #define SCORE_SUBCOMP 33 /* substitute a composing character */ |
527 | 617 |
528 #define SCORE_FILE 30 /* suggestion from a file */ | 618 #define SCORE_FILE 30 /* suggestion from a file */ |
529 #define SCORE_MAXINIT 350 /* Initial maximum score: higher == slower. | 619 #define SCORE_MAXINIT 350 /* Initial maximum score: higher == slower. |
530 * 350 allows for about three changes. */ | 620 * 350 allows for about three changes. */ |
531 | 621 |
622 #define SCORE_COMMON1 30 /* subtracted for words seen before */ | |
623 #define SCORE_COMMON2 40 /* subtracted for words often seen */ | |
624 #define SCORE_COMMON3 50 /* subtracted for words very often seen */ | |
625 #define SCORE_THRES2 10 /* word count threshold for COMMON2 */ | |
626 #define SCORE_THRES3 100 /* word count threshold for COMMON3 */ | |
627 | |
628 /* When trying changed soundfold words it becomes slow when trying more than | |
629 * two changes. With less then two changes it's slightly faster but we miss a | |
630 * few good suggestions. In rare cases we need to try three of four changes. | |
631 */ | |
632 #define SCORE_SFMAX1 200 /* maximum score for first try */ | |
633 #define SCORE_SFMAX2 300 /* maximum score for second try */ | |
634 #define SCORE_SFMAX3 400 /* maximum score for third try */ | |
635 | |
532 #define SCORE_BIG SCORE_INS * 3 /* big difference */ | 636 #define SCORE_BIG SCORE_INS * 3 /* big difference */ |
533 #define SCORE_MAXMAX 999999 /* accept any score */ | 637 #define SCORE_MAXMAX 999999 /* accept any score */ |
638 #define SCORE_LIMITMAX 350 /* for spell_edit_score_limit() */ | |
639 | |
640 /* for spell_edit_score_limit() we need to know the minimum value of | |
641 * SCORE_ICASE, SCORE_SWAP, SCORE_DEL, SCORE_SIMILAR and SCORE_INS */ | |
642 #define SCORE_EDIT_MIN SCORE_SIMILAR | |
534 | 643 |
535 /* | 644 /* |
536 * Structure to store info for word matching. | 645 * Structure to store info for word matching. |
537 */ | 646 */ |
538 typedef struct matchinf_S | 647 typedef struct matchinf_S |
615 STATE_NOPREFIX, /* try without prefix */ | 724 STATE_NOPREFIX, /* try without prefix */ |
616 STATE_SPLITUNDO, /* Undo splitting. */ | 725 STATE_SPLITUNDO, /* Undo splitting. */ |
617 STATE_ENDNUL, /* Past NUL bytes at start of the node. */ | 726 STATE_ENDNUL, /* Past NUL bytes at start of the node. */ |
618 STATE_PLAIN, /* Use each byte of the node. */ | 727 STATE_PLAIN, /* Use each byte of the node. */ |
619 STATE_DEL, /* Delete a byte from the bad word. */ | 728 STATE_DEL, /* Delete a byte from the bad word. */ |
729 STATE_INS_PREP, /* Prepare for inserting bytes. */ | |
620 STATE_INS, /* Insert a byte in the bad word. */ | 730 STATE_INS, /* Insert a byte in the bad word. */ |
621 STATE_SWAP, /* Swap two bytes. */ | 731 STATE_SWAP, /* Swap two bytes. */ |
622 STATE_UNSWAP, /* Undo swap two characters. */ | 732 STATE_UNSWAP, /* Undo swap two characters. */ |
623 STATE_SWAP3, /* Swap two characters over three. */ | 733 STATE_SWAP3, /* Swap two characters over three. */ |
624 STATE_UNSWAP3, /* Undo Swap two characters over three. */ | 734 STATE_UNSWAP3, /* Undo Swap two characters over three. */ |
655 char_u ts_splitoff; /* index in "tword" after last split */ | 765 char_u ts_splitoff; /* index in "tword" after last split */ |
656 char_u ts_splitfidx; /* "ts_fidx" at word split */ | 766 char_u ts_splitfidx; /* "ts_fidx" at word split */ |
657 char_u ts_complen; /* nr of compound words used */ | 767 char_u ts_complen; /* nr of compound words used */ |
658 char_u ts_compsplit; /* index for "compflags" where word was spit */ | 768 char_u ts_compsplit; /* index for "compflags" where word was spit */ |
659 char_u ts_save_badflags; /* su_badflags saved here */ | 769 char_u ts_save_badflags; /* su_badflags saved here */ |
770 char_u ts_delidx; /* index in fword for char that was deleted, | |
771 valid when "ts_flags" has TSF_DIDDEL */ | |
660 } trystate_T; | 772 } trystate_T; |
661 | 773 |
662 /* values for ts_isdiff */ | 774 /* values for ts_isdiff */ |
663 #define DIFF_NONE 0 /* no different byte (yet) */ | 775 #define DIFF_NONE 0 /* no different byte (yet) */ |
664 #define DIFF_YES 1 /* different byte found */ | 776 #define DIFF_YES 1 /* different byte found */ |
665 #define DIFF_INSERT 2 /* inserting character */ | 777 #define DIFF_INSERT 2 /* inserting character */ |
666 | 778 |
667 /* values for ts_flags */ | 779 /* values for ts_flags */ |
668 #define TSF_PREFIXOK 1 /* already checked that prefix is OK */ | 780 #define TSF_PREFIXOK 1 /* already checked that prefix is OK */ |
669 #define TSF_DIDSPLIT 2 /* tried split at this point */ | 781 #define TSF_DIDSPLIT 2 /* tried split at this point */ |
782 #define TSF_DIDDEL 4 /* did a delete, "ts_delidx" has index */ | |
670 | 783 |
671 /* special values ts_prefixdepth */ | 784 /* special values ts_prefixdepth */ |
672 #define PFD_NOPREFIX 0xff /* not using prefixes */ | 785 #define PFD_NOPREFIX 0xff /* not using prefixes */ |
673 #define PFD_PREFIXTREE 0xfe /* walking through the prefix tree */ | 786 #define PFD_PREFIXTREE 0xfe /* walking through the prefix tree */ |
674 #define PFD_NOTSPECIAL 0xfd /* first value that's not special */ | 787 #define PFD_NOTSPECIAL 0xfd /* highest value that's not special */ |
675 | 788 |
676 /* mode values for find_word */ | 789 /* mode values for find_word */ |
677 #define FIND_FOLDWORD 0 /* find word case-folded */ | 790 #define FIND_FOLDWORD 0 /* find word case-folded */ |
678 #define FIND_KEEPWORD 1 /* find keep-case word */ | 791 #define FIND_KEEPWORD 1 /* find keep-case word */ |
679 #define FIND_PREFIX 2 /* find word after prefix */ | 792 #define FIND_PREFIX 2 /* find word after prefix */ |
681 #define FIND_KEEPCOMPOUND 4 /* find keep-case compound word */ | 794 #define FIND_KEEPCOMPOUND 4 /* find keep-case compound word */ |
682 | 795 |
683 static slang_T *slang_alloc __ARGS((char_u *lang)); | 796 static slang_T *slang_alloc __ARGS((char_u *lang)); |
684 static void slang_free __ARGS((slang_T *lp)); | 797 static void slang_free __ARGS((slang_T *lp)); |
685 static void slang_clear __ARGS((slang_T *lp)); | 798 static void slang_clear __ARGS((slang_T *lp)); |
799 static void slang_clear_sug __ARGS((slang_T *lp)); | |
686 static void find_word __ARGS((matchinf_T *mip, int mode)); | 800 static void find_word __ARGS((matchinf_T *mip, int mode)); |
687 static int can_compound __ARGS((slang_T *slang, char_u *word, char_u *flags)); | 801 static int can_compound __ARGS((slang_T *slang, char_u *word, char_u *flags)); |
688 static int valid_word_prefix __ARGS((int totprefcnt, int arridx, int flags, char_u *word, slang_T *slang, int cond_req)); | 802 static int valid_word_prefix __ARGS((int totprefcnt, int arridx, int flags, char_u *word, slang_T *slang, int cond_req)); |
689 static void find_prefix __ARGS((matchinf_T *mip, int mode)); | 803 static void find_prefix __ARGS((matchinf_T *mip, int mode)); |
690 static int fold_more __ARGS((matchinf_T *mip)); | 804 static int fold_more __ARGS((matchinf_T *mip)); |
698 static char_u *read_cnt_string __ARGS((FILE *fd, int cnt_bytes, int *lenp)); | 812 static char_u *read_cnt_string __ARGS((FILE *fd, int cnt_bytes, int *lenp)); |
699 static char_u *read_string __ARGS((FILE *fd, int cnt)); | 813 static char_u *read_string __ARGS((FILE *fd, int cnt)); |
700 static int read_region_section __ARGS((FILE *fd, slang_T *slang, int len)); | 814 static int read_region_section __ARGS((FILE *fd, slang_T *slang, int len)); |
701 static int read_charflags_section __ARGS((FILE *fd)); | 815 static int read_charflags_section __ARGS((FILE *fd)); |
702 static int read_prefcond_section __ARGS((FILE *fd, slang_T *lp)); | 816 static int read_prefcond_section __ARGS((FILE *fd, slang_T *lp)); |
703 static int read_rep_section __ARGS((FILE *fd, slang_T *slang)); | 817 static int read_rep_section __ARGS((FILE *fd, garray_T *gap, short *first)); |
704 static int read_sal_section __ARGS((FILE *fd, slang_T *slang)); | 818 static int read_sal_section __ARGS((FILE *fd, slang_T *slang)); |
819 static int read_words_section __ARGS((FILE *fd, slang_T *lp, int len)); | |
820 static void count_common_word __ARGS((slang_T *lp, char_u *word, int len, int count)); | |
821 static int score_wordcount_adj __ARGS((slang_T *slang, int score, char_u *word, int split)); | |
705 static int read_sofo_section __ARGS((FILE *fd, slang_T *slang)); | 822 static int read_sofo_section __ARGS((FILE *fd, slang_T *slang)); |
706 static int read_compound __ARGS((FILE *fd, slang_T *slang, int len)); | 823 static int read_compound __ARGS((FILE *fd, slang_T *slang, int len)); |
707 static int byte_in_str __ARGS((char_u *str, int byte)); | 824 static int byte_in_str __ARGS((char_u *str, int byte)); |
708 static int init_syl_tab __ARGS((slang_T *slang)); | 825 static int init_syl_tab __ARGS((slang_T *slang)); |
709 static int count_syllables __ARGS((slang_T *slang, char_u *word)); | 826 static int count_syllables __ARGS((slang_T *slang, char_u *word)); |
710 static int set_sofo __ARGS((slang_T *lp, char_u *from, char_u *to)); | 827 static int set_sofo __ARGS((slang_T *lp, char_u *from, char_u *to)); |
711 static void set_sal_first __ARGS((slang_T *lp)); | 828 static void set_sal_first __ARGS((slang_T *lp)); |
712 #ifdef FEAT_MBYTE | 829 #ifdef FEAT_MBYTE |
713 static int *mb_str2wide __ARGS((char_u *s)); | 830 static int *mb_str2wide __ARGS((char_u *s)); |
714 #endif | 831 #endif |
715 static idx_T read_tree __ARGS((FILE *fd, char_u *byts, idx_T *idxs, int maxidx, int startidx, int prefixtree, int maxprefcondnr)); | 832 static int spell_read_tree __ARGS((FILE *fd, char_u **bytsp, idx_T **idxsp, int prefixtree, int prefixcnt)); |
833 static idx_T read_tree_node __ARGS((FILE *fd, char_u *byts, idx_T *idxs, int maxidx, int startidx, int prefixtree, int maxprefcondnr)); | |
716 static void clear_midword __ARGS((buf_T *buf)); | 834 static void clear_midword __ARGS((buf_T *buf)); |
717 static void use_midword __ARGS((slang_T *lp, buf_T *buf)); | 835 static void use_midword __ARGS((slang_T *lp, buf_T *buf)); |
718 static int find_region __ARGS((char_u *rp, char_u *region)); | 836 static int find_region __ARGS((char_u *rp, char_u *region)); |
719 static int captype __ARGS((char_u *word, char_u *end)); | 837 static int captype __ARGS((char_u *word, char_u *end)); |
720 static int badword_captype __ARGS((char_u *word, char_u *end)); | 838 static int badword_captype __ARGS((char_u *word, char_u *end)); |
721 static void spell_reload_one __ARGS((char_u *fname, int added_word)); | 839 static void spell_reload_one __ARGS((char_u *fname, int added_word)); |
722 static void set_spell_charflags __ARGS((char_u *flags, int cnt, char_u *upp)); | 840 static void set_spell_charflags __ARGS((char_u *flags, int cnt, char_u *upp)); |
723 static int set_spell_chartab __ARGS((char_u *fol, char_u *low, char_u *upp)); | 841 static int set_spell_chartab __ARGS((char_u *fol, char_u *low, char_u *upp)); |
724 static int spell_casefold __ARGS((char_u *p, int len, char_u *buf, int buflen)); | 842 static int spell_casefold __ARGS((char_u *p, int len, char_u *buf, int buflen)); |
725 static int check_need_cap __ARGS((linenr_T lnum, colnr_T col)); | 843 static int check_need_cap __ARGS((linenr_T lnum, colnr_T col)); |
726 static void spell_find_suggest __ARGS((char_u *badptr, suginfo_T *su, int maxcount, int banbadword, int need_cap)); | 844 static void spell_find_suggest __ARGS((char_u *badptr, suginfo_T *su, int maxcount, int banbadword, int need_cap, int interactive)); |
727 #ifdef FEAT_EVAL | 845 #ifdef FEAT_EVAL |
728 static void spell_suggest_expr __ARGS((suginfo_T *su, char_u *expr)); | 846 static void spell_suggest_expr __ARGS((suginfo_T *su, char_u *expr)); |
729 #endif | 847 #endif |
730 static void spell_suggest_file __ARGS((suginfo_T *su, char_u *fname)); | 848 static void spell_suggest_file __ARGS((suginfo_T *su, char_u *fname)); |
731 static void spell_suggest_intern __ARGS((suginfo_T *su)); | 849 static void spell_suggest_intern __ARGS((suginfo_T *su, int interactive)); |
850 static void suggest_load_files __ARGS((void)); | |
851 static void tree_count_words __ARGS((char_u *byts, idx_T *idxs)); | |
732 static void spell_find_cleanup __ARGS((suginfo_T *su)); | 852 static void spell_find_cleanup __ARGS((suginfo_T *su)); |
733 static void onecap_copy __ARGS((char_u *word, char_u *wcopy, int upper)); | 853 static void onecap_copy __ARGS((char_u *word, char_u *wcopy, int upper)); |
734 static void allcap_copy __ARGS((char_u *word, char_u *wcopy)); | 854 static void allcap_copy __ARGS((char_u *word, char_u *wcopy)); |
735 static void suggest_try_special __ARGS((suginfo_T *su)); | 855 static void suggest_try_special __ARGS((suginfo_T *su)); |
736 static void suggest_try_change __ARGS((suginfo_T *su)); | 856 static void suggest_try_change __ARGS((suginfo_T *su)); |
737 static int try_deeper __ARGS((suginfo_T *su, trystate_T *stack, int depth, int score_add)); | 857 static void suggest_trie_walk __ARGS((suginfo_T *su, langp_T *lp, char_u *fword, int soundfold)); |
858 static void go_deeper __ARGS((trystate_T *stack, int depth, int score_add)); | |
738 #ifdef FEAT_MBYTE | 859 #ifdef FEAT_MBYTE |
739 static int nofold_len __ARGS((char_u *fword, int flen, char_u *word)); | 860 static int nofold_len __ARGS((char_u *fword, int flen, char_u *word)); |
740 #endif | 861 #endif |
741 static void find_keepcap_word __ARGS((slang_T *slang, char_u *fword, char_u *kword)); | 862 static void find_keepcap_word __ARGS((slang_T *slang, char_u *fword, char_u *kword)); |
742 static void score_comp_sal __ARGS((suginfo_T *su)); | 863 static void score_comp_sal __ARGS((suginfo_T *su)); |
743 static void score_combine __ARGS((suginfo_T *su)); | 864 static void score_combine __ARGS((suginfo_T *su)); |
744 static int stp_sal_score __ARGS((suggest_T *stp, suginfo_T *su, slang_T *slang, char_u *badsound)); | 865 static int stp_sal_score __ARGS((suggest_T *stp, suginfo_T *su, slang_T *slang, char_u *badsound)); |
866 static void suggest_try_soundalike_prep __ARGS((void)); | |
745 static void suggest_try_soundalike __ARGS((suginfo_T *su)); | 867 static void suggest_try_soundalike __ARGS((suginfo_T *su)); |
868 static void suggest_try_soundalike_finish __ARGS((void)); | |
869 static void add_sound_suggest __ARGS((suginfo_T *su, char_u *goodword, int score, langp_T *lp)); | |
870 static int soundfold_find __ARGS((slang_T *slang, char_u *word)); | |
746 static void make_case_word __ARGS((char_u *fword, char_u *cword, int flags)); | 871 static void make_case_word __ARGS((char_u *fword, char_u *cword, int flags)); |
747 static void set_map_str __ARGS((slang_T *lp, char_u *map)); | 872 static void set_map_str __ARGS((slang_T *lp, char_u *map)); |
748 static int similar_chars __ARGS((slang_T *slang, int c1, int c2)); | 873 static int similar_chars __ARGS((slang_T *slang, int c1, int c2)); |
749 static void add_suggestion __ARGS((suginfo_T *su, garray_T *gap, char_u *goodword, int badlen, int score, int altscore, int had_bonus, slang_T *slang)); | 874 static void add_suggestion __ARGS((suginfo_T *su, garray_T *gap, char_u *goodword, int badlen, int score, int altscore, int had_bonus, slang_T *slang, int maxsf)); |
875 static void check_suggestions __ARGS((suginfo_T *su, garray_T *gap)); | |
750 static void add_banned __ARGS((suginfo_T *su, char_u *word)); | 876 static void add_banned __ARGS((suginfo_T *su, char_u *word)); |
751 static int was_banned __ARGS((suginfo_T *su, char_u *word)); | |
752 static void free_banned __ARGS((suginfo_T *su)); | |
753 static void rescore_suggestions __ARGS((suginfo_T *su)); | 877 static void rescore_suggestions __ARGS((suginfo_T *su)); |
754 static void rescore_one __ARGS((suginfo_T *su, suggest_T *stp)); | 878 static void rescore_one __ARGS((suginfo_T *su, suggest_T *stp)); |
755 static int cleanup_suggestions __ARGS((garray_T *gap, int maxscore, int keep)); | 879 static int cleanup_suggestions __ARGS((garray_T *gap, int maxscore, int keep)); |
756 static void spell_soundfold __ARGS((slang_T *slang, char_u *inword, int folded, char_u *res)); | 880 static void spell_soundfold __ARGS((slang_T *slang, char_u *inword, int folded, char_u *res)); |
757 static void spell_soundfold_sofo __ARGS((slang_T *slang, char_u *inword, char_u *res)); | 881 static void spell_soundfold_sofo __ARGS((slang_T *slang, char_u *inword, char_u *res)); |
758 static void spell_soundfold_sal __ARGS((slang_T *slang, char_u *inword, char_u *res)); | 882 static void spell_soundfold_sal __ARGS((slang_T *slang, char_u *inword, char_u *res)); |
759 #ifdef FEAT_MBYTE | 883 #ifdef FEAT_MBYTE |
760 static void spell_soundfold_wsal __ARGS((slang_T *slang, char_u *inword, char_u *res)); | 884 static void spell_soundfold_wsal __ARGS((slang_T *slang, char_u *inword, char_u *res)); |
761 #endif | 885 #endif |
762 static int soundalike_score __ARGS((char_u *goodsound, char_u *badsound)); | 886 static int soundalike_score __ARGS((char_u *goodsound, char_u *badsound)); |
763 static int spell_edit_score __ARGS((char_u *badword, char_u *goodword)); | 887 static int spell_edit_score __ARGS((slang_T *slang, char_u *badword, char_u *goodword)); |
764 static void dump_word __ARGS((char_u *word, int round, int flags, linenr_T lnum)); | 888 static int spell_edit_score_limit __ARGS((slang_T *slang, char_u *badword, char_u *goodword, int limit)); |
889 #ifdef FEAT_MBYTE | |
890 static int spell_edit_score_limit_w __ARGS((slang_T *slang, char_u *badword, char_u *goodword, int limit)); | |
891 #endif | |
892 static void dump_word __ARGS((slang_T *slang, char_u *word, int round, int flags, linenr_T lnum)); | |
765 static linenr_T dump_prefixes __ARGS((slang_T *slang, char_u *word, int round, int flags, linenr_T startlnum)); | 893 static linenr_T dump_prefixes __ARGS((slang_T *slang, char_u *word, int round, int flags, linenr_T startlnum)); |
894 static buf_T *open_spellbuf __ARGS((void)); | |
895 static void close_spellbuf __ARGS((buf_T *buf)); | |
766 | 896 |
767 /* | 897 /* |
768 * Use our own character-case definitions, because the current locale may | 898 * Use our own character-case definitions, because the current locale may |
769 * differ from what the .spl file uses. | 899 * differ from what the .spl file uses. |
770 * These must not be called with negative number! | 900 * These must not be called with negative number! |
829 * | 959 * |
830 * Returns the length of the word in bytes, also when it's OK, so that the | 960 * Returns the length of the word in bytes, also when it's OK, so that the |
831 * caller can skip over the word. | 961 * caller can skip over the word. |
832 */ | 962 */ |
833 int | 963 int |
834 spell_check(wp, ptr, attrp, capcol) | 964 spell_check(wp, ptr, attrp, capcol, docount) |
835 win_T *wp; /* current window */ | 965 win_T *wp; /* current window */ |
836 char_u *ptr; | 966 char_u *ptr; |
837 hlf_T *attrp; | 967 hlf_T *attrp; |
838 int *capcol; /* column to check for Capital */ | 968 int *capcol; /* column to check for Capital */ |
969 int docount; /* count good words */ | |
839 { | 970 { |
840 matchinf_T mi; /* Most things are put in "mi" so that it can | 971 matchinf_T mi; /* Most things are put in "mi" so that it can |
841 be passed to functions quickly. */ | 972 be passed to functions quickly. */ |
842 int nrlen = 0; /* found a number first */ | 973 int nrlen = 0; /* found a number first */ |
843 int c; | 974 int c; |
844 int wrongcaplen = 0; | 975 int wrongcaplen = 0; |
845 int lpi; | 976 int lpi; |
977 int count_word = docount; | |
846 | 978 |
847 /* A word never starts at a space or a control character. Return quickly | 979 /* A word never starts at a space or a control character. Return quickly |
848 * then, skipping over the character. */ | 980 * then, skipping over the character. */ |
849 if (*ptr <= ' ') | 981 if (*ptr <= ' ') |
850 return 1; | 982 return 1; |
903 mi.mi_result = SP_BAD; | 1035 mi.mi_result = SP_BAD; |
904 mi.mi_result2 = SP_BAD; | 1036 mi.mi_result2 = SP_BAD; |
905 | 1037 |
906 /* | 1038 /* |
907 * Loop over the languages specified in 'spelllang'. | 1039 * Loop over the languages specified in 'spelllang'. |
908 * We check them all, because a matching word may be longer than an | 1040 * We check them all, because a word may be matched longer in another |
909 * already found matching word. | 1041 * language. |
910 */ | 1042 */ |
911 for (lpi = 0; lpi < wp->w_buffer->b_langp.ga_len; ++lpi) | 1043 for (lpi = 0; lpi < wp->w_buffer->b_langp.ga_len; ++lpi) |
912 { | 1044 { |
913 mi.mi_lp = LANGP_ENTRY(wp->w_buffer->b_langp, lpi); | 1045 mi.mi_lp = LANGP_ENTRY(wp->w_buffer->b_langp, lpi); |
914 | 1046 |
931 if (mi.mi_lp->lp_slang->sl_nobreak && mi.mi_result == SP_BAD | 1063 if (mi.mi_lp->lp_slang->sl_nobreak && mi.mi_result == SP_BAD |
932 && mi.mi_result2 != SP_BAD) | 1064 && mi.mi_result2 != SP_BAD) |
933 { | 1065 { |
934 mi.mi_result = mi.mi_result2; | 1066 mi.mi_result = mi.mi_result2; |
935 mi.mi_end = mi.mi_end2; | 1067 mi.mi_end = mi.mi_end2; |
1068 } | |
1069 | |
1070 /* Count the word in the first language where it's found to be OK. */ | |
1071 if (count_word && mi.mi_result == SP_OK) | |
1072 { | |
1073 count_common_word(mi.mi_lp->lp_slang, ptr, | |
1074 (int)(mi.mi_end - ptr), 1); | |
1075 count_word = FALSE; | |
936 } | 1076 } |
937 } | 1077 } |
938 | 1078 |
939 if (mi.mi_result != SP_OK) | 1079 if (mi.mi_result != SP_OK) |
940 { | 1080 { |
1895 && (colnr_T)(p - buf) >= wp->w_cursor.col) | 2035 && (colnr_T)(p - buf) >= wp->w_cursor.col) |
1896 break; | 2036 break; |
1897 | 2037 |
1898 /* start of word */ | 2038 /* start of word */ |
1899 attr = HLF_COUNT; | 2039 attr = HLF_COUNT; |
1900 len = spell_check(wp, p, &attr, &capcol); | 2040 len = spell_check(wp, p, &attr, &capcol, FALSE); |
1901 | 2041 |
1902 if (attr != HLF_COUNT) | 2042 if (attr != HLF_COUNT) |
1903 { | 2043 { |
1904 /* We found a bad word. Check the attribute. */ | 2044 /* We found a bad word. Check the attribute. */ |
1905 if (allwords || attr == HLF_SPB) | 2045 if (allwords || attr == HLF_SPB) |
2138 vim_snprintf((char *)fname, MAXPATHL, "%s.%s.spl", | 2278 vim_snprintf((char *)fname, MAXPATHL, "%s.%s.spl", |
2139 int_wordlist, spell_enc()); | 2279 int_wordlist, spell_enc()); |
2140 } | 2280 } |
2141 | 2281 |
2142 /* | 2282 /* |
2143 * Allocate a new slang_T. | 2283 * Allocate a new slang_T for language "lang". "lang" can be NULL. |
2144 * Caller must fill "sl_next". | 2284 * Caller must fill "sl_next". |
2145 */ | 2285 */ |
2146 static slang_T * | 2286 static slang_T * |
2147 slang_alloc(lang) | 2287 slang_alloc(lang) |
2148 char_u *lang; | 2288 char_u *lang; |
2150 slang_T *lp; | 2290 slang_T *lp; |
2151 | 2291 |
2152 lp = (slang_T *)alloc_clear(sizeof(slang_T)); | 2292 lp = (slang_T *)alloc_clear(sizeof(slang_T)); |
2153 if (lp != NULL) | 2293 if (lp != NULL) |
2154 { | 2294 { |
2155 lp->sl_name = vim_strsave(lang); | 2295 if (lang != NULL) |
2296 lp->sl_name = vim_strsave(lang); | |
2156 ga_init2(&lp->sl_rep, sizeof(fromto_T), 10); | 2297 ga_init2(&lp->sl_rep, sizeof(fromto_T), 10); |
2298 ga_init2(&lp->sl_repsal, sizeof(fromto_T), 10); | |
2157 lp->sl_compmax = MAXWLEN; | 2299 lp->sl_compmax = MAXWLEN; |
2158 lp->sl_compsylmax = MAXWLEN; | 2300 lp->sl_compsylmax = MAXWLEN; |
2159 } | 2301 hash_init(&lp->sl_wordcount); |
2302 } | |
2303 | |
2160 return lp; | 2304 return lp; |
2161 } | 2305 } |
2162 | 2306 |
2163 /* | 2307 /* |
2164 * Free the contents of an slang_T and the structure itself. | 2308 * Free the contents of an slang_T and the structure itself. |
2182 { | 2326 { |
2183 garray_T *gap; | 2327 garray_T *gap; |
2184 fromto_T *ftp; | 2328 fromto_T *ftp; |
2185 salitem_T *smp; | 2329 salitem_T *smp; |
2186 int i; | 2330 int i; |
2331 int round; | |
2187 | 2332 |
2188 vim_free(lp->sl_fbyts); | 2333 vim_free(lp->sl_fbyts); |
2189 lp->sl_fbyts = NULL; | 2334 lp->sl_fbyts = NULL; |
2190 vim_free(lp->sl_kbyts); | 2335 vim_free(lp->sl_kbyts); |
2191 lp->sl_kbyts = NULL; | 2336 lp->sl_kbyts = NULL; |
2197 vim_free(lp->sl_kidxs); | 2342 vim_free(lp->sl_kidxs); |
2198 lp->sl_kidxs = NULL; | 2343 lp->sl_kidxs = NULL; |
2199 vim_free(lp->sl_pidxs); | 2344 vim_free(lp->sl_pidxs); |
2200 lp->sl_pidxs = NULL; | 2345 lp->sl_pidxs = NULL; |
2201 | 2346 |
2202 gap = &lp->sl_rep; | 2347 for (round = 1; round <= 2; ++round) |
2203 while (gap->ga_len > 0) | 2348 { |
2204 { | 2349 gap = round == 1 ? &lp->sl_rep : &lp->sl_repsal; |
2205 ftp = &((fromto_T *)gap->ga_data)[--gap->ga_len]; | 2350 while (gap->ga_len > 0) |
2206 vim_free(ftp->ft_from); | 2351 { |
2207 vim_free(ftp->ft_to); | 2352 ftp = &((fromto_T *)gap->ga_data)[--gap->ga_len]; |
2208 } | 2353 vim_free(ftp->ft_from); |
2209 ga_clear(gap); | 2354 vim_free(ftp->ft_to); |
2355 } | |
2356 ga_clear(gap); | |
2357 } | |
2210 | 2358 |
2211 gap = &lp->sl_sal; | 2359 gap = &lp->sl_sal; |
2212 if (lp->sl_sofo) | 2360 if (lp->sl_sofo) |
2213 { | 2361 { |
2214 /* "ga_len" is set to 1 without adding an item for latin1 */ | 2362 /* "ga_len" is set to 1 without adding an item for latin1 */ |
2251 | 2399 |
2252 vim_free(lp->sl_syllable); | 2400 vim_free(lp->sl_syllable); |
2253 lp->sl_syllable = NULL; | 2401 lp->sl_syllable = NULL; |
2254 ga_clear(&lp->sl_syl_items); | 2402 ga_clear(&lp->sl_syl_items); |
2255 | 2403 |
2404 hash_clear_all(&lp->sl_wordcount, WC_KEY_OFF); | |
2405 hash_init(&lp->sl_wordcount); | |
2406 | |
2256 #ifdef FEAT_MBYTE | 2407 #ifdef FEAT_MBYTE |
2257 { | 2408 hash_clear_all(&lp->sl_map_hash, 0); |
2258 int todo = lp->sl_map_hash.ht_used; | |
2259 hashitem_T *hi; | |
2260 | |
2261 for (hi = lp->sl_map_hash.ht_array; todo > 0; ++hi) | |
2262 if (!HASHITEM_EMPTY(hi)) | |
2263 { | |
2264 --todo; | |
2265 vim_free(hi->hi_key); | |
2266 } | |
2267 } | |
2268 hash_clear(&lp->sl_map_hash); | |
2269 #endif | 2409 #endif |
2410 | |
2411 /* Clear info from .sug file. */ | |
2412 slang_clear_sug(lp); | |
2270 | 2413 |
2271 lp->sl_compmax = MAXWLEN; | 2414 lp->sl_compmax = MAXWLEN; |
2272 lp->sl_compminlen = 0; | 2415 lp->sl_compminlen = 0; |
2273 lp->sl_compsylmax = MAXWLEN; | 2416 lp->sl_compsylmax = MAXWLEN; |
2274 lp->sl_regions[0] = NUL; | 2417 lp->sl_regions[0] = NUL; |
2418 } | |
2419 | |
2420 /* | |
2421 * Clear the info from the .sug file in "lp". | |
2422 */ | |
2423 static void | |
2424 slang_clear_sug(lp) | |
2425 slang_T *lp; | |
2426 { | |
2427 vim_free(lp->sl_sbyts); | |
2428 lp->sl_sbyts = NULL; | |
2429 vim_free(lp->sl_sidxs); | |
2430 lp->sl_sidxs = NULL; | |
2431 close_spellbuf(lp->sl_sugbuf); | |
2432 lp->sl_sugbuf = NULL; | |
2433 lp->sl_sugloaded = FALSE; | |
2434 lp->sl_sugtime = 0; | |
2275 } | 2435 } |
2276 | 2436 |
2277 /* | 2437 /* |
2278 * Load one spell file and store the info into a slang_T. | 2438 * Load one spell file and store the info into a slang_T. |
2279 * Invoked through do_in_runtimepath(). | 2439 * Invoked through do_in_runtimepath(). |
2301 } | 2461 } |
2302 | 2462 |
2303 /* | 2463 /* |
2304 * Load one spell file and store the info into a slang_T. | 2464 * Load one spell file and store the info into a slang_T. |
2305 * | 2465 * |
2306 * This is invoked in two ways: | 2466 * This is invoked in three ways: |
2307 * - From spell_load_cb() to load a spell file for the first time. "lang" is | 2467 * - From spell_load_cb() to load a spell file for the first time. "lang" is |
2308 * the language name, "old_lp" is NULL. Will allocate an slang_T. | 2468 * the language name, "old_lp" is NULL. Will allocate an slang_T. |
2309 * - To reload a spell file that was changed. "lang" is NULL and "old_lp" | 2469 * - To reload a spell file that was changed. "lang" is NULL and "old_lp" |
2310 * points to the existing slang_T. | 2470 * points to the existing slang_T. |
2471 * - Just after writing a .spl file; it's read back to produce the .sug file. | |
2472 * "old_lp" is NULL and "lang" is a dummy name. Will allocate an slang_T. | |
2311 * Returns the slang_T the spell file was loaded into. NULL for error. | 2473 * Returns the slang_T the spell file was loaded into. NULL for error. |
2312 */ | 2474 */ |
2313 static slang_T * | 2475 static slang_T * |
2314 spell_load_file(fname, lang, old_lp, silent) | 2476 spell_load_file(fname, lang, old_lp, silent) |
2315 char_u *fname; | 2477 char_u *fname; |
2318 int silent; /* no error if file doesn't exist */ | 2480 int silent; /* no error if file doesn't exist */ |
2319 { | 2481 { |
2320 FILE *fd; | 2482 FILE *fd; |
2321 char_u buf[VIMSPELLMAGICL]; | 2483 char_u buf[VIMSPELLMAGICL]; |
2322 char_u *p; | 2484 char_u *p; |
2323 char_u *bp; | |
2324 idx_T *ip; | |
2325 int i; | 2485 int i; |
2326 int n; | 2486 int n; |
2327 int len; | 2487 int len; |
2328 int round; | |
2329 char_u *save_sourcing_name = sourcing_name; | 2488 char_u *save_sourcing_name = sourcing_name; |
2330 linenr_T save_sourcing_lnum = sourcing_lnum; | 2489 linenr_T save_sourcing_lnum = sourcing_lnum; |
2331 slang_T *lp = NULL; | 2490 slang_T *lp = NULL; |
2332 idx_T idx; | |
2333 int c = 0; | 2491 int c = 0; |
2334 int res; | 2492 int res; |
2335 | 2493 |
2336 fd = mch_fopen((char *)fname, "r"); | 2494 fd = mch_fopen((char *)fname, "r"); |
2337 if (fd == NULL) | 2495 if (fd == NULL) |
2372 | 2530 |
2373 /* Set sourcing_name, so that error messages mention the file name. */ | 2531 /* Set sourcing_name, so that error messages mention the file name. */ |
2374 sourcing_name = fname; | 2532 sourcing_name = fname; |
2375 sourcing_lnum = 0; | 2533 sourcing_lnum = 0; |
2376 | 2534 |
2377 /* <HEADER>: <fileID> | 2535 /* |
2536 * <HEADER>: <fileID> | |
2378 */ | 2537 */ |
2379 for (i = 0; i < VIMSPELLMAGICL; ++i) | 2538 for (i = 0; i < VIMSPELLMAGICL; ++i) |
2380 buf[i] = getc(fd); /* <fileID> */ | 2539 buf[i] = getc(fd); /* <fileID> */ |
2381 if (STRNCMP(buf, VIMSPELLMAGIC, VIMSPELLMAGICL) != 0) | 2540 if (STRNCMP(buf, VIMSPELLMAGIC, VIMSPELLMAGICL) != 0) |
2382 { | 2541 { |
2431 case SN_PREFCOND: | 2590 case SN_PREFCOND: |
2432 res = read_prefcond_section(fd, lp); | 2591 res = read_prefcond_section(fd, lp); |
2433 break; | 2592 break; |
2434 | 2593 |
2435 case SN_REP: | 2594 case SN_REP: |
2436 res = read_rep_section(fd, lp); | 2595 res = read_rep_section(fd, &lp->sl_rep, lp->sl_rep_first); |
2596 break; | |
2597 | |
2598 case SN_REPSAL: | |
2599 res = read_rep_section(fd, &lp->sl_repsal, lp->sl_repsal_first); | |
2437 break; | 2600 break; |
2438 | 2601 |
2439 case SN_SAL: | 2602 case SN_SAL: |
2440 res = read_sal_section(fd, lp); | 2603 res = read_sal_section(fd, lp); |
2441 break; | 2604 break; |
2448 p = read_string(fd, len); /* <mapstr> */ | 2611 p = read_string(fd, len); /* <mapstr> */ |
2449 if (p == NULL) | 2612 if (p == NULL) |
2450 goto endFAIL; | 2613 goto endFAIL; |
2451 set_map_str(lp, p); | 2614 set_map_str(lp, p); |
2452 vim_free(p); | 2615 vim_free(p); |
2616 break; | |
2617 | |
2618 case SN_WORDS: | |
2619 res = read_words_section(fd, lp, len); | |
2620 break; | |
2621 | |
2622 case SN_SUGFILE: | |
2623 for (i = 7; i >= 0; --i) /* <timestamp> */ | |
2624 lp->sl_sugtime += getc(fd) << (i * 8); | |
2453 break; | 2625 break; |
2454 | 2626 |
2455 case SN_COMPOUND: | 2627 case SN_COMPOUND: |
2456 res = read_compound(fd, lp, len); | 2628 res = read_compound(fd, lp, len); |
2457 break; | 2629 break; |
2479 while (--len >= 0) | 2651 while (--len >= 0) |
2480 if (getc(fd) < 0) | 2652 if (getc(fd) < 0) |
2481 goto truncerr; | 2653 goto truncerr; |
2482 break; | 2654 break; |
2483 } | 2655 } |
2656 someerror: | |
2484 if (res == SP_FORMERROR) | 2657 if (res == SP_FORMERROR) |
2485 { | 2658 { |
2486 formerr: | |
2487 EMSG(_(e_format)); | 2659 EMSG(_(e_format)); |
2488 goto endFAIL; | 2660 goto endFAIL; |
2489 } | 2661 } |
2490 if (res == SP_TRUNCERROR) | 2662 if (res == SP_TRUNCERROR) |
2491 { | 2663 { |
2495 } | 2667 } |
2496 if (res == SP_OTHERERROR) | 2668 if (res == SP_OTHERERROR) |
2497 goto endFAIL; | 2669 goto endFAIL; |
2498 } | 2670 } |
2499 | 2671 |
2500 /* round 1: <LWORDTREE> | 2672 /* <LWORDTREE> */ |
2501 * round 2: <KWORDTREE> | 2673 res = spell_read_tree(fd, &lp->sl_fbyts, &lp->sl_fidxs, FALSE, 0); |
2502 * round 3: <PREFIXTREE> */ | 2674 if (res != 0) |
2503 for (round = 1; round <= 3; ++round) | 2675 goto someerror; |
2504 { | 2676 |
2505 /* The tree size was computed when writing the file, so that we can | 2677 /* <KWORDTREE> */ |
2506 * allocate it as one long block. <nodecount> */ | 2678 res = spell_read_tree(fd, &lp->sl_kbyts, &lp->sl_kidxs, FALSE, 0); |
2507 len = (getc(fd) << 24) + (getc(fd) << 16) + (getc(fd) << 8) + getc(fd); | 2679 if (res != 0) |
2508 if (len < 0) | 2680 goto someerror; |
2509 goto truncerr; | 2681 |
2510 if (len > 0) | 2682 /* <PREFIXTREE> */ |
2511 { | 2683 res = spell_read_tree(fd, &lp->sl_pbyts, &lp->sl_pidxs, TRUE, |
2512 /* Allocate the byte array. */ | 2684 lp->sl_prefixcnt); |
2513 bp = lalloc((long_u)len, TRUE); | 2685 if (res != 0) |
2514 if (bp == NULL) | 2686 goto someerror; |
2515 goto endFAIL; | |
2516 if (round == 1) | |
2517 lp->sl_fbyts = bp; | |
2518 else if (round == 2) | |
2519 lp->sl_kbyts = bp; | |
2520 else | |
2521 lp->sl_pbyts = bp; | |
2522 | |
2523 /* Allocate the index array. */ | |
2524 ip = (idx_T *)lalloc_clear((long_u)(len * sizeof(int)), TRUE); | |
2525 if (ip == NULL) | |
2526 goto endFAIL; | |
2527 if (round == 1) | |
2528 lp->sl_fidxs = ip; | |
2529 else if (round == 2) | |
2530 lp->sl_kidxs = ip; | |
2531 else | |
2532 lp->sl_pidxs = ip; | |
2533 | |
2534 /* Read the tree and store it in the array. */ | |
2535 idx = read_tree(fd, bp, ip, len, 0, round == 3, lp->sl_prefixcnt); | |
2536 if (idx == -1) | |
2537 goto truncerr; | |
2538 if (idx < 0) | |
2539 goto formerr; | |
2540 } | |
2541 } | |
2542 | 2687 |
2543 /* For a new file link it in the list of spell files. */ | 2688 /* For a new file link it in the list of spell files. */ |
2544 if (old_lp == NULL) | 2689 if (old_lp == NULL) |
2545 { | 2690 { |
2546 lp->sl_next = first_lang; | 2691 lp->sl_next = first_lang; |
2731 } | 2876 } |
2732 return 0; | 2877 return 0; |
2733 } | 2878 } |
2734 | 2879 |
2735 /* | 2880 /* |
2736 * Read REP items section from "fd": <repcount> <rep> ... | 2881 * Read REP or REPSAL items section from "fd": <repcount> <rep> ... |
2737 * Return SP_*ERROR flags. | 2882 * Return SP_*ERROR flags. |
2738 */ | 2883 */ |
2739 static int | 2884 static int |
2740 read_rep_section(fd, slang) | 2885 read_rep_section(fd, gap, first) |
2741 FILE *fd; | 2886 FILE *fd; |
2742 slang_T *slang; | 2887 garray_T *gap; |
2888 short *first; | |
2743 { | 2889 { |
2744 int cnt; | 2890 int cnt; |
2745 garray_T *gap; | |
2746 fromto_T *ftp; | 2891 fromto_T *ftp; |
2747 short *first; | |
2748 int i; | 2892 int i; |
2749 | 2893 |
2750 cnt = (getc(fd) << 8) + getc(fd); /* <repcount> */ | 2894 cnt = (getc(fd) << 8) + getc(fd); /* <repcount> */ |
2751 if (cnt < 0) | 2895 if (cnt < 0) |
2752 return SP_TRUNCERROR; | 2896 return SP_TRUNCERROR; |
2753 | 2897 |
2754 gap = &slang->sl_rep; | |
2755 if (ga_grow(gap, cnt) == FAIL) | 2898 if (ga_grow(gap, cnt) == FAIL) |
2756 return SP_OTHERERROR; | 2899 return SP_OTHERERROR; |
2757 | 2900 |
2758 /* <rep> : <repfromlen> <repfrom> <reptolen> <repto> */ | 2901 /* <rep> : <repfromlen> <repfrom> <reptolen> <repto> */ |
2759 for (; gap->ga_len < cnt; ++gap->ga_len) | 2902 for (; gap->ga_len < cnt; ++gap->ga_len) |
2773 return SP_FORMERROR; | 2916 return SP_FORMERROR; |
2774 } | 2917 } |
2775 } | 2918 } |
2776 | 2919 |
2777 /* Fill the first-index table. */ | 2920 /* Fill the first-index table. */ |
2778 first = slang->sl_rep_first; | |
2779 for (i = 0; i < 256; ++i) | 2921 for (i = 0; i < 256; ++i) |
2780 first[i] = -1; | 2922 first[i] = -1; |
2781 for (i = 0; i < gap->ga_len; ++i) | 2923 for (i = 0; i < gap->ga_len; ++i) |
2782 { | 2924 { |
2783 ftp = &((fromto_T *)gap->ga_data)[i]; | 2925 ftp = &((fromto_T *)gap->ga_data)[i]; |
2936 | 3078 |
2937 /* Fill the first-index table. */ | 3079 /* Fill the first-index table. */ |
2938 set_sal_first(slang); | 3080 set_sal_first(slang); |
2939 | 3081 |
2940 return 0; | 3082 return 0; |
3083 } | |
3084 | |
3085 /* | |
3086 * Read SN_WORDS: <word> ... | |
3087 * Return SP_*ERROR flags. | |
3088 */ | |
3089 static int | |
3090 read_words_section(fd, lp, len) | |
3091 FILE *fd; | |
3092 slang_T *lp; | |
3093 int len; | |
3094 { | |
3095 int done = 0; | |
3096 int i; | |
3097 char_u word[MAXWLEN]; | |
3098 | |
3099 while (done < len) | |
3100 { | |
3101 /* Read one word at a time. */ | |
3102 for (i = 0; ; ++i) | |
3103 { | |
3104 word[i] = getc(fd); | |
3105 if (word[i] == NUL) | |
3106 break; | |
3107 if (i == MAXWLEN - 1) | |
3108 return SP_FORMERROR; | |
3109 } | |
3110 | |
3111 /* Init the count to 10. */ | |
3112 count_common_word(lp, word, -1, 10); | |
3113 done += i + 1; | |
3114 } | |
3115 return 0; | |
3116 } | |
3117 | |
3118 /* | |
3119 * Add a word to the hashtable of common words. | |
3120 * If it's already there then the counter is increased. | |
3121 */ | |
3122 static void | |
3123 count_common_word(lp, word, len, count) | |
3124 slang_T *lp; | |
3125 char_u *word; | |
3126 int len; /* word length, -1 for upto NUL */ | |
3127 int count; /* 1 to count once, 10 to init */ | |
3128 { | |
3129 hash_T hash; | |
3130 hashitem_T *hi; | |
3131 wordcount_T *wc; | |
3132 char_u buf[MAXWLEN]; | |
3133 char_u *p; | |
3134 | |
3135 if (len == -1) | |
3136 p = word; | |
3137 else | |
3138 { | |
3139 vim_strncpy(buf, word, len); | |
3140 p = buf; | |
3141 } | |
3142 | |
3143 hash = hash_hash(p); | |
3144 hi = hash_lookup(&lp->sl_wordcount, p, hash); | |
3145 if (HASHITEM_EMPTY(hi)) | |
3146 { | |
3147 wc = (wordcount_T *)alloc(sizeof(wordcount_T) + STRLEN(p)); | |
3148 if (wc == NULL) | |
3149 return; | |
3150 STRCPY(wc->wc_word, p); | |
3151 wc->wc_count = count; | |
3152 hash_add_item(&lp->sl_wordcount, hi, wc->wc_word, hash); | |
3153 } | |
3154 else | |
3155 { | |
3156 wc = HI2WC(hi); | |
3157 if ((wc->wc_count += count) < (unsigned)count) /* check for overflow */ | |
3158 wc->wc_count = MAXWORDCOUNT; | |
3159 } | |
3160 } | |
3161 | |
3162 /* | |
3163 * Adjust the score of common words. | |
3164 */ | |
3165 static int | |
3166 score_wordcount_adj(slang, score, word, split) | |
3167 slang_T *slang; | |
3168 int score; | |
3169 char_u *word; | |
3170 int split; /* word was split, less bonus */ | |
3171 { | |
3172 hashitem_T *hi; | |
3173 wordcount_T *wc; | |
3174 int bonus; | |
3175 int newscore; | |
3176 | |
3177 hi = hash_find(&slang->sl_wordcount, word); | |
3178 if (!HASHITEM_EMPTY(hi)) | |
3179 { | |
3180 wc = HI2WC(hi); | |
3181 if (wc->wc_count < SCORE_THRES2) | |
3182 bonus = SCORE_COMMON1; | |
3183 else if (wc->wc_count < SCORE_THRES3) | |
3184 bonus = SCORE_COMMON2; | |
3185 else | |
3186 bonus = SCORE_COMMON3; | |
3187 if (split) | |
3188 newscore = score - bonus / 2; | |
3189 else | |
3190 newscore = score - bonus; | |
3191 if (newscore < 0) | |
3192 return 0; | |
3193 return newscore; | |
3194 } | |
3195 return score; | |
2941 } | 3196 } |
2942 | 3197 |
2943 /* | 3198 /* |
2944 * SN_SOFO: <sofofromlen> <sofofrom> <sofotolen> <sofoto> | 3199 * SN_SOFO: <sofofromlen> <sofofrom> <sofotolen> <sofoto> |
2945 * Return SP_*ERROR flags. | 3200 * Return SP_*ERROR flags. |
3432 return res; | 3687 return res; |
3433 } | 3688 } |
3434 #endif | 3689 #endif |
3435 | 3690 |
3436 /* | 3691 /* |
3692 * Read a tree from the .spl or .sug file. | |
3693 * Allocates the memory and stores pointers in "bytsp" and "idxsp". | |
3694 * This is skipped when the tree has zero length. | |
3695 * Returns zero when OK, SP_ value for an error. | |
3696 */ | |
3697 static int | |
3698 spell_read_tree(fd, bytsp, idxsp, prefixtree, prefixcnt) | |
3699 FILE *fd; | |
3700 char_u **bytsp; | |
3701 idx_T **idxsp; | |
3702 int prefixtree; /* TRUE for the prefix tree */ | |
3703 int prefixcnt; /* when "prefixtree" is TRUE: prefix count */ | |
3704 { | |
3705 int len; | |
3706 int idx; | |
3707 char_u *bp; | |
3708 idx_T *ip; | |
3709 | |
3710 /* The tree size was computed when writing the file, so that we can | |
3711 * allocate it as one long block. <nodecount> */ | |
3712 len = (getc(fd) << 24) + (getc(fd) << 16) + (getc(fd) << 8) + getc(fd); | |
3713 if (len < 0) | |
3714 return SP_TRUNCERROR; | |
3715 if (len > 0) | |
3716 { | |
3717 /* Allocate the byte array. */ | |
3718 bp = lalloc((long_u)len, TRUE); | |
3719 if (bp == NULL) | |
3720 return SP_OTHERERROR; | |
3721 *bytsp = bp; | |
3722 | |
3723 /* Allocate the index array. */ | |
3724 ip = (idx_T *)lalloc_clear((long_u)(len * sizeof(int)), TRUE); | |
3725 if (ip == NULL) | |
3726 return SP_OTHERERROR; | |
3727 *idxsp = ip; | |
3728 | |
3729 /* Recursively read the tree and store it in the array. */ | |
3730 idx = read_tree_node(fd, bp, ip, len, 0, prefixtree, prefixcnt); | |
3731 if (idx < 0) | |
3732 return idx; | |
3733 } | |
3734 return 0; | |
3735 } | |
3736 | |
3737 /* | |
3437 * Read one row of siblings from the spell file and store it in the byte array | 3738 * Read one row of siblings from the spell file and store it in the byte array |
3438 * "byts" and index array "idxs". Recursively read the children. | 3739 * "byts" and index array "idxs". Recursively read the children. |
3439 * | 3740 * |
3440 * NOTE: The code here must match put_node(). | 3741 * NOTE: The code here must match put_node()! |
3441 * | 3742 * |
3442 * Returns the index follosing the siblings. | 3743 * Returns the index (>= 0) following the siblings. |
3443 * Returns -1 if the file is shorter than expected. | 3744 * Returns SP_TRUNCERROR if the file is shorter than expected. |
3444 * Returns -2 if there is a format error. | 3745 * Returns SP_FORMERROR if there is a format error. |
3445 */ | 3746 */ |
3446 static idx_T | 3747 static idx_T |
3447 read_tree(fd, byts, idxs, maxidx, startidx, prefixtree, maxprefcondnr) | 3748 read_tree_node(fd, byts, idxs, maxidx, startidx, prefixtree, maxprefcondnr) |
3448 FILE *fd; | 3749 FILE *fd; |
3449 char_u *byts; | 3750 char_u *byts; |
3450 idx_T *idxs; | 3751 idx_T *idxs; |
3451 int maxidx; /* size of arrays */ | 3752 int maxidx; /* size of arrays */ |
3452 idx_T startidx; /* current index in "byts" and "idxs" */ | 3753 idx_T startidx; /* current index in "byts" and "idxs" */ |
3461 int c2; | 3762 int c2; |
3462 #define SHARED_MASK 0x8000000 | 3763 #define SHARED_MASK 0x8000000 |
3463 | 3764 |
3464 len = getc(fd); /* <siblingcount> */ | 3765 len = getc(fd); /* <siblingcount> */ |
3465 if (len <= 0) | 3766 if (len <= 0) |
3466 return -1; | 3767 return SP_TRUNCERROR; |
3467 | 3768 |
3468 if (startidx + len >= maxidx) | 3769 if (startidx + len >= maxidx) |
3469 return -2; | 3770 return SP_FORMERROR; |
3470 byts[idx++] = len; | 3771 byts[idx++] = len; |
3471 | 3772 |
3472 /* Read the byte values, flag/region bytes and shared indexes. */ | 3773 /* Read the byte values, flag/region bytes and shared indexes. */ |
3473 for (i = 1; i <= len; ++i) | 3774 for (i = 1; i <= len; ++i) |
3474 { | 3775 { |
3475 c = getc(fd); /* <byte> */ | 3776 c = getc(fd); /* <byte> */ |
3476 if (c < 0) | 3777 if (c < 0) |
3477 return -1; | 3778 return SP_TRUNCERROR; |
3478 if (c <= BY_SPECIAL) | 3779 if (c <= BY_SPECIAL) |
3479 { | 3780 { |
3480 if (c == BY_NOFLAGS && !prefixtree) | 3781 if (c == BY_NOFLAGS && !prefixtree) |
3481 { | 3782 { |
3482 /* No flags, all regions. */ | 3783 /* No flags, all regions. */ |
3498 | 3799 |
3499 c |= getc(fd); /* <affixID> */ | 3800 c |= getc(fd); /* <affixID> */ |
3500 | 3801 |
3501 n = (getc(fd) << 8) + getc(fd); /* <prefcondnr> */ | 3802 n = (getc(fd) << 8) + getc(fd); /* <prefcondnr> */ |
3502 if (n >= maxprefcondnr) | 3803 if (n >= maxprefcondnr) |
3503 return -2; | 3804 return SP_FORMERROR; |
3504 c |= (n << 8); | 3805 c |= (n << 8); |
3505 } | 3806 } |
3506 else /* c must be BY_FLAGS or BY_FLAGS2 */ | 3807 else /* c must be BY_FLAGS or BY_FLAGS2 */ |
3507 { | 3808 { |
3508 /* Read flags and optional region and prefix ID. In | 3809 /* Read flags and optional region and prefix ID. In |
3524 else /* c == BY_INDEX */ | 3825 else /* c == BY_INDEX */ |
3525 { | 3826 { |
3526 /* <nodeidx> */ | 3827 /* <nodeidx> */ |
3527 n = (getc(fd) << 16) + (getc(fd) << 8) + getc(fd); | 3828 n = (getc(fd) << 16) + (getc(fd) << 8) + getc(fd); |
3528 if (n < 0 || n >= maxidx) | 3829 if (n < 0 || n >= maxidx) |
3529 return -2; | 3830 return SP_FORMERROR; |
3530 idxs[idx] = n + SHARED_MASK; | 3831 idxs[idx] = n + SHARED_MASK; |
3531 c = getc(fd); /* <xbyte> */ | 3832 c = getc(fd); /* <xbyte> */ |
3532 } | 3833 } |
3533 } | 3834 } |
3534 byts[idx++] = c; | 3835 byts[idx++] = c; |
3543 if (idxs[startidx + i] & SHARED_MASK) | 3844 if (idxs[startidx + i] & SHARED_MASK) |
3544 idxs[startidx + i] &= ~SHARED_MASK; | 3845 idxs[startidx + i] &= ~SHARED_MASK; |
3545 else | 3846 else |
3546 { | 3847 { |
3547 idxs[startidx + i] = idx; | 3848 idxs[startidx + i] = idx; |
3548 idx = read_tree(fd, byts, idxs, maxidx, idx, | 3849 idx = read_tree_node(fd, byts, idxs, maxidx, idx, |
3549 prefixtree, maxprefcondnr); | 3850 prefixtree, maxprefcondnr); |
3550 if (idx < 0) | 3851 if (idx < 0) |
3551 break; | 3852 break; |
3552 } | 3853 } |
3553 } | 3854 } |
3818 /* REP items */ | 4119 /* REP items */ |
3819 if (lp->lp_slang->sl_rep.ga_len > 0) | 4120 if (lp->lp_slang->sl_rep.ga_len > 0) |
3820 /* language has REP items itself */ | 4121 /* language has REP items itself */ |
3821 lp->lp_replang = lp->lp_slang; | 4122 lp->lp_replang = lp->lp_slang; |
3822 else | 4123 else |
3823 /* find first similar language that does sound folding */ | 4124 /* find first similar language that has REP items */ |
3824 for (j = 0; j < ga.ga_len; ++j) | 4125 for (j = 0; j < ga.ga_len; ++j) |
3825 { | 4126 { |
3826 lp2 = LANGP_ENTRY(ga, j); | 4127 lp2 = LANGP_ENTRY(ga, j); |
3827 if (lp2->lp_slang->sl_rep.ga_len > 0 | 4128 if (lp2->lp_slang->sl_rep.ga_len > 0 |
3828 && STRNCMP(lp->lp_slang->sl_name, | 4129 && STRNCMP(lp->lp_slang->sl_name, |
4237 int wn_refs; /* Nr. of references to this node. Only | 4538 int wn_refs; /* Nr. of references to this node. Only |
4238 relevant for first node in a list of | 4539 relevant for first node in a list of |
4239 siblings, in following siblings it is | 4540 siblings, in following siblings it is |
4240 always one. */ | 4541 always one. */ |
4241 char_u wn_byte; /* Byte for this node. NUL for word end */ | 4542 char_u wn_byte; /* Byte for this node. NUL for word end */ |
4242 char_u wn_affixID; /* when "wn_byte" is NUL: supported/required | 4543 |
4243 prefix ID or 0 */ | 4544 /* Info for when "wn_byte" is NUL. |
4244 short_u wn_flags; /* when "wn_byte" is NUL: WF_ flags */ | 4545 * In PREFIXTREE "wn_region" is used for the prefcondnr. |
4245 short wn_region; /* when "wn_byte" is NUL: region mask; for | 4546 * In the soundfolded word tree "wn_flags" has the MSW of the wordnr and |
4246 PREFIXTREE it's the prefcondnr */ | 4547 * "wn_region" the LSW of the wordnr. */ |
4548 char_u wn_affixID; /* supported/required prefix ID or 0 */ | |
4549 short_u wn_flags; /* WF_ flags */ | |
4550 short wn_region; /* region mask */ | |
4551 | |
4247 #ifdef SPELL_PRINTTREE | 4552 #ifdef SPELL_PRINTTREE |
4248 int wn_nr; /* sequence nr for printing */ | 4553 int wn_nr; /* sequence nr for printing */ |
4249 #endif | 4554 #endif |
4250 }; | 4555 }; |
4251 | 4556 |
4263 | 4568 |
4264 wordnode_T *si_keeproot; /* tree with keep-case words */ | 4569 wordnode_T *si_keeproot; /* tree with keep-case words */ |
4265 long si_keepwcount; /* nr of words in si_keeproot */ | 4570 long si_keepwcount; /* nr of words in si_keeproot */ |
4266 | 4571 |
4267 wordnode_T *si_prefroot; /* tree with postponed prefixes */ | 4572 wordnode_T *si_prefroot; /* tree with postponed prefixes */ |
4573 | |
4574 long si_sugtree; /* creating the soundfolding trie */ | |
4268 | 4575 |
4269 sblock_T *si_blocks; /* memory blocks used */ | 4576 sblock_T *si_blocks; /* memory blocks used */ |
4270 long si_blocks_cnt; /* memory blocks allocated */ | 4577 long si_blocks_cnt; /* memory blocks allocated */ |
4271 long si_compress_cnt; /* words to add before lowering | 4578 long si_compress_cnt; /* words to add before lowering |
4272 compression limit */ | 4579 compression limit */ |
4274 compression, linked by "wn_child" field. */ | 4581 compression, linked by "wn_child" field. */ |
4275 long si_free_count; /* number of nodes in si_first_free */ | 4582 long si_free_count; /* number of nodes in si_first_free */ |
4276 #ifdef SPELL_PRINTTREE | 4583 #ifdef SPELL_PRINTTREE |
4277 int si_wordnode_nr; /* sequence nr for nodes */ | 4584 int si_wordnode_nr; /* sequence nr for nodes */ |
4278 #endif | 4585 #endif |
4279 | 4586 buf_T *si_spellbuf; /* buffer used to store soundfold word table */ |
4280 | 4587 |
4281 int si_ascii; /* handling only ASCII words */ | 4588 int si_ascii; /* handling only ASCII words */ |
4282 int si_add; /* addition file */ | 4589 int si_add; /* addition file */ |
4283 int si_clear_chartab; /* when TRUE clear char tables */ | 4590 int si_clear_chartab; /* when TRUE clear char tables */ |
4284 int si_region; /* region mask */ | 4591 int si_region; /* region mask */ |
4290 are no regions) */ | 4597 are no regions) */ |
4291 char_u si_region_name[16]; /* region names; used only if | 4598 char_u si_region_name[16]; /* region names; used only if |
4292 * si_region_count > 1) */ | 4599 * si_region_count > 1) */ |
4293 | 4600 |
4294 garray_T si_rep; /* list of fromto_T entries from REP lines */ | 4601 garray_T si_rep; /* list of fromto_T entries from REP lines */ |
4602 garray_T si_repsal; /* list of fromto_T entries from REPSAL lines */ | |
4295 garray_T si_sal; /* list of fromto_T entries from SAL lines */ | 4603 garray_T si_sal; /* list of fromto_T entries from SAL lines */ |
4296 char_u *si_sofofr; /* SOFOFROM text */ | 4604 char_u *si_sofofr; /* SOFOFROM text */ |
4297 char_u *si_sofoto; /* SOFOTO text */ | 4605 char_u *si_sofoto; /* SOFOTO text */ |
4606 int si_nosugfile; /* NOSUGFILE item found */ | |
4298 int si_followup; /* soundsalike: ? */ | 4607 int si_followup; /* soundsalike: ? */ |
4299 int si_collapse; /* soundsalike: ? */ | 4608 int si_collapse; /* soundsalike: ? */ |
4609 hashtab_T si_commonwords; /* hashtable for common words */ | |
4610 time_t si_sugtime; /* timestamp for .sug file */ | |
4300 int si_rem_accents; /* soundsalike: remove accents */ | 4611 int si_rem_accents; /* soundsalike: remove accents */ |
4301 garray_T si_map; /* MAP info concatenated */ | 4612 garray_T si_map; /* MAP info concatenated */ |
4302 char_u *si_midword; /* MIDWORD chars or NULL */ | 4613 char_u *si_midword; /* MIDWORD chars or NULL */ |
4303 int si_compmax; /* max nr of words for compounding */ | 4614 int si_compmax; /* max nr of words for compounding */ |
4304 int si_compminlen; /* minimal length for compounding */ | 4615 int si_compminlen; /* minimal length for compounding */ |
4335 static void free_blocks __ARGS((sblock_T *bl)); | 4646 static void free_blocks __ARGS((sblock_T *bl)); |
4336 static wordnode_T *wordtree_alloc __ARGS((spellinfo_T *spin)); | 4647 static wordnode_T *wordtree_alloc __ARGS((spellinfo_T *spin)); |
4337 static int store_word __ARGS((spellinfo_T *spin, char_u *word, int flags, int region, char_u *pfxlist, int need_affix)); | 4648 static int store_word __ARGS((spellinfo_T *spin, char_u *word, int flags, int region, char_u *pfxlist, int need_affix)); |
4338 static int tree_add_word __ARGS((spellinfo_T *spin, char_u *word, wordnode_T *tree, int flags, int region, int affixID)); | 4649 static int tree_add_word __ARGS((spellinfo_T *spin, char_u *word, wordnode_T *tree, int flags, int region, int affixID)); |
4339 static wordnode_T *get_wordnode __ARGS((spellinfo_T *spin)); | 4650 static wordnode_T *get_wordnode __ARGS((spellinfo_T *spin)); |
4340 static void deref_wordnode __ARGS((spellinfo_T *spin, wordnode_T *node)); | 4651 static int deref_wordnode __ARGS((spellinfo_T *spin, wordnode_T *node)); |
4341 static void free_wordnode __ARGS((spellinfo_T *spin, wordnode_T *n)); | 4652 static void free_wordnode __ARGS((spellinfo_T *spin, wordnode_T *n)); |
4342 static void wordtree_compress __ARGS((spellinfo_T *spin, wordnode_T *root)); | 4653 static void wordtree_compress __ARGS((spellinfo_T *spin, wordnode_T *root)); |
4343 static int node_compress __ARGS((spellinfo_T *spin, wordnode_T *node, hashtab_T *ht, int *tot)); | 4654 static int node_compress __ARGS((spellinfo_T *spin, wordnode_T *node, hashtab_T *ht, int *tot)); |
4344 static int node_equal __ARGS((wordnode_T *n1, wordnode_T *n2)); | 4655 static int node_equal __ARGS((wordnode_T *n1, wordnode_T *n2)); |
4656 static void put_sugtime __ARGS((spellinfo_T *spin, FILE *fd)); | |
4345 static int write_vim_spell __ARGS((spellinfo_T *spin, char_u *fname)); | 4657 static int write_vim_spell __ARGS((spellinfo_T *spin, char_u *fname)); |
4346 static void clear_node __ARGS((wordnode_T *node)); | 4658 static void clear_node __ARGS((wordnode_T *node)); |
4347 static int put_node __ARGS((FILE *fd, wordnode_T *node, int index, int regionmask, int prefixtree)); | 4659 static int put_node __ARGS((FILE *fd, wordnode_T *node, int index, int regionmask, int prefixtree)); |
4660 static void spell_make_sugfile __ARGS((spellinfo_T *spin, char_u *wfname)); | |
4661 static int sug_filltree __ARGS((spellinfo_T *spin, slang_T *slang)); | |
4662 static int sug_maketable __ARGS((spellinfo_T *spin)); | |
4663 static int sug_filltable __ARGS((spellinfo_T *spin, wordnode_T *node, int startwordnr, garray_T *gap)); | |
4664 static int offset2bytes __ARGS((int nr, char_u *buf)); | |
4665 static int bytes2offset __ARGS((char_u **pp)); | |
4666 static void sug_write __ARGS((spellinfo_T *spin, char_u *fname)); | |
4348 static void mkspell __ARGS((int fcount, char_u **fnames, int ascii, int overwrite, int added_word)); | 4667 static void mkspell __ARGS((int fcount, char_u **fnames, int ascii, int overwrite, int added_word)); |
4668 static void spell_message __ARGS((spellinfo_T *spin, char_u *str)); | |
4349 static void init_spellfile __ARGS((void)); | 4669 static void init_spellfile __ARGS((void)); |
4350 | 4670 |
4351 /* In the postponed prefixes tree wn_flags is used to store the WFP_ flags, | 4671 /* In the postponed prefixes tree wn_flags is used to store the WFP_ flags, |
4352 * but it must be negative to indicate the prefix tree to tree_add_word(). | 4672 * but it must be negative to indicate the prefix tree to tree_add_word(). |
4353 * Use a negative number with the lower 8 bits zero. */ | 4673 * Use a negative number with the lower 8 bits zero. */ |
4473 FILE *fd; | 4793 FILE *fd; |
4474 afffile_T *aff; | 4794 afffile_T *aff; |
4475 char_u rline[MAXLINELEN]; | 4795 char_u rline[MAXLINELEN]; |
4476 char_u *line; | 4796 char_u *line; |
4477 char_u *pc = NULL; | 4797 char_u *pc = NULL; |
4478 #define MAXITEMCNT 7 | 4798 #define MAXITEMCNT 30 |
4479 char_u *(items[MAXITEMCNT]); | 4799 char_u *(items[MAXITEMCNT]); |
4480 int itemcnt; | 4800 int itemcnt; |
4481 char_u *p; | 4801 char_u *p; |
4482 int lnum = 0; | 4802 int lnum = 0; |
4483 affheader_T *cur_aff = NULL; | 4803 affheader_T *cur_aff = NULL; |
4486 hashtab_T *tp; | 4806 hashtab_T *tp; |
4487 char_u *low = NULL; | 4807 char_u *low = NULL; |
4488 char_u *fol = NULL; | 4808 char_u *fol = NULL; |
4489 char_u *upp = NULL; | 4809 char_u *upp = NULL; |
4490 int do_rep; | 4810 int do_rep; |
4811 int do_repsal; | |
4491 int do_sal; | 4812 int do_sal; |
4492 int do_map; | 4813 int do_map; |
4493 int found_map = FALSE; | 4814 int found_map = FALSE; |
4494 hashitem_T *hi; | 4815 hashitem_T *hi; |
4495 int l; | 4816 int l; |
4511 { | 4832 { |
4512 EMSG2(_(e_notopen), fname); | 4833 EMSG2(_(e_notopen), fname); |
4513 return NULL; | 4834 return NULL; |
4514 } | 4835 } |
4515 | 4836 |
4516 if (spin->si_verbose || p_verbose > 2) | 4837 vim_snprintf((char *)IObuff, IOSIZE, _("Reading affix file %s ..."), fname); |
4517 { | 4838 spell_message(spin, IObuff); |
4518 if (!spin->si_verbose) | |
4519 verbose_enter(); | |
4520 smsg((char_u *)_("Reading affix file %s ..."), fname); | |
4521 out_flush(); | |
4522 if (!spin->si_verbose) | |
4523 verbose_leave(); | |
4524 } | |
4525 | 4839 |
4526 /* Only do REP lines when not done in another .aff file already. */ | 4840 /* Only do REP lines when not done in another .aff file already. */ |
4527 do_rep = spin->si_rep.ga_len == 0; | 4841 do_rep = spin->si_rep.ga_len == 0; |
4842 | |
4843 /* Only do REPSAL lines when not done in another .aff file already. */ | |
4844 do_repsal = spin->si_repsal.ga_len == 0; | |
4528 | 4845 |
4529 /* Only do SAL lines when not done in another .aff file already. */ | 4846 /* Only do SAL lines when not done in another .aff file already. */ |
4530 do_sal = spin->si_sal.ga_len == 0; | 4847 do_sal = spin->si_sal.ga_len == 0; |
4531 | 4848 |
4532 /* Only do MAP lines when not done in another .aff file already. */ | 4849 /* Only do MAP lines when not done in another .aff file already. */ |
4753 syllable = getroom_save(spin, items[1]); | 5070 syllable = getroom_save(spin, items[1]); |
4754 } | 5071 } |
4755 else if (STRCMP(items[0], "NOBREAK") == 0 && itemcnt == 1) | 5072 else if (STRCMP(items[0], "NOBREAK") == 0 && itemcnt == 1) |
4756 { | 5073 { |
4757 spin->si_nobreak = TRUE; | 5074 spin->si_nobreak = TRUE; |
5075 } | |
5076 else if (STRCMP(items[0], "NOSUGFILE") == 0 && itemcnt == 1) | |
5077 { | |
5078 spin->si_nosugfile = TRUE; | |
4758 } | 5079 } |
4759 else if (STRCMP(items[0], "PFXPOSTPONE") == 0 && itemcnt == 1) | 5080 else if (STRCMP(items[0], "PFXPOSTPONE") == 0 && itemcnt == 1) |
4760 { | 5081 { |
4761 aff->af_pfxpostpone = TRUE; | 5082 aff->af_pfxpostpone = TRUE; |
4762 } | 5083 } |
5059 else if (STRCMP(items[0], "UPP") == 0 && itemcnt == 2 | 5380 else if (STRCMP(items[0], "UPP") == 0 && itemcnt == 2 |
5060 && upp == NULL) | 5381 && upp == NULL) |
5061 { | 5382 { |
5062 upp = vim_strsave(items[1]); | 5383 upp = vim_strsave(items[1]); |
5063 } | 5384 } |
5064 else if (STRCMP(items[0], "REP") == 0 && itemcnt == 2) | 5385 else if ((STRCMP(items[0], "REP") == 0 |
5065 { | 5386 || STRCMP(items[0], "REPSAL") == 0) |
5066 /* Ignore REP count */; | 5387 && itemcnt == 2) |
5388 { | |
5389 /* Ignore REP/REPSAL count */; | |
5067 if (!isdigit(*items[1])) | 5390 if (!isdigit(*items[1])) |
5068 smsg((char_u *)_("Expected REP count in %s line %d"), | 5391 smsg((char_u *)_("Expected REP(SAL) count in %s line %d"), |
5069 fname, lnum); | 5392 fname, lnum); |
5070 } | 5393 } |
5071 else if (STRCMP(items[0], "REP") == 0 && itemcnt >= 3) | 5394 else if ((STRCMP(items[0], "REP") == 0 |
5072 { | 5395 || STRCMP(items[0], "REPSAL") == 0) |
5073 /* REP item */ | 5396 && itemcnt >= 3) |
5397 { | |
5398 /* REP/REPSAL item */ | |
5074 /* Myspell ignores extra arguments, we require it starts with | 5399 /* Myspell ignores extra arguments, we require it starts with |
5075 * # to detect mistakes. */ | 5400 * # to detect mistakes. */ |
5076 if (itemcnt > 3 && items[3][0] != '#') | 5401 if (itemcnt > 3 && items[3][0] != '#') |
5077 smsg((char_u *)_(e_afftrailing), fname, lnum, items[3]); | 5402 smsg((char_u *)_(e_afftrailing), fname, lnum, items[3]); |
5078 if (do_rep) | 5403 if (items[0][3] == 'S' ? do_repsal : do_rep) |
5079 { | 5404 { |
5080 /* Replace underscore with space (can't include a space | 5405 /* Replace underscore with space (can't include a space |
5081 * directly). */ | 5406 * directly). */ |
5082 for (p = items[1]; *p != NUL; mb_ptr_adv(p)) | 5407 for (p = items[1]; *p != NUL; mb_ptr_adv(p)) |
5083 if (*p == '_') | 5408 if (*p == '_') |
5084 *p = ' '; | 5409 *p = ' '; |
5085 for (p = items[2]; *p != NUL; mb_ptr_adv(p)) | 5410 for (p = items[2]; *p != NUL; mb_ptr_adv(p)) |
5086 if (*p == '_') | 5411 if (*p == '_') |
5087 *p = ' '; | 5412 *p = ' '; |
5088 add_fromto(spin, &spin->si_rep, items[1], items[2]); | 5413 add_fromto(spin, items[0][3] == 'S' |
5414 ? &spin->si_repsal | |
5415 : &spin->si_rep, items[1], items[2]); | |
5089 } | 5416 } |
5090 } | 5417 } |
5091 else if (STRCMP(items[0], "MAP") == 0 && itemcnt == 2) | 5418 else if (STRCMP(items[0], "MAP") == 0 && itemcnt == 2) |
5092 { | 5419 { |
5093 /* MAP item or count */ | 5420 /* MAP item or count */ |
5153 } | 5480 } |
5154 else if (STRCMP(items[0], "SOFOTO") == 0 && itemcnt == 2 | 5481 else if (STRCMP(items[0], "SOFOTO") == 0 && itemcnt == 2 |
5155 && sofoto == NULL) | 5482 && sofoto == NULL) |
5156 { | 5483 { |
5157 sofoto = getroom_save(spin, items[1]); | 5484 sofoto = getroom_save(spin, items[1]); |
5485 } | |
5486 else if (STRCMP(items[0], "COMMON") == 0) | |
5487 { | |
5488 int i; | |
5489 | |
5490 for (i = 1; i < itemcnt; ++i) | |
5491 { | |
5492 if (HASHITEM_EMPTY(hash_find(&spin->si_commonwords, | |
5493 items[i]))) | |
5494 { | |
5495 p = vim_strsave(items[i]); | |
5496 if (p == NULL) | |
5497 break; | |
5498 hash_add(&spin->si_commonwords, p); | |
5499 } | |
5500 } | |
5158 } | 5501 } |
5159 else | 5502 else |
5160 smsg((char_u *)_("Unrecognized or duplicate item in %s line %d: %s"), | 5503 smsg((char_u *)_("Unrecognized or duplicate item in %s line %d: %s"), |
5161 fname, lnum, items[0]); | 5504 fname, lnum, items[0]); |
5162 } | 5505 } |
5663 } | 6006 } |
5664 | 6007 |
5665 /* The hashtable is only used to detect duplicated words. */ | 6008 /* The hashtable is only used to detect duplicated words. */ |
5666 hash_init(&ht); | 6009 hash_init(&ht); |
5667 | 6010 |
5668 if (spin->si_verbose || p_verbose > 2) | 6011 vim_snprintf((char *)IObuff, IOSIZE, |
5669 { | 6012 _("Reading dictionary file %s ..."), fname); |
5670 if (!spin->si_verbose) | 6013 spell_message(spin, IObuff); |
5671 verbose_enter(); | |
5672 smsg((char_u *)_("Reading dictionary file %s ..."), fname); | |
5673 out_flush(); | |
5674 if (!spin->si_verbose) | |
5675 verbose_leave(); | |
5676 } | |
5677 | 6014 |
5678 /* start with a message for the first line */ | 6015 /* start with a message for the first line */ |
5679 spin->si_msg_count = 999999; | 6016 spin->si_msg_count = 999999; |
5680 | 6017 |
5681 /* Read and ignore the first line: word count. */ | 6018 /* Read and ignore the first line: word count. */ |
6120 { | 6457 { |
6121 EMSG2(_(e_notopen), fname); | 6458 EMSG2(_(e_notopen), fname); |
6122 return FAIL; | 6459 return FAIL; |
6123 } | 6460 } |
6124 | 6461 |
6125 if (spin->si_verbose || p_verbose > 2) | 6462 vim_snprintf((char *)IObuff, IOSIZE, _("Reading word file %s ..."), fname); |
6126 { | 6463 spell_message(spin, IObuff); |
6127 if (!spin->si_verbose) | |
6128 verbose_enter(); | |
6129 smsg((char_u *)_("Reading word file %s ..."), fname); | |
6130 out_flush(); | |
6131 if (!spin->si_verbose) | |
6132 verbose_leave(); | |
6133 } | |
6134 | 6464 |
6135 /* | 6465 /* |
6136 * Read all the lines in the file one by one. | 6466 * Read all the lines in the file one by one. |
6137 */ | 6467 */ |
6138 while (!vim_fgets(rline, MAXLINELEN, fd) && !got_int) | 6468 while (!vim_fgets(rline, MAXLINELEN, fd) && !got_int) |
6292 } | 6622 } |
6293 | 6623 |
6294 vim_free(pc); | 6624 vim_free(pc); |
6295 fclose(fd); | 6625 fclose(fd); |
6296 | 6626 |
6297 if (spin->si_ascii && non_ascii > 0 && (spin->si_verbose || p_verbose > 2)) | 6627 if (spin->si_ascii && non_ascii > 0) |
6298 { | 6628 { |
6299 if (p_verbose > 2) | 6629 vim_snprintf((char *)IObuff, IOSIZE, |
6300 verbose_enter(); | 6630 _("Ignored %d words with non-ASCII characters"), non_ascii); |
6301 smsg((char_u *)_("Ignored %d words with non-ASCII characters"), | 6631 spell_message(spin, IObuff); |
6302 non_ascii); | 6632 } |
6303 if (p_verbose > 2) | 6633 |
6304 verbose_leave(); | |
6305 } | |
6306 return retval; | 6634 return retval; |
6307 } | 6635 } |
6308 | 6636 |
6309 /* | 6637 /* |
6310 * Get part of an sblock_T, "len" bytes long. | 6638 * Get part of an sblock_T, "len" bytes long. |
6440 return res; | 6768 return res; |
6441 } | 6769 } |
6442 | 6770 |
6443 /* | 6771 /* |
6444 * Add word "word" to a word tree at "root". | 6772 * Add word "word" to a word tree at "root". |
6445 * When "flags" < 0 we are adding to the prefix tree where flags is used for | 6773 * When "flags" < 0 we are adding to the prefix tree where "flags" is used for |
6446 * "rare" and "region" is the condition nr. | 6774 * "rare" and "region" is the condition nr. |
6447 * Returns FAIL when out of memory. | 6775 * Returns FAIL when out of memory. |
6448 */ | 6776 */ |
6449 static int | 6777 static int |
6450 tree_add_word(spin, word, root, flags, region, affixID) | 6778 tree_add_word(spin, word, root, flags, region, affixID) |
6505 * done on flags and then on affixID. */ | 6833 * done on flags and then on affixID. */ |
6506 while (node != NULL | 6834 while (node != NULL |
6507 && (node->wn_byte < word[i] | 6835 && (node->wn_byte < word[i] |
6508 || (node->wn_byte == NUL | 6836 || (node->wn_byte == NUL |
6509 && (flags < 0 | 6837 && (flags < 0 |
6510 ? node->wn_affixID < affixID | 6838 ? node->wn_affixID < (unsigned)affixID |
6511 : node->wn_flags < (flags & WN_MASK) | 6839 : (node->wn_flags < (unsigned)(flags & WN_MASK) |
6512 || (node->wn_flags == (flags & WN_MASK) | 6840 || (node->wn_flags == (flags & WN_MASK) |
6513 && node->wn_affixID < affixID))))) | 6841 && (spin->si_sugtree |
6842 ? (node->wn_region & 0xffff) < region | |
6843 : node->wn_affixID | |
6844 < (unsigned)affixID))))))) | |
6514 { | 6845 { |
6515 prev = &node->wn_sibling; | 6846 prev = &node->wn_sibling; |
6516 node = *prev; | 6847 node = *prev; |
6517 } | 6848 } |
6518 if (node == NULL | 6849 if (node == NULL |
6519 || node->wn_byte != word[i] | 6850 || node->wn_byte != word[i] |
6520 || (word[i] == NUL | 6851 || (word[i] == NUL |
6521 && (flags < 0 | 6852 && (flags < 0 |
6853 || spin->si_sugtree | |
6522 || node->wn_flags != (flags & WN_MASK) | 6854 || node->wn_flags != (flags & WN_MASK) |
6523 || node->wn_affixID != affixID))) | 6855 || node->wn_affixID != affixID))) |
6524 { | 6856 { |
6525 /* Allocate a new node. */ | 6857 /* Allocate a new node. */ |
6526 np = get_wordnode(spin); | 6858 np = get_wordnode(spin); |
6604 out_flush(); | 6936 out_flush(); |
6605 } | 6937 } |
6606 | 6938 |
6607 /* Compress both trees. Either they both have many nodes, which makes | 6939 /* Compress both trees. Either they both have many nodes, which makes |
6608 * compression useful, or one of them is small, which means | 6940 * compression useful, or one of them is small, which means |
6609 * compression goes fast. */ | 6941 * compression goes fast. But when filling the souldfold word tree |
6942 * there is no keep-case tree. */ | |
6610 wordtree_compress(spin, spin->si_foldroot); | 6943 wordtree_compress(spin, spin->si_foldroot); |
6611 wordtree_compress(spin, spin->si_keeproot); | 6944 if (affixID >= 0) |
6945 wordtree_compress(spin, spin->si_keeproot); | |
6612 } | 6946 } |
6613 | 6947 |
6614 return OK; | 6948 return OK; |
6615 } | 6949 } |
6616 | 6950 |
6682 | 7016 |
6683 /* | 7017 /* |
6684 * Decrement the reference count on a node (which is the head of a list of | 7018 * Decrement the reference count on a node (which is the head of a list of |
6685 * siblings). If the reference count becomes zero free the node and its | 7019 * siblings). If the reference count becomes zero free the node and its |
6686 * siblings. | 7020 * siblings. |
6687 */ | 7021 * Returns the number of nodes actually freed. |
6688 static void | 7022 */ |
7023 static int | |
6689 deref_wordnode(spin, node) | 7024 deref_wordnode(spin, node) |
6690 spellinfo_T *spin; | 7025 spellinfo_T *spin; |
6691 wordnode_T *node; | 7026 wordnode_T *node; |
6692 { | 7027 { |
6693 wordnode_T *np; | 7028 wordnode_T *np; |
7029 int cnt = 0; | |
6694 | 7030 |
6695 if (--node->wn_refs == 0) | 7031 if (--node->wn_refs == 0) |
7032 { | |
6696 for (np = node; np != NULL; np = np->wn_sibling) | 7033 for (np = node; np != NULL; np = np->wn_sibling) |
6697 { | 7034 { |
6698 if (np->wn_child != NULL) | 7035 if (np->wn_child != NULL) |
6699 deref_wordnode(spin, np->wn_child); | 7036 cnt += deref_wordnode(spin, np->wn_child); |
6700 free_wordnode(spin, np); | 7037 free_wordnode(spin, np); |
6701 } | 7038 ++cnt; |
7039 } | |
7040 ++cnt; /* length field */ | |
7041 } | |
7042 return cnt; | |
6702 } | 7043 } |
6703 | 7044 |
6704 /* | 7045 /* |
6705 * Free a wordnode_T for re-use later. | 7046 * Free a wordnode_T for re-use later. |
6706 * Only the "wn_child" field becomes invalid. | 7047 * Only the "wn_child" field becomes invalid. |
6737 | 7078 |
6738 #ifndef SPELL_PRINTTREE | 7079 #ifndef SPELL_PRINTTREE |
6739 if (spin->si_verbose || p_verbose > 2) | 7080 if (spin->si_verbose || p_verbose > 2) |
6740 #endif | 7081 #endif |
6741 { | 7082 { |
6742 if (!spin->si_verbose) | |
6743 verbose_enter(); | |
6744 if (tot > 1000000) | 7083 if (tot > 1000000) |
6745 perc = (tot - n) / (tot / 100); | 7084 perc = (tot - n) / (tot / 100); |
6746 else if (tot == 0) | 7085 else if (tot == 0) |
6747 perc = 0; | 7086 perc = 0; |
6748 else | 7087 else |
6749 perc = (tot - n) * 100 / tot; | 7088 perc = (tot - n) * 100 / tot; |
6750 smsg((char_u *)_("Compressed %d of %d nodes; %d%% remaining"), | 7089 vim_snprintf((char *)IObuff, IOSIZE, |
6751 n, tot, perc); | 7090 _("Compressed %d of %d nodes; %d (%d%%) remaining"), |
6752 if (p_verbose > 2) | 7091 n, tot, tot - n, perc); |
6753 verbose_leave(); | 7092 spell_message(spin, IObuff); |
6754 } | 7093 } |
6755 #ifdef SPELL_PRINTTREE | 7094 #ifdef SPELL_PRINTTREE |
6756 spell_print_tree(root->wn_sibling); | 7095 spell_print_tree(root->wn_sibling); |
6757 #endif | 7096 #endif |
6758 hash_clear(&ht); | 7097 hash_clear(&ht); |
6782 | 7121 |
6783 /* | 7122 /* |
6784 * Go through the list of siblings. Compress each child and then try | 7123 * Go through the list of siblings. Compress each child and then try |
6785 * finding an identical child to replace it. | 7124 * finding an identical child to replace it. |
6786 * Note that with "child" we mean not just the node that is pointed to, | 7125 * Note that with "child" we mean not just the node that is pointed to, |
6787 * but the whole list of siblings, of which the node is the first. | 7126 * but the whole list of siblings of which the child node is the first. |
6788 */ | 7127 */ |
6789 for (np = node; np != NULL && !got_int; np = np->wn_sibling) | 7128 for (np = node; np != NULL && !got_int; np = np->wn_sibling) |
6790 { | 7129 { |
6791 ++len; | 7130 ++len; |
6792 if ((child = np->wn_child) != NULL) | 7131 if ((child = np->wn_child) != NULL) |
6793 { | 7132 { |
6794 /* Compress the child. This fills hashkey. */ | 7133 /* Compress the child first. This fills hashkey. */ |
6795 compressed += node_compress(spin, child, ht, tot); | 7134 compressed += node_compress(spin, child, ht, tot); |
6796 | 7135 |
6797 /* Try to find an identical child. */ | 7136 /* Try to find an identical child. */ |
6798 hash = hash_hash(child->wn_u1.hashkey); | 7137 hash = hash_hash(child->wn_u1.hashkey); |
6799 hi = hash_lookup(ht, child->wn_u1.hashkey, hash); | 7138 hi = hash_lookup(ht, child->wn_u1.hashkey, hash); |
6800 tp = NULL; | |
6801 if (!HASHITEM_EMPTY(hi)) | 7139 if (!HASHITEM_EMPTY(hi)) |
6802 { | 7140 { |
6803 /* There are children with an identical hash value. Now check | 7141 /* There are children we encountered before with a hash value |
6804 * if there is one that is really identical. */ | 7142 * identical to the current child. Now check if there is one |
7143 * that is really identical. */ | |
6805 for (tp = HI2WN(hi); tp != NULL; tp = tp->wn_u2.next) | 7144 for (tp = HI2WN(hi); tp != NULL; tp = tp->wn_u2.next) |
6806 if (node_equal(child, tp)) | 7145 if (node_equal(child, tp)) |
6807 { | 7146 { |
6808 /* Found one! Now use that child in place of the | 7147 /* Found one! Now use that child in place of the |
6809 * current one. This means the current child and all | 7148 * current one. This means the current child and all |
6810 * its siblings is unlinked from the tree. */ | 7149 * its siblings is unlinked from the tree. */ |
6811 ++tp->wn_refs; | 7150 ++tp->wn_refs; |
6812 deref_wordnode(spin, child); | 7151 compressed += deref_wordnode(spin, child); |
6813 np->wn_child = tp; | 7152 np->wn_child = tp; |
6814 ++compressed; | |
6815 break; | 7153 break; |
6816 } | 7154 } |
6817 if (tp == NULL) | 7155 if (tp == NULL) |
6818 { | 7156 { |
6819 /* No other child with this hash value equals the child of | 7157 /* No other child with this hash value equals the child of |
6828 /* No other child has this hash value, add it to the | 7166 /* No other child has this hash value, add it to the |
6829 * hashtable. */ | 7167 * hashtable. */ |
6830 hash_add_item(ht, hi, child->wn_u1.hashkey, hash); | 7168 hash_add_item(ht, hi, child->wn_u1.hashkey, hash); |
6831 } | 7169 } |
6832 } | 7170 } |
6833 *tot += len; | 7171 *tot += len + 1; /* add one for the node that stores the length */ |
6834 | 7172 |
6835 /* | 7173 /* |
6836 * Make a hash key for the node and its siblings, so that we can quickly | 7174 * Make a hash key for the node and its siblings, so that we can quickly |
6837 * find a lookalike node. This must be done after compressing the sibling | 7175 * find a lookalike node. This must be done after compressing the sibling |
6838 * list, otherwise the hash key would become invalid by the compression. | 7176 * list, otherwise the hash key would become invalid by the compression. |
6902 { | 7240 { |
6903 int i; | 7241 int i; |
6904 | 7242 |
6905 for (i = len - 1; i >= 0; --i) | 7243 for (i = len - 1; i >= 0; --i) |
6906 putc((int)(nr >> (i * 8)), fd); | 7244 putc((int)(nr >> (i * 8)), fd); |
7245 } | |
7246 | |
7247 /* | |
7248 * Write spin->si_sugtime to file "fd". | |
7249 */ | |
7250 static void | |
7251 put_sugtime(spin, fd) | |
7252 spellinfo_T *spin; | |
7253 FILE *fd; | |
7254 { | |
7255 int c; | |
7256 int i; | |
7257 | |
7258 /* time_t can be up to 8 bytes in size, more than long_u, thus we | |
7259 * can't use put_bytes() here. */ | |
7260 for (i = 7; i >= 0; --i) | |
7261 if (i + 1 > sizeof(time_t)) | |
7262 /* ">>" doesn't work well when shifting more bits than avail */ | |
7263 putc(0, fd); | |
7264 else | |
7265 { | |
7266 c = (unsigned)spin->si_sugtime >> (i * 8); | |
7267 putc(c, fd); | |
7268 } | |
6907 } | 7269 } |
6908 | 7270 |
6909 static int | 7271 static int |
6910 #ifdef __BORLANDC__ | 7272 #ifdef __BORLANDC__ |
6911 _RTLENTRYF | 7273 _RTLENTRYF |
7054 | 7416 |
7055 write_spell_prefcond(fd, &spin->si_prefcond); | 7417 write_spell_prefcond(fd, &spin->si_prefcond); |
7056 } | 7418 } |
7057 | 7419 |
7058 /* SN_REP: <repcount> <rep> ... | 7420 /* SN_REP: <repcount> <rep> ... |
7059 * SN_SAL: <salflags> <salcount> <sal> ... */ | 7421 * SN_SAL: <salflags> <salcount> <sal> ... |
7060 | 7422 * SN_REPSAL: <repcount> <rep> ... */ |
7061 /* Sort the REP items. */ | 7423 |
7062 qsort(spin->si_rep.ga_data, (size_t)spin->si_rep.ga_len, | 7424 /* round 1: SN_REP section |
7425 * round 2: SN_SAL section (unless SN_SOFO is used) | |
7426 * round 3: SN_REPSAL section */ | |
7427 for (round = 1; round <= 3; ++round) | |
7428 { | |
7429 if (round == 1) | |
7430 gap = &spin->si_rep; | |
7431 else if (round == 2) | |
7432 { | |
7433 /* Don't write SN_SAL when using a SN_SOFO section */ | |
7434 if (spin->si_sofofr != NULL && spin->si_sofoto != NULL) | |
7435 continue; | |
7436 gap = &spin->si_sal; | |
7437 } | |
7438 else | |
7439 gap = &spin->si_repsal; | |
7440 | |
7441 /* Don't write the section if there are no items. */ | |
7442 if (gap->ga_len == 0) | |
7443 continue; | |
7444 | |
7445 /* Sort the REP/REPSAL items. */ | |
7446 if (round != 2) | |
7447 qsort(gap->ga_data, (size_t)gap->ga_len, | |
7063 sizeof(fromto_T), rep_compare); | 7448 sizeof(fromto_T), rep_compare); |
7064 | 7449 |
7065 /* round 1: SN_REP section | 7450 i = round == 1 ? SN_REP : (round == 2 ? SN_SAL : SN_REPSAL); |
7066 * round 2: SN_SAL section (unless SN_SOFO is used) */ | 7451 putc(i, fd); /* <sectionID> */ |
7067 for (round = 1; round <= 2; ++round) | |
7068 { | |
7069 if (round == 1) | |
7070 { | |
7071 gap = &spin->si_rep; | |
7072 putc(SN_REP, fd); /* <sectionID> */ | |
7073 } | |
7074 else | |
7075 { | |
7076 if (spin->si_sofofr != NULL && spin->si_sofoto != NULL) | |
7077 /* using SN_SOFO section instead of SN_SAL */ | |
7078 break; | |
7079 gap = &spin->si_sal; | |
7080 putc(SN_SAL, fd); /* <sectionID> */ | |
7081 } | |
7082 | 7452 |
7083 /* This is for making suggestions, section is not required. */ | 7453 /* This is for making suggestions, section is not required. */ |
7084 putc(0, fd); /* <sectionflags> */ | 7454 putc(0, fd); /* <sectionflags> */ |
7085 | 7455 |
7086 /* Compute the length of what follows. */ | 7456 /* Compute the length of what follows. */ |
7141 l = STRLEN(spin->si_sofoto); | 7511 l = STRLEN(spin->si_sofoto); |
7142 put_bytes(fd, (long_u)l, 2); /* <sofotolen> */ | 7512 put_bytes(fd, (long_u)l, 2); /* <sofotolen> */ |
7143 fwrite(spin->si_sofoto, l, (size_t)1, fd); /* <sofoto> */ | 7513 fwrite(spin->si_sofoto, l, (size_t)1, fd); /* <sofoto> */ |
7144 } | 7514 } |
7145 | 7515 |
7516 /* SN_WORDS: <word> ... | |
7517 * This is for making suggestions, section is not required. */ | |
7518 if (spin->si_commonwords.ht_used > 0) | |
7519 { | |
7520 putc(SN_WORDS, fd); /* <sectionID> */ | |
7521 putc(0, fd); /* <sectionflags> */ | |
7522 | |
7523 /* round 1: count the bytes | |
7524 * round 2: write the bytes */ | |
7525 for (round = 1; round <= 2; ++round) | |
7526 { | |
7527 int todo; | |
7528 int len = 0; | |
7529 hashitem_T *hi; | |
7530 | |
7531 todo = spin->si_commonwords.ht_used; | |
7532 for (hi = spin->si_commonwords.ht_array; todo > 0; ++hi) | |
7533 if (!HASHITEM_EMPTY(hi)) | |
7534 { | |
7535 l = STRLEN(hi->hi_key) + 1; | |
7536 len += l; | |
7537 if (round == 2) /* <word> */ | |
7538 fwrite(hi->hi_key, (size_t)l, (size_t)1, fd); | |
7539 --todo; | |
7540 } | |
7541 if (round == 1) | |
7542 put_bytes(fd, (long_u)len, 4); /* <sectionlen> */ | |
7543 } | |
7544 } | |
7545 | |
7146 /* SN_MAP: <mapstr> | 7546 /* SN_MAP: <mapstr> |
7147 * This is for making suggestions, section is not required. */ | 7547 * This is for making suggestions, section is not required. */ |
7148 if (spin->si_map.ga_len > 0) | 7548 if (spin->si_map.ga_len > 0) |
7149 { | 7549 { |
7150 putc(SN_MAP, fd); /* <sectionID> */ | 7550 putc(SN_MAP, fd); /* <sectionID> */ |
7151 putc(0, fd); /* <sectionflags> */ | 7551 putc(0, fd); /* <sectionflags> */ |
7152 l = spin->si_map.ga_len; | 7552 l = spin->si_map.ga_len; |
7153 put_bytes(fd, (long_u)l, 4); /* <sectionlen> */ | 7553 put_bytes(fd, (long_u)l, 4); /* <sectionlen> */ |
7154 fwrite(spin->si_map.ga_data, (size_t)l, (size_t)1, fd); | 7554 fwrite(spin->si_map.ga_data, (size_t)l, (size_t)1, fd); |
7155 /* <mapstr> */ | 7555 /* <mapstr> */ |
7556 } | |
7557 | |
7558 /* SN_SUGFILE: <timestamp> | |
7559 * This is used to notify that a .sug file may be available and at the | |
7560 * same time allows for checking that a .sug file that is found matches | |
7561 * with this .spl file. That's because the word numbers must be exactly | |
7562 * right. */ | |
7563 if (!spin->si_nosugfile | |
7564 && (spin->si_sal.ga_len > 0 | |
7565 || (spin->si_sofofr != NULL && spin->si_sofoto != NULL))) | |
7566 { | |
7567 putc(SN_SUGFILE, fd); /* <sectionID> */ | |
7568 putc(0, fd); /* <sectionflags> */ | |
7569 put_bytes(fd, (long_u)8, 4); /* <sectionlen> */ | |
7570 | |
7571 /* Set si_sugtime and write it to the file. */ | |
7572 spin->si_sugtime = time(NULL); | |
7573 put_sugtime(spin, fd); /* <timestamp> */ | |
7156 } | 7574 } |
7157 | 7575 |
7158 /* SN_COMPOUND: compound info. | 7576 /* SN_COMPOUND: compound info. |
7159 * We don't mark it required, when not supported all compound words will | 7577 * We don't mark it required, when not supported all compound words will |
7160 * be bad words. */ | 7578 * be bad words. */ |
7265 * Dump a word tree at node "node". | 7683 * Dump a word tree at node "node". |
7266 * | 7684 * |
7267 * This first writes the list of possible bytes (siblings). Then for each | 7685 * This first writes the list of possible bytes (siblings). Then for each |
7268 * byte recursively write the children. | 7686 * byte recursively write the children. |
7269 * | 7687 * |
7270 * NOTE: The code here must match the code in read_tree(), since assumptions | 7688 * NOTE: The code here must match the code in read_tree_node(), since |
7271 * are made about the indexes (so that we don't have to write them in the | 7689 * assumptions are made about the indexes (so that we don't have to write them |
7272 * file). | 7690 * in the file). |
7273 * | 7691 * |
7274 * Returns the number of nodes used. | 7692 * Returns the number of nodes used. |
7275 */ | 7693 */ |
7276 static int | 7694 static int |
7277 put_node(fd, node, index, regionmask, prefixtree) | 7695 put_node(fd, node, index, regionmask, prefixtree) |
7425 FreeWild(fcount, fnames); | 7843 FreeWild(fcount, fnames); |
7426 } | 7844 } |
7427 } | 7845 } |
7428 | 7846 |
7429 /* | 7847 /* |
7848 * Create the .sug file. | |
7849 * Uses the soundfold info in "spin". | |
7850 * Writes the file with the name "wfname", with ".spl" changed to ".sug". | |
7851 */ | |
7852 static void | |
7853 spell_make_sugfile(spin, wfname) | |
7854 spellinfo_T *spin; | |
7855 char_u *wfname; | |
7856 { | |
7857 char_u fname[MAXPATHL]; | |
7858 int len; | |
7859 slang_T *slang; | |
7860 int free_slang = FALSE; | |
7861 | |
7862 /* | |
7863 * Read back the .spl file that was written. This fills the required | |
7864 * info for soundfolding. This also uses less memory than the | |
7865 * pointer-linked version of the trie. And it avoids having two versions | |
7866 * of the code for the soundfolding stuff. | |
7867 * It might have been done already by spell_reload_one(). | |
7868 */ | |
7869 for (slang = first_lang; slang != NULL; slang = slang->sl_next) | |
7870 if (fullpathcmp(wfname, slang->sl_fname, FALSE) == FPC_SAME) | |
7871 break; | |
7872 if (slang == NULL) | |
7873 { | |
7874 spell_message(spin, (char_u *)_("Reading back spell file...")); | |
7875 slang = spell_load_file(wfname, NULL, NULL, FALSE); | |
7876 if (slang == NULL) | |
7877 return; | |
7878 /* don't want this language in the list */ | |
7879 if (first_lang == slang) | |
7880 first_lang = slang->sl_next; | |
7881 free_slang = TRUE; | |
7882 } | |
7883 | |
7884 /* | |
7885 * Clear the info in "spin" that is used. | |
7886 */ | |
7887 spin->si_blocks = NULL; | |
7888 spin->si_blocks_cnt = 0; | |
7889 spin->si_compress_cnt = 0; /* will stay at 0 all the time*/ | |
7890 spin->si_free_count = 0; | |
7891 spin->si_first_free = NULL; | |
7892 spin->si_foldwcount = 0; | |
7893 | |
7894 /* | |
7895 * Go through the trie of good words, soundfold each word and add it to | |
7896 * the soundfold trie. | |
7897 */ | |
7898 spell_message(spin, (char_u *)_("Performing soundfolding...")); | |
7899 if (sug_filltree(spin, slang) == FAIL) | |
7900 goto theend; | |
7901 | |
7902 /* | |
7903 * Create the table which links each soundfold word with a list of the | |
7904 * good words it may come from. Creates buffer "spin->si_spellbuf". | |
7905 * This also removes the wordnr from the NUL byte entries to make | |
7906 * compression possible. | |
7907 */ | |
7908 if (sug_maketable(spin) == FAIL) | |
7909 goto theend; | |
7910 | |
7911 smsg((char_u *)_("Number of words after soundfolding: %ld"), | |
7912 (long)spin->si_spellbuf->b_ml.ml_line_count); | |
7913 | |
7914 /* | |
7915 * Compress the soundfold trie. | |
7916 */ | |
7917 spell_message(spin, (char_u *)_(msg_compressing)); | |
7918 wordtree_compress(spin, spin->si_foldroot); | |
7919 | |
7920 /* | |
7921 * Write the .sug file. | |
7922 * Make the file name by changing ".spl" to ".sug". | |
7923 */ | |
7924 STRCPY(fname, wfname); | |
7925 len = STRLEN(fname); | |
7926 fname[len - 2] = 'u'; | |
7927 fname[len - 1] = 'g'; | |
7928 sug_write(spin, fname); | |
7929 | |
7930 theend: | |
7931 if (free_slang) | |
7932 slang_free(slang); | |
7933 free_blocks(spin->si_blocks); | |
7934 close_spellbuf(spin->si_spellbuf); | |
7935 } | |
7936 | |
7937 /* | |
7938 * Build the soundfold trie for language "slang". | |
7939 */ | |
7940 static int | |
7941 sug_filltree(spin, slang) | |
7942 spellinfo_T *spin; | |
7943 slang_T *slang; | |
7944 { | |
7945 char_u *byts; | |
7946 idx_T *idxs; | |
7947 int depth; | |
7948 idx_T arridx[MAXWLEN]; | |
7949 int curi[MAXWLEN]; | |
7950 char_u tword[MAXWLEN]; | |
7951 char_u tsalword[MAXWLEN]; | |
7952 int c; | |
7953 idx_T n; | |
7954 unsigned words_done = 0; | |
7955 int wordcount[MAXWLEN]; | |
7956 | |
7957 /* We use si_foldroot for the souldfolded trie. */ | |
7958 spin->si_foldroot = wordtree_alloc(spin); | |
7959 if (spin->si_foldroot == NULL) | |
7960 return FAIL; | |
7961 | |
7962 /* let tree_add_word() know we're adding to the soundfolded tree */ | |
7963 spin->si_sugtree = TRUE; | |
7964 | |
7965 /* | |
7966 * Go through the whole case-folded tree, soundfold each word and put it | |
7967 * in the trie. | |
7968 */ | |
7969 byts = slang->sl_fbyts; | |
7970 idxs = slang->sl_fidxs; | |
7971 | |
7972 arridx[0] = 0; | |
7973 curi[0] = 1; | |
7974 wordcount[0] = 0; | |
7975 | |
7976 depth = 0; | |
7977 while (depth >= 0 && !got_int) | |
7978 { | |
7979 if (curi[depth] > byts[arridx[depth]]) | |
7980 { | |
7981 /* Done all bytes at this node, go up one level. */ | |
7982 idxs[arridx[depth]] = wordcount[depth]; | |
7983 if (depth > 0) | |
7984 wordcount[depth - 1] += wordcount[depth]; | |
7985 | |
7986 --depth; | |
7987 line_breakcheck(); | |
7988 } | |
7989 else | |
7990 { | |
7991 | |
7992 /* Do one more byte at this node. */ | |
7993 n = arridx[depth] + curi[depth]; | |
7994 ++curi[depth]; | |
7995 | |
7996 c = byts[n]; | |
7997 if (c == 0) | |
7998 { | |
7999 /* Sound-fold the word. */ | |
8000 tword[depth] = NUL; | |
8001 spell_soundfold(slang, tword, TRUE, tsalword); | |
8002 | |
8003 /* We use the "flags" field for the MSB of the wordnr, | |
8004 * "region" for the LSB of the wordnr. */ | |
8005 if (tree_add_word(spin, tsalword, spin->si_foldroot, | |
8006 words_done >> 16, words_done & 0xffff, | |
8007 0) == FAIL) | |
8008 return FAIL; | |
8009 | |
8010 ++words_done; | |
8011 ++wordcount[depth]; | |
8012 | |
8013 /* Reset the block count each time to avoid compression | |
8014 * kicking in. */ | |
8015 spin->si_blocks_cnt = 0; | |
8016 | |
8017 /* Skip over any other NUL bytes (same word with different | |
8018 * flags). */ | |
8019 while (byts[n + 1] == 0) | |
8020 { | |
8021 ++n; | |
8022 ++curi[depth]; | |
8023 } | |
8024 } | |
8025 else | |
8026 { | |
8027 /* Normal char, go one level deeper. */ | |
8028 tword[depth++] = c; | |
8029 arridx[depth] = idxs[n]; | |
8030 curi[depth] = 1; | |
8031 wordcount[depth] = 0; | |
8032 } | |
8033 } | |
8034 } | |
8035 | |
8036 smsg((char_u *)_("Total number of words: %d"), words_done); | |
8037 | |
8038 return OK; | |
8039 } | |
8040 | |
8041 /* | |
8042 * Make the table that links each word in the soundfold trie to the words it | |
8043 * can be produced from. | |
8044 * This is not unlike lines in a file, thus use a memfile to be able to access | |
8045 * the table efficiently. | |
8046 * Returns FAIL when out of memory. | |
8047 */ | |
8048 static int | |
8049 sug_maketable(spin) | |
8050 spellinfo_T *spin; | |
8051 { | |
8052 garray_T ga; | |
8053 int res = OK; | |
8054 | |
8055 /* Allocate a buffer, open a memline for it and create the swap file | |
8056 * (uses a temp file, not a .swp file). */ | |
8057 spin->si_spellbuf = open_spellbuf(); | |
8058 if (spin->si_spellbuf == NULL) | |
8059 return FAIL; | |
8060 | |
8061 /* Use a buffer to store the line info, avoids allocating many small | |
8062 * pieces of memory. */ | |
8063 ga_init2(&ga, 1, 100); | |
8064 | |
8065 /* recursively go through the tree */ | |
8066 if (sug_filltable(spin, spin->si_foldroot->wn_sibling, 0, &ga) == -1) | |
8067 res = FAIL; | |
8068 | |
8069 ga_clear(&ga); | |
8070 return res; | |
8071 } | |
8072 | |
8073 /* | |
8074 * Fill the table for one node and its children. | |
8075 * Returns the wordnr at the start of the node. | |
8076 * Returns -1 when out of memory. | |
8077 */ | |
8078 static int | |
8079 sug_filltable(spin, node, startwordnr, gap) | |
8080 spellinfo_T *spin; | |
8081 wordnode_T *node; | |
8082 int startwordnr; | |
8083 garray_T *gap; /* place to store line of numbers */ | |
8084 { | |
8085 wordnode_T *p, *np; | |
8086 int wordnr = startwordnr; | |
8087 int nr; | |
8088 int prev_nr; | |
8089 | |
8090 for (p = node; p != NULL; p = p->wn_sibling) | |
8091 { | |
8092 if (p->wn_byte == NUL) | |
8093 { | |
8094 gap->ga_len = 0; | |
8095 prev_nr = 0; | |
8096 for (np = p; np != NULL && np->wn_byte == NUL; np = np->wn_sibling) | |
8097 { | |
8098 if (ga_grow(gap, 10) == FAIL) | |
8099 return -1; | |
8100 | |
8101 nr = (np->wn_flags << 16) + (np->wn_region & 0xffff); | |
8102 /* Compute the offset from the previous nr and store the | |
8103 * offset in a way that it takes a minimum number of bytes. | |
8104 * It's a bit like utf-8, but without the need to mark | |
8105 * following bytes. */ | |
8106 nr -= prev_nr; | |
8107 prev_nr += nr; | |
8108 gap->ga_len += offset2bytes(nr, | |
8109 (char_u *)gap->ga_data + gap->ga_len); | |
8110 } | |
8111 | |
8112 /* add the NUL byte */ | |
8113 ((char_u *)gap->ga_data)[gap->ga_len++] = NUL; | |
8114 | |
8115 if (ml_append_buf(spin->si_spellbuf, (linenr_T)wordnr, | |
8116 gap->ga_data, gap->ga_len, TRUE) == FAIL) | |
8117 return -1; | |
8118 ++wordnr; | |
8119 | |
8120 /* Remove extra NUL entries, we no longer need them. We don't | |
8121 * bother freeing the nodes, the won't be reused anyway. */ | |
8122 while (p->wn_sibling != NULL && p->wn_sibling->wn_byte == NUL) | |
8123 p->wn_sibling = p->wn_sibling->wn_sibling; | |
8124 | |
8125 /* Clear the flags on the remaining NUL node, so that compression | |
8126 * works a lot better. */ | |
8127 p->wn_flags = 0; | |
8128 p->wn_region = 0; | |
8129 } | |
8130 else | |
8131 { | |
8132 wordnr = sug_filltable(spin, p->wn_child, wordnr, gap); | |
8133 if (wordnr == -1) | |
8134 return -1; | |
8135 } | |
8136 } | |
8137 return wordnr; | |
8138 } | |
8139 | |
8140 /* | |
8141 * Convert an offset into a minimal number of bytes. | |
8142 * Similar to utf_char2byters, but use 8 bits in followup bytes and avoid NUL | |
8143 * bytes. | |
8144 */ | |
8145 static int | |
8146 offset2bytes(nr, buf) | |
8147 int nr; | |
8148 char_u *buf; | |
8149 { | |
8150 int rem; | |
8151 int b1, b2, b3, b4; | |
8152 | |
8153 /* Split the number in parts of base 255. We need to avoid NUL bytes. */ | |
8154 b1 = nr % 255 + 1; | |
8155 rem = nr / 255; | |
8156 b2 = rem % 255 + 1; | |
8157 rem = rem / 255; | |
8158 b3 = rem % 255 + 1; | |
8159 b4 = rem / 255 + 1; | |
8160 | |
8161 if (b4 > 1 || b3 > 0x1f) /* 4 bytes */ | |
8162 { | |
8163 buf[0] = 0xe0 + b4; | |
8164 buf[1] = b3; | |
8165 buf[2] = b2; | |
8166 buf[3] = b1; | |
8167 return 4; | |
8168 } | |
8169 if (b3 > 1 || b2 > 0x3f ) /* 3 bytes */ | |
8170 { | |
8171 buf[0] = 0xc0 + b3; | |
8172 buf[1] = b2; | |
8173 buf[2] = b1; | |
8174 return 3; | |
8175 } | |
8176 if (b2 > 1 || b1 > 0x7f ) /* 2 bytes */ | |
8177 { | |
8178 buf[0] = 0x80 + b2; | |
8179 buf[1] = b1; | |
8180 return 2; | |
8181 } | |
8182 /* 1 byte */ | |
8183 buf[0] = b1; | |
8184 return 1; | |
8185 } | |
8186 | |
8187 /* | |
8188 * Opposite of offset2bytes(). | |
8189 * "pp" points to the bytes and is advanced over it. | |
8190 * Returns the offset. | |
8191 */ | |
8192 static int | |
8193 bytes2offset(pp) | |
8194 char_u **pp; | |
8195 { | |
8196 char_u *p = *pp; | |
8197 int nr; | |
8198 int c; | |
8199 | |
8200 c = *p++; | |
8201 if ((c & 0x80) == 0x00) /* 1 byte */ | |
8202 { | |
8203 nr = c - 1; | |
8204 } | |
8205 else if ((c & 0xc0) == 0x80) /* 2 bytes */ | |
8206 { | |
8207 nr = (c & 0x3f) - 1; | |
8208 nr = nr * 255 + (*p++ - 1); | |
8209 } | |
8210 else if ((c & 0xe0) == 0xc0) /* 3 bytes */ | |
8211 { | |
8212 nr = (c & 0x1f) - 1; | |
8213 nr = nr * 255 + (*p++ - 1); | |
8214 nr = nr * 255 + (*p++ - 1); | |
8215 } | |
8216 else /* 4 bytes */ | |
8217 { | |
8218 nr = (c & 0x0f) - 1; | |
8219 nr = nr * 255 + (*p++ - 1); | |
8220 nr = nr * 255 + (*p++ - 1); | |
8221 nr = nr * 255 + (*p++ - 1); | |
8222 } | |
8223 | |
8224 *pp = p; | |
8225 return nr; | |
8226 } | |
8227 | |
8228 /* | |
8229 * Write the .sug file in "fname". | |
8230 */ | |
8231 static void | |
8232 sug_write(spin, fname) | |
8233 spellinfo_T *spin; | |
8234 char_u *fname; | |
8235 { | |
8236 FILE *fd; | |
8237 wordnode_T *tree; | |
8238 int nodecount; | |
8239 int wcount; | |
8240 char_u *line; | |
8241 linenr_T lnum; | |
8242 int len; | |
8243 | |
8244 /* Create the file. Note that an existing file is silently overwritten! */ | |
8245 fd = mch_fopen((char *)fname, "w"); | |
8246 if (fd == NULL) | |
8247 { | |
8248 EMSG2(_(e_notopen), fname); | |
8249 return; | |
8250 } | |
8251 | |
8252 vim_snprintf((char *)IObuff, IOSIZE, | |
8253 _("Writing suggestion file %s ..."), fname); | |
8254 spell_message(spin, IObuff); | |
8255 | |
8256 /* | |
8257 * <SUGHEADER>: <fileID> <versionnr> <timestamp> | |
8258 */ | |
8259 if (fwrite(VIMSUGMAGIC, VIMSUGMAGICL, (size_t)1, fd) != 1) /* <fileID> */ | |
8260 { | |
8261 EMSG(_(e_write)); | |
8262 goto theend; | |
8263 } | |
8264 putc(VIMSUGVERSION, fd); /* <versionnr> */ | |
8265 | |
8266 /* Write si_sugtime to the file. */ | |
8267 put_sugtime(spin, fd); /* <timestamp> */ | |
8268 | |
8269 /* | |
8270 * <SUGWORDTREE> | |
8271 */ | |
8272 spin->si_memtot = 0; | |
8273 tree = spin->si_foldroot->wn_sibling; | |
8274 | |
8275 /* Clear the index and wnode fields in the tree. */ | |
8276 clear_node(tree); | |
8277 | |
8278 /* Count the number of nodes. Needed to be able to allocate the | |
8279 * memory when reading the nodes. Also fills in index for shared | |
8280 * nodes. */ | |
8281 nodecount = put_node(NULL, tree, 0, 0, FALSE); | |
8282 | |
8283 /* number of nodes in 4 bytes */ | |
8284 put_bytes(fd, (long_u)nodecount, 4); /* <nodecount> */ | |
8285 spin->si_memtot += nodecount + nodecount * sizeof(int); | |
8286 | |
8287 /* Write the nodes. */ | |
8288 (void)put_node(fd, tree, 0, 0, FALSE); | |
8289 | |
8290 /* | |
8291 * <SUGTABLE>: <sugwcount> <sugline> ... | |
8292 */ | |
8293 wcount = spin->si_spellbuf->b_ml.ml_line_count; | |
8294 put_bytes(fd, (long_u)wcount, 4); /* <sugwcount> */ | |
8295 | |
8296 for (lnum = 1; lnum <= (linenr_T)wcount; ++lnum) | |
8297 { | |
8298 /* <sugline>: <sugnr> ... NUL */ | |
8299 line = ml_get_buf(spin->si_spellbuf, lnum, FALSE); | |
8300 len = STRLEN(line) + 1; | |
8301 if (fwrite(line, (size_t)len, (size_t)1, fd) == 0) | |
8302 { | |
8303 EMSG(_(e_write)); | |
8304 goto theend; | |
8305 } | |
8306 spin->si_memtot += len; | |
8307 } | |
8308 | |
8309 /* Write another byte to check for errors. */ | |
8310 if (putc(0, fd) == EOF) | |
8311 EMSG(_(e_write)); | |
8312 | |
8313 vim_snprintf((char *)IObuff, IOSIZE, | |
8314 _("Estimated runtime memory use: %d bytes"), spin->si_memtot); | |
8315 spell_message(spin, IObuff); | |
8316 | |
8317 theend: | |
8318 /* close the file */ | |
8319 fclose(fd); | |
8320 } | |
8321 | |
8322 /* | |
8323 * Open a spell buffer. This is a nameless buffer that is not in the buffer | |
8324 * list and only contains text lines. Can use a swapfile to reduce memory | |
8325 * use. | |
8326 * Most other fields are invalid! Esp. watch out for string options being | |
8327 * NULL and there is no undo info. | |
8328 * Returns NULL when out of memory. | |
8329 */ | |
8330 static buf_T * | |
8331 open_spellbuf() | |
8332 { | |
8333 buf_T *buf; | |
8334 | |
8335 buf = (buf_T *)alloc_clear(sizeof(buf_T)); | |
8336 if (buf != NULL) | |
8337 { | |
8338 buf->b_spell = TRUE; | |
8339 buf->b_p_swf = TRUE; /* may create a swap file */ | |
8340 ml_open(buf); | |
8341 ml_open_file(buf); /* create swap file now */ | |
8342 } | |
8343 return buf; | |
8344 } | |
8345 | |
8346 /* | |
8347 * Close the buffer used for spell info. | |
8348 */ | |
8349 static void | |
8350 close_spellbuf(buf) | |
8351 buf_T *buf; | |
8352 { | |
8353 if (buf != NULL) | |
8354 { | |
8355 ml_close(buf, TRUE); | |
8356 vim_free(buf); | |
8357 } | |
8358 } | |
8359 | |
8360 | |
8361 /* | |
7430 * Create a Vim spell file from one or more word lists. | 8362 * Create a Vim spell file from one or more word lists. |
7431 * "fnames[0]" is the output file name. | 8363 * "fnames[0]" is the output file name. |
7432 * "fnames[fcount - 1]" is the last input file name. | 8364 * "fnames[fcount - 1]" is the last input file name. |
7433 * Exception: when "fnames[0]" ends in ".add" it's used as the input file name | 8365 * Exception: when "fnames[0]" ends in ".add" it's used as the input file name |
7434 * and ".spl" is appended to make the output file name. | 8366 * and ".spl" is appended to make the output file name. |
7456 spin.si_verbose = !added_word; | 8388 spin.si_verbose = !added_word; |
7457 spin.si_ascii = ascii; | 8389 spin.si_ascii = ascii; |
7458 spin.si_followup = TRUE; | 8390 spin.si_followup = TRUE; |
7459 spin.si_rem_accents = TRUE; | 8391 spin.si_rem_accents = TRUE; |
7460 ga_init2(&spin.si_rep, (int)sizeof(fromto_T), 20); | 8392 ga_init2(&spin.si_rep, (int)sizeof(fromto_T), 20); |
8393 ga_init2(&spin.si_repsal, (int)sizeof(fromto_T), 20); | |
7461 ga_init2(&spin.si_sal, (int)sizeof(fromto_T), 20); | 8394 ga_init2(&spin.si_sal, (int)sizeof(fromto_T), 20); |
7462 ga_init2(&spin.si_map, (int)sizeof(char_u), 100); | 8395 ga_init2(&spin.si_map, (int)sizeof(char_u), 100); |
7463 ga_init2(&spin.si_prefcond, (int)sizeof(char_u *), 50); | 8396 ga_init2(&spin.si_prefcond, (int)sizeof(char_u *), 50); |
8397 hash_init(&spin.si_commonwords); | |
7464 spin.si_newcompID = 127; /* start compound ID at first maximum */ | 8398 spin.si_newcompID = 127; /* start compound ID at first maximum */ |
7465 | 8399 |
7466 /* default: fnames[0] is output file, following are input files */ | 8400 /* default: fnames[0] is output file, following are input files */ |
7467 innames = &fnames[1]; | 8401 innames = &fnames[1]; |
7468 incount = fcount - 1; | 8402 incount = fcount - 1; |
7611 } | 8545 } |
7612 | 8546 |
7613 if (spin.si_compflags != NULL && spin.si_nobreak) | 8547 if (spin.si_compflags != NULL && spin.si_nobreak) |
7614 MSG(_("Warning: both compounding and NOBREAK specified")); | 8548 MSG(_("Warning: both compounding and NOBREAK specified")); |
7615 | 8549 |
7616 if (!error) | 8550 if (!error && !got_int) |
7617 { | 8551 { |
7618 /* | 8552 /* |
7619 * Combine tails in the tree. | 8553 * Combine tails in the tree. |
7620 */ | 8554 */ |
7621 if (spin.si_verbose || p_verbose > 2) | 8555 spell_message(&spin, (char_u *)_(msg_compressing)); |
7622 { | |
7623 if (!spin.si_verbose) | |
7624 verbose_enter(); | |
7625 MSG(_(msg_compressing)); | |
7626 out_flush(); | |
7627 if (!spin.si_verbose) | |
7628 verbose_leave(); | |
7629 } | |
7630 wordtree_compress(&spin, spin.si_foldroot); | 8556 wordtree_compress(&spin, spin.si_foldroot); |
7631 wordtree_compress(&spin, spin.si_keeproot); | 8557 wordtree_compress(&spin, spin.si_keeproot); |
7632 wordtree_compress(&spin, spin.si_prefroot); | 8558 wordtree_compress(&spin, spin.si_prefroot); |
7633 } | 8559 } |
7634 | 8560 |
7635 if (!error) | 8561 if (!error && !got_int) |
7636 { | 8562 { |
7637 /* | 8563 /* |
7638 * Write the info in the spell file. | 8564 * Write the info in the spell file. |
7639 */ | 8565 */ |
7640 if (spin.si_verbose || p_verbose > 2) | 8566 vim_snprintf((char *)IObuff, IOSIZE, |
7641 { | 8567 _("Writing spell file %s ..."), wfname); |
7642 if (!spin.si_verbose) | 8568 spell_message(&spin, IObuff); |
7643 verbose_enter(); | |
7644 smsg((char_u *)_("Writing spell file %s ..."), wfname); | |
7645 out_flush(); | |
7646 if (!spin.si_verbose) | |
7647 verbose_leave(); | |
7648 } | |
7649 | 8569 |
7650 error = write_vim_spell(&spin, wfname) == FAIL; | 8570 error = write_vim_spell(&spin, wfname) == FAIL; |
7651 | 8571 |
7652 if (spin.si_verbose || p_verbose > 2) | 8572 spell_message(&spin, (char_u *)_("Done!")); |
7653 { | 8573 vim_snprintf((char *)IObuff, IOSIZE, |
7654 if (!spin.si_verbose) | 8574 _("Estimated runtime memory use: %d bytes"), spin.si_memtot); |
7655 verbose_enter(); | 8575 spell_message(&spin, IObuff); |
7656 MSG(_("Done!")); | 8576 |
7657 smsg((char_u *)_("Estimated runtime memory use: %d bytes"), | 8577 /* |
7658 spin.si_memtot); | 8578 * If the file is loaded need to reload it. |
7659 out_flush(); | 8579 */ |
7660 if (!spin.si_verbose) | |
7661 verbose_leave(); | |
7662 } | |
7663 | |
7664 /* If the file is loaded need to reload it. */ | |
7665 if (!error) | 8580 if (!error) |
7666 spell_reload_one(wfname, added_word); | 8581 spell_reload_one(wfname, added_word); |
7667 } | 8582 } |
7668 | 8583 |
7669 /* Free the allocated memory. */ | 8584 /* Free the allocated memory. */ |
7670 ga_clear(&spin.si_rep); | 8585 ga_clear(&spin.si_rep); |
8586 ga_clear(&spin.si_repsal); | |
7671 ga_clear(&spin.si_sal); | 8587 ga_clear(&spin.si_sal); |
7672 ga_clear(&spin.si_map); | 8588 ga_clear(&spin.si_map); |
7673 ga_clear(&spin.si_prefcond); | 8589 ga_clear(&spin.si_prefcond); |
8590 hash_clear_all(&spin.si_commonwords, 0); | |
7674 | 8591 |
7675 /* Free the .aff file structures. */ | 8592 /* Free the .aff file structures. */ |
7676 for (i = 0; i < incount; ++i) | 8593 for (i = 0; i < incount; ++i) |
7677 if (afile[i] != NULL) | 8594 if (afile[i] != NULL) |
7678 spell_free_aff(afile[i]); | 8595 spell_free_aff(afile[i]); |
7679 | 8596 |
7680 /* Free all the bits and pieces at once. */ | 8597 /* Free all the bits and pieces at once. */ |
7681 free_blocks(spin.si_blocks); | 8598 free_blocks(spin.si_blocks); |
7682 } | 8599 |
7683 } | 8600 /* |
7684 | 8601 * If there is soundfolding info and no NOSUGFILE item create the |
8602 * .sug file with the soundfolded word trie. | |
8603 */ | |
8604 if (spin.si_sugtime != 0 && !error && !got_int) | |
8605 spell_make_sugfile(&spin, wfname); | |
8606 | |
8607 } | |
8608 } | |
8609 | |
8610 /* | |
8611 * Display a message for spell file processing when 'verbose' is set or using | |
8612 * ":mkspell". "str" can be IObuff. | |
8613 */ | |
8614 static void | |
8615 spell_message(spin, str) | |
8616 spellinfo_T *spin; | |
8617 char_u *str; | |
8618 { | |
8619 if (spin->si_verbose || p_verbose > 2) | |
8620 { | |
8621 if (!spin->si_verbose) | |
8622 verbose_enter(); | |
8623 MSG(str); | |
8624 out_flush(); | |
8625 if (!spin->si_verbose) | |
8626 verbose_leave(); | |
8627 } | |
8628 } | |
7685 | 8629 |
7686 /* | 8630 /* |
7687 * ":[count]spellgood {word}" | 8631 * ":[count]spellgood {word}" |
7688 * ":[count]spellwrong {word}" | 8632 * ":[count]spellwrong {word}" |
7689 */ | 8633 */ |
8332 } | 9276 } |
8333 | 9277 |
8334 return OK; | 9278 return OK; |
8335 } | 9279 } |
8336 | 9280 |
9281 /* values for sps_flags */ | |
8337 #define SPS_BEST 1 | 9282 #define SPS_BEST 1 |
8338 #define SPS_FAST 2 | 9283 #define SPS_FAST 2 |
8339 #define SPS_DOUBLE 4 | 9284 #define SPS_DOUBLE 4 |
8340 | 9285 |
8341 static int sps_flags = SPS_BEST; | 9286 static int sps_flags = SPS_BEST; /* flags from 'spellsuggest' */ |
8342 static int sps_limit = 9999; | 9287 static int sps_limit = 9999; /* max nr of suggestions given */ |
8343 | 9288 |
8344 /* | 9289 /* |
8345 * Check the 'spellsuggest' option. Return FAIL if it's wrong. | 9290 * Check the 'spellsuggest' option. Return FAIL if it's wrong. |
8346 * Sets "sps_flags" and "sps_limit". | 9291 * Sets "sps_flags" and "sps_limit". |
8347 */ | 9292 */ |
8459 if (sps_limit > (int)Rows - 2) | 9404 if (sps_limit > (int)Rows - 2) |
8460 limit = (int)Rows - 2; | 9405 limit = (int)Rows - 2; |
8461 else | 9406 else |
8462 limit = sps_limit; | 9407 limit = sps_limit; |
8463 spell_find_suggest(line + curwin->w_cursor.col, &sug, limit, | 9408 spell_find_suggest(line + curwin->w_cursor.col, &sug, limit, |
8464 TRUE, need_cap); | 9409 TRUE, need_cap, TRUE); |
8465 | 9410 |
8466 if (sug.su_ga.ga_len == 0) | 9411 if (sug.su_ga.ga_len == 0) |
8467 MSG(_("Sorry, no suggestions")); | 9412 MSG(_("Sorry, no suggestions")); |
8468 else if (count > 0) | 9413 else if (count > 0) |
8469 { | 9414 { |
8510 | 9455 |
8511 /* The suggested word may replace only part of the bad word, add | 9456 /* The suggested word may replace only part of the bad word, add |
8512 * the not replaced part. */ | 9457 * the not replaced part. */ |
8513 STRCPY(wcopy, stp->st_word); | 9458 STRCPY(wcopy, stp->st_word); |
8514 if (sug.su_badlen > stp->st_orglen) | 9459 if (sug.su_badlen > stp->st_orglen) |
8515 vim_strncpy(wcopy + STRLEN(wcopy), | 9460 vim_strncpy(wcopy + stp->st_wordlen, |
8516 sug.su_badptr + stp->st_orglen, | 9461 sug.su_badptr + stp->st_orglen, |
8517 sug.su_badlen - stp->st_orglen); | 9462 sug.su_badlen - stp->st_orglen); |
8518 vim_snprintf((char *)IObuff, IOSIZE, "%2d", i + 1); | 9463 vim_snprintf((char *)IObuff, IOSIZE, "%2d", i + 1); |
8519 #ifdef FEAT_RIGHTLEFT | 9464 #ifdef FEAT_RIGHTLEFT |
8520 if (cmdmsg_rl) | 9465 if (cmdmsg_rl) |
8584 repl_from = vim_strnsave(sug.su_badptr, stp->st_orglen); | 9529 repl_from = vim_strnsave(sug.su_badptr, stp->st_orglen); |
8585 repl_to = vim_strsave(stp->st_word); | 9530 repl_to = vim_strsave(stp->st_word); |
8586 } | 9531 } |
8587 | 9532 |
8588 /* Replace the word. */ | 9533 /* Replace the word. */ |
8589 p = alloc(STRLEN(line) - stp->st_orglen + STRLEN(stp->st_word) + 1); | 9534 p = alloc(STRLEN(line) - stp->st_orglen + stp->st_wordlen + 1); |
8590 if (p != NULL) | 9535 if (p != NULL) |
8591 { | 9536 { |
8592 c = sug.su_badptr - line; | 9537 c = sug.su_badptr - line; |
8593 mch_memmove(p, line, c); | 9538 mch_memmove(p, line, c); |
8594 STRCPY(p + c, stp->st_word); | 9539 STRCPY(p + c, stp->st_word); |
8599 | 9544 |
8600 /* For redo we use a change-word command. */ | 9545 /* For redo we use a change-word command. */ |
8601 ResetRedobuff(); | 9546 ResetRedobuff(); |
8602 AppendToRedobuff((char_u *)"ciw"); | 9547 AppendToRedobuff((char_u *)"ciw"); |
8603 AppendToRedobuffLit(p + c, | 9548 AppendToRedobuffLit(p + c, |
8604 STRLEN(stp->st_word) + sug.su_badlen - stp->st_orglen); | 9549 stp->st_wordlen + sug.su_badlen - stp->st_orglen); |
8605 AppendCharToRedobuff(ESC); | 9550 AppendCharToRedobuff(ESC); |
8606 } | 9551 } |
8607 } | 9552 } |
8608 else | 9553 else |
8609 curwin->w_cursor = prev_cursor; | 9554 curwin->w_cursor = prev_cursor; |
8757 /* | 9702 /* |
8758 * Find spell suggestions for "word". Return them in the growarray "*gap" as | 9703 * Find spell suggestions for "word". Return them in the growarray "*gap" as |
8759 * a list of allocated strings. | 9704 * a list of allocated strings. |
8760 */ | 9705 */ |
8761 void | 9706 void |
8762 spell_suggest_list(gap, word, maxcount, need_cap) | 9707 spell_suggest_list(gap, word, maxcount, need_cap, interactive) |
8763 garray_T *gap; | 9708 garray_T *gap; |
8764 char_u *word; | 9709 char_u *word; |
8765 int maxcount; /* maximum nr of suggestions */ | 9710 int maxcount; /* maximum nr of suggestions */ |
8766 int need_cap; /* 'spellcapcheck' matched */ | 9711 int need_cap; /* 'spellcapcheck' matched */ |
9712 int interactive; | |
8767 { | 9713 { |
8768 suginfo_T sug; | 9714 suginfo_T sug; |
8769 int i; | 9715 int i; |
8770 suggest_T *stp; | 9716 suggest_T *stp; |
8771 char_u *wcopy; | 9717 char_u *wcopy; |
8772 | 9718 |
8773 spell_find_suggest(word, &sug, maxcount, FALSE, need_cap); | 9719 spell_find_suggest(word, &sug, maxcount, FALSE, need_cap, interactive); |
8774 | 9720 |
8775 /* Make room in "gap". */ | 9721 /* Make room in "gap". */ |
8776 ga_init2(gap, sizeof(char_u *), sug.su_ga.ga_len + 1); | 9722 ga_init2(gap, sizeof(char_u *), sug.su_ga.ga_len + 1); |
8777 if (ga_grow(gap, sug.su_ga.ga_len) == FAIL) | 9723 if (ga_grow(gap, sug.su_ga.ga_len) == FAIL) |
8778 return; | 9724 return; |
8781 { | 9727 { |
8782 stp = &SUG(sug.su_ga, i); | 9728 stp = &SUG(sug.su_ga, i); |
8783 | 9729 |
8784 /* The suggested word may replace only part of "word", add the not | 9730 /* The suggested word may replace only part of "word", add the not |
8785 * replaced part. */ | 9731 * replaced part. */ |
8786 wcopy = alloc(STRLEN(stp->st_word) | 9732 wcopy = alloc(stp->st_wordlen |
8787 + STRLEN(sug.su_badptr + stp->st_orglen) + 1); | 9733 + STRLEN(sug.su_badptr + stp->st_orglen) + 1); |
8788 if (wcopy == NULL) | 9734 if (wcopy == NULL) |
8789 break; | 9735 break; |
8790 STRCPY(wcopy, stp->st_word); | 9736 STRCPY(wcopy, stp->st_word); |
8791 STRCAT(wcopy, sug.su_badptr + stp->st_orglen); | 9737 STRCPY(wcopy + stp->st_wordlen, sug.su_badptr + stp->st_orglen); |
8792 ((char_u **)gap->ga_data)[gap->ga_len++] = wcopy; | 9738 ((char_u **)gap->ga_data)[gap->ga_len++] = wcopy; |
8793 } | 9739 } |
8794 | 9740 |
8795 spell_find_cleanup(&sug); | 9741 spell_find_cleanup(&sug); |
8796 } | 9742 } |
8801 * The maximum number of suggestions is "maxcount". | 9747 * The maximum number of suggestions is "maxcount". |
8802 * Note: does use info for the current window. | 9748 * Note: does use info for the current window. |
8803 * This is based on the mechanisms of Aspell, but completely reimplemented. | 9749 * This is based on the mechanisms of Aspell, but completely reimplemented. |
8804 */ | 9750 */ |
8805 static void | 9751 static void |
8806 spell_find_suggest(badptr, su, maxcount, banbadword, need_cap) | 9752 spell_find_suggest(badptr, su, maxcount, banbadword, need_cap, interactive) |
8807 char_u *badptr; | 9753 char_u *badptr; |
8808 suginfo_T *su; | 9754 suginfo_T *su; |
8809 int maxcount; | 9755 int maxcount; |
8810 int banbadword; /* don't include badword in suggestions */ | 9756 int banbadword; /* don't include badword in suggestions */ |
8811 int need_cap; /* word should start with capital */ | 9757 int need_cap; /* word should start with capital */ |
9758 int interactive; | |
8812 { | 9759 { |
8813 hlf_T attr = HLF_COUNT; | 9760 hlf_T attr = HLF_COUNT; |
8814 char_u buf[MAXPATHL]; | 9761 char_u buf[MAXPATHL]; |
8815 char_u *p; | 9762 char_u *p; |
8816 int do_combine = FALSE; | 9763 int do_combine = FALSE; |
8831 if (*badptr == NUL) | 9778 if (*badptr == NUL) |
8832 return; | 9779 return; |
8833 hash_init(&su->su_banned); | 9780 hash_init(&su->su_banned); |
8834 | 9781 |
8835 su->su_badptr = badptr; | 9782 su->su_badptr = badptr; |
8836 su->su_badlen = spell_check(curwin, su->su_badptr, &attr, NULL); | 9783 su->su_badlen = spell_check(curwin, su->su_badptr, &attr, NULL, FALSE); |
8837 su->su_maxcount = maxcount; | 9784 su->su_maxcount = maxcount; |
8838 su->su_maxscore = SCORE_MAXINIT; | 9785 su->su_maxscore = SCORE_MAXINIT; |
8839 | 9786 |
8840 if (su->su_badlen >= MAXWLEN) | 9787 if (su->su_badlen >= MAXWLEN) |
8841 su->su_badlen = MAXWLEN - 1; /* just in case */ | 9788 su->su_badlen = MAXWLEN - 1; /* just in case */ |
8874 c = PTR2CHAR(su->su_badptr); | 9821 c = PTR2CHAR(su->su_badptr); |
8875 if (!SPELL_ISUPPER(c) && attr == HLF_COUNT) | 9822 if (!SPELL_ISUPPER(c) && attr == HLF_COUNT) |
8876 { | 9823 { |
8877 make_case_word(su->su_badword, buf, WF_ONECAP); | 9824 make_case_word(su->su_badword, buf, WF_ONECAP); |
8878 add_suggestion(su, &su->su_ga, buf, su->su_badlen, SCORE_ICASE, | 9825 add_suggestion(su, &su->su_ga, buf, su->su_badlen, SCORE_ICASE, |
8879 0, TRUE, su->su_sallang); | 9826 0, TRUE, su->su_sallang, FALSE); |
8880 } | 9827 } |
8881 | 9828 |
8882 /* Ban the bad word itself. It may appear in another region. */ | 9829 /* Ban the bad word itself. It may appear in another region. */ |
8883 if (banbadword) | 9830 if (banbadword) |
8884 add_banned(su, su->su_badword); | 9831 add_banned(su, su->su_badword); |
8910 /* Use list of suggestions in a file. */ | 9857 /* Use list of suggestions in a file. */ |
8911 spell_suggest_file(su, buf + 5); | 9858 spell_suggest_file(su, buf + 5); |
8912 else | 9859 else |
8913 { | 9860 { |
8914 /* Use internal method. */ | 9861 /* Use internal method. */ |
8915 spell_suggest_intern(su); | 9862 spell_suggest_intern(su, interactive); |
8916 if (sps_flags & SPS_DOUBLE) | 9863 if (sps_flags & SPS_DOUBLE) |
8917 do_combine = TRUE; | 9864 do_combine = TRUE; |
8918 } | 9865 } |
8919 } | 9866 } |
8920 | 9867 |
8950 for (li = list->lv_first; li != NULL; li = li->li_next) | 9897 for (li = list->lv_first; li != NULL; li = li->li_next) |
8951 if (li->li_tv.v_type == VAR_LIST) | 9898 if (li->li_tv.v_type == VAR_LIST) |
8952 { | 9899 { |
8953 /* Get the word and the score from the items. */ | 9900 /* Get the word and the score from the items. */ |
8954 score = get_spellword(li->li_tv.vval.v_list, &p); | 9901 score = get_spellword(li->li_tv.vval.v_list, &p); |
8955 if (score >= 0) | 9902 if (score >= 0 && score <= su->su_maxscore) |
8956 add_suggestion(su, &su->su_ga, p, | 9903 add_suggestion(su, &su->su_ga, p, su->su_badlen, |
8957 su->su_badlen, score, 0, TRUE, su->su_sallang); | 9904 score, 0, TRUE, su->su_sallang, FALSE); |
8958 } | 9905 } |
8959 list_unref(list); | 9906 list_unref(list); |
8960 } | 9907 } |
8961 | 9908 |
8962 /* Sort the suggestions and truncate at "maxcount". */ | 9909 /* Remove bogus suggestions, sort and truncate at "maxcount". */ |
9910 check_suggestions(su, &su->su_ga); | |
8963 (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, su->su_maxcount); | 9911 (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, su->su_maxcount); |
8964 } | 9912 } |
8965 #endif | 9913 #endif |
8966 | 9914 |
8967 /* | 9915 /* |
9009 make_case_word(p, cword, su->su_badflags); | 9957 make_case_word(p, cword, su->su_badflags); |
9010 p = cword; | 9958 p = cword; |
9011 } | 9959 } |
9012 | 9960 |
9013 add_suggestion(su, &su->su_ga, p, su->su_badlen, | 9961 add_suggestion(su, &su->su_ga, p, su->su_badlen, |
9014 SCORE_FILE, 0, TRUE, su->su_sallang); | 9962 SCORE_FILE, 0, TRUE, su->su_sallang, FALSE); |
9015 } | 9963 } |
9016 } | 9964 } |
9017 | 9965 |
9018 fclose(fd); | 9966 fclose(fd); |
9019 | 9967 |
9020 /* Sort the suggestions and truncate at "maxcount". */ | 9968 /* Remove bogus suggestions, sort and truncate at "maxcount". */ |
9969 check_suggestions(su, &su->su_ga); | |
9021 (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, su->su_maxcount); | 9970 (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, su->su_maxcount); |
9022 } | 9971 } |
9023 | 9972 |
9024 /* | 9973 /* |
9025 * Find suggestions for the internal method indicated by "sps_flags". | 9974 * Find suggestions for the internal method indicated by "sps_flags". |
9026 */ | 9975 */ |
9027 static void | 9976 static void |
9028 spell_suggest_intern(su) | 9977 spell_suggest_intern(su, interactive) |
9029 suginfo_T *su; | 9978 suginfo_T *su; |
9030 { | 9979 int interactive; |
9980 { | |
9981 /* | |
9982 * Load the .sug file(s) that are available and not done yet. | |
9983 */ | |
9984 suggest_load_files(); | |
9985 | |
9031 /* | 9986 /* |
9032 * 1. Try special cases, such as repeating a word: "the the" -> "the". | 9987 * 1. Try special cases, such as repeating a word: "the the" -> "the". |
9033 * | 9988 * |
9034 * Set a maximum score to limit the combination of operations that is | 9989 * Set a maximum score to limit the combination of operations that is |
9035 * tried. | 9990 * tried. |
9046 if (sps_flags & SPS_DOUBLE) | 10001 if (sps_flags & SPS_DOUBLE) |
9047 score_comp_sal(su); | 10002 score_comp_sal(su); |
9048 | 10003 |
9049 /* | 10004 /* |
9050 * 3. Try finding sound-a-like words. | 10005 * 3. Try finding sound-a-like words. |
9051 * | |
9052 * Only do this when we don't have a lot of suggestions yet, because it's | |
9053 * very slow and often doesn't find new suggestions. | |
9054 */ | 10006 */ |
9055 if ((sps_flags & SPS_DOUBLE) | 10007 if ((sps_flags & SPS_FAST) == 0) |
9056 || (!(sps_flags & SPS_FAST) | 10008 { |
9057 && su->su_ga.ga_len < SUG_CLEAN_COUNT(su))) | 10009 if (sps_flags & SPS_BEST) |
9058 { | 10010 /* Adjust the word score for the suggestions found so far for how |
9059 /* Allow a higher score now. */ | 10011 * they sounds like. */ |
9060 su->su_maxscore = SCORE_MAXMAX; | 10012 rescore_suggestions(su); |
10013 | |
10014 /* | |
10015 * While going throught the soundfold tree "su_maxscore" is the score | |
10016 * for the soundfold word, limits the changes that are being tried, | |
10017 * and "su_sfmaxscore" the rescored score, which is set by | |
10018 * cleanup_suggestions(). | |
10019 * First find words with a small edit distance, because this is much | |
10020 * faster and often already finds the top-N suggestions. If we didn't | |
10021 * find many suggestions try again with a higher edit distance. | |
10022 * "sl_sounddone" is used to avoid doing the same word twice. | |
10023 */ | |
10024 suggest_try_soundalike_prep(); | |
10025 su->su_maxscore = SCORE_SFMAX1; | |
10026 su->su_sfmaxscore = SCORE_MAXINIT * 3; | |
9061 suggest_try_soundalike(su); | 10027 suggest_try_soundalike(su); |
9062 } | 10028 if (su->su_ga.ga_len < SUG_CLEAN_COUNT(su)) |
9063 | 10029 { |
9064 /* When CTRL-C was hit while searching do show the results. */ | 10030 /* We didn't find enough matches, try again, allowing more |
10031 * changes to the soundfold word. */ | |
10032 su->su_maxscore = SCORE_SFMAX2; | |
10033 suggest_try_soundalike(su); | |
10034 if (su->su_ga.ga_len < SUG_CLEAN_COUNT(su)) | |
10035 { | |
10036 /* Still didn't find enough matches, try again, allowing even | |
10037 * more changes to the soundfold word. */ | |
10038 su->su_maxscore = SCORE_SFMAX3; | |
10039 suggest_try_soundalike(su); | |
10040 } | |
10041 } | |
10042 su->su_maxscore = su->su_sfmaxscore; | |
10043 suggest_try_soundalike_finish(); | |
10044 } | |
10045 | |
10046 /* When CTRL-C was hit while searching do show the results. Only clear | |
10047 * got_int when using a command, not for spellsuggest(). */ | |
9065 ui_breakcheck(); | 10048 ui_breakcheck(); |
9066 if (got_int) | 10049 if (interactive && got_int) |
9067 { | 10050 { |
9068 (void)vgetc(); | 10051 (void)vgetc(); |
9069 got_int = FALSE; | 10052 got_int = FALSE; |
9070 } | 10053 } |
9071 | 10054 |
9073 { | 10056 { |
9074 if (sps_flags & SPS_BEST) | 10057 if (sps_flags & SPS_BEST) |
9075 /* Adjust the word score for how it sounds like. */ | 10058 /* Adjust the word score for how it sounds like. */ |
9076 rescore_suggestions(su); | 10059 rescore_suggestions(su); |
9077 | 10060 |
9078 /* Sort the suggestions and truncate at "maxcount". */ | 10061 /* Remove bogus suggestions, sort and truncate at "maxcount". */ |
10062 check_suggestions(su, &su->su_ga); | |
9079 (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, su->su_maxcount); | 10063 (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, su->su_maxcount); |
10064 } | |
10065 } | |
10066 | |
10067 /* | |
10068 * Load the .sug files for languages that have one and weren't loaded yet. | |
10069 */ | |
10070 static void | |
10071 suggest_load_files() | |
10072 { | |
10073 langp_T *lp; | |
10074 int lpi; | |
10075 slang_T *slang; | |
10076 char_u *dotp; | |
10077 FILE *fd; | |
10078 char_u buf[MAXWLEN]; | |
10079 int i; | |
10080 time_t timestamp; | |
10081 int wcount; | |
10082 int wordnr; | |
10083 garray_T ga; | |
10084 int c; | |
10085 | |
10086 /* Do this for all languages that support sound folding. */ | |
10087 for (lpi = 0; lpi < curbuf->b_langp.ga_len; ++lpi) | |
10088 { | |
10089 lp = LANGP_ENTRY(curbuf->b_langp, lpi); | |
10090 slang = lp->lp_slang; | |
10091 if (slang->sl_sugtime != 0 && !slang->sl_sugloaded) | |
10092 { | |
10093 /* Change ".spl" to ".sug" and open the file. When the file isn't | |
10094 * found silently skip it. Do set "sl_sugloaded" so that we | |
10095 * don't try again and again. */ | |
10096 slang->sl_sugloaded = TRUE; | |
10097 | |
10098 dotp = vim_strrchr(slang->sl_fname, '.'); | |
10099 if (dotp == NULL || fnamecmp(dotp, ".spl") != 0) | |
10100 continue; | |
10101 STRCPY(dotp, ".sug"); | |
10102 fd = fopen((char *)slang->sl_fname, "r"); | |
10103 if (fd == NULL) | |
10104 goto nextone; | |
10105 | |
10106 /* | |
10107 * <SUGHEADER>: <fileID> <versionnr> <timestamp> | |
10108 */ | |
10109 for (i = 0; i < VIMSUGMAGICL; ++i) | |
10110 buf[i] = getc(fd); /* <fileID> */ | |
10111 if (STRNCMP(buf, VIMSUGMAGIC, VIMSUGMAGICL) != 0) | |
10112 { | |
10113 EMSG2(_("E999: This does not look like a .sug file: %s"), | |
10114 slang->sl_fname); | |
10115 goto nextone; | |
10116 } | |
10117 c = getc(fd); /* <versionnr> */ | |
10118 if (c < VIMSUGVERSION) | |
10119 { | |
10120 EMSG2(_("E999: Old .sug file, needs to be updated: %s"), | |
10121 slang->sl_fname); | |
10122 goto nextone; | |
10123 } | |
10124 else if (c > VIMSUGVERSION) | |
10125 { | |
10126 EMSG2(_("E999: .sug file is for newer version of Vim: %s"), | |
10127 slang->sl_fname); | |
10128 goto nextone; | |
10129 } | |
10130 | |
10131 /* Check the timestamp, it must be exactly the same as the one in | |
10132 * the .spl file. Otherwise the word numbers won't match. */ | |
10133 timestamp = 0; | |
10134 for (i = 7; i >= 0; --i) /* <timestamp> */ | |
10135 timestamp += getc(fd) << (i * 8); | |
10136 if (timestamp != slang->sl_sugtime) | |
10137 { | |
10138 EMSG2(_("E999: .sug file doesn't match .spl file: %s"), | |
10139 slang->sl_fname); | |
10140 goto nextone; | |
10141 } | |
10142 | |
10143 /* | |
10144 * <SUGWORDTREE>: <wordtree> | |
10145 * Read the trie with the soundfolded words. | |
10146 */ | |
10147 if (spell_read_tree(fd, &slang->sl_sbyts, &slang->sl_sidxs, | |
10148 FALSE, 0) != 0) | |
10149 { | |
10150 someerror: | |
10151 EMSG2(_("E999: error while reading .sug file: %s"), | |
10152 slang->sl_fname); | |
10153 slang_clear_sug(slang); | |
10154 goto nextone; | |
10155 } | |
10156 | |
10157 /* | |
10158 * <SUGTABLE>: <sugwcount> <sugline> ... | |
10159 * | |
10160 * Read the table with word numbers. We use a file buffer for | |
10161 * this, because it's so much like a file with lines. Makes it | |
10162 * possible to swap the info and save on memory use. | |
10163 */ | |
10164 slang->sl_sugbuf = open_spellbuf(); | |
10165 if (slang->sl_sugbuf == NULL) | |
10166 goto someerror; | |
10167 /* <sugwcount> */ | |
10168 wcount = (getc(fd) << 24) + (getc(fd) << 16) + (getc(fd) << 8) | |
10169 + getc(fd); | |
10170 if (wcount < 0) | |
10171 goto someerror; | |
10172 | |
10173 /* Read all the wordnr lists into the buffer, one NUL terminated | |
10174 * list per line. */ | |
10175 ga_init2(&ga, 1, 100); | |
10176 for (wordnr = 0; wordnr < wcount; ++wordnr) | |
10177 { | |
10178 ga.ga_len = 0; | |
10179 for (;;) | |
10180 { | |
10181 c = getc(fd); /* <sugline> */ | |
10182 if (c < 0 || ga_grow(&ga, 1) == FAIL) | |
10183 goto someerror; | |
10184 ((char_u *)ga.ga_data)[ga.ga_len++] = c; | |
10185 if (c == NUL) | |
10186 break; | |
10187 } | |
10188 if (ml_append_buf(slang->sl_sugbuf, (linenr_T)wordnr, | |
10189 ga.ga_data, ga.ga_len, TRUE) == FAIL) | |
10190 goto someerror; | |
10191 } | |
10192 ga_clear(&ga); | |
10193 | |
10194 /* | |
10195 * Need to put word counts in the word tries, so that we can find | |
10196 * a word by its number. | |
10197 */ | |
10198 tree_count_words(slang->sl_fbyts, slang->sl_fidxs); | |
10199 tree_count_words(slang->sl_sbyts, slang->sl_sidxs); | |
10200 | |
10201 nextone: | |
10202 if (fd != NULL) | |
10203 fclose(fd); | |
10204 STRCPY(dotp, ".spl"); | |
10205 } | |
10206 } | |
10207 } | |
10208 | |
10209 | |
10210 /* | |
10211 * Fill in the wordcount fields for a trie. | |
10212 * Returns the total number of words. | |
10213 */ | |
10214 static void | |
10215 tree_count_words(byts, idxs) | |
10216 char_u *byts; | |
10217 idx_T *idxs; | |
10218 { | |
10219 int depth; | |
10220 idx_T arridx[MAXWLEN]; | |
10221 int curi[MAXWLEN]; | |
10222 int c; | |
10223 idx_T n; | |
10224 int wordcount[MAXWLEN]; | |
10225 | |
10226 arridx[0] = 0; | |
10227 curi[0] = 1; | |
10228 wordcount[0] = 0; | |
10229 depth = 0; | |
10230 while (depth >= 0 && !got_int) | |
10231 { | |
10232 if (curi[depth] > byts[arridx[depth]]) | |
10233 { | |
10234 /* Done all bytes at this node, go up one level. */ | |
10235 idxs[arridx[depth]] = wordcount[depth]; | |
10236 if (depth > 0) | |
10237 wordcount[depth - 1] += wordcount[depth]; | |
10238 | |
10239 --depth; | |
10240 fast_breakcheck(); | |
10241 } | |
10242 else | |
10243 { | |
10244 /* Do one more byte at this node. */ | |
10245 n = arridx[depth] + curi[depth]; | |
10246 ++curi[depth]; | |
10247 | |
10248 c = byts[n]; | |
10249 if (c == 0) | |
10250 { | |
10251 /* End of word, count it. */ | |
10252 ++wordcount[depth]; | |
10253 | |
10254 /* Skip over any other NUL bytes (same word with different | |
10255 * flags). */ | |
10256 while (byts[n + 1] == 0) | |
10257 { | |
10258 ++n; | |
10259 ++curi[depth]; | |
10260 } | |
10261 } | |
10262 else | |
10263 { | |
10264 /* Normal char, go one level deeper to count the words. */ | |
10265 ++depth; | |
10266 arridx[depth] = idxs[n]; | |
10267 curi[depth] = 1; | |
10268 wordcount[depth] = 0; | |
10269 } | |
10270 } | |
9080 } | 10271 } |
9081 } | 10272 } |
9082 | 10273 |
9083 /* | 10274 /* |
9084 * Free the info put in "*su" by spell_find_suggest(). | 10275 * Free the info put in "*su" by spell_find_suggest(). |
9096 for (i = 0; i < su->su_sga.ga_len; ++i) | 10287 for (i = 0; i < su->su_sga.ga_len; ++i) |
9097 vim_free(SUG(su->su_sga, i).st_word); | 10288 vim_free(SUG(su->su_sga, i).st_word); |
9098 ga_clear(&su->su_sga); | 10289 ga_clear(&su->su_sga); |
9099 | 10290 |
9100 /* Free the banned words. */ | 10291 /* Free the banned words. */ |
9101 free_banned(su); | 10292 hash_clear_all(&su->su_banned, 0); |
9102 } | 10293 } |
9103 | 10294 |
9104 /* | 10295 /* |
9105 * Make a copy of "word", with the first letter upper or lower cased, to | 10296 * Make a copy of "word", with the first letter upper or lower cased, to |
9106 * "wcopy[MAXWLEN]". "word" must not be empty. | 10297 * "wcopy[MAXWLEN]". "word" must not be empty. |
9222 su->su_fbadword[len] = c; | 10413 su->su_fbadword[len] = c; |
9223 | 10414 |
9224 /* Give a soundalike score of 0, compute the score as if deleting one | 10415 /* Give a soundalike score of 0, compute the score as if deleting one |
9225 * character. */ | 10416 * character. */ |
9226 add_suggestion(su, &su->su_ga, word, su->su_badlen, | 10417 add_suggestion(su, &su->su_ga, word, su->su_badlen, |
9227 RESCORE(SCORE_REP, 0), 0, TRUE, su->su_sallang); | 10418 RESCORE(SCORE_REP, 0), 0, TRUE, su->su_sallang, FALSE); |
9228 } | 10419 } |
9229 } | 10420 } |
10421 | |
10422 /* | |
10423 * Try finding suggestions by adding/removing/swapping letters. | |
10424 */ | |
10425 static void | |
10426 suggest_try_change(su) | |
10427 suginfo_T *su; | |
10428 { | |
10429 char_u fword[MAXWLEN]; /* copy of the bad word, case-folded */ | |
10430 int n; | |
10431 char_u *p; | |
10432 int lpi; | |
10433 langp_T *lp; | |
10434 | |
10435 /* We make a copy of the case-folded bad word, so that we can modify it | |
10436 * to find matches (esp. REP items). Append some more text, changing | |
10437 * chars after the bad word may help. */ | |
10438 STRCPY(fword, su->su_fbadword); | |
10439 n = STRLEN(fword); | |
10440 p = su->su_badptr + su->su_badlen; | |
10441 (void)spell_casefold(p, STRLEN(p), fword + n, MAXWLEN - n); | |
10442 | |
10443 for (lpi = 0; lpi < curbuf->b_langp.ga_len; ++lpi) | |
10444 { | |
10445 lp = LANGP_ENTRY(curbuf->b_langp, lpi); | |
10446 | |
10447 /* If reloading a spell file fails it's still in the list but | |
10448 * everything has been cleared. */ | |
10449 if (lp->lp_slang->sl_fbyts == NULL) | |
10450 continue; | |
10451 | |
10452 /* Try it for this language. Will add possible suggestions. */ | |
10453 suggest_trie_walk(su, lp, fword, FALSE); | |
10454 } | |
10455 } | |
10456 | |
10457 /* Check the maximum score, if we go over it we won't try this change. */ | |
10458 #define TRY_DEEPER(su, stack, depth, add) \ | |
10459 (stack[depth].ts_score + (add) < su->su_maxscore) | |
9230 | 10460 |
9231 /* | 10461 /* |
9232 * Try finding suggestions by adding/removing/swapping letters. | 10462 * Try finding suggestions by adding/removing/swapping letters. |
9233 * | 10463 * |
9234 * This uses a state machine. At each node in the tree we try various | 10464 * This uses a state machine. At each node in the tree we try various |
9235 * operations. When trying if an operation work "depth" is increased and the | 10465 * operations. When trying if an operation works "depth" is increased and the |
9236 * stack[] is used to store info. This allows combinations, thus insert one | 10466 * stack[] is used to store info. This allows combinations, thus insert one |
9237 * character, replace one and delete another. The number of changes is | 10467 * character, replace one and delete another. The number of changes is |
9238 * limited by su->su_maxscore, checked in try_deeper(). | 10468 * limited by su->su_maxscore. |
9239 * | 10469 * |
9240 * After implementing this I noticed an article by Kemal Oflazer that | 10470 * After implementing this I noticed an article by Kemal Oflazer that |
9241 * describes something similar: "Error-tolerant Finite State Recognition with | 10471 * describes something similar: "Error-tolerant Finite State Recognition with |
9242 * Applications to Morphological Analysis and Spelling Correction" (1996). | 10472 * Applications to Morphological Analysis and Spelling Correction" (1996). |
9243 * The implementation in the article is simplified and requires a stack of | 10473 * The implementation in the article is simplified and requires a stack of |
9244 * unknown depth. The implementation here only needs a stack depth of the | 10474 * unknown depth. The implementation here only needs a stack depth equal to |
9245 * length of the word. | 10475 * the length of the word. |
10476 * | |
10477 * This is also used for the sound-folded word, "soundfold" is TRUE then. | |
10478 * The mechanism is the same, but we find a match with a sound-folded word | |
10479 * that comes from one or more original words. Each of these words may be | |
10480 * added, this is done by add_sound_suggest(). | |
10481 * Don't use: | |
10482 * the prefix tree or the keep-case tree | |
10483 * "su->su_badlen" | |
10484 * anything to do with upper and lower case | |
10485 * anything to do with word or non-word characters ("spell_iswordp()") | |
10486 * banned words | |
10487 * word flags (rare, region, compounding) | |
10488 * word splitting for now | |
10489 * "similar_chars()" | |
10490 * use "slang->sl_repsal" instead of "lp->lp_replang->sl_rep" | |
9246 */ | 10491 */ |
9247 static void | 10492 static void |
9248 suggest_try_change(su) | 10493 suggest_trie_walk(su, lp, fword, soundfold) |
9249 suginfo_T *su; | 10494 suginfo_T *su; |
9250 { | 10495 langp_T *lp; |
9251 char_u fword[MAXWLEN]; /* copy of the bad word, case-folded */ | 10496 char_u *fword; |
10497 int soundfold; | |
10498 { | |
9252 char_u tword[MAXWLEN]; /* good word collected so far */ | 10499 char_u tword[MAXWLEN]; /* good word collected so far */ |
9253 trystate_T stack[MAXWLEN]; | 10500 trystate_T stack[MAXWLEN]; |
9254 char_u preword[MAXWLEN * 3]; /* word found with proper case; | 10501 char_u preword[MAXWLEN * 3]; /* word found with proper case; |
9255 * concatanation of prefix compound | 10502 * concatanation of prefix compound |
9256 * words and split word. NUL terminated | 10503 * words and split word. NUL terminated |
9257 * when going deeper but not when coming | 10504 * when going deeper but not when coming |
9258 * back. */ | 10505 * back. */ |
9259 char_u compflags[MAXWLEN]; /* compound flags, one for each word */ | 10506 char_u compflags[MAXWLEN]; /* compound flags, one for each word */ |
9260 trystate_T *sp; | 10507 trystate_T *sp; |
9261 int newscore; | 10508 int newscore; |
9262 langp_T *lp; | 10509 int score; |
9263 char_u *byts, *fbyts, *pbyts; | 10510 char_u *byts, *fbyts, *pbyts; |
9264 idx_T *idxs, *fidxs, *pidxs; | 10511 idx_T *idxs, *fidxs, *pidxs; |
9265 int depth; | 10512 int depth; |
9266 int c, c2, c3; | 10513 int c, c2, c3; |
9267 int n; | 10514 int n = 0; |
9268 int flags; | 10515 int flags; |
9269 garray_T *gap; | 10516 garray_T *gap; |
9270 idx_T arridx; | 10517 idx_T arridx; |
9271 int len; | 10518 int len; |
9272 char_u *p; | 10519 char_u *p; |
9273 fromto_T *ftp; | 10520 fromto_T *ftp; |
9274 int fl = 0, tl; | 10521 int fl = 0, tl; |
9275 int repextra = 0; /* extra bytes in fword[] from REP item */ | 10522 int repextra = 0; /* extra bytes in fword[] from REP item */ |
9276 slang_T *slang; | 10523 slang_T *slang = lp->lp_slang; |
9277 int fword_ends; | 10524 int fword_ends; |
9278 int lpi; | |
9279 int maysplit; | |
9280 int goodword_ends; | 10525 int goodword_ends; |
9281 | 10526 #ifdef DEBUG_TRIEWALK |
9282 /* We make a copy of the case-folded bad word, so that we can modify it | 10527 /* Stores the name of the change made at each level. */ |
9283 * to find matches (esp. REP items). Append some more text, changing | 10528 char_u changename[MAXWLEN][80]; |
9284 * chars after the bad word may help. */ | 10529 #endif |
9285 STRCPY(fword, su->su_fbadword); | 10530 int breakcheckcount = 1000; |
9286 n = STRLEN(fword); | 10531 int compound_ok; |
9287 p = su->su_badptr + su->su_badlen; | 10532 |
9288 (void)spell_casefold(p, STRLEN(p), fword + n, MAXWLEN - n); | 10533 /* |
9289 | 10534 * Go through the whole case-fold tree, try changes at each node. |
9290 for (lpi = 0; lpi < curbuf->b_langp.ga_len; ++lpi) | 10535 * "tword[]" contains the word collected from nodes in the tree. |
9291 { | 10536 * "fword[]" the word we are trying to match with (initially the bad |
9292 lp = LANGP_ENTRY(curbuf->b_langp, lpi); | 10537 * word). |
9293 slang = lp->lp_slang; | 10538 */ |
9294 | 10539 depth = 0; |
9295 /* If reloading a spell file fails it's still in the list but | 10540 sp = &stack[0]; |
9296 * everything has been cleared. */ | 10541 vim_memset(sp, 0, sizeof(trystate_T)); |
9297 if (slang->sl_fbyts == NULL) | 10542 sp->ts_curi = 1; |
9298 continue; | 10543 |
9299 | 10544 if (soundfold) |
9300 /* | 10545 { |
9301 * Go through the whole case-fold tree, try changes at each node. | 10546 /* Going through the soundfold tree. */ |
9302 * "tword[]" contains the word collected from nodes in the tree. | 10547 byts = fbyts = slang->sl_sbyts; |
9303 * "fword[]" the word we are trying to match with (initially the bad | 10548 idxs = fidxs = slang->sl_sidxs; |
9304 * word). | 10549 pbyts = NULL; |
9305 */ | 10550 pidxs = NULL; |
9306 depth = 0; | 10551 sp->ts_prefixdepth = PFD_NOPREFIX; |
9307 sp = &stack[0]; | 10552 sp->ts_state = STATE_START; |
9308 vim_memset(sp, 0, sizeof(trystate_T)); | 10553 } |
9309 sp->ts_curi = 1; | 10554 else |
9310 | 10555 { |
9311 /* | 10556 /* |
9312 * When there are postponed prefixes we need to use these first. At | 10557 * When there are postponed prefixes we need to use these first. At |
9313 * the end of the prefix we continue in the case-fold tree. | 10558 * the end of the prefix we continue in the case-fold tree. |
9314 */ | 10559 */ |
9315 fbyts = slang->sl_fbyts; | 10560 fbyts = slang->sl_fbyts; |
9328 byts = fbyts; | 10573 byts = fbyts; |
9329 idxs = fidxs; | 10574 idxs = fidxs; |
9330 sp->ts_prefixdepth = PFD_NOPREFIX; | 10575 sp->ts_prefixdepth = PFD_NOPREFIX; |
9331 sp->ts_state = STATE_START; | 10576 sp->ts_state = STATE_START; |
9332 } | 10577 } |
9333 | 10578 } |
9334 /* | 10579 |
9335 * Loop to find all suggestions. At each round we either: | 10580 /* |
9336 * - For the current state try one operation, advance "ts_curi", | 10581 * Loop to find all suggestions. At each round we either: |
9337 * increase "depth". | 10582 * - For the current state try one operation, advance "ts_curi", |
9338 * - When a state is done go to the next, set "ts_state". | 10583 * increase "depth". |
9339 * - When all states are tried decrease "depth". | 10584 * - When a state is done go to the next, set "ts_state". |
9340 */ | 10585 * - When all states are tried decrease "depth". |
9341 while (depth >= 0 && !got_int) | 10586 */ |
9342 { | 10587 while (depth >= 0 && !got_int) |
9343 sp = &stack[depth]; | 10588 { |
9344 switch (sp->ts_state) | 10589 sp = &stack[depth]; |
9345 { | 10590 switch (sp->ts_state) |
9346 case STATE_START: | 10591 { |
9347 case STATE_NOPREFIX: | 10592 case STATE_START: |
9348 /* | 10593 case STATE_NOPREFIX: |
9349 * Start of node: Deal with NUL bytes, which means | 10594 /* |
9350 * tword[] may end here. | 10595 * Start of node: Deal with NUL bytes, which means |
9351 */ | 10596 * tword[] may end here. |
9352 arridx = sp->ts_arridx; /* current node in the tree */ | 10597 */ |
9353 len = byts[arridx]; /* bytes in this node */ | 10598 arridx = sp->ts_arridx; /* current node in the tree */ |
9354 arridx += sp->ts_curi; /* index of current byte */ | 10599 len = byts[arridx]; /* bytes in this node */ |
9355 | 10600 arridx += sp->ts_curi; /* index of current byte */ |
9356 if (sp->ts_prefixdepth == PFD_PREFIXTREE) | 10601 |
10602 if (sp->ts_prefixdepth == PFD_PREFIXTREE) | |
10603 { | |
10604 /* Skip over the NUL bytes, we use them later. */ | |
10605 for (n = 0; n < len && byts[arridx + n] == 0; ++n) | |
10606 ; | |
10607 sp->ts_curi += n; | |
10608 | |
10609 /* Always past NUL bytes now. */ | |
10610 n = (int)sp->ts_state; | |
10611 sp->ts_state = STATE_ENDNUL; | |
10612 sp->ts_save_badflags = su->su_badflags; | |
10613 | |
10614 /* At end of a prefix or at start of prefixtree: check for | |
10615 * following word. */ | |
10616 if (byts[arridx] == 0 || n == (int)STATE_NOPREFIX) | |
9357 { | 10617 { |
9358 /* Skip over the NUL bytes, we use them later. */ | 10618 /* Set su->su_badflags to the caps type at this position. |
9359 for (n = 0; n < len && byts[arridx + n] == 0; ++n) | 10619 * Use the caps type until here for the prefix itself. */ |
9360 ; | 10620 #ifdef FEAT_MBYTE |
9361 sp->ts_curi += n; | 10621 if (has_mbyte) |
9362 | 10622 n = nofold_len(fword, sp->ts_fidx, su->su_badptr); |
9363 /* Always past NUL bytes now. */ | 10623 else |
9364 n = (int)sp->ts_state; | 10624 #endif |
9365 sp->ts_state = STATE_ENDNUL; | 10625 n = sp->ts_fidx; |
9366 sp->ts_save_badflags = su->su_badflags; | 10626 flags = badword_captype(su->su_badptr, su->su_badptr + n); |
9367 | 10627 su->su_badflags = badword_captype(su->su_badptr + n, |
9368 /* At end of a prefix or at start of prefixtree: check for | 10628 su->su_badptr + su->su_badlen); |
9369 * following word. */ | 10629 #ifdef DEBUG_TRIEWALK |
9370 if (byts[arridx] == 0 || n == (int)STATE_NOPREFIX) | 10630 sprintf(changename[depth], "prefix"); |
10631 #endif | |
10632 go_deeper(stack, depth, 0); | |
10633 ++depth; | |
10634 sp = &stack[depth]; | |
10635 sp->ts_prefixdepth = depth - 1; | |
10636 byts = fbyts; | |
10637 idxs = fidxs; | |
10638 sp->ts_arridx = 0; | |
10639 | |
10640 /* Move the prefix to preword[] with the right case | |
10641 * and make find_keepcap_word() works. */ | |
10642 tword[sp->ts_twordlen] = NUL; | |
10643 make_case_word(tword + sp->ts_splitoff, | |
10644 preword + sp->ts_prewordlen, flags); | |
10645 sp->ts_prewordlen = STRLEN(preword); | |
10646 sp->ts_splitoff = sp->ts_twordlen; | |
10647 } | |
10648 break; | |
10649 } | |
10650 | |
10651 if (sp->ts_curi > len || byts[arridx] != 0) | |
10652 { | |
10653 /* Past bytes in node and/or past NUL bytes. */ | |
10654 sp->ts_state = STATE_ENDNUL; | |
10655 sp->ts_save_badflags = su->su_badflags; | |
10656 break; | |
10657 } | |
10658 | |
10659 /* | |
10660 * End of word in tree. | |
10661 */ | |
10662 ++sp->ts_curi; /* eat one NUL byte */ | |
10663 | |
10664 flags = (int)idxs[arridx]; | |
10665 fword_ends = (fword[sp->ts_fidx] == NUL | |
10666 || (soundfold | |
10667 ? vim_iswhite(fword[sp->ts_fidx]) | |
10668 : !spell_iswordp(fword + sp->ts_fidx, curbuf))); | |
10669 tword[sp->ts_twordlen] = NUL; | |
10670 | |
10671 if (sp->ts_prefixdepth <= PFD_NOTSPECIAL | |
10672 && (sp->ts_flags & TSF_PREFIXOK) == 0) | |
10673 { | |
10674 /* There was a prefix before the word. Check that the prefix | |
10675 * can be used with this word. */ | |
10676 /* Count the length of the NULs in the prefix. If there are | |
10677 * none this must be the first try without a prefix. */ | |
10678 n = stack[sp->ts_prefixdepth].ts_arridx; | |
10679 len = pbyts[n++]; | |
10680 for (c = 0; c < len && pbyts[n + c] == 0; ++c) | |
10681 ; | |
10682 if (c > 0) | |
10683 { | |
10684 c = valid_word_prefix(c, n, flags, | |
10685 tword + sp->ts_splitoff, slang, FALSE); | |
10686 if (c == 0) | |
10687 break; | |
10688 | |
10689 /* Use the WF_RARE flag for a rare prefix. */ | |
10690 if (c & WF_RAREPFX) | |
10691 flags |= WF_RARE; | |
10692 | |
10693 /* Tricky: when checking for both prefix and compounding | |
10694 * we run into the prefix flag first. | |
10695 * Remember that it's OK, so that we accept the prefix | |
10696 * when arriving at a compound flag. */ | |
10697 sp->ts_flags |= TSF_PREFIXOK; | |
10698 } | |
10699 } | |
10700 | |
10701 /* Check NEEDCOMPOUND: can't use word without compounding. Do try | |
10702 * appending another compound word below. */ | |
10703 if (sp->ts_complen == sp->ts_compsplit && fword_ends | |
10704 && (flags & WF_NEEDCOMP)) | |
10705 goodword_ends = FALSE; | |
10706 else | |
10707 goodword_ends = TRUE; | |
10708 | |
10709 p = NULL; | |
10710 compound_ok = TRUE; | |
10711 if (sp->ts_complen > sp->ts_compsplit) | |
10712 { | |
10713 if (slang->sl_nobreak) | |
10714 { | |
10715 /* There was a word before this word. When there was no | |
10716 * change in this word (it was correct) add the first word | |
10717 * as a suggestion. If this word was corrected too, we | |
10718 * need to check if a correct word follows. */ | |
10719 if (sp->ts_fidx - sp->ts_splitfidx | |
10720 == sp->ts_twordlen - sp->ts_splitoff | |
10721 && STRNCMP(fword + sp->ts_splitfidx, | |
10722 tword + sp->ts_splitoff, | |
10723 sp->ts_fidx - sp->ts_splitfidx) == 0) | |
9371 { | 10724 { |
9372 /* Set su->su_badflags to the caps type at this | 10725 preword[sp->ts_prewordlen] = NUL; |
9373 * position. Use the caps type until here for the | 10726 newscore = score_wordcount_adj(slang, sp->ts_score, |
9374 * prefix itself. */ | 10727 preword + sp->ts_prewordlen, |
9375 #ifdef FEAT_MBYTE | 10728 sp->ts_prewordlen > 0); |
9376 if (has_mbyte) | 10729 /* Add the suggestion if the score isn't too bad. */ |
9377 n = nofold_len(fword, sp->ts_fidx, su->su_badptr); | 10730 if (newscore <= su->su_maxscore) |
9378 else | |
9379 #endif | |
9380 n = sp->ts_fidx; | |
9381 flags = badword_captype(su->su_badptr, | |
9382 su->su_badptr + n); | |
9383 su->su_badflags = badword_captype(su->su_badptr + n, | |
9384 su->su_badptr + su->su_badlen); | |
9385 ++depth; | |
9386 stack[depth] = stack[depth - 1]; | |
9387 sp = &stack[depth]; | |
9388 sp->ts_prefixdepth = depth - 1; | |
9389 byts = fbyts; | |
9390 idxs = fidxs; | |
9391 sp->ts_state = STATE_START; | |
9392 sp->ts_curi = 1; /* start just after length byte */ | |
9393 sp->ts_arridx = 0; | |
9394 | |
9395 /* Move the prefix to preword[] with the right case | |
9396 * and make find_keepcap_word() works. */ | |
9397 tword[sp->ts_twordlen] = NUL; | |
9398 make_case_word(tword + sp->ts_splitoff, | |
9399 preword + sp->ts_prewordlen, | |
9400 flags); | |
9401 sp->ts_prewordlen = STRLEN(preword); | |
9402 sp->ts_splitoff = sp->ts_twordlen; | |
9403 } | |
9404 break; | |
9405 } | |
9406 | |
9407 if (sp->ts_curi > len || byts[arridx] != 0) | |
9408 { | |
9409 /* Past bytes in node and/or past NUL bytes. */ | |
9410 sp->ts_state = STATE_ENDNUL; | |
9411 sp->ts_save_badflags = su->su_badflags; | |
9412 break; | |
9413 } | |
9414 | |
9415 /* | |
9416 * End of word in tree. | |
9417 */ | |
9418 ++sp->ts_curi; /* eat one NUL byte */ | |
9419 | |
9420 flags = (int)idxs[arridx]; | |
9421 fword_ends = (fword[sp->ts_fidx] == NUL | |
9422 || !spell_iswordp(fword + sp->ts_fidx, curbuf)); | |
9423 tword[sp->ts_twordlen] = NUL; | |
9424 | |
9425 if (sp->ts_prefixdepth <= PFD_NOTSPECIAL | |
9426 && (sp->ts_flags & TSF_PREFIXOK) == 0) | |
9427 { | |
9428 /* There was a prefix before the word. Check that the | |
9429 * prefix can be used with this word. */ | |
9430 /* Count the length of the NULs in the prefix. If there | |
9431 * are none this must be the first try without a prefix. | |
9432 */ | |
9433 n = stack[sp->ts_prefixdepth].ts_arridx; | |
9434 len = pbyts[n++]; | |
9435 for (c = 0; c < len && pbyts[n + c] == 0; ++c) | |
9436 ; | |
9437 if (c > 0) | |
9438 { | |
9439 c = valid_word_prefix(c, n, flags, | |
9440 tword + sp->ts_splitoff, slang, FALSE); | |
9441 if (c == 0) | |
9442 break; | |
9443 | |
9444 /* Use the WF_RARE flag for a rare prefix. */ | |
9445 if (c & WF_RAREPFX) | |
9446 flags |= WF_RARE; | |
9447 | |
9448 /* Tricky: when checking for both prefix and | |
9449 * compounding we run into the prefix flag first. | |
9450 * Remember that it's OK, so that we accept the prefix | |
9451 * when arriving at a compound flag. */ | |
9452 sp->ts_flags |= TSF_PREFIXOK; | |
9453 } | |
9454 } | |
9455 | |
9456 /* Check NEEDCOMPOUND: can't use word without compounding. Do | |
9457 * try appending another compound word below. */ | |
9458 if (sp->ts_complen == sp->ts_compsplit && fword_ends | |
9459 && (flags & WF_NEEDCOMP)) | |
9460 goodword_ends = FALSE; | |
9461 else | |
9462 goodword_ends = TRUE; | |
9463 | |
9464 if (sp->ts_complen > sp->ts_compsplit) | |
9465 { | |
9466 if (slang->sl_nobreak) | |
9467 { | |
9468 /* There was a word before this word. When there was | |
9469 * no change in this word (it was correct) add the | |
9470 * first word as a suggestion. If this word was | |
9471 * corrected too, we need to check if a correct word | |
9472 * follows. */ | |
9473 if (sp->ts_fidx - sp->ts_splitfidx | |
9474 == sp->ts_twordlen - sp->ts_splitoff | |
9475 && STRNCMP(fword + sp->ts_splitfidx, | |
9476 tword + sp->ts_splitoff, | |
9477 sp->ts_fidx - sp->ts_splitfidx) == 0) | |
9478 { | |
9479 preword[sp->ts_prewordlen] = NUL; | |
9480 add_suggestion(su, &su->su_ga, preword, | 10731 add_suggestion(su, &su->su_ga, preword, |
9481 sp->ts_splitfidx - repextra, | 10732 sp->ts_splitfidx - repextra, |
9482 sp->ts_score, 0, FALSE, | 10733 newscore, 0, FALSE, |
9483 lp->lp_sallang); | 10734 lp->lp_sallang, FALSE); |
9484 break; | 10735 break; |
9485 } | |
9486 } | |
9487 else | |
9488 { | |
9489 /* There was a compound word before this word. If | |
9490 * this word does not support compounding then give up | |
9491 * (splitting is tried for the word without compound | |
9492 * flag). */ | |
9493 if (((unsigned)flags >> 24) == 0 | |
9494 || sp->ts_twordlen - sp->ts_splitoff | |
9495 < slang->sl_compminlen) | |
9496 break; | |
9497 #ifdef FEAT_MBYTE | |
9498 /* For multi-byte chars check character length against | |
9499 * COMPOUNDMIN. */ | |
9500 if (has_mbyte | |
9501 && slang->sl_compminlen > 0 | |
9502 && mb_charlen(tword + sp->ts_splitoff) | |
9503 < slang->sl_compminlen) | |
9504 break; | |
9505 #endif | |
9506 | |
9507 compflags[sp->ts_complen] = ((unsigned)flags >> 24); | |
9508 compflags[sp->ts_complen + 1] = NUL; | |
9509 vim_strncpy(preword + sp->ts_prewordlen, | |
9510 tword + sp->ts_splitoff, | |
9511 sp->ts_twordlen - sp->ts_splitoff); | |
9512 p = preword; | |
9513 while (*skiptowhite(p) != NUL) | |
9514 p = skipwhite(skiptowhite(p)); | |
9515 if (fword_ends && !can_compound(slang, p, | |
9516 compflags + sp->ts_compsplit)) | |
9517 break; | |
9518 | |
9519 /* Get pointer to last char of previous word. */ | |
9520 p = preword + sp->ts_prewordlen; | |
9521 mb_ptr_back(preword, p); | |
9522 } | 10736 } |
9523 } | 10737 } |
9524 else | 10738 else |
9525 p = NULL; | 10739 { |
9526 | 10740 /* There was a compound word before this word. If this |
9527 /* | 10741 * word does not support compounding then give up |
9528 * Form the word with proper case in preword. | 10742 * (splitting is tried for the word without compound |
9529 * If there is a word from a previous split, append. | 10743 * flag). */ |
9530 */ | 10744 if (((unsigned)flags >> 24) == 0 |
9531 if (flags & WF_KEEPCAP) | 10745 || sp->ts_twordlen - sp->ts_splitoff |
9532 /* Must find the word in the keep-case tree. */ | 10746 < slang->sl_compminlen) |
9533 find_keepcap_word(slang, tword + sp->ts_splitoff, | 10747 break; |
10748 #ifdef FEAT_MBYTE | |
10749 /* For multi-byte chars check character length against | |
10750 * COMPOUNDMIN. */ | |
10751 if (has_mbyte | |
10752 && slang->sl_compminlen > 0 | |
10753 && mb_charlen(tword + sp->ts_splitoff) | |
10754 < slang->sl_compminlen) | |
10755 break; | |
10756 #endif | |
10757 | |
10758 compflags[sp->ts_complen] = ((unsigned)flags >> 24); | |
10759 compflags[sp->ts_complen + 1] = NUL; | |
10760 vim_strncpy(preword + sp->ts_prewordlen, | |
10761 tword + sp->ts_splitoff, | |
10762 sp->ts_twordlen - sp->ts_splitoff); | |
10763 p = preword; | |
10764 while (*skiptowhite(p) != NUL) | |
10765 p = skipwhite(skiptowhite(p)); | |
10766 if (fword_ends && !can_compound(slang, p, | |
10767 compflags + sp->ts_compsplit)) | |
10768 /* Compound is not allowed. But it may still be | |
10769 * possible if we add another (short) word. */ | |
10770 compound_ok = FALSE; | |
10771 | |
10772 /* Get pointer to last char of previous word. */ | |
10773 p = preword + sp->ts_prewordlen; | |
10774 mb_ptr_back(preword, p); | |
10775 } | |
10776 } | |
10777 | |
10778 /* | |
10779 * Form the word with proper case in preword. | |
10780 * If there is a word from a previous split, append. | |
10781 * For the soundfold tree don't change the case, simply append. | |
10782 */ | |
10783 if (soundfold) | |
10784 STRCPY(preword + sp->ts_prewordlen, tword + sp->ts_splitoff); | |
10785 else if (flags & WF_KEEPCAP) | |
10786 /* Must find the word in the keep-case tree. */ | |
10787 find_keepcap_word(slang, tword + sp->ts_splitoff, | |
9534 preword + sp->ts_prewordlen); | 10788 preword + sp->ts_prewordlen); |
9535 else | 10789 else |
9536 { | 10790 { |
9537 /* Include badflags: if the badword is onecap or allcap | 10791 /* Include badflags: If the badword is onecap or allcap |
9538 * use that for the goodword too. But if the badword is | 10792 * use that for the goodword too. But if the badword is |
9539 * allcap and it's only one char long use onecap. */ | 10793 * allcap and it's only one char long use onecap. */ |
9540 c = su->su_badflags; | 10794 c = su->su_badflags; |
9541 if ((c & WF_ALLCAP) | 10795 if ((c & WF_ALLCAP) |
9542 #ifdef FEAT_MBYTE | 10796 #ifdef FEAT_MBYTE |
9543 && su->su_badlen == (*mb_ptr2len)(su->su_badptr) | 10797 && su->su_badlen == (*mb_ptr2len)(su->su_badptr) |
9544 #else | 10798 #else |
9545 && su->su_badlen == 1 | 10799 && su->su_badlen == 1 |
9546 #endif | 10800 #endif |
9547 ) | 10801 ) |
9548 c = WF_ONECAP; | 10802 c = WF_ONECAP; |
9549 c |= flags; | 10803 c |= flags; |
9550 | 10804 |
9551 /* When appending a compound word after a word character | 10805 /* When appending a compound word after a word character don't |
9552 * don't use Onecap. */ | 10806 * use Onecap. */ |
9553 if (p != NULL && spell_iswordp_nmw(p)) | 10807 if (p != NULL && spell_iswordp_nmw(p)) |
9554 c &= ~WF_ONECAP; | 10808 c &= ~WF_ONECAP; |
9555 make_case_word(tword + sp->ts_splitoff, | 10809 make_case_word(tword + sp->ts_splitoff, |
9556 preword + sp->ts_prewordlen, c); | 10810 preword + sp->ts_prewordlen, c); |
9557 } | 10811 } |
9558 | 10812 |
10813 if (!soundfold) | |
10814 { | |
9559 /* Don't use a banned word. It may appear again as a good | 10815 /* Don't use a banned word. It may appear again as a good |
9560 * word, thus remember it. */ | 10816 * word, thus remember it. */ |
9561 if (flags & WF_BANNED) | 10817 if (flags & WF_BANNED) |
9562 { | 10818 { |
9563 add_banned(su, preword + sp->ts_prewordlen); | 10819 add_banned(su, preword + sp->ts_prewordlen); |
9564 break; | 10820 break; |
9565 } | 10821 } |
9566 if ((sp->ts_complen == sp->ts_compsplit | 10822 if ((sp->ts_complen == sp->ts_compsplit |
9567 && was_banned(su, preword + sp->ts_prewordlen)) | 10823 && WAS_BANNED(su, preword + sp->ts_prewordlen)) |
9568 || was_banned(su, preword)) | 10824 || WAS_BANNED(su, preword)) |
9569 { | 10825 { |
9570 if (slang->sl_compprog == NULL) | 10826 if (slang->sl_compprog == NULL) |
9571 break; | 10827 break; |
9572 /* the word so far was banned but we may try compounding */ | 10828 /* the word so far was banned but we may try compounding */ |
9573 goodword_ends = FALSE; | 10829 goodword_ends = FALSE; |
9574 } | 10830 } |
9575 | 10831 } |
9576 newscore = 0; | 10832 |
10833 newscore = 0; | |
10834 if (!soundfold) /* soundfold words don't have flags */ | |
10835 { | |
9577 if ((flags & WF_REGION) | 10836 if ((flags & WF_REGION) |
9578 && (((unsigned)flags >> 16) & lp->lp_region) == 0) | 10837 && (((unsigned)flags >> 16) & lp->lp_region) == 0) |
9579 newscore += SCORE_REGION; | 10838 newscore += SCORE_REGION; |
9580 if (flags & WF_RARE) | 10839 if (flags & WF_RARE) |
9581 newscore += SCORE_RARE; | 10840 newscore += SCORE_RARE; |
9582 | 10841 |
9583 if (!spell_valid_case(su->su_badflags, | 10842 if (!spell_valid_case(su->su_badflags, |
9584 captype(preword + sp->ts_prewordlen, NULL))) | 10843 captype(preword + sp->ts_prewordlen, NULL))) |
9585 newscore += SCORE_ICASE; | 10844 newscore += SCORE_ICASE; |
9586 | 10845 } |
9587 maysplit = TRUE; | 10846 |
9588 if (fword_ends && goodword_ends | 10847 /* TODO: how about splitting in the soundfold tree? */ |
9589 && sp->ts_fidx >= sp->ts_fidxtry) | 10848 if (fword_ends |
10849 && goodword_ends | |
10850 && sp->ts_fidx >= sp->ts_fidxtry | |
10851 && compound_ok) | |
10852 { | |
10853 /* The badword also ends: add suggestions. */ | |
10854 #ifdef DEBUG_TRIEWALK | |
10855 if (soundfold && STRCMP(preword, "smwrd") == 0) | |
9590 { | 10856 { |
9591 /* The badword also ends: add suggestions. Give a penalty | 10857 int j; |
9592 * when changing non-word char to word char, e.g., "thes," | 10858 |
9593 * -> "these". */ | 10859 /* print the stack of changes that brought us here */ |
10860 smsg("------ %s -------", fword); | |
10861 for (j = 0; j < depth; ++j) | |
10862 smsg("%s", changename[j]); | |
10863 } | |
10864 #endif | |
10865 if (soundfold) | |
10866 { | |
10867 /* For soundfolded words we need to find the original | |
10868 * words, the edit distrance and then add them. */ | |
10869 add_sound_suggest(su, preword, sp->ts_score, lp); | |
10870 } | |
10871 else | |
10872 { | |
10873 /* Give a penalty when changing non-word char to word | |
10874 * char, e.g., "thes," -> "these". */ | |
9594 p = fword + sp->ts_fidx; | 10875 p = fword + sp->ts_fidx; |
9595 #ifdef FEAT_MBYTE | 10876 mb_ptr_back(fword, p); |
9596 if (has_mbyte) | |
9597 mb_ptr_back(fword, p); | |
9598 else | |
9599 #endif | |
9600 --p; | |
9601 if (!spell_iswordp(p, curbuf)) | 10877 if (!spell_iswordp(p, curbuf)) |
9602 { | 10878 { |
9603 p = preword + STRLEN(preword); | 10879 p = preword + STRLEN(preword); |
9604 #ifdef FEAT_MBYTE | 10880 mb_ptr_back(preword, p); |
9605 if (has_mbyte) | |
9606 mb_ptr_back(preword, p); | |
9607 else | |
9608 #endif | |
9609 --p; | |
9610 if (spell_iswordp(p, curbuf)) | 10881 if (spell_iswordp(p, curbuf)) |
9611 newscore += SCORE_NONWORD; | 10882 newscore += SCORE_NONWORD; |
9612 } | 10883 } |
9613 | 10884 |
9614 add_suggestion(su, &su->su_ga, preword, | 10885 /* Give a bonus to words seen before. */ |
9615 sp->ts_fidx - repextra, | 10886 score = score_wordcount_adj(slang, |
9616 sp->ts_score + newscore, 0, FALSE, | 10887 sp->ts_score + newscore, |
9617 lp->lp_sallang); | 10888 preword + sp->ts_prewordlen, |
9618 | 10889 sp->ts_prewordlen > 0); |
9619 /* When the bad word doesn't end yet, try changing the | 10890 |
9620 * next word. E.g., find suggestions for "the the" where | 10891 /* Add the suggestion if the score isn't too bad. */ |
9621 * the second "the" is different. It's done like a split. | 10892 if (score <= su->su_maxscore) |
9622 */ | 10893 add_suggestion(su, &su->su_ga, preword, |
9623 if (sp->ts_fidx - repextra >= su->su_badlen) | 10894 sp->ts_fidx - repextra, |
9624 maysplit = FALSE; | 10895 score, 0, FALSE, lp->lp_sallang, FALSE); |
9625 } | 10896 } |
9626 | 10897 } |
9627 if (maysplit | 10898 |
9628 && (sp->ts_fidx >= sp->ts_fidxtry || fword_ends) | 10899 /* |
10900 * Try word split and/or compounding. | |
10901 */ | |
10902 if ((sp->ts_fidx >= sp->ts_fidxtry || fword_ends) | |
9629 #ifdef FEAT_MBYTE | 10903 #ifdef FEAT_MBYTE |
9630 /* Don't split halfway a character. */ | 10904 /* Don't split halfway a character. */ |
9631 && (!has_mbyte || sp->ts_tcharlen == 0) | 10905 && (!has_mbyte || sp->ts_tcharlen == 0) |
9632 #endif | 10906 #endif |
9633 ) | 10907 ) |
9634 { | 10908 { |
9635 int try_compound; | 10909 int try_compound; |
9636 | 10910 int try_split; |
9637 /* Get here in two situations: | 10911 |
9638 * 1. The word in the tree ends but the badword continues: | 10912 /* If past the end of the bad word don't try a split. |
9639 * If the word allows compounding try that. Otherwise | 10913 * Otherwise try changing the next word. E.g., find |
9640 * try a split by inserting a space. For both check | 10914 * suggestions for "the the" where the second "the" is |
9641 * that a valid words starts at fword[sp->ts_fidx]. | 10915 * different. It's done like a split. |
9642 * For NOBREAK do like compounding to be able to check | 10916 * TODO: word split for soundfold words */ |
9643 * if the next word is valid. | 10917 try_split = (sp->ts_fidx - repextra < su->su_badlen) |
9644 * 2. The badword does end, but it was due to a change | 10918 && !soundfold; |
9645 * (e.g., a swap). No need to split, but do check that | 10919 |
9646 * the following word is valid. | 10920 /* Get here in several situations: |
9647 */ | 10921 * 1. The word in the tree ends: |
9648 try_compound = FALSE; | 10922 * If the word allows compounding try that. Otherwise try |
9649 if ((!fword_ends || !goodword_ends) | 10923 * a split by inserting a space. For both check that a |
9650 && slang->sl_compprog != NULL | 10924 * valid words starts at fword[sp->ts_fidx]. |
9651 && ((unsigned)flags >> 24) != 0 | 10925 * For NOBREAK do like compounding to be able to check if |
9652 && sp->ts_twordlen - sp->ts_splitoff | 10926 * the next word is valid. |
9653 >= slang->sl_compminlen | 10927 * 2. The badword does end, but it was due to a change (e.g., |
10928 * a swap). No need to split, but do check that the | |
10929 * following word is valid. | |
10930 * 3. The badword and the word in the tree end. It may still | |
10931 * be possible to compound another (short) word. | |
10932 */ | |
10933 try_compound = FALSE; | |
10934 if (!soundfold | |
10935 && slang->sl_compprog != NULL | |
10936 && ((unsigned)flags >> 24) != 0 | |
10937 && sp->ts_twordlen - sp->ts_splitoff | |
10938 >= slang->sl_compminlen | |
9654 #ifdef FEAT_MBYTE | 10939 #ifdef FEAT_MBYTE |
9655 && (!has_mbyte | 10940 && (!has_mbyte |
9656 || slang->sl_compminlen == 0 | 10941 || slang->sl_compminlen == 0 |
9657 || mb_charlen(tword + sp->ts_splitoff) | 10942 || mb_charlen(tword + sp->ts_splitoff) |
9658 >= slang->sl_compminlen) | 10943 >= slang->sl_compminlen) |
9659 #endif | 10944 #endif |
9660 && (slang->sl_compsylmax < MAXWLEN | 10945 && (slang->sl_compsylmax < MAXWLEN |
9661 || sp->ts_complen + 1 - sp->ts_compsplit | 10946 || sp->ts_complen + 1 - sp->ts_compsplit |
9662 < slang->sl_compmax) | 10947 < slang->sl_compmax) |
9663 && (byte_in_str(sp->ts_complen == sp->ts_compsplit | 10948 && (byte_in_str(sp->ts_complen == sp->ts_compsplit |
9664 ? slang->sl_compstartflags | 10949 ? slang->sl_compstartflags |
9665 : slang->sl_compallflags, | 10950 : slang->sl_compallflags, |
9666 ((unsigned)flags >> 24)))) | 10951 ((unsigned)flags >> 24)))) |
9667 { | 10952 { |
9668 try_compound = TRUE; | 10953 try_compound = TRUE; |
9669 compflags[sp->ts_complen] = ((unsigned)flags >> 24); | 10954 compflags[sp->ts_complen] = ((unsigned)flags >> 24); |
9670 compflags[sp->ts_complen + 1] = NUL; | 10955 compflags[sp->ts_complen + 1] = NUL; |
9671 } | 10956 } |
9672 | 10957 |
9673 /* For NOBREAK we never try splitting, it won't make any | 10958 /* For NOBREAK we never try splitting, it won't make any word |
9674 * word valid. */ | 10959 * valid. */ |
9675 if (slang->sl_nobreak) | 10960 if (slang->sl_nobreak) |
9676 try_compound = TRUE; | 10961 try_compound = TRUE; |
9677 | 10962 |
9678 /* If we could add a compound word, and it's also possible | 10963 /* If we could add a compound word, and it's also possible to |
9679 * to split at this point, do the split first and set | 10964 * split at this point, do the split first and set |
9680 * TSF_DIDSPLIT to avoid doing it again. */ | 10965 * TSF_DIDSPLIT to avoid doing it again. */ |
9681 else if (!fword_ends | 10966 else if (!fword_ends |
9682 && try_compound | 10967 && try_compound |
9683 && (sp->ts_flags & TSF_DIDSPLIT) == 0) | 10968 && (sp->ts_flags & TSF_DIDSPLIT) == 0) |
9684 { | 10969 { |
9685 try_compound = FALSE; | 10970 try_compound = FALSE; |
9686 sp->ts_flags |= TSF_DIDSPLIT; | 10971 sp->ts_flags |= TSF_DIDSPLIT; |
9687 --sp->ts_curi; /* do the same NUL again */ | 10972 --sp->ts_curi; /* do the same NUL again */ |
9688 compflags[sp->ts_complen] = NUL; | 10973 compflags[sp->ts_complen] = NUL; |
9689 } | 10974 } |
9690 else | 10975 else |
9691 sp->ts_flags &= ~TSF_DIDSPLIT; | 10976 sp->ts_flags &= ~TSF_DIDSPLIT; |
9692 | 10977 |
10978 if (try_split || try_compound) | |
10979 { | |
9693 if (!try_compound && (!fword_ends || !goodword_ends)) | 10980 if (!try_compound && (!fword_ends || !goodword_ends)) |
9694 { | 10981 { |
9695 /* If we're going to split need to check that the | 10982 /* If we're going to split need to check that the |
9696 * words so far are valid for compounding. If there | 10983 * words so far are valid for compounding. If there |
9697 * is only one word it must not have the NEEDCOMPOUND | 10984 * is only one word it must not have the NEEDCOMPOUND |
9705 if (sp->ts_complen > sp->ts_compsplit | 10992 if (sp->ts_complen > sp->ts_compsplit |
9706 && !can_compound(slang, p, | 10993 && !can_compound(slang, p, |
9707 compflags + sp->ts_compsplit)) | 10994 compflags + sp->ts_compsplit)) |
9708 break; | 10995 break; |
9709 newscore += SCORE_SPLIT; | 10996 newscore += SCORE_SPLIT; |
10997 | |
10998 /* Give a bonus to words seen before. */ | |
10999 newscore = score_wordcount_adj(slang, newscore, | |
11000 preword + sp->ts_prewordlen, TRUE); | |
9710 } | 11001 } |
9711 | 11002 |
9712 if (try_deeper(su, stack, depth, newscore)) | 11003 if (TRY_DEEPER(su, stack, depth, newscore)) |
9713 { | 11004 { |
11005 go_deeper(stack, depth, newscore); | |
11006 #ifdef DEBUG_TRIEWALK | |
11007 if (!try_compound && !fword_ends) | |
11008 sprintf(changename[depth], "%.*s-%s: split", | |
11009 sp->ts_twordlen, tword, fword + sp->ts_fidx); | |
11010 else | |
11011 sprintf(changename[depth], "%.*s-%s: compound", | |
11012 sp->ts_twordlen, tword, fword + sp->ts_fidx); | |
11013 #endif | |
9714 /* Save things to be restored at STATE_SPLITUNDO. */ | 11014 /* Save things to be restored at STATE_SPLITUNDO. */ |
9715 sp->ts_save_badflags = su->su_badflags; | 11015 sp->ts_save_badflags = su->su_badflags; |
9716 sp->ts_state = STATE_SPLITUNDO; | 11016 sp->ts_state = STATE_SPLITUNDO; |
9717 | 11017 |
9718 ++depth; | 11018 ++depth; |
9728 /* If the badword has a non-word character at this | 11028 /* If the badword has a non-word character at this |
9729 * position skip it. That means replacing the | 11029 * position skip it. That means replacing the |
9730 * non-word character with a space. Always skip a | 11030 * non-word character with a space. Always skip a |
9731 * character when the word ends. But only when the | 11031 * character when the word ends. But only when the |
9732 * good word can end. */ | 11032 * good word can end. */ |
9733 if (((!try_compound | 11033 if (((!try_compound && !spell_iswordp_nmw(fword |
9734 && !spell_iswordp_nmw(fword + sp->ts_fidx)) | 11034 + sp->ts_fidx)) |
9735 || fword_ends) | 11035 || fword_ends) |
9736 && goodword_ends) | 11036 && fword[sp->ts_fidx] != NUL |
11037 && goodword_ends) | |
9737 { | 11038 { |
9738 int l; | 11039 int l; |
9739 | 11040 |
9740 #ifdef FEAT_MBYTE | 11041 #ifdef FEAT_MBYTE |
9741 if (has_mbyte) | 11042 if (has_mbyte) |
9787 sp->ts_prefixdepth = PFD_PREFIXTREE; | 11088 sp->ts_prefixdepth = PFD_PREFIXTREE; |
9788 sp->ts_state = STATE_NOPREFIX; | 11089 sp->ts_state = STATE_NOPREFIX; |
9789 } | 11090 } |
9790 } | 11091 } |
9791 } | 11092 } |
11093 } | |
11094 break; | |
11095 | |
11096 case STATE_SPLITUNDO: | |
11097 /* Undo the changes done for word split or compound word. */ | |
11098 su->su_badflags = sp->ts_save_badflags; | |
11099 | |
11100 /* Continue looking for NUL bytes. */ | |
11101 sp->ts_state = STATE_START; | |
11102 | |
11103 /* In case we went into the prefix tree. */ | |
11104 byts = fbyts; | |
11105 idxs = fidxs; | |
11106 break; | |
11107 | |
11108 case STATE_ENDNUL: | |
11109 /* Past the NUL bytes in the node. */ | |
11110 su->su_badflags = sp->ts_save_badflags; | |
11111 if (fword[sp->ts_fidx] == NUL | |
11112 #ifdef FEAT_MBYTE | |
11113 && sp->ts_tcharlen == 0 | |
11114 #endif | |
11115 ) | |
11116 { | |
11117 /* The badword ends, can't use STATE_PLAIN. */ | |
11118 sp->ts_state = STATE_DEL; | |
9792 break; | 11119 break; |
9793 | 11120 } |
9794 case STATE_SPLITUNDO: | 11121 sp->ts_state = STATE_PLAIN; |
9795 /* Undo the changes done for word split or compound word. */ | 11122 /*FALLTHROUGH*/ |
9796 su->su_badflags = sp->ts_save_badflags; | 11123 |
9797 | 11124 case STATE_PLAIN: |
9798 /* Continue looking for NUL bytes. */ | 11125 /* |
9799 sp->ts_state = STATE_START; | 11126 * Go over all possible bytes at this node, add each to tword[] |
9800 | 11127 * and use child node. "ts_curi" is the index. |
9801 /* In case we went into the prefix tree. */ | 11128 */ |
9802 byts = fbyts; | 11129 arridx = sp->ts_arridx; |
9803 idxs = fidxs; | 11130 if (sp->ts_curi > byts[arridx]) |
9804 break; | 11131 { |
9805 | 11132 /* Done all bytes at this node, do next state. When still at |
9806 case STATE_ENDNUL: | 11133 * already changed bytes skip the other tricks. */ |
9807 /* Past the NUL bytes in the node. */ | 11134 if (sp->ts_fidx >= sp->ts_fidxtry) |
9808 su->su_badflags = sp->ts_save_badflags; | 11135 sp->ts_state = STATE_DEL; |
9809 if (fword[sp->ts_fidx] == NUL | 11136 else |
11137 sp->ts_state = STATE_FINAL; | |
11138 } | |
11139 else | |
11140 { | |
11141 arridx += sp->ts_curi++; | |
11142 c = byts[arridx]; | |
11143 | |
11144 /* Normal byte, go one level deeper. If it's not equal to the | |
11145 * byte in the bad word adjust the score. But don't even try | |
11146 * when the byte was already changed. And don't try when we | |
11147 * just deleted this byte, accepting it is always cheaper then | |
11148 * delete + substitute. */ | |
11149 if (c == fword[sp->ts_fidx] | |
9810 #ifdef FEAT_MBYTE | 11150 #ifdef FEAT_MBYTE |
9811 && sp->ts_tcharlen == 0 | 11151 || (sp->ts_tcharlen > 0 && sp->ts_isdiff != DIFF_NONE) |
9812 #endif | 11152 #endif |
9813 ) | 11153 ) |
11154 newscore = 0; | |
11155 else | |
11156 newscore = SCORE_SUBST; | |
11157 if ((newscore == 0 | |
11158 || (sp->ts_fidx >= sp->ts_fidxtry | |
11159 && ((sp->ts_flags & TSF_DIDDEL) == 0 | |
11160 || c != fword[sp->ts_delidx]))) | |
11161 && TRY_DEEPER(su, stack, depth, newscore)) | |
9814 { | 11162 { |
9815 /* The badword ends, can't use the bytes in this node. */ | 11163 go_deeper(stack, depth, newscore); |
9816 sp->ts_state = STATE_DEL; | 11164 #ifdef DEBUG_TRIEWALK |
9817 break; | 11165 if (newscore > 0) |
9818 } | 11166 sprintf(changename[depth], "%.*s-%s: subst %c to %c", |
9819 sp->ts_state = STATE_PLAIN; | 11167 sp->ts_twordlen, tword, fword + sp->ts_fidx, |
9820 /*FALLTHROUGH*/ | 11168 fword[sp->ts_fidx], c); |
9821 | |
9822 case STATE_PLAIN: | |
9823 /* | |
9824 * Go over all possible bytes at this node, add each to | |
9825 * tword[] and use child node. "ts_curi" is the index. | |
9826 */ | |
9827 arridx = sp->ts_arridx; | |
9828 if (sp->ts_curi > byts[arridx]) | |
9829 { | |
9830 /* Done all bytes at this node, do next state. When still | |
9831 * at already changed bytes skip the other tricks. */ | |
9832 if (sp->ts_fidx >= sp->ts_fidxtry) | |
9833 sp->ts_state = STATE_DEL; | |
9834 else | 11169 else |
9835 sp->ts_state = STATE_FINAL; | 11170 sprintf(changename[depth], "%.*s-%s: accept %c", |
9836 } | 11171 sp->ts_twordlen, tword, fword + sp->ts_fidx, |
9837 else | 11172 fword[sp->ts_fidx]); |
9838 { | 11173 #endif |
9839 arridx += sp->ts_curi++; | 11174 ++depth; |
9840 c = byts[arridx]; | 11175 sp = &stack[depth]; |
9841 | 11176 ++sp->ts_fidx; |
9842 /* Normal byte, go one level deeper. If it's not equal to | 11177 tword[sp->ts_twordlen++] = c; |
9843 * the byte in the bad word adjust the score. But don't | 11178 sp->ts_arridx = idxs[arridx]; |
9844 * even try when the byte was already changed. */ | |
9845 if (c == fword[sp->ts_fidx] | |
9846 #ifdef FEAT_MBYTE | 11179 #ifdef FEAT_MBYTE |
9847 || (sp->ts_tcharlen > 0 | 11180 if (newscore == SCORE_SUBST) |
9848 && sp->ts_isdiff != DIFF_NONE) | 11181 sp->ts_isdiff = DIFF_YES; |
9849 #endif | |
9850 ) | |
9851 newscore = 0; | |
9852 else | |
9853 newscore = SCORE_SUBST; | |
9854 if ((newscore == 0 || sp->ts_fidx >= sp->ts_fidxtry) | |
9855 && try_deeper(su, stack, depth, newscore)) | |
9856 { | |
9857 ++depth; | |
9858 sp = &stack[depth]; | |
9859 ++sp->ts_fidx; | |
9860 tword[sp->ts_twordlen++] = c; | |
9861 sp->ts_arridx = idxs[arridx]; | |
9862 #ifdef FEAT_MBYTE | |
9863 if (newscore == SCORE_SUBST) | |
9864 sp->ts_isdiff = DIFF_YES; | |
9865 if (has_mbyte) | |
9866 { | |
9867 /* Multi-byte characters are a bit complicated to | |
9868 * handle: They differ when any of the bytes | |
9869 * differ and then their length may also differ. */ | |
9870 if (sp->ts_tcharlen == 0) | |
9871 { | |
9872 /* First byte. */ | |
9873 sp->ts_tcharidx = 0; | |
9874 sp->ts_tcharlen = MB_BYTE2LEN(c); | |
9875 sp->ts_fcharstart = sp->ts_fidx - 1; | |
9876 sp->ts_isdiff = (newscore != 0) | |
9877 ? DIFF_YES : DIFF_NONE; | |
9878 } | |
9879 else if (sp->ts_isdiff == DIFF_INSERT) | |
9880 /* When inserting trail bytes don't advance in | |
9881 * the bad word. */ | |
9882 --sp->ts_fidx; | |
9883 if (++sp->ts_tcharidx == sp->ts_tcharlen) | |
9884 { | |
9885 /* Last byte of character. */ | |
9886 if (sp->ts_isdiff == DIFF_YES) | |
9887 { | |
9888 /* Correct ts_fidx for the byte length of | |
9889 * the character (we didn't check that | |
9890 * before). */ | |
9891 sp->ts_fidx = sp->ts_fcharstart | |
9892 + MB_BYTE2LEN( | |
9893 fword[sp->ts_fcharstart]); | |
9894 | |
9895 /* For changing a composing character | |
9896 * adjust the score from SCORE_SUBST to | |
9897 * SCORE_SUBCOMP. */ | |
9898 if (enc_utf8 | |
9899 && utf_iscomposing( | |
9900 mb_ptr2char(tword | |
9901 + sp->ts_twordlen | |
9902 - sp->ts_tcharlen)) | |
9903 && utf_iscomposing( | |
9904 mb_ptr2char(fword | |
9905 + sp->ts_fcharstart))) | |
9906 sp->ts_score -= | |
9907 SCORE_SUBST - SCORE_SUBCOMP; | |
9908 | |
9909 /* For a similar character adjust score | |
9910 * from SCORE_SUBST to SCORE_SIMILAR. */ | |
9911 else if (slang->sl_has_map | |
9912 && similar_chars(slang, | |
9913 mb_ptr2char(tword | |
9914 + sp->ts_twordlen | |
9915 - sp->ts_tcharlen), | |
9916 mb_ptr2char(fword | |
9917 + sp->ts_fcharstart))) | |
9918 sp->ts_score -= | |
9919 SCORE_SUBST - SCORE_SIMILAR; | |
9920 } | |
9921 else if (sp->ts_isdiff == DIFF_INSERT | |
9922 && sp->ts_twordlen > sp->ts_tcharlen) | |
9923 { | |
9924 p = tword + sp->ts_twordlen | |
9925 - sp->ts_tcharlen; | |
9926 c = mb_ptr2char(p); | |
9927 if (enc_utf8 && utf_iscomposing(c)) | |
9928 { | |
9929 /* Inserting a composing char doesn't | |
9930 * count that much. */ | |
9931 sp->ts_score -= SCORE_INS | |
9932 - SCORE_INSCOMP; | |
9933 } | |
9934 else | |
9935 { | |
9936 /* If the previous character was the | |
9937 * same, thus doubling a character, | |
9938 * give a bonus to the score. */ | |
9939 mb_ptr_back(tword, p); | |
9940 if (c == mb_ptr2char(p)) | |
9941 sp->ts_score -= SCORE_INS | |
9942 - SCORE_INSDUP; | |
9943 } | |
9944 } | |
9945 | |
9946 /* Starting a new char, reset the length. */ | |
9947 sp->ts_tcharlen = 0; | |
9948 } | |
9949 } | |
9950 else | |
9951 #endif | |
9952 { | |
9953 /* If we found a similar char adjust the score. | |
9954 * We do this after calling try_deeper() because | |
9955 * it's slow. */ | |
9956 if (newscore != 0 | |
9957 && slang->sl_has_map | |
9958 && similar_chars(slang, | |
9959 c, fword[sp->ts_fidx - 1])) | |
9960 sp->ts_score -= SCORE_SUBST - SCORE_SIMILAR; | |
9961 } | |
9962 } | |
9963 } | |
9964 break; | |
9965 | |
9966 case STATE_DEL: | |
9967 #ifdef FEAT_MBYTE | |
9968 /* When past the first byte of a multi-byte char don't try | |
9969 * delete/insert/swap a character. */ | |
9970 if (has_mbyte && sp->ts_tcharlen > 0) | |
9971 { | |
9972 sp->ts_state = STATE_FINAL; | |
9973 break; | |
9974 } | |
9975 #endif | |
9976 /* | |
9977 * Try skipping one character in the bad word (delete it). | |
9978 */ | |
9979 sp->ts_state = STATE_INS; | |
9980 sp->ts_curi = 1; | |
9981 if (fword[sp->ts_fidx] != NUL | |
9982 && try_deeper(su, stack, depth, SCORE_DEL)) | |
9983 { | |
9984 ++depth; | |
9985 | |
9986 /* Advance over the character in fword[]. Give a bonus to | |
9987 * the score if the same character is following "nn" -> | |
9988 * "n". */ | |
9989 #ifdef FEAT_MBYTE | |
9990 if (has_mbyte) | 11182 if (has_mbyte) |
9991 { | 11183 { |
9992 c = mb_ptr2char(fword + sp->ts_fidx); | 11184 /* Multi-byte characters are a bit complicated to |
9993 stack[depth].ts_fidx += MB_BYTE2LEN(fword[sp->ts_fidx]); | 11185 * handle: They differ when any of the bytes differ |
9994 if (enc_utf8 && utf_iscomposing(c)) | 11186 * and then their length may also differ. */ |
9995 stack[depth].ts_score -= SCORE_DEL - SCORE_DELCOMP; | 11187 if (sp->ts_tcharlen == 0) |
9996 else if (c == mb_ptr2char(fword + stack[depth].ts_fidx)) | 11188 { |
9997 stack[depth].ts_score -= SCORE_DEL - SCORE_DELDUP; | 11189 /* First byte. */ |
11190 sp->ts_tcharidx = 0; | |
11191 sp->ts_tcharlen = MB_BYTE2LEN(c); | |
11192 sp->ts_fcharstart = sp->ts_fidx - 1; | |
11193 sp->ts_isdiff = (newscore != 0) | |
11194 ? DIFF_YES : DIFF_NONE; | |
11195 } | |
11196 else if (sp->ts_isdiff == DIFF_INSERT) | |
11197 /* When inserting trail bytes don't advance in the | |
11198 * bad word. */ | |
11199 --sp->ts_fidx; | |
11200 if (++sp->ts_tcharidx == sp->ts_tcharlen) | |
11201 { | |
11202 /* Last byte of character. */ | |
11203 if (sp->ts_isdiff == DIFF_YES) | |
11204 { | |
11205 /* Correct ts_fidx for the byte length of the | |
11206 * character (we didn't check that before). */ | |
11207 sp->ts_fidx = sp->ts_fcharstart | |
11208 + MB_BYTE2LEN( | |
11209 fword[sp->ts_fcharstart]); | |
11210 | |
11211 /* For changing a composing character adjust | |
11212 * the score from SCORE_SUBST to | |
11213 * SCORE_SUBCOMP. */ | |
11214 if (enc_utf8 | |
11215 && utf_iscomposing( | |
11216 mb_ptr2char(tword | |
11217 + sp->ts_twordlen | |
11218 - sp->ts_tcharlen)) | |
11219 && utf_iscomposing( | |
11220 mb_ptr2char(fword | |
11221 + sp->ts_fcharstart))) | |
11222 sp->ts_score -= | |
11223 SCORE_SUBST - SCORE_SUBCOMP; | |
11224 | |
11225 /* For a similar character adjust score from | |
11226 * SCORE_SUBST to SCORE_SIMILAR. */ | |
11227 else if (!soundfold | |
11228 && slang->sl_has_map | |
11229 && similar_chars(slang, | |
11230 mb_ptr2char(tword | |
11231 + sp->ts_twordlen | |
11232 - sp->ts_tcharlen), | |
11233 mb_ptr2char(fword | |
11234 + sp->ts_fcharstart))) | |
11235 sp->ts_score -= | |
11236 SCORE_SUBST - SCORE_SIMILAR; | |
11237 } | |
11238 else if (sp->ts_isdiff == DIFF_INSERT | |
11239 && sp->ts_twordlen > sp->ts_tcharlen) | |
11240 { | |
11241 p = tword + sp->ts_twordlen - sp->ts_tcharlen; | |
11242 c = mb_ptr2char(p); | |
11243 if (enc_utf8 && utf_iscomposing(c)) | |
11244 { | |
11245 /* Inserting a composing char doesn't | |
11246 * count that much. */ | |
11247 sp->ts_score -= SCORE_INS - SCORE_INSCOMP; | |
11248 } | |
11249 else | |
11250 { | |
11251 /* If the previous character was the same, | |
11252 * thus doubling a character, give a bonus | |
11253 * to the score. Also for the soundfold | |
11254 * tree (might seem illogical but does | |
11255 * give better scores). */ | |
11256 mb_ptr_back(tword, p); | |
11257 if (c == mb_ptr2char(p)) | |
11258 sp->ts_score -= SCORE_INS | |
11259 - SCORE_INSDUP; | |
11260 } | |
11261 } | |
11262 | |
11263 /* Starting a new char, reset the length. */ | |
11264 sp->ts_tcharlen = 0; | |
11265 } | |
9998 } | 11266 } |
9999 else | 11267 else |
10000 #endif | 11268 #endif |
10001 { | 11269 { |
10002 ++stack[depth].ts_fidx; | 11270 /* If we found a similar char adjust the score. |
10003 if (fword[sp->ts_fidx] == fword[sp->ts_fidx + 1]) | 11271 * We do this after calling go_deeper() because |
10004 stack[depth].ts_score -= SCORE_DEL - SCORE_DELDUP; | 11272 * it's slow. */ |
10005 } | 11273 if (newscore != 0 |
10006 break; | 11274 && !soundfold |
10007 } | 11275 && slang->sl_has_map |
10008 /*FALLTHROUGH*/ | 11276 && similar_chars(slang, |
10009 | 11277 c, fword[sp->ts_fidx - 1])) |
10010 case STATE_INS: | 11278 sp->ts_score -= SCORE_SUBST - SCORE_SIMILAR; |
10011 /* Insert one byte. Do this for each possible byte at this | |
10012 * node. */ | |
10013 n = sp->ts_arridx; | |
10014 if (sp->ts_curi > byts[n]) | |
10015 { | |
10016 /* Done all bytes at this node, do next state. */ | |
10017 sp->ts_state = STATE_SWAP; | |
10018 } | |
10019 else | |
10020 { | |
10021 /* Do one more byte at this node. Skip NUL bytes. */ | |
10022 n += sp->ts_curi++; | |
10023 c = byts[n]; | |
10024 if (c != 0 && try_deeper(su, stack, depth, SCORE_INS)) | |
10025 { | |
10026 ++depth; | |
10027 sp = &stack[depth]; | |
10028 tword[sp->ts_twordlen++] = c; | |
10029 sp->ts_arridx = idxs[n]; | |
10030 #ifdef FEAT_MBYTE | |
10031 if (has_mbyte) | |
10032 { | |
10033 fl = MB_BYTE2LEN(c); | |
10034 if (fl > 1) | |
10035 { | |
10036 /* There are following bytes for the same | |
10037 * character. We must find all bytes before | |
10038 * trying delete/insert/swap/etc. */ | |
10039 sp->ts_tcharlen = fl; | |
10040 sp->ts_tcharidx = 1; | |
10041 sp->ts_isdiff = DIFF_INSERT; | |
10042 } | |
10043 } | |
10044 else | |
10045 fl = 1; | |
10046 if (fl == 1) | |
10047 #endif | |
10048 { | |
10049 /* If the previous character was the same, thus | |
10050 * doubling a character, give a bonus to the | |
10051 * score. */ | |
10052 if (sp->ts_twordlen >= 2 | |
10053 && tword[sp->ts_twordlen - 2] == c) | |
10054 sp->ts_score -= SCORE_INS - SCORE_INSDUP; | |
10055 } | |
10056 } | 11279 } |
10057 } | 11280 } |
11281 } | |
11282 break; | |
11283 | |
11284 case STATE_DEL: | |
11285 #ifdef FEAT_MBYTE | |
11286 /* When past the first byte of a multi-byte char don't try | |
11287 * delete/insert/swap a character. */ | |
11288 if (has_mbyte && sp->ts_tcharlen > 0) | |
11289 { | |
11290 sp->ts_state = STATE_FINAL; | |
10058 break; | 11291 break; |
10059 | 11292 } |
10060 case STATE_SWAP: | 11293 #endif |
10061 /* | 11294 /* |
10062 * Swap two bytes in the bad word: "12" -> "21". | 11295 * Try skipping one character in the bad word (delete it). |
10063 * We change "fword" here, it's changed back afterwards. | 11296 */ |
10064 */ | 11297 sp->ts_state = STATE_INS_PREP; |
10065 p = fword + sp->ts_fidx; | 11298 sp->ts_curi = 1; |
10066 c = *p; | 11299 if (soundfold && sp->ts_fidx == 0 && fword[sp->ts_fidx] == '*') |
10067 if (c == NUL) | 11300 /* Deleting a vowel at the start of a word counts less, see |
10068 { | 11301 * soundalike_score(). */ |
10069 /* End of word, can't swap or replace. */ | 11302 newscore = 2 * SCORE_DEL / 3; |
10070 sp->ts_state = STATE_FINAL; | 11303 else |
10071 break; | 11304 newscore = SCORE_DEL; |
10072 } | 11305 if (fword[sp->ts_fidx] != NUL |
10073 | 11306 && TRY_DEEPER(su, stack, depth, newscore)) |
10074 /* Don't swap if the first character is not a word character. | 11307 { |
10075 * SWAP3 etc. also don't make sense then. */ | 11308 go_deeper(stack, depth, newscore); |
10076 if (!spell_iswordp(p, curbuf)) | 11309 #ifdef DEBUG_TRIEWALK |
10077 { | 11310 sprintf(changename[depth], "%.*s-%s: delete %c", |
10078 sp->ts_state = STATE_REP_INI; | 11311 sp->ts_twordlen, tword, fword + sp->ts_fidx, |
10079 break; | 11312 fword[sp->ts_fidx]); |
10080 } | 11313 #endif |
10081 | 11314 ++depth; |
11315 | |
11316 /* Remember what character we deleted, so that we can avoid | |
11317 * inserting it again. */ | |
11318 stack[depth].ts_flags |= TSF_DIDDEL; | |
11319 stack[depth].ts_delidx = sp->ts_fidx; | |
11320 | |
11321 /* Advance over the character in fword[]. Give a bonus to the | |
11322 * score if the same character is following "nn" -> "n". It's | |
11323 * a bit illogical for soundfold tree but it does give better | |
11324 * results. */ | |
10082 #ifdef FEAT_MBYTE | 11325 #ifdef FEAT_MBYTE |
10083 if (has_mbyte) | 11326 if (has_mbyte) |
10084 { | 11327 { |
10085 n = mb_cptr2len(p); | 11328 c = mb_ptr2char(fword + sp->ts_fidx); |
10086 c = mb_ptr2char(p); | 11329 stack[depth].ts_fidx += MB_BYTE2LEN(fword[sp->ts_fidx]); |
10087 if (!spell_iswordp(p + n, curbuf)) | 11330 if (enc_utf8 && utf_iscomposing(c)) |
10088 c2 = c; /* don't swap non-word char */ | 11331 stack[depth].ts_score -= SCORE_DEL - SCORE_DELCOMP; |
10089 else | 11332 else if (c == mb_ptr2char(fword + stack[depth].ts_fidx)) |
10090 c2 = mb_ptr2char(p + n); | 11333 stack[depth].ts_score -= SCORE_DEL - SCORE_DELDUP; |
10091 } | 11334 } |
10092 else | 11335 else |
10093 #endif | 11336 #endif |
10094 { | 11337 { |
10095 if (!spell_iswordp(p + 1, curbuf)) | 11338 ++stack[depth].ts_fidx; |
10096 c2 = c; /* don't swap non-word char */ | 11339 if (fword[sp->ts_fidx] == fword[sp->ts_fidx + 1]) |
10097 else | 11340 stack[depth].ts_score -= SCORE_DEL - SCORE_DELDUP; |
10098 c2 = p[1]; | |
10099 } | 11341 } |
10100 | 11342 break; |
10101 /* When characters are identical, swap won't do anything. | 11343 } |
10102 * Also get here if the second char is not a word character. */ | 11344 /*FALLTHROUGH*/ |
10103 if (c == c2) | 11345 |
11346 case STATE_INS_PREP: | |
11347 if (sp->ts_flags & TSF_DIDDEL) | |
11348 { | |
11349 /* If we just deleted a byte then inserting won't make sense, | |
11350 * a substitute is always cheaper. */ | |
11351 sp->ts_state = STATE_SWAP; | |
11352 break; | |
11353 } | |
11354 | |
11355 /* skip over NUL bytes */ | |
11356 n = sp->ts_arridx; | |
11357 for (;;) | |
11358 { | |
11359 if (sp->ts_curi > byts[n]) | |
10104 { | 11360 { |
10105 sp->ts_state = STATE_SWAP3; | 11361 /* Only NUL bytes at this node, go to next state. */ |
11362 sp->ts_state = STATE_SWAP; | |
10106 break; | 11363 break; |
10107 } | 11364 } |
10108 if (c2 != NUL && try_deeper(su, stack, depth, SCORE_SWAP)) | 11365 if (byts[n + sp->ts_curi] != NUL) |
10109 { | 11366 { |
10110 sp->ts_state = STATE_UNSWAP; | 11367 /* Found a byte to insert. */ |
10111 ++depth; | 11368 sp->ts_state = STATE_INS; |
11369 break; | |
11370 } | |
11371 ++sp->ts_curi; | |
11372 } | |
11373 break; | |
11374 | |
11375 /*FALLTHROUGH*/ | |
11376 | |
11377 case STATE_INS: | |
11378 /* Insert one byte. Repeat this for each possible byte at this | |
11379 * node. */ | |
11380 n = sp->ts_arridx; | |
11381 if (sp->ts_curi > byts[n]) | |
11382 { | |
11383 /* Done all bytes at this node, go to next state. */ | |
11384 sp->ts_state = STATE_SWAP; | |
11385 break; | |
11386 } | |
11387 | |
11388 /* Do one more byte at this node, but: | |
11389 * - Skip NUL bytes. | |
11390 * - Skip the byte if it's equal to the byte in the word, | |
11391 * accepting that byte is always better. | |
11392 */ | |
11393 n += sp->ts_curi++; | |
11394 c = byts[n]; | |
11395 if (soundfold && sp->ts_twordlen == 0 && c == '*') | |
11396 /* Inserting a vowel at the start of a word counts less, | |
11397 * see soundalike_score(). */ | |
11398 newscore = 2 * SCORE_INS / 3; | |
11399 else | |
11400 newscore = SCORE_INS; | |
11401 if (c != fword[sp->ts_fidx] | |
11402 && TRY_DEEPER(su, stack, depth, newscore)) | |
11403 { | |
11404 go_deeper(stack, depth, newscore); | |
11405 #ifdef DEBUG_TRIEWALK | |
11406 sprintf(changename[depth], "%.*s-%s: insert %c", | |
11407 sp->ts_twordlen, tword, fword + sp->ts_fidx, | |
11408 c); | |
11409 #endif | |
11410 ++depth; | |
11411 sp = &stack[depth]; | |
11412 tword[sp->ts_twordlen++] = c; | |
11413 sp->ts_arridx = idxs[n]; | |
10112 #ifdef FEAT_MBYTE | 11414 #ifdef FEAT_MBYTE |
10113 if (has_mbyte) | 11415 if (has_mbyte) |
11416 { | |
11417 fl = MB_BYTE2LEN(c); | |
11418 if (fl > 1) | |
10114 { | 11419 { |
10115 fl = mb_char2len(c2); | 11420 /* There are following bytes for the same character. |
10116 mch_memmove(p, p + n, fl); | 11421 * We must find all bytes before trying |
10117 mb_char2bytes(c, p + fl); | 11422 * delete/insert/swap/etc. */ |
10118 stack[depth].ts_fidxtry = sp->ts_fidx + n + fl; | 11423 sp->ts_tcharlen = fl; |
10119 } | 11424 sp->ts_tcharidx = 1; |
10120 else | 11425 sp->ts_isdiff = DIFF_INSERT; |
10121 #endif | |
10122 { | |
10123 p[0] = c2; | |
10124 p[1] = c; | |
10125 stack[depth].ts_fidxtry = sp->ts_fidx + 2; | |
10126 } | 11426 } |
10127 } | 11427 } |
10128 else | 11428 else |
10129 /* If this swap doesn't work then SWAP3 won't either. */ | 11429 fl = 1; |
10130 sp->ts_state = STATE_REP_INI; | 11430 if (fl == 1) |
11431 #endif | |
11432 { | |
11433 /* If the previous character was the same, thus doubling a | |
11434 * character, give a bonus to the score. Also for | |
11435 * soundfold words (illogical but does give a better | |
11436 * score). */ | |
11437 if (sp->ts_twordlen >= 2 | |
11438 && tword[sp->ts_twordlen - 2] == c) | |
11439 sp->ts_score -= SCORE_INS - SCORE_INSDUP; | |
11440 } | |
11441 } | |
11442 break; | |
11443 | |
11444 case STATE_SWAP: | |
11445 /* | |
11446 * Swap two bytes in the bad word: "12" -> "21". | |
11447 * We change "fword" here, it's changed back afterwards at | |
11448 * STATE_UNSWAP. | |
11449 */ | |
11450 p = fword + sp->ts_fidx; | |
11451 c = *p; | |
11452 if (c == NUL) | |
11453 { | |
11454 /* End of word, can't swap or replace. */ | |
11455 sp->ts_state = STATE_FINAL; | |
10131 break; | 11456 break; |
10132 | 11457 } |
10133 case STATE_UNSWAP: | 11458 |
10134 /* Undo the STATE_SWAP swap: "21" -> "12". */ | 11459 /* Don't swap if the first character is not a word character. |
10135 p = fword + sp->ts_fidx; | 11460 * SWAP3 etc. also don't make sense then. */ |
11461 if (!soundfold && !spell_iswordp(p, curbuf)) | |
11462 { | |
11463 sp->ts_state = STATE_REP_INI; | |
11464 break; | |
11465 } | |
11466 | |
11467 #ifdef FEAT_MBYTE | |
11468 if (has_mbyte) | |
11469 { | |
11470 n = mb_cptr2len(p); | |
11471 c = mb_ptr2char(p); | |
11472 if (!soundfold && !spell_iswordp(p + n, curbuf)) | |
11473 c2 = c; /* don't swap non-word char */ | |
11474 else | |
11475 c2 = mb_ptr2char(p + n); | |
11476 } | |
11477 else | |
11478 #endif | |
11479 { | |
11480 if (!soundfold && !spell_iswordp(p + 1, curbuf)) | |
11481 c2 = c; /* don't swap non-word char */ | |
11482 else | |
11483 c2 = p[1]; | |
11484 } | |
11485 | |
11486 /* When characters are identical, swap won't do anything. | |
11487 * Also get here if the second char is not a word character. */ | |
11488 if (c == c2) | |
11489 { | |
11490 sp->ts_state = STATE_SWAP3; | |
11491 break; | |
11492 } | |
11493 if (c2 != NUL && TRY_DEEPER(su, stack, depth, SCORE_SWAP)) | |
11494 { | |
11495 go_deeper(stack, depth, SCORE_SWAP); | |
11496 #ifdef DEBUG_TRIEWALK | |
11497 sprintf(changename[depth], "%.*s-%s: swap %c and %c", | |
11498 sp->ts_twordlen, tword, fword + sp->ts_fidx, | |
11499 c, c2); | |
11500 #endif | |
11501 sp->ts_state = STATE_UNSWAP; | |
11502 ++depth; | |
10136 #ifdef FEAT_MBYTE | 11503 #ifdef FEAT_MBYTE |
10137 if (has_mbyte) | 11504 if (has_mbyte) |
10138 { | 11505 { |
10139 n = MB_BYTE2LEN(*p); | 11506 fl = mb_char2len(c2); |
10140 c = mb_ptr2char(p + n); | 11507 mch_memmove(p, p + n, fl); |
10141 mch_memmove(p + MB_BYTE2LEN(p[n]), p, n); | 11508 mb_char2bytes(c, p + fl); |
10142 mb_char2bytes(c, p); | 11509 stack[depth].ts_fidxtry = sp->ts_fidx + n + fl; |
10143 } | 11510 } |
10144 else | 11511 else |
10145 #endif | 11512 #endif |
10146 { | 11513 { |
10147 c = *p; | 11514 p[0] = c2; |
10148 *p = p[1]; | |
10149 p[1] = c; | 11515 p[1] = c; |
11516 stack[depth].ts_fidxtry = sp->ts_fidx + 2; | |
10150 } | 11517 } |
10151 /*FALLTHROUGH*/ | 11518 } |
10152 | 11519 else |
10153 case STATE_SWAP3: | 11520 /* If this swap doesn't work then SWAP3 won't either. */ |
10154 /* Swap two bytes, skipping one: "123" -> "321". We change | 11521 sp->ts_state = STATE_REP_INI; |
10155 * "fword" here, it's changed back afterwards. */ | 11522 break; |
11523 | |
11524 case STATE_UNSWAP: | |
11525 /* Undo the STATE_SWAP swap: "21" -> "12". */ | |
11526 p = fword + sp->ts_fidx; | |
11527 #ifdef FEAT_MBYTE | |
11528 if (has_mbyte) | |
11529 { | |
11530 n = MB_BYTE2LEN(*p); | |
11531 c = mb_ptr2char(p + n); | |
11532 mch_memmove(p + MB_BYTE2LEN(p[n]), p, n); | |
11533 mb_char2bytes(c, p); | |
11534 } | |
11535 else | |
11536 #endif | |
11537 { | |
11538 c = *p; | |
11539 *p = p[1]; | |
11540 p[1] = c; | |
11541 } | |
11542 /*FALLTHROUGH*/ | |
11543 | |
11544 case STATE_SWAP3: | |
11545 /* Swap two bytes, skipping one: "123" -> "321". We change | |
11546 * "fword" here, it's changed back afterwards at STATE_UNSWAP3. */ | |
11547 p = fword + sp->ts_fidx; | |
11548 #ifdef FEAT_MBYTE | |
11549 if (has_mbyte) | |
11550 { | |
11551 n = mb_cptr2len(p); | |
11552 c = mb_ptr2char(p); | |
11553 fl = mb_cptr2len(p + n); | |
11554 c2 = mb_ptr2char(p + n); | |
11555 if (!soundfold && !spell_iswordp(p + n + fl, curbuf)) | |
11556 c3 = c; /* don't swap non-word char */ | |
11557 else | |
11558 c3 = mb_ptr2char(p + n + fl); | |
11559 } | |
11560 else | |
11561 #endif | |
11562 { | |
11563 c = *p; | |
11564 c2 = p[1]; | |
11565 if (!soundfold && !spell_iswordp(p + 2, curbuf)) | |
11566 c3 = c; /* don't swap non-word char */ | |
11567 else | |
11568 c3 = p[2]; | |
11569 } | |
11570 | |
11571 /* When characters are identical: "121" then SWAP3 result is | |
11572 * identical, ROT3L result is same as SWAP: "211", ROT3L result is | |
11573 * same as SWAP on next char: "112". Thus skip all swapping. | |
11574 * Also skip when c3 is NUL. | |
11575 * Also get here when the third character is not a word character. | |
11576 * Second character may any char: "a.b" -> "b.a" */ | |
11577 if (c == c3 || c3 == NUL) | |
11578 { | |
11579 sp->ts_state = STATE_REP_INI; | |
11580 break; | |
11581 } | |
11582 if (TRY_DEEPER(su, stack, depth, SCORE_SWAP3)) | |
11583 { | |
11584 go_deeper(stack, depth, SCORE_SWAP3); | |
11585 #ifdef DEBUG_TRIEWALK | |
11586 sprintf(changename[depth], "%.*s-%s: swap3 %c and %c", | |
11587 sp->ts_twordlen, tword, fword + sp->ts_fidx, | |
11588 c, c3); | |
11589 #endif | |
11590 sp->ts_state = STATE_UNSWAP3; | |
11591 ++depth; | |
11592 #ifdef FEAT_MBYTE | |
11593 if (has_mbyte) | |
11594 { | |
11595 tl = mb_char2len(c3); | |
11596 mch_memmove(p, p + n + fl, tl); | |
11597 mb_char2bytes(c2, p + tl); | |
11598 mb_char2bytes(c, p + fl + tl); | |
11599 stack[depth].ts_fidxtry = sp->ts_fidx + n + fl + tl; | |
11600 } | |
11601 else | |
11602 #endif | |
11603 { | |
11604 p[0] = p[2]; | |
11605 p[2] = c; | |
11606 stack[depth].ts_fidxtry = sp->ts_fidx + 3; | |
11607 } | |
11608 } | |
11609 else | |
11610 sp->ts_state = STATE_REP_INI; | |
11611 break; | |
11612 | |
11613 case STATE_UNSWAP3: | |
11614 /* Undo STATE_SWAP3: "321" -> "123" */ | |
11615 p = fword + sp->ts_fidx; | |
11616 #ifdef FEAT_MBYTE | |
11617 if (has_mbyte) | |
11618 { | |
11619 n = MB_BYTE2LEN(*p); | |
11620 c2 = mb_ptr2char(p + n); | |
11621 fl = MB_BYTE2LEN(p[n]); | |
11622 c = mb_ptr2char(p + n + fl); | |
11623 tl = MB_BYTE2LEN(p[n + fl]); | |
11624 mch_memmove(p + fl + tl, p, n); | |
11625 mb_char2bytes(c, p); | |
11626 mb_char2bytes(c2, p + tl); | |
11627 p = p + tl; | |
11628 } | |
11629 else | |
11630 #endif | |
11631 { | |
11632 c = *p; | |
11633 *p = p[2]; | |
11634 p[2] = c; | |
11635 ++p; | |
11636 } | |
11637 | |
11638 if (!soundfold && !spell_iswordp(p, curbuf)) | |
11639 { | |
11640 /* Middle char is not a word char, skip the rotate. First and | |
11641 * third char were already checked at swap and swap3. */ | |
11642 sp->ts_state = STATE_REP_INI; | |
11643 break; | |
11644 } | |
11645 | |
11646 /* Rotate three characters left: "123" -> "231". We change | |
11647 * "fword" here, it's changed back afterwards at STATE_UNROT3L. */ | |
11648 if (TRY_DEEPER(su, stack, depth, SCORE_SWAP3)) | |
11649 { | |
11650 go_deeper(stack, depth, SCORE_SWAP3); | |
11651 #ifdef DEBUG_TRIEWALK | |
11652 p = fword + sp->ts_fidx; | |
11653 sprintf(changename[depth], "%.*s-%s: rotate left %c%c%c", | |
11654 sp->ts_twordlen, tword, fword + sp->ts_fidx, | |
11655 p[0], p[1], p[2]); | |
11656 #endif | |
11657 sp->ts_state = STATE_UNROT3L; | |
11658 ++depth; | |
10156 p = fword + sp->ts_fidx; | 11659 p = fword + sp->ts_fidx; |
10157 #ifdef FEAT_MBYTE | 11660 #ifdef FEAT_MBYTE |
10158 if (has_mbyte) | 11661 if (has_mbyte) |
10159 { | 11662 { |
10160 n = mb_cptr2len(p); | 11663 n = mb_cptr2len(p); |
10161 c = mb_ptr2char(p); | 11664 c = mb_ptr2char(p); |
10162 fl = mb_cptr2len(p + n); | 11665 fl = mb_cptr2len(p + n); |
10163 c2 = mb_ptr2char(p + n); | 11666 fl += mb_cptr2len(p + n + fl); |
10164 if (!spell_iswordp(p + n + fl, curbuf)) | 11667 mch_memmove(p, p + n, fl); |
10165 c3 = c; /* don't swap non-word char */ | 11668 mb_char2bytes(c, p + fl); |
10166 else | 11669 stack[depth].ts_fidxtry = sp->ts_fidx + n + fl; |
10167 c3 = mb_ptr2char(p + n + fl); | |
10168 } | 11670 } |
10169 else | 11671 else |
10170 #endif | 11672 #endif |
10171 { | 11673 { |
10172 c = *p; | 11674 c = *p; |
10173 c2 = p[1]; | 11675 *p = p[1]; |
10174 if (!spell_iswordp(p + 2, curbuf)) | 11676 p[1] = p[2]; |
10175 c3 = c; /* don't swap non-word char */ | 11677 p[2] = c; |
10176 else | 11678 stack[depth].ts_fidxtry = sp->ts_fidx + 3; |
10177 c3 = p[2]; | |
10178 } | 11679 } |
10179 | 11680 } |
10180 /* When characters are identical: "121" then SWAP3 result is | 11681 else |
10181 * identical, ROT3L result is same as SWAP: "211", ROT3L | 11682 sp->ts_state = STATE_REP_INI; |
10182 * result is same as SWAP on next char: "112". Thus skip all | 11683 break; |
10183 * swapping. Also skip when c3 is NUL. | 11684 |
10184 * Also get here when the third character is not a word | 11685 case STATE_UNROT3L: |
10185 * character. Second character may any char: "a.b" -> "b.a" */ | 11686 /* Undo ROT3L: "231" -> "123" */ |
10186 if (c == c3 || c3 == NUL) | 11687 p = fword + sp->ts_fidx; |
10187 { | |
10188 sp->ts_state = STATE_REP_INI; | |
10189 break; | |
10190 } | |
10191 if (try_deeper(su, stack, depth, SCORE_SWAP3)) | |
10192 { | |
10193 sp->ts_state = STATE_UNSWAP3; | |
10194 ++depth; | |
10195 #ifdef FEAT_MBYTE | 11688 #ifdef FEAT_MBYTE |
10196 if (has_mbyte) | 11689 if (has_mbyte) |
10197 { | 11690 { |
10198 tl = mb_char2len(c3); | 11691 n = MB_BYTE2LEN(*p); |
10199 mch_memmove(p, p + n + fl, tl); | 11692 n += MB_BYTE2LEN(p[n]); |
10200 mb_char2bytes(c2, p + tl); | 11693 c = mb_ptr2char(p + n); |
10201 mb_char2bytes(c, p + fl + tl); | 11694 tl = MB_BYTE2LEN(p[n]); |
10202 stack[depth].ts_fidxtry = sp->ts_fidx + n + fl + tl; | 11695 mch_memmove(p + tl, p, n); |
10203 } | 11696 mb_char2bytes(c, p); |
10204 else | 11697 } |
11698 else | |
10205 #endif | 11699 #endif |
10206 { | 11700 { |
10207 p[0] = p[2]; | 11701 c = p[2]; |
10208 p[2] = c; | 11702 p[2] = p[1]; |
10209 stack[depth].ts_fidxtry = sp->ts_fidx + 3; | 11703 p[1] = *p; |
10210 } | 11704 *p = c; |
10211 } | 11705 } |
10212 else | 11706 |
10213 sp->ts_state = STATE_REP_INI; | 11707 /* Rotate three bytes right: "123" -> "312". We change "fword" |
10214 break; | 11708 * here, it's changed back afterwards at STATE_UNROT3R. */ |
10215 | 11709 if (TRY_DEEPER(su, stack, depth, SCORE_SWAP3)) |
10216 case STATE_UNSWAP3: | 11710 { |
10217 /* Undo STATE_SWAP3: "321" -> "123" */ | 11711 go_deeper(stack, depth, SCORE_SWAP3); |
11712 #ifdef DEBUG_TRIEWALK | |
11713 p = fword + sp->ts_fidx; | |
11714 sprintf(changename[depth], "%.*s-%s: rotate right %c%c%c", | |
11715 sp->ts_twordlen, tword, fword + sp->ts_fidx, | |
11716 p[0], p[1], p[2]); | |
11717 #endif | |
11718 sp->ts_state = STATE_UNROT3R; | |
11719 ++depth; | |
10218 p = fword + sp->ts_fidx; | 11720 p = fword + sp->ts_fidx; |
10219 #ifdef FEAT_MBYTE | 11721 #ifdef FEAT_MBYTE |
10220 if (has_mbyte) | 11722 if (has_mbyte) |
10221 { | 11723 { |
10222 n = MB_BYTE2LEN(*p); | 11724 n = mb_cptr2len(p); |
10223 c2 = mb_ptr2char(p + n); | 11725 n += mb_cptr2len(p + n); |
10224 fl = MB_BYTE2LEN(p[n]); | |
10225 c = mb_ptr2char(p + n + fl); | |
10226 tl = MB_BYTE2LEN(p[n + fl]); | |
10227 mch_memmove(p + fl + tl, p, n); | |
10228 mb_char2bytes(c, p); | |
10229 mb_char2bytes(c2, p + tl); | |
10230 p = p + tl; | |
10231 } | |
10232 else | |
10233 #endif | |
10234 { | |
10235 c = *p; | |
10236 *p = p[2]; | |
10237 p[2] = c; | |
10238 ++p; | |
10239 } | |
10240 | |
10241 if (!spell_iswordp(p, curbuf)) | |
10242 { | |
10243 /* Middle char is not a word char, skip the rotate. | |
10244 * First and third char were already checked at swap | |
10245 * and swap3. */ | |
10246 sp->ts_state = STATE_REP_INI; | |
10247 break; | |
10248 } | |
10249 | |
10250 /* Rotate three characters left: "123" -> "231". We change | |
10251 * "fword" here, it's changed back afterwards. */ | |
10252 if (try_deeper(su, stack, depth, SCORE_SWAP3)) | |
10253 { | |
10254 sp->ts_state = STATE_UNROT3L; | |
10255 ++depth; | |
10256 p = fword + sp->ts_fidx; | |
10257 #ifdef FEAT_MBYTE | |
10258 if (has_mbyte) | |
10259 { | |
10260 n = mb_cptr2len(p); | |
10261 c = mb_ptr2char(p); | |
10262 fl = mb_cptr2len(p + n); | |
10263 fl += mb_cptr2len(p + n + fl); | |
10264 mch_memmove(p, p + n, fl); | |
10265 mb_char2bytes(c, p + fl); | |
10266 stack[depth].ts_fidxtry = sp->ts_fidx + n + fl; | |
10267 } | |
10268 else | |
10269 #endif | |
10270 { | |
10271 c = *p; | |
10272 *p = p[1]; | |
10273 p[1] = p[2]; | |
10274 p[2] = c; | |
10275 stack[depth].ts_fidxtry = sp->ts_fidx + 3; | |
10276 } | |
10277 } | |
10278 else | |
10279 sp->ts_state = STATE_REP_INI; | |
10280 break; | |
10281 | |
10282 case STATE_UNROT3L: | |
10283 /* Undo ROT3L: "231" -> "123" */ | |
10284 p = fword + sp->ts_fidx; | |
10285 #ifdef FEAT_MBYTE | |
10286 if (has_mbyte) | |
10287 { | |
10288 n = MB_BYTE2LEN(*p); | |
10289 n += MB_BYTE2LEN(p[n]); | |
10290 c = mb_ptr2char(p + n); | 11726 c = mb_ptr2char(p + n); |
10291 tl = MB_BYTE2LEN(p[n]); | 11727 tl = mb_cptr2len(p + n); |
10292 mch_memmove(p + tl, p, n); | 11728 mch_memmove(p + tl, p, n); |
10293 mb_char2bytes(c, p); | 11729 mb_char2bytes(c, p); |
11730 stack[depth].ts_fidxtry = sp->ts_fidx + n + tl; | |
10294 } | 11731 } |
10295 else | 11732 else |
10296 #endif | 11733 #endif |
10297 { | 11734 { |
10298 c = p[2]; | 11735 c = p[2]; |
10299 p[2] = p[1]; | 11736 p[2] = p[1]; |
10300 p[1] = *p; | 11737 p[1] = *p; |
10301 *p = c; | 11738 *p = c; |
11739 stack[depth].ts_fidxtry = sp->ts_fidx + 3; | |
10302 } | 11740 } |
10303 | 11741 } |
10304 /* Rotate three bytes right: "123" -> "312". We change | 11742 else |
10305 * "fword" here, it's changed back afterwards. */ | 11743 sp->ts_state = STATE_REP_INI; |
10306 if (try_deeper(su, stack, depth, SCORE_SWAP3)) | 11744 break; |
11745 | |
11746 case STATE_UNROT3R: | |
11747 /* Undo ROT3R: "312" -> "123" */ | |
11748 p = fword + sp->ts_fidx; | |
11749 #ifdef FEAT_MBYTE | |
11750 if (has_mbyte) | |
11751 { | |
11752 c = mb_ptr2char(p); | |
11753 tl = MB_BYTE2LEN(*p); | |
11754 n = MB_BYTE2LEN(p[tl]); | |
11755 n += MB_BYTE2LEN(p[tl + n]); | |
11756 mch_memmove(p, p + tl, n); | |
11757 mb_char2bytes(c, p + n); | |
11758 } | |
11759 else | |
11760 #endif | |
11761 { | |
11762 c = *p; | |
11763 *p = p[1]; | |
11764 p[1] = p[2]; | |
11765 p[2] = c; | |
11766 } | |
11767 /*FALLTHROUGH*/ | |
11768 | |
11769 case STATE_REP_INI: | |
11770 /* Check if matching with REP items from the .aff file would work. | |
11771 * Quickly skip if: | |
11772 * - there are no REP items and we are not in the soundfold trie | |
11773 * - the score is going to be too high anyway | |
11774 * - already applied a REP item or swapped here */ | |
11775 if ((lp->lp_replang == NULL && !soundfold) | |
11776 || sp->ts_score + SCORE_REP >= su->su_maxscore | |
11777 || sp->ts_fidx < sp->ts_fidxtry) | |
11778 { | |
11779 sp->ts_state = STATE_FINAL; | |
11780 break; | |
11781 } | |
11782 | |
11783 /* Use the first byte to quickly find the first entry that may | |
11784 * match. If the index is -1 there is none. */ | |
11785 if (soundfold) | |
11786 sp->ts_curi = slang->sl_repsal_first[fword[sp->ts_fidx]]; | |
11787 else | |
11788 sp->ts_curi = lp->lp_replang->sl_rep_first[fword[sp->ts_fidx]]; | |
11789 | |
11790 if (sp->ts_curi < 0) | |
11791 { | |
11792 sp->ts_state = STATE_FINAL; | |
11793 break; | |
11794 } | |
11795 | |
11796 sp->ts_state = STATE_REP; | |
11797 /*FALLTHROUGH*/ | |
11798 | |
11799 case STATE_REP: | |
11800 /* Try matching with REP items from the .aff file. For each match | |
11801 * replace the characters and check if the resulting word is | |
11802 * valid. */ | |
11803 p = fword + sp->ts_fidx; | |
11804 | |
11805 if (soundfold) | |
11806 gap = &slang->sl_repsal; | |
11807 else | |
11808 gap = &lp->lp_replang->sl_rep; | |
11809 while (sp->ts_curi < gap->ga_len) | |
11810 { | |
11811 ftp = (fromto_T *)gap->ga_data + sp->ts_curi++; | |
11812 if (*ftp->ft_from != *p) | |
10307 { | 11813 { |
10308 sp->ts_state = STATE_UNROT3R; | 11814 /* past possible matching entries */ |
10309 ++depth; | 11815 sp->ts_curi = gap->ga_len; |
10310 p = fword + sp->ts_fidx; | |
10311 #ifdef FEAT_MBYTE | |
10312 if (has_mbyte) | |
10313 { | |
10314 n = mb_cptr2len(p); | |
10315 n += mb_cptr2len(p + n); | |
10316 c = mb_ptr2char(p + n); | |
10317 tl = mb_cptr2len(p + n); | |
10318 mch_memmove(p + tl, p, n); | |
10319 mb_char2bytes(c, p); | |
10320 stack[depth].ts_fidxtry = sp->ts_fidx + n + tl; | |
10321 } | |
10322 else | |
10323 #endif | |
10324 { | |
10325 c = p[2]; | |
10326 p[2] = p[1]; | |
10327 p[1] = *p; | |
10328 *p = c; | |
10329 stack[depth].ts_fidxtry = sp->ts_fidx + 3; | |
10330 } | |
10331 } | |
10332 else | |
10333 sp->ts_state = STATE_REP_INI; | |
10334 break; | |
10335 | |
10336 case STATE_UNROT3R: | |
10337 /* Undo ROT3R: "312" -> "123" */ | |
10338 p = fword + sp->ts_fidx; | |
10339 #ifdef FEAT_MBYTE | |
10340 if (has_mbyte) | |
10341 { | |
10342 c = mb_ptr2char(p); | |
10343 tl = MB_BYTE2LEN(*p); | |
10344 n = MB_BYTE2LEN(p[tl]); | |
10345 n += MB_BYTE2LEN(p[tl + n]); | |
10346 mch_memmove(p, p + tl, n); | |
10347 mb_char2bytes(c, p + n); | |
10348 } | |
10349 else | |
10350 #endif | |
10351 { | |
10352 c = *p; | |
10353 *p = p[1]; | |
10354 p[1] = p[2]; | |
10355 p[2] = c; | |
10356 } | |
10357 /*FALLTHROUGH*/ | |
10358 | |
10359 case STATE_REP_INI: | |
10360 /* Check if matching with REP items from the .aff file would | |
10361 * work. Quickly skip if: | |
10362 * - there are no REP items | |
10363 * - the score is going to be too high anyway | |
10364 * - already applied a REP item or swapped here */ | |
10365 if (lp->lp_replang == NULL | |
10366 || sp->ts_score + SCORE_REP >= su->su_maxscore | |
10367 || sp->ts_fidx < sp->ts_fidxtry) | |
10368 { | |
10369 sp->ts_state = STATE_FINAL; | |
10370 break; | 11816 break; |
10371 } | 11817 } |
10372 gap = &lp->lp_replang->sl_rep; | 11818 if (STRNCMP(ftp->ft_from, p, STRLEN(ftp->ft_from)) == 0 |
10373 | 11819 && TRY_DEEPER(su, stack, depth, SCORE_REP)) |
10374 /* Use the first byte to quickly find the first entry that | |
10375 * may match. If the index is -1 there is none. */ | |
10376 sp->ts_curi = lp->lp_replang->sl_rep_first[fword[sp->ts_fidx]]; | |
10377 if (sp->ts_curi < 0) | |
10378 { | 11820 { |
10379 sp->ts_state = STATE_FINAL; | 11821 go_deeper(stack, depth, SCORE_REP); |
11822 #ifdef DEBUG_TRIEWALK | |
11823 sprintf(changename[depth], "%.*s-%s: replace %s with %s", | |
11824 sp->ts_twordlen, tword, fword + sp->ts_fidx, | |
11825 ftp->ft_from, ftp->ft_to); | |
11826 #endif | |
11827 /* Need to undo this afterwards. */ | |
11828 sp->ts_state = STATE_REP_UNDO; | |
11829 | |
11830 /* Change the "from" to the "to" string. */ | |
11831 ++depth; | |
11832 fl = STRLEN(ftp->ft_from); | |
11833 tl = STRLEN(ftp->ft_to); | |
11834 if (fl != tl) | |
11835 { | |
11836 mch_memmove(p + tl, p + fl, STRLEN(p + fl) + 1); | |
11837 repextra += tl - fl; | |
11838 } | |
11839 mch_memmove(p, ftp->ft_to, tl); | |
11840 stack[depth].ts_fidxtry = sp->ts_fidx + tl; | |
11841 #ifdef FEAT_MBYTE | |
11842 stack[depth].ts_tcharlen = 0; | |
11843 #endif | |
10380 break; | 11844 break; |
10381 } | 11845 } |
10382 | 11846 } |
10383 sp->ts_state = STATE_REP; | 11847 |
10384 /*FALLTHROUGH*/ | 11848 if (sp->ts_curi >= gap->ga_len && sp->ts_state == STATE_REP) |
10385 | 11849 /* No (more) matches. */ |
10386 case STATE_REP: | 11850 sp->ts_state = STATE_FINAL; |
10387 /* Try matching with REP items from the .aff file. For each | 11851 |
10388 * match replace the characters and check if the resulting | 11852 break; |
10389 * word is valid. */ | 11853 |
10390 p = fword + sp->ts_fidx; | 11854 case STATE_REP_UNDO: |
10391 | 11855 /* Undo a REP replacement and continue with the next one. */ |
11856 if (soundfold) | |
11857 gap = &slang->sl_repsal; | |
11858 else | |
10392 gap = &lp->lp_replang->sl_rep; | 11859 gap = &lp->lp_replang->sl_rep; |
10393 while (sp->ts_curi < gap->ga_len) | 11860 ftp = (fromto_T *)gap->ga_data + sp->ts_curi - 1; |
10394 { | 11861 fl = STRLEN(ftp->ft_from); |
10395 ftp = (fromto_T *)gap->ga_data + sp->ts_curi++; | 11862 tl = STRLEN(ftp->ft_to); |
10396 if (*ftp->ft_from != *p) | 11863 p = fword + sp->ts_fidx; |
10397 { | 11864 if (fl != tl) |
10398 /* past possible matching entries */ | 11865 { |
10399 sp->ts_curi = gap->ga_len; | 11866 mch_memmove(p + fl, p + tl, STRLEN(p + tl) + 1); |
10400 break; | 11867 repextra -= tl - fl; |
10401 } | 11868 } |
10402 if (STRNCMP(ftp->ft_from, p, STRLEN(ftp->ft_from)) == 0 | 11869 mch_memmove(p, ftp->ft_from, fl); |
10403 && try_deeper(su, stack, depth, SCORE_REP)) | 11870 sp->ts_state = STATE_REP; |
10404 { | 11871 break; |
10405 /* Need to undo this afterwards. */ | 11872 |
10406 sp->ts_state = STATE_REP_UNDO; | 11873 default: |
10407 | 11874 /* Did all possible states at this level, go up one level. */ |
10408 /* Change the "from" to the "to" string. */ | 11875 --depth; |
10409 ++depth; | 11876 |
10410 fl = STRLEN(ftp->ft_from); | 11877 if (depth >= 0 && stack[depth].ts_prefixdepth == PFD_PREFIXTREE) |
10411 tl = STRLEN(ftp->ft_to); | 11878 { |
10412 if (fl != tl) | 11879 /* Continue in or go back to the prefix tree. */ |
10413 { | 11880 byts = pbyts; |
10414 mch_memmove(p + tl, p + fl, STRLEN(p + fl) + 1); | 11881 idxs = pidxs; |
10415 repextra += tl - fl; | 11882 } |
10416 } | 11883 |
10417 mch_memmove(p, ftp->ft_to, tl); | 11884 /* Don't check for CTRL-C too often, it takes time. */ |
10418 stack[depth].ts_fidxtry = sp->ts_fidx + tl; | 11885 if (--breakcheckcount == 0) |
10419 #ifdef FEAT_MBYTE | 11886 { |
10420 stack[depth].ts_tcharlen = 0; | 11887 ui_breakcheck(); |
10421 #endif | 11888 breakcheckcount = 1000; |
10422 break; | 11889 } |
10423 } | 11890 } |
10424 } | 11891 } |
10425 | 11892 } |
10426 if (sp->ts_curi >= gap->ga_len && sp->ts_state == STATE_REP) | 11893 |
10427 /* No (more) matches. */ | 11894 |
10428 sp->ts_state = STATE_FINAL; | 11895 /* |
10429 | 11896 * Go one level deeper in the tree. |
10430 break; | 11897 */ |
10431 | 11898 static void |
10432 case STATE_REP_UNDO: | 11899 go_deeper(stack, depth, score_add) |
10433 /* Undo a REP replacement and continue with the next one. */ | |
10434 ftp = (fromto_T *)lp->lp_replang->sl_rep.ga_data | |
10435 + sp->ts_curi - 1; | |
10436 fl = STRLEN(ftp->ft_from); | |
10437 tl = STRLEN(ftp->ft_to); | |
10438 p = fword + sp->ts_fidx; | |
10439 if (fl != tl) | |
10440 { | |
10441 mch_memmove(p + fl, p + tl, STRLEN(p + tl) + 1); | |
10442 repextra -= tl - fl; | |
10443 } | |
10444 mch_memmove(p, ftp->ft_from, fl); | |
10445 sp->ts_state = STATE_REP; | |
10446 break; | |
10447 | |
10448 default: | |
10449 /* Did all possible states at this level, go up one level. */ | |
10450 --depth; | |
10451 | |
10452 if (depth >= 0 && stack[depth].ts_prefixdepth == PFD_PREFIXTREE) | |
10453 { | |
10454 /* Continue in or go back to the prefix tree. */ | |
10455 byts = pbyts; | |
10456 idxs = pidxs; | |
10457 } | |
10458 | |
10459 /* Don't check for CTRL-C too often, it takes time. */ | |
10460 line_breakcheck(); | |
10461 } | |
10462 } | |
10463 } | |
10464 } | |
10465 | |
10466 /* | |
10467 * Try going one level deeper in the tree. | |
10468 */ | |
10469 static int | |
10470 try_deeper(su, stack, depth, score_add) | |
10471 suginfo_T *su; | |
10472 trystate_T *stack; | 11900 trystate_T *stack; |
10473 int depth; | 11901 int depth; |
10474 int score_add; | 11902 int score_add; |
10475 { | 11903 { |
10476 int newscore; | |
10477 | |
10478 /* Refuse to go deeper if the scrore is getting too big. */ | |
10479 newscore = stack[depth].ts_score + score_add; | |
10480 if (newscore >= su->su_maxscore) | |
10481 return FALSE; | |
10482 | |
10483 stack[depth + 1] = stack[depth]; | 11904 stack[depth + 1] = stack[depth]; |
10484 stack[depth + 1].ts_state = STATE_START; | 11905 stack[depth + 1].ts_state = STATE_START; |
10485 stack[depth + 1].ts_score = newscore; | 11906 stack[depth + 1].ts_score = stack[depth].ts_score + score_add; |
10486 stack[depth + 1].ts_curi = 1; /* start just after length byte */ | 11907 stack[depth + 1].ts_curi = 1; /* start just after length byte */ |
10487 stack[depth + 1].ts_flags = 0; | 11908 stack[depth + 1].ts_flags = 0; |
10488 return TRUE; | |
10489 } | 11909 } |
10490 | 11910 |
10491 #ifdef FEAT_MBYTE | 11911 #ifdef FEAT_MBYTE |
10492 /* | 11912 /* |
10493 * Case-folding may change the number of bytes: Count nr of chars in | 11913 * Case-folding may change the number of bytes: Count nr of chars in |
10711 /* Add the suggestion. */ | 12131 /* Add the suggestion. */ |
10712 sstp = &SUG(su->su_sga, su->su_sga.ga_len); | 12132 sstp = &SUG(su->su_sga, su->su_sga.ga_len); |
10713 sstp->st_word = vim_strsave(stp->st_word); | 12133 sstp->st_word = vim_strsave(stp->st_word); |
10714 if (sstp->st_word != NULL) | 12134 if (sstp->st_word != NULL) |
10715 { | 12135 { |
12136 sstp->st_wordlen = stp->st_wordlen; | |
10716 sstp->st_score = score; | 12137 sstp->st_score = score; |
10717 sstp->st_altscore = 0; | 12138 sstp->st_altscore = 0; |
10718 sstp->st_orglen = stp->st_orglen; | 12139 sstp->st_orglen = stp->st_orglen; |
10719 ++su->su_sga.ga_len; | 12140 ++su->su_sga.ga_len; |
10720 } | 12141 } |
10741 suggest_T *stp; | 12162 suggest_T *stp; |
10742 char_u *p; | 12163 char_u *p; |
10743 char_u badsound[MAXWLEN]; | 12164 char_u badsound[MAXWLEN]; |
10744 int round; | 12165 int round; |
10745 int lpi; | 12166 int lpi; |
12167 slang_T *slang = NULL; | |
10746 | 12168 |
10747 /* Add the alternate score to su_ga. */ | 12169 /* Add the alternate score to su_ga. */ |
10748 for (lpi = 0; lpi < curbuf->b_langp.ga_len; ++lpi) | 12170 for (lpi = 0; lpi < curbuf->b_langp.ga_len; ++lpi) |
10749 { | 12171 { |
10750 lp = LANGP_ENTRY(curbuf->b_langp, lpi); | 12172 lp = LANGP_ENTRY(curbuf->b_langp, lpi); |
10751 if (lp->lp_slang->sl_sal.ga_len > 0) | 12173 if (lp->lp_slang->sl_sal.ga_len > 0) |
10752 { | 12174 { |
10753 /* soundfold the bad word */ | 12175 /* soundfold the bad word */ |
10754 spell_soundfold(lp->lp_slang, su->su_fbadword, TRUE, badsound); | 12176 slang = lp->lp_slang; |
12177 spell_soundfold(slang, su->su_fbadword, TRUE, badsound); | |
10755 | 12178 |
10756 for (i = 0; i < su->su_ga.ga_len; ++i) | 12179 for (i = 0; i < su->su_ga.ga_len; ++i) |
10757 { | 12180 { |
10758 stp = &SUG(su->su_ga, i); | 12181 stp = &SUG(su->su_ga, i); |
10759 stp->st_altscore = stp_sal_score(stp, su, lp->lp_slang, | 12182 stp->st_altscore = stp_sal_score(stp, su, slang, badsound); |
10760 badsound); | |
10761 if (stp->st_altscore == SCORE_MAXMAX) | 12183 if (stp->st_altscore == SCORE_MAXMAX) |
10762 stp->st_score = (stp->st_score * 3 + SCORE_BIG) / 4; | 12184 stp->st_score = (stp->st_score * 3 + SCORE_BIG) / 4; |
10763 else | 12185 else |
10764 stp->st_score = (stp->st_score * 3 | 12186 stp->st_score = (stp->st_score * 3 |
10765 + stp->st_altscore) / 4; | 12187 + stp->st_altscore) / 4; |
10767 } | 12189 } |
10768 break; | 12190 break; |
10769 } | 12191 } |
10770 } | 12192 } |
10771 | 12193 |
12194 if (slang == NULL) /* just in case */ | |
12195 return; | |
12196 | |
10772 /* Add the alternate score to su_sga. */ | 12197 /* Add the alternate score to su_sga. */ |
10773 for (i = 0; i < su->su_sga.ga_len; ++i) | 12198 for (i = 0; i < su->su_sga.ga_len; ++i) |
10774 { | 12199 { |
10775 stp = &SUG(su->su_sga, i); | 12200 stp = &SUG(su->su_sga, i); |
10776 stp->st_altscore = spell_edit_score(su->su_badword, stp->st_word); | 12201 stp->st_altscore = spell_edit_score(slang, |
12202 su->su_badword, stp->st_word); | |
10777 if (stp->st_score == SCORE_MAXMAX) | 12203 if (stp->st_score == SCORE_MAXMAX) |
10778 stp->st_score = (SCORE_BIG * 7 + stp->st_altscore) / 8; | 12204 stp->st_score = (SCORE_BIG * 7 + stp->st_altscore) / 8; |
10779 else | 12205 else |
10780 stp->st_score = (stp->st_score * 7 + stp->st_altscore) / 8; | 12206 stp->st_score = (stp->st_score * 7 + stp->st_altscore) / 8; |
10781 stp->st_salscore = TRUE; | 12207 stp->st_salscore = TRUE; |
10782 } | 12208 } |
10783 | 12209 |
10784 /* Sort the suggestions and truncate at "maxcount" for both lists. */ | 12210 /* Remove bad suggestions, sort the suggestions and truncate at "maxcount" |
12211 * for both lists. */ | |
12212 check_suggestions(su, &su->su_ga); | |
10785 (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, su->su_maxcount); | 12213 (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, su->su_maxcount); |
12214 check_suggestions(su, &su->su_sga); | |
10786 (void)cleanup_suggestions(&su->su_sga, su->su_maxscore, su->su_maxcount); | 12215 (void)cleanup_suggestions(&su->su_sga, su->su_maxscore, su->su_maxcount); |
10787 | 12216 |
10788 ga_init2(&ga, (int)sizeof(suginfo_T), 1); | 12217 ga_init2(&ga, (int)sizeof(suginfo_T), 1); |
10789 if (ga_grow(&ga, su->su_ga.ga_len + su->su_sga.ga_len) == FAIL) | 12218 if (ga_grow(&ga, su->su_ga.ga_len + su->su_sga.ga_len) == FAIL) |
10790 return; | 12219 return; |
10870 if (lendiff > 0) | 12299 if (lendiff > 0) |
10871 { | 12300 { |
10872 /* Add part of the bad word to the good word, so that we soundfold | 12301 /* Add part of the bad word to the good word, so that we soundfold |
10873 * what replaces the bad word. */ | 12302 * what replaces the bad word. */ |
10874 STRCPY(goodword, stp->st_word); | 12303 STRCPY(goodword, stp->st_word); |
10875 STRNCAT(goodword, su->su_badptr + su->su_badlen - lendiff, lendiff); | 12304 vim_strncpy(goodword + stp->st_wordlen, |
12305 su->su_badptr + su->su_badlen - lendiff, lendiff); | |
10876 pgood = goodword; | 12306 pgood = goodword; |
10877 } | 12307 } |
10878 else | 12308 else |
10879 pgood = stp->st_word; | 12309 pgood = stp->st_word; |
10880 | 12310 |
10881 /* Sound-fold the word and compute the score for the difference. */ | 12311 /* Sound-fold the word and compute the score for the difference. */ |
10882 spell_soundfold(slang, pgood, FALSE, goodsound); | 12312 spell_soundfold(slang, pgood, FALSE, goodsound); |
10883 | 12313 |
10884 return soundalike_score(goodsound, pbad); | 12314 return soundalike_score(goodsound, pbad); |
12315 } | |
12316 | |
12317 /* structure used to store soundfolded words that add_sound_suggest() has | |
12318 * handled already. */ | |
12319 typedef struct | |
12320 { | |
12321 short sft_score; /* lowest score used */ | |
12322 char_u sft_word[1]; /* soundfolded word, actually longer */ | |
12323 } sftword_T; | |
12324 | |
12325 static sftword_T dumsft; | |
12326 #define HIKEY2SFT(p) ((sftword_T *)(p - (dumsft.sft_word - (char_u *)&dumsft))) | |
12327 #define HI2SFT(hi) HIKEY2SFT((hi)->hi_key) | |
12328 | |
12329 /* | |
12330 * Prepare for calling suggest_try_soundalike(). | |
12331 */ | |
12332 static void | |
12333 suggest_try_soundalike_prep() | |
12334 { | |
12335 langp_T *lp; | |
12336 int lpi; | |
12337 slang_T *slang; | |
12338 | |
12339 /* Do this for all languages that support sound folding and for which a | |
12340 * .sug file has been loaded. */ | |
12341 for (lpi = 0; lpi < curbuf->b_langp.ga_len; ++lpi) | |
12342 { | |
12343 lp = LANGP_ENTRY(curbuf->b_langp, lpi); | |
12344 slang = lp->lp_slang; | |
12345 if (slang->sl_sal.ga_len > 0 && slang->sl_sbyts != NULL) | |
12346 /* prepare the hashtable used by add_sound_suggest() */ | |
12347 hash_init(&slang->sl_sounddone); | |
12348 } | |
10885 } | 12349 } |
10886 | 12350 |
10887 /* | 12351 /* |
10888 * Find suggestions by comparing the word in a sound-a-like form. | 12352 * Find suggestions by comparing the word in a sound-a-like form. |
10889 * Note: This doesn't support postponed prefixes. | 12353 * Note: This doesn't support postponed prefixes. |
10891 static void | 12355 static void |
10892 suggest_try_soundalike(su) | 12356 suggest_try_soundalike(su) |
10893 suginfo_T *su; | 12357 suginfo_T *su; |
10894 { | 12358 { |
10895 char_u salword[MAXWLEN]; | 12359 char_u salword[MAXWLEN]; |
10896 char_u tword[MAXWLEN]; | |
10897 char_u tsalword[MAXWLEN]; | |
10898 idx_T arridx[MAXWLEN]; | |
10899 int curi[MAXWLEN]; | |
10900 langp_T *lp; | 12360 langp_T *lp; |
12361 int lpi; | |
12362 slang_T *slang; | |
12363 | |
12364 /* Do this for all languages that support sound folding and for which a | |
12365 * .sug file has been loaded. */ | |
12366 for (lpi = 0; lpi < curbuf->b_langp.ga_len; ++lpi) | |
12367 { | |
12368 lp = LANGP_ENTRY(curbuf->b_langp, lpi); | |
12369 slang = lp->lp_slang; | |
12370 if (slang->sl_sal.ga_len > 0 && slang->sl_sbyts != NULL) | |
12371 { | |
12372 /* soundfold the bad word */ | |
12373 spell_soundfold(slang, su->su_fbadword, TRUE, salword); | |
12374 | |
12375 /* try all kinds of inserts/deletes/swaps/etc. */ | |
12376 /* TODO: also soundfold the next words, so that we can try joining | |
12377 * and splitting */ | |
12378 suggest_trie_walk(su, lp, salword, TRUE); | |
12379 } | |
12380 } | |
12381 } | |
12382 | |
12383 /* | |
12384 * Finish up after calling suggest_try_soundalike(). | |
12385 */ | |
12386 static void | |
12387 suggest_try_soundalike_finish() | |
12388 { | |
12389 langp_T *lp; | |
12390 int lpi; | |
12391 slang_T *slang; | |
12392 int todo; | |
12393 hashitem_T *hi; | |
12394 | |
12395 /* Do this for all languages that support sound folding and for which a | |
12396 * .sug file has been loaded. */ | |
12397 for (lpi = 0; lpi < curbuf->b_langp.ga_len; ++lpi) | |
12398 { | |
12399 lp = LANGP_ENTRY(curbuf->b_langp, lpi); | |
12400 slang = lp->lp_slang; | |
12401 if (slang->sl_sal.ga_len > 0 && slang->sl_sbyts != NULL) | |
12402 { | |
12403 /* Free the info about handled words. */ | |
12404 todo = slang->sl_sounddone.ht_used; | |
12405 for (hi = slang->sl_sounddone.ht_array; todo > 0; ++hi) | |
12406 if (!HASHITEM_EMPTY(hi)) | |
12407 { | |
12408 vim_free(HI2SFT(hi)); | |
12409 --todo; | |
12410 } | |
12411 hash_clear(&slang->sl_sounddone); | |
12412 } | |
12413 } | |
12414 } | |
12415 | |
12416 /* | |
12417 * A match with a soundfolded word is found. Add the good word(s) that | |
12418 * produce this soundfolded word. | |
12419 */ | |
12420 static void | |
12421 add_sound_suggest(su, goodword, score, lp) | |
12422 suginfo_T *su; | |
12423 char_u *goodword; | |
12424 int score; /* soundfold score */ | |
12425 langp_T *lp; | |
12426 { | |
12427 slang_T *slang = lp->lp_slang; /* language for sound folding */ | |
12428 int sfwordnr; | |
12429 char_u *nrline; | |
12430 int orgnr; | |
12431 char_u theword[MAXWLEN]; | |
12432 int i; | |
12433 int wlen; | |
10901 char_u *byts; | 12434 char_u *byts; |
10902 idx_T *idxs; | 12435 idx_T *idxs; |
10903 int depth; | 12436 int n; |
10904 int c; | 12437 int wordcount; |
10905 idx_T n; | 12438 int wc; |
10906 int round; | 12439 int goodscore; |
10907 int flags; | 12440 hash_T hash; |
10908 int sound_score; | 12441 hashitem_T *hi; |
10909 int local_score; | 12442 sftword_T *sft; |
10910 int lpi; | 12443 int bc, gc; |
10911 slang_T *slang; | 12444 int limit; |
10912 | 12445 |
10913 /* Do this for all languages that support sound folding. */ | 12446 /* |
10914 for (lpi = 0; lpi < curbuf->b_langp.ga_len; ++lpi) | 12447 * It's very well possible that the same soundfold word is found several |
10915 { | 12448 * times with different scores. Since the following is quite slow only do |
10916 lp = LANGP_ENTRY(curbuf->b_langp, lpi); | 12449 * the words that have a better score than before. Use a hashtable to |
10917 slang = lp->lp_slang; | 12450 * remember the words that have been done. |
10918 if (slang->sl_sal.ga_len > 0) | 12451 */ |
10919 { | 12452 hash = hash_hash(goodword); |
10920 /* soundfold the bad word */ | 12453 hi = hash_lookup(&slang->sl_sounddone, goodword, hash); |
10921 spell_soundfold(slang, su->su_fbadword, TRUE, salword); | 12454 if (HASHITEM_EMPTY(hi)) |
10922 | 12455 { |
10923 /* | 12456 sft = (sftword_T *)alloc(sizeof(sftword_T) + STRLEN(goodword)); |
10924 * Go through the whole tree, soundfold each word and compare. | 12457 if (sft != NULL) |
10925 * round 1: use the case-folded tree. | 12458 { |
10926 * round 2: use the keep-case tree. | 12459 sft->sft_score = score; |
10927 */ | 12460 STRCPY(sft->sft_word, goodword); |
10928 for (round = 1; round <= 2; ++round) | 12461 hash_add_item(&slang->sl_sounddone, hi, sft->sft_word, hash); |
10929 { | 12462 } |
10930 if (round == 1) | 12463 } |
12464 else | |
12465 { | |
12466 sft = HI2SFT(hi); | |
12467 if (score >= sft->sft_score) | |
12468 return; | |
12469 sft->sft_score = score; | |
12470 } | |
12471 | |
12472 /* | |
12473 * Find the word nr in the soundfold tree. | |
12474 */ | |
12475 sfwordnr = soundfold_find(slang, goodword); | |
12476 if (sfwordnr < 0) | |
12477 { | |
12478 EMSG2(_(e_intern2), "add_sound_suggest()"); | |
12479 return; | |
12480 } | |
12481 | |
12482 /* | |
12483 * go over the list of good words that produce this soundfold word | |
12484 */ | |
12485 nrline = ml_get_buf(slang->sl_sugbuf, (linenr_T)(sfwordnr + 1), FALSE); | |
12486 orgnr = 0; | |
12487 while (*nrline != NUL) | |
12488 { | |
12489 /* The wordnr was stored in a minimal nr of bytes as an offset to the | |
12490 * previous wordnr. */ | |
12491 orgnr += bytes2offset(&nrline); | |
12492 | |
12493 byts = slang->sl_fbyts; | |
12494 idxs = slang->sl_fidxs; | |
12495 | |
12496 /* Lookup the word "orgnr" one of the two tries. */ | |
12497 n = 0; | |
12498 wlen = 0; | |
12499 wordcount = 0; | |
12500 for (;;) | |
12501 { | |
12502 i = 1; | |
12503 if (wordcount == orgnr && byts[n + 1] == NUL) | |
12504 break; /* found end of word */ | |
12505 | |
12506 if (byts[n + 1] == NUL) | |
12507 ++wordcount; | |
12508 | |
12509 /* skip over the NUL bytes */ | |
12510 for ( ; byts[n + i] == NUL; ++i) | |
12511 if (i > byts[n]) /* safety check */ | |
10931 { | 12512 { |
10932 byts = slang->sl_fbyts; | 12513 STRCPY(theword + wlen, "BAD"); |
10933 idxs = slang->sl_fidxs; | 12514 goto badword; |
12515 } | |
12516 | |
12517 /* One of the siblings must have the word. */ | |
12518 for ( ; i < byts[n]; ++i) | |
12519 { | |
12520 wc = idxs[idxs[n + i]]; /* nr of words under this byte */ | |
12521 if (wordcount + wc > orgnr) | |
12522 break; | |
12523 wordcount += wc; | |
12524 } | |
12525 | |
12526 theword[wlen++] = byts[n + i]; | |
12527 n = idxs[n + i]; | |
12528 } | |
12529 badword: | |
12530 theword[wlen] = NUL; | |
12531 | |
12532 /* Go over the possible flags and regions. */ | |
12533 for (; i <= byts[n] && byts[n + i] == NUL; ++i) | |
12534 { | |
12535 char_u cword[MAXWLEN]; | |
12536 char_u *p; | |
12537 int flags = (int)idxs[n + i]; | |
12538 | |
12539 if (flags & WF_KEEPCAP) | |
12540 { | |
12541 /* Must find the word in the keep-case tree. */ | |
12542 find_keepcap_word(slang, theword, cword); | |
12543 p = cword; | |
12544 } | |
12545 else | |
12546 { | |
12547 flags |= su->su_badflags; | |
12548 if ((flags & WF_CAPMASK) != 0) | |
12549 { | |
12550 /* Need to fix case according to "flags". */ | |
12551 make_case_word(theword, cword, flags); | |
12552 p = cword; | |
10934 } | 12553 } |
10935 else | 12554 else |
12555 p = theword; | |
12556 } | |
12557 | |
12558 /* Add the suggestion. */ | |
12559 if (sps_flags & SPS_DOUBLE) | |
12560 { | |
12561 /* Add the suggestion if the score isn't too bad. */ | |
12562 if (score <= su->su_maxscore) | |
12563 add_suggestion(su, &su->su_sga, p, su->su_badlen, | |
12564 score, 0, FALSE, slang, FALSE); | |
12565 } | |
12566 else | |
12567 { | |
12568 /* Add a penalty for words in another region. */ | |
12569 if ((flags & WF_REGION) | |
12570 && (((unsigned)flags >> 16) & lp->lp_region) == 0) | |
12571 goodscore = SCORE_REGION; | |
12572 else | |
12573 goodscore = 0; | |
12574 | |
12575 /* Add a small penalty for changing the first letter from | |
12576 * lower to upper case. Helps for "tath" -> "Kath", which is | |
12577 * less common thatn "tath" -> "path". Don't do it when the | |
12578 * letter is the same, that has already been counted. */ | |
12579 gc = PTR2CHAR(p); | |
12580 if (SPELL_ISUPPER(gc)) | |
10936 { | 12581 { |
10937 byts = slang->sl_kbyts; | 12582 bc = PTR2CHAR(su->su_badword); |
10938 idxs = slang->sl_kidxs; | 12583 if (!SPELL_ISUPPER(bc) |
10939 if (byts == NULL) /* no keep-case words */ | 12584 && SPELL_TOFOLD(bc) != SPELL_TOFOLD(gc)) |
10940 continue; | 12585 goodscore += SCORE_ICASE / 2; |
10941 } | 12586 } |
10942 | 12587 |
10943 depth = 0; | 12588 /* Compute the score for the good word. This only does letter |
10944 arridx[0] = 0; | 12589 * insert/delete/swap/replace. REP items are not considered, |
10945 curi[0] = 1; | 12590 * which may make the score a bit higher. |
10946 while (depth >= 0 && !got_int) | 12591 * Use a limit for the score to make it work faster. Use |
12592 * MAXSCORE(), because RESCORE() will change the score. | |
12593 * If the limit is very high then the iterative method is | |
12594 * inefficient, using an array is quicker. */ | |
12595 limit = MAXSCORE(su->su_sfmaxscore - goodscore, score); | |
12596 if (limit > SCORE_LIMITMAX) | |
12597 goodscore += spell_edit_score(slang, su->su_badword, p); | |
12598 else | |
12599 goodscore += spell_edit_score_limit(slang, su->su_badword, | |
12600 p, limit); | |
12601 | |
12602 /* When going over the limit don't bother to do the rest. */ | |
12603 if (goodscore < SCORE_MAXMAX) | |
10947 { | 12604 { |
10948 if (curi[depth] > byts[arridx[depth]]) | 12605 /* Give a bonus to words seen before. */ |
10949 { | 12606 goodscore = score_wordcount_adj(slang, goodscore, p, FALSE); |
10950 /* Done all bytes at this node, go up one level. */ | 12607 |
10951 --depth; | 12608 /* Add the suggestion if the score isn't too bad. */ |
10952 line_breakcheck(); | 12609 goodscore = RESCORE(goodscore, score); |
10953 } | 12610 if (goodscore <= su->su_sfmaxscore) |
10954 else | 12611 add_suggestion(su, &su->su_ga, p, su->su_badlen, |
10955 { | 12612 goodscore, score, TRUE, slang, TRUE); |
10956 /* Do one more byte at this node. */ | |
10957 n = arridx[depth] + curi[depth]; | |
10958 ++curi[depth]; | |
10959 c = byts[n]; | |
10960 if (c == 0) | |
10961 { | |
10962 /* End of word, deal with the word. */ | |
10963 flags = (int)idxs[n]; | |
10964 if (round == 2 || (flags & WF_KEEPCAP) == 0) | |
10965 { | |
10966 tword[depth] = NUL; | |
10967 /* Sound-fold. Only in keep-case tree need to | |
10968 * case-fold the word. */ | |
10969 spell_soundfold(slang, tword, | |
10970 round == 1, tsalword); | |
10971 | |
10972 /* Compute the edit distance between the | |
10973 * sound-a-like words. */ | |
10974 sound_score = soundalike_score(salword, | |
10975 tsalword); | |
10976 | |
10977 /* Add a penalty for words in another region. */ | |
10978 if ((flags & WF_REGION) && (((unsigned)flags | |
10979 >> 16) & lp->lp_region) == 0) | |
10980 local_score = SCORE_REGION; | |
10981 else | |
10982 local_score = 0; | |
10983 sound_score += local_score; | |
10984 | |
10985 if (sound_score < SCORE_MAXMAX) | |
10986 { | |
10987 char_u cword[MAXWLEN]; | |
10988 char_u *p; | |
10989 int score; | |
10990 | |
10991 flags |= su->su_badflags; | |
10992 if (round == 1 && (flags & WF_CAPMASK) != 0) | |
10993 { | |
10994 /* Need to fix case according to | |
10995 * "flags". */ | |
10996 make_case_word(tword, cword, flags); | |
10997 p = cword; | |
10998 } | |
10999 else | |
11000 p = tword; | |
11001 | |
11002 if (sps_flags & SPS_DOUBLE) | |
11003 add_suggestion(su, &su->su_sga, p, | |
11004 su->su_badlen, | |
11005 sound_score, 0, FALSE, | |
11006 lp->lp_sallang); | |
11007 else | |
11008 { | |
11009 /* Compute the score. */ | |
11010 score = spell_edit_score( | |
11011 su->su_badword, p) | |
11012 + local_score; | |
11013 if (sps_flags & SPS_BEST) | |
11014 /* give a bonus for the good word | |
11015 * sounding the same as the bad | |
11016 * word */ | |
11017 add_suggestion(su, &su->su_ga, p, | |
11018 su->su_badlen, | |
11019 RESCORE(score, sound_score), | |
11020 sound_score, TRUE, | |
11021 lp->lp_sallang); | |
11022 else | |
11023 add_suggestion(su, &su->su_ga, p, | |
11024 su->su_badlen, | |
11025 score + sound_score, | |
11026 0, FALSE, | |
11027 lp->lp_sallang); | |
11028 } | |
11029 } | |
11030 } | |
11031 | |
11032 /* Skip over other NUL bytes. */ | |
11033 while (byts[n + 1] == 0) | |
11034 { | |
11035 ++n; | |
11036 ++curi[depth]; | |
11037 } | |
11038 } | |
11039 else | |
11040 { | |
11041 /* Normal char, go one level deeper. */ | |
11042 tword[depth++] = c; | |
11043 arridx[depth] = idxs[n]; | |
11044 curi[depth] = 1; | |
11045 } | |
11046 } | |
11047 } | 12613 } |
11048 } | 12614 } |
11049 } | 12615 } |
11050 } | 12616 /* smsg("word %s (%d): %s (%d)", sftword, sftnr, theword, orgnr); */ |
12617 } | |
12618 } | |
12619 | |
12620 /* | |
12621 * Find word "word" in fold-case tree for "slang" and return the word number. | |
12622 */ | |
12623 static int | |
12624 soundfold_find(slang, word) | |
12625 slang_T *slang; | |
12626 char_u *word; | |
12627 { | |
12628 idx_T arridx = 0; | |
12629 int len; | |
12630 int wlen = 0; | |
12631 int c; | |
12632 char_u *ptr = word; | |
12633 char_u *byts; | |
12634 idx_T *idxs; | |
12635 int wordnr = 0; | |
12636 | |
12637 byts = slang->sl_sbyts; | |
12638 idxs = slang->sl_sidxs; | |
12639 | |
12640 for (;;) | |
12641 { | |
12642 /* First byte is the number of possible bytes. */ | |
12643 len = byts[arridx++]; | |
12644 | |
12645 /* If the first possible byte is a zero the word could end here. | |
12646 * If the word ends we found the word. If not skip the NUL bytes. */ | |
12647 c = ptr[wlen]; | |
12648 if (byts[arridx] == NUL) | |
12649 { | |
12650 if (c == NUL) | |
12651 break; | |
12652 | |
12653 /* Skip over the zeros, there can be several. */ | |
12654 while (len > 0 && byts[arridx] == NUL) | |
12655 { | |
12656 ++arridx; | |
12657 --len; | |
12658 } | |
12659 if (len == 0) | |
12660 return -1; /* no children, word should have ended here */ | |
12661 ++wordnr; | |
12662 } | |
12663 | |
12664 /* If the word ends we didn't find it. */ | |
12665 if (c == NUL) | |
12666 return -1; | |
12667 | |
12668 /* Perform a binary search in the list of accepted bytes. */ | |
12669 if (c == TAB) /* <Tab> is handled like <Space> */ | |
12670 c = ' '; | |
12671 while (byts[arridx] < c) | |
12672 { | |
12673 /* The word count is in the first idxs[] entry of the child. */ | |
12674 wordnr += idxs[idxs[arridx]]; | |
12675 ++arridx; | |
12676 if (--len == 0) /* end of the bytes, didn't find it */ | |
12677 return -1; | |
12678 } | |
12679 if (byts[arridx] != c) /* didn't find the byte */ | |
12680 return -1; | |
12681 | |
12682 /* Continue at the child (if there is one). */ | |
12683 arridx = idxs[arridx]; | |
12684 ++wlen; | |
12685 | |
12686 /* One space in the good word may stand for several spaces in the | |
12687 * checked word. */ | |
12688 if (c == ' ') | |
12689 while (ptr[wlen] == ' ' || ptr[wlen] == TAB) | |
12690 ++wlen; | |
12691 } | |
12692 | |
12693 return wordnr; | |
11051 } | 12694 } |
11052 | 12695 |
11053 /* | 12696 /* |
11054 * Copy "fword" to "cword", fixing case according to "flags". | 12697 * Copy "fword" to "cword", fixing case according to "flags". |
11055 */ | 12698 */ |
11088 lp->sl_has_map = FALSE; | 12731 lp->sl_has_map = FALSE; |
11089 return; | 12732 return; |
11090 } | 12733 } |
11091 lp->sl_has_map = TRUE; | 12734 lp->sl_has_map = TRUE; |
11092 | 12735 |
11093 /* Init the array and hash table empty. */ | 12736 /* Init the array and hash tables empty. */ |
11094 for (i = 0; i < 256; ++i) | 12737 for (i = 0; i < 256; ++i) |
11095 lp->sl_map_array[i] = 0; | 12738 lp->sl_map_array[i] = 0; |
11096 #ifdef FEAT_MBYTE | 12739 #ifdef FEAT_MBYTE |
11097 hash_init(&lp->sl_map_hash); | 12740 hash_init(&lp->sl_map_hash); |
11098 #endif | 12741 #endif |
11202 return m1 == m2; | 12845 return m1 == m2; |
11203 } | 12846 } |
11204 | 12847 |
11205 /* | 12848 /* |
11206 * Add a suggestion to the list of suggestions. | 12849 * Add a suggestion to the list of suggestions. |
11207 * Do not add a duplicate suggestion or suggestions with a bad score. | 12850 * For a suggestion that is already in the list the lowest score is remembered. |
11208 * When "use_score" is not zero it's used, otherwise the score is computed | |
11209 * with spell_edit_score(). | |
11210 */ | 12851 */ |
11211 static void | 12852 static void |
11212 add_suggestion(su, gap, goodword, badlenarg, score, altscore, had_bonus, slang) | 12853 add_suggestion(su, gap, goodword, badlenarg, score, altscore, had_bonus, |
12854 slang, maxsf) | |
11213 suginfo_T *su; | 12855 suginfo_T *su; |
11214 garray_T *gap; | 12856 garray_T *gap; /* either su_ga or su_sga */ |
11215 char_u *goodword; | 12857 char_u *goodword; |
11216 int badlenarg; /* len of bad word replaced with "goodword" */ | 12858 int badlenarg; /* len of bad word replaced with "goodword" */ |
11217 int score; | 12859 int score; |
11218 int altscore; | 12860 int altscore; |
11219 int had_bonus; /* value for st_had_bonus */ | 12861 int had_bonus; /* value for st_had_bonus */ |
11220 slang_T *slang; /* language for sound folding */ | 12862 slang_T *slang; /* language for sound folding */ |
11221 { | 12863 int maxsf; /* su_maxscore applies to soundfold score, |
11222 int goodlen = STRLEN(goodword); /* len of goodword changed */ | 12864 su_sfmaxscore to the total score. */ |
11223 int badlen = badlenarg; /* len of bad word changed */ | 12865 { |
12866 int goodlen; /* len of goodword changed */ | |
12867 int badlen; /* len of bad word changed */ | |
11224 suggest_T *stp; | 12868 suggest_T *stp; |
11225 suggest_T new_sug; | 12869 suggest_T new_sug; |
11226 int i; | 12870 int i; |
11227 hlf_T attr = HLF_COUNT; | |
11228 char_u longword[MAXWLEN + 1]; | |
11229 char_u *pgood, *pbad; | 12871 char_u *pgood, *pbad; |
11230 | |
11231 /* Check that the word really is valid. Esp. for banned words and for | |
11232 * split words, such as "the the". Need to append what follows to check | |
11233 * for that. */ | |
11234 STRCPY(longword, goodword); | |
11235 vim_strncpy(longword + goodlen, su->su_badptr + badlen, MAXWLEN - goodlen); | |
11236 (void)spell_check(curwin, longword, &attr, NULL); | |
11237 if (attr != HLF_COUNT) | |
11238 return; | |
11239 | 12872 |
11240 /* Minimize "badlen" for consistency. Avoids that changing "the the" to | 12873 /* Minimize "badlen" for consistency. Avoids that changing "the the" to |
11241 * "thee the" is added next to changing the first "the" the "thee". */ | 12874 * "thee the" is added next to changing the first "the" the "thee". */ |
11242 pgood = goodword + STRLEN(goodword); | 12875 pgood = goodword + STRLEN(goodword); |
11243 pbad = su->su_badptr + badlen; | 12876 pbad = su->su_badptr + badlenarg; |
11244 while (pgood > goodword && pbad > su->su_badptr) | 12877 for (;;) |
11245 { | 12878 { |
12879 goodlen = pgood - goodword; | |
12880 badlen = pbad - su->su_badptr; | |
12881 if (goodlen <= 0 || badlen <= 0) | |
12882 break; | |
11246 mb_ptr_back(goodword, pgood); | 12883 mb_ptr_back(goodword, pgood); |
11247 mb_ptr_back(su->su_badptr, pbad); | 12884 mb_ptr_back(su->su_badptr, pbad); |
11248 #ifdef FEAT_MBYTE | 12885 #ifdef FEAT_MBYTE |
11249 if (has_mbyte) | 12886 if (has_mbyte) |
11250 { | 12887 { |
11253 } | 12890 } |
11254 else | 12891 else |
11255 #endif | 12892 #endif |
11256 if (*pgood != *pbad) | 12893 if (*pgood != *pbad) |
11257 break; | 12894 break; |
11258 badlen = pbad - su->su_badptr; | 12895 } |
11259 goodlen = pgood - goodword; | 12896 |
11260 } | |
11261 if (badlen == 0 && goodlen == 0) | 12897 if (badlen == 0 && goodlen == 0) |
11262 /* goodword doesn't change anything; may happen for "the the" changing | 12898 /* goodword doesn't change anything; may happen for "the the" changing |
11263 * the first "the" to itself. */ | 12899 * the first "the" to itself. */ |
11264 return; | 12900 return; |
11265 | 12901 |
11266 if (score <= su->su_maxscore) | 12902 /* Check if the word is already there. Also check the length that is |
11267 { | 12903 * being replaced "thes," -> "these" is a different suggestion from |
11268 /* Check if the word is already there. Also check the length that is | 12904 * "thes" -> "these". */ |
11269 * being replaced "thes," -> "these" is a different suggestion from | 12905 stp = &SUG(*gap, 0); |
11270 * "thes" -> "these". */ | 12906 for (i = gap->ga_len; --i >= 0; ++stp) |
11271 stp = &SUG(*gap, 0); | 12907 if (stp->st_wordlen == goodlen |
11272 for (i = gap->ga_len - 1; i >= 0; --i) | 12908 && stp->st_orglen == badlen |
11273 if ((int)STRLEN(stp[i].st_word) == goodlen | 12909 && STRNCMP(stp->st_word, goodword, goodlen) == 0) |
11274 && STRNCMP(stp[i].st_word, goodword, goodlen) == 0 | 12910 { |
11275 && stp[i].st_orglen == badlen) | 12911 /* |
11276 { | 12912 * Found it. Remember the word with the lowest score. |
11277 /* | 12913 */ |
11278 * Found it. Remember the word with the lowest score. | 12914 if (stp->st_slang == NULL) |
11279 */ | 12915 stp->st_slang = slang; |
11280 if (stp[i].st_slang == NULL) | 12916 |
11281 stp[i].st_slang = slang; | 12917 new_sug.st_score = score; |
11282 | 12918 new_sug.st_altscore = altscore; |
11283 new_sug.st_score = score; | 12919 new_sug.st_had_bonus = had_bonus; |
11284 new_sug.st_altscore = altscore; | 12920 |
11285 new_sug.st_had_bonus = had_bonus; | 12921 if (stp->st_had_bonus != had_bonus) |
11286 | 12922 { |
11287 if (stp[i].st_had_bonus != had_bonus) | 12923 /* Only one of the two had the soundalike score computed. |
12924 * Need to do that for the other one now, otherwise the | |
12925 * scores can't be compared. This happens because | |
12926 * suggest_try_change() doesn't compute the soundalike | |
12927 * word to keep it fast, while some special methods set | |
12928 * the soundalike score to zero. */ | |
12929 if (had_bonus) | |
12930 rescore_one(su, stp); | |
12931 else | |
11288 { | 12932 { |
11289 /* Only one of the two had the soundalike score computed. | 12933 new_sug.st_word = stp->st_word; |
11290 * Need to do that for the other one now, otherwise the | 12934 new_sug.st_wordlen = stp->st_wordlen; |
11291 * scores can't be compared. This happens because | 12935 new_sug.st_slang = stp->st_slang; |
11292 * suggest_try_change() doesn't compute the soundalike | 12936 new_sug.st_orglen = badlen; |
11293 * word to keep it fast, while some special methods set | 12937 rescore_one(su, &new_sug); |
11294 * the soundalike score to zero. */ | |
11295 if (had_bonus) | |
11296 rescore_one(su, &stp[i]); | |
11297 else | |
11298 { | |
11299 new_sug.st_word = goodword; | |
11300 new_sug.st_slang = stp[i].st_slang; | |
11301 new_sug.st_orglen = badlen; | |
11302 rescore_one(su, &new_sug); | |
11303 } | |
11304 } | 12938 } |
11305 | 12939 } |
11306 if (stp[i].st_score > new_sug.st_score) | 12940 |
12941 if (stp->st_score > new_sug.st_score) | |
12942 { | |
12943 stp->st_score = new_sug.st_score; | |
12944 stp->st_altscore = new_sug.st_altscore; | |
12945 stp->st_had_bonus = new_sug.st_had_bonus; | |
12946 } | |
12947 break; | |
12948 } | |
12949 | |
12950 if (i < 0 && ga_grow(gap, 1) == OK) | |
12951 { | |
12952 /* Add a suggestion. */ | |
12953 stp = &SUG(*gap, gap->ga_len); | |
12954 stp->st_word = vim_strnsave(goodword, goodlen); | |
12955 if (stp->st_word != NULL) | |
12956 { | |
12957 stp->st_wordlen = goodlen; | |
12958 stp->st_score = score; | |
12959 stp->st_altscore = altscore; | |
12960 stp->st_had_bonus = had_bonus; | |
12961 stp->st_orglen = badlen; | |
12962 stp->st_slang = slang; | |
12963 ++gap->ga_len; | |
12964 | |
12965 /* If we have too many suggestions now, sort the list and keep | |
12966 * the best suggestions. */ | |
12967 if (gap->ga_len > SUG_MAX_COUNT(su)) | |
12968 { | |
12969 if (maxsf) | |
12970 su->su_sfmaxscore = cleanup_suggestions(gap, | |
12971 su->su_sfmaxscore, SUG_CLEAN_COUNT(su)); | |
12972 else | |
11307 { | 12973 { |
11308 stp[i].st_score = new_sug.st_score; | 12974 i = su->su_maxscore; |
11309 stp[i].st_altscore = new_sug.st_altscore; | 12975 su->su_maxscore = cleanup_suggestions(gap, |
11310 stp[i].st_had_bonus = new_sug.st_had_bonus; | 12976 su->su_maxscore, SUG_CLEAN_COUNT(su)); |
11311 } | 12977 } |
11312 break; | 12978 } |
11313 } | 12979 } |
11314 | 12980 } |
11315 if (i < 0 && ga_grow(gap, 1) == OK) | 12981 } |
11316 { | 12982 |
11317 /* Add a suggestion. */ | 12983 /* |
11318 stp = &SUG(*gap, gap->ga_len); | 12984 * Suggestions may in fact be flagged as errors. Esp. for banned words and |
11319 stp->st_word = vim_strnsave(goodword, goodlen); | 12985 * for split words, such as "the the". Remove these from the list here. |
11320 if (stp->st_word != NULL) | 12986 */ |
11321 { | 12987 static void |
11322 stp->st_score = score; | 12988 check_suggestions(su, gap) |
11323 stp->st_altscore = altscore; | 12989 suginfo_T *su; |
11324 stp->st_had_bonus = had_bonus; | 12990 garray_T *gap; /* either su_ga or su_sga */ |
11325 stp->st_orglen = badlen; | 12991 { |
11326 stp->st_slang = slang; | 12992 suggest_T *stp; |
11327 ++gap->ga_len; | 12993 int i; |
11328 | 12994 char_u longword[MAXWLEN + 1]; |
11329 /* If we have too many suggestions now, sort the list and keep | 12995 int len; |
11330 * the best suggestions. */ | 12996 hlf_T attr; |
11331 if (gap->ga_len > SUG_MAX_COUNT(su)) | 12997 |
11332 su->su_maxscore = cleanup_suggestions(gap, su->su_maxscore, | 12998 stp = &SUG(*gap, 0); |
11333 SUG_CLEAN_COUNT(su)); | 12999 for (i = gap->ga_len - 1; i >= 0; --i) |
11334 } | 13000 { |
11335 } | 13001 /* Need to append what follows to check for "the the". */ |
11336 } | 13002 STRCPY(longword, stp[i].st_word); |
11337 } | 13003 len = stp[i].st_wordlen; |
13004 vim_strncpy(longword + len, su->su_badptr + stp[i].st_orglen, | |
13005 MAXWLEN - len); | |
13006 attr = HLF_COUNT; | |
13007 (void)spell_check(curwin, longword, &attr, NULL, FALSE); | |
13008 if (attr != HLF_COUNT) | |
13009 { | |
13010 /* Remove this entry. */ | |
13011 vim_free(stp[i].st_word); | |
13012 --gap->ga_len; | |
13013 if (i < gap->ga_len) | |
13014 mch_memmove(stp + i, stp + i + 1, | |
13015 sizeof(suggest_T) * (gap->ga_len - i)); | |
13016 } | |
13017 } | |
13018 } | |
13019 | |
11338 | 13020 |
11339 /* | 13021 /* |
11340 * Add a word to be banned. | 13022 * Add a word to be banned. |
11341 */ | 13023 */ |
11342 static void | 13024 static void |
11346 { | 13028 { |
11347 char_u *s = vim_strsave(word); | 13029 char_u *s = vim_strsave(word); |
11348 hash_T hash; | 13030 hash_T hash; |
11349 hashitem_T *hi; | 13031 hashitem_T *hi; |
11350 | 13032 |
11351 if (s != NULL) | 13033 hash = hash_hash(word); |
11352 { | 13034 hi = hash_lookup(&su->su_banned, word, hash); |
11353 hash = hash_hash(s); | 13035 if (HASHITEM_EMPTY(hi)) |
11354 hi = hash_lookup(&su->su_banned, s, hash); | 13036 { |
11355 if (HASHITEM_EMPTY(hi)) | 13037 s = vim_strsave(word); |
13038 if (s != NULL) | |
11356 hash_add_item(&su->su_banned, hi, s, hash); | 13039 hash_add_item(&su->su_banned, hi, s, hash); |
11357 else | 13040 } |
11358 vim_free(s); | |
11359 } | |
11360 } | |
11361 | |
11362 /* | |
11363 * Return TRUE if a word appears in the list of banned words. | |
11364 */ | |
11365 static int | |
11366 was_banned(su, word) | |
11367 suginfo_T *su; | |
11368 char_u *word; | |
11369 { | |
11370 hashitem_T *hi = hash_find(&su->su_banned, word); | |
11371 | |
11372 return !HASHITEM_EMPTY(hi); | |
11373 } | |
11374 | |
11375 /* | |
11376 * Free the banned words in "su". | |
11377 */ | |
11378 static void | |
11379 free_banned(su) | |
11380 suginfo_T *su; | |
11381 { | |
11382 int todo; | |
11383 hashitem_T *hi; | |
11384 | |
11385 todo = su->su_banned.ht_used; | |
11386 for (hi = su->su_banned.ht_array; todo > 0; ++hi) | |
11387 { | |
11388 if (!HASHITEM_EMPTY(hi)) | |
11389 { | |
11390 vim_free(hi->hi_key); | |
11391 --todo; | |
11392 } | |
11393 } | |
11394 hash_clear(&su->su_banned); | |
11395 } | 13041 } |
11396 | 13042 |
11397 /* | 13043 /* |
11398 * Recompute the score for all suggestions if sound-folding is possible. This | 13044 * Recompute the score for all suggestions if sound-folding is possible. This |
11399 * is slow, thus only done for the final results. | 13045 * is slow, thus only done for the final results. |
12268 | 13914 |
12269 /* adding/inserting "*" at the start (word starts with vowel) shouldn't be | 13915 /* adding/inserting "*" at the start (word starts with vowel) shouldn't be |
12270 * counted so much, vowels halfway the word aren't counted at all. */ | 13916 * counted so much, vowels halfway the word aren't counted at all. */ |
12271 if ((*badsound == '*' || *goodsound == '*') && *badsound != *goodsound) | 13917 if ((*badsound == '*' || *goodsound == '*') && *badsound != *goodsound) |
12272 { | 13918 { |
12273 score = SCORE_DEL / 2; | 13919 if (badsound[1] == goodsound[1] |
12274 if (*badsound == '*') | 13920 || (badsound[1] != NUL |
12275 ++badsound; | 13921 && goodsound[1] != NUL |
13922 && badsound[2] == goodsound[2])) | |
13923 { | |
13924 /* handle like a substitute */ | |
13925 } | |
12276 else | 13926 else |
12277 ++goodsound; | 13927 { |
13928 score = 2 * SCORE_DEL / 3; | |
13929 if (*badsound == '*') | |
13930 ++badsound; | |
13931 else | |
13932 ++goodsound; | |
13933 } | |
12278 } | 13934 } |
12279 | 13935 |
12280 goodlen = STRLEN(goodsound); | 13936 goodlen = STRLEN(goodsound); |
12281 badlen = STRLEN(badsound); | 13937 badlen = STRLEN(badsound); |
12282 | 13938 |
12468 * The implementation of the algorithm comes from Aspell editdist.cpp, | 14124 * The implementation of the algorithm comes from Aspell editdist.cpp, |
12469 * edit_distance(). It has been converted from C++ to C and modified to | 14125 * edit_distance(). It has been converted from C++ to C and modified to |
12470 * support multi-byte characters. | 14126 * support multi-byte characters. |
12471 */ | 14127 */ |
12472 static int | 14128 static int |
12473 spell_edit_score(badword, goodword) | 14129 spell_edit_score(slang, badword, goodword) |
14130 slang_T *slang; | |
12474 char_u *badword; | 14131 char_u *badword; |
12475 char_u *goodword; | 14132 char_u *goodword; |
12476 { | 14133 { |
12477 int *cnt; | 14134 int *cnt; |
12478 int badlen, goodlen; /* lenghts including NUL */ | 14135 int badlen, goodlen; /* lenghts including NUL */ |
12510 if (cnt == NULL) | 14167 if (cnt == NULL) |
12511 return 0; /* out of memory */ | 14168 return 0; /* out of memory */ |
12512 | 14169 |
12513 CNT(0, 0) = 0; | 14170 CNT(0, 0) = 0; |
12514 for (j = 1; j <= goodlen; ++j) | 14171 for (j = 1; j <= goodlen; ++j) |
12515 CNT(0, j) = CNT(0, j - 1) + SCORE_DEL; | 14172 CNT(0, j) = CNT(0, j - 1) + SCORE_INS; |
12516 | 14173 |
12517 for (i = 1; i <= badlen; ++i) | 14174 for (i = 1; i <= badlen; ++i) |
12518 { | 14175 { |
12519 CNT(i, 0) = CNT(i - 1, 0) + SCORE_INS; | 14176 CNT(i, 0) = CNT(i - 1, 0) + SCORE_DEL; |
12520 for (j = 1; j <= goodlen; ++j) | 14177 for (j = 1; j <= goodlen; ++j) |
12521 { | 14178 { |
12522 #ifdef FEAT_MBYTE | 14179 #ifdef FEAT_MBYTE |
12523 if (has_mbyte) | 14180 if (has_mbyte) |
12524 { | 14181 { |
12537 { | 14194 { |
12538 /* Use a better score when there is only a case difference. */ | 14195 /* Use a better score when there is only a case difference. */ |
12539 if (SPELL_TOFOLD(bc) == SPELL_TOFOLD(gc)) | 14196 if (SPELL_TOFOLD(bc) == SPELL_TOFOLD(gc)) |
12540 CNT(i, j) = SCORE_ICASE + CNT(i - 1, j - 1); | 14197 CNT(i, j) = SCORE_ICASE + CNT(i - 1, j - 1); |
12541 else | 14198 else |
12542 CNT(i, j) = SCORE_SUBST + CNT(i - 1, j - 1); | 14199 { |
14200 /* For a similar character use SCORE_SIMILAR. */ | |
14201 if (slang != NULL | |
14202 && slang->sl_has_map | |
14203 && similar_chars(slang, gc, bc)) | |
14204 CNT(i, j) = SCORE_SIMILAR + CNT(i - 1, j - 1); | |
14205 else | |
14206 CNT(i, j) = SCORE_SUBST + CNT(i - 1, j - 1); | |
14207 } | |
12543 | 14208 |
12544 if (i > 1 && j > 1) | 14209 if (i > 1 && j > 1) |
12545 { | 14210 { |
12546 #ifdef FEAT_MBYTE | 14211 #ifdef FEAT_MBYTE |
12547 if (has_mbyte) | 14212 if (has_mbyte) |
12575 i = CNT(badlen - 1, goodlen - 1); | 14240 i = CNT(badlen - 1, goodlen - 1); |
12576 vim_free(cnt); | 14241 vim_free(cnt); |
12577 return i; | 14242 return i; |
12578 } | 14243 } |
12579 | 14244 |
14245 typedef struct | |
14246 { | |
14247 int badi; | |
14248 int goodi; | |
14249 int score; | |
14250 } limitscore_T; | |
14251 | |
14252 /* | |
14253 * Like spell_edit_score(), but with a limit on the score to make it faster. | |
14254 * May return SCORE_MAXMAX when the score is higher than "limit". | |
14255 * | |
14256 * This uses a stack for the edits still to be tried. | |
14257 * The idea comes from Aspell leditdist.cpp. Rewritten in C and added support | |
14258 * for multi-byte characters. | |
14259 */ | |
14260 static int | |
14261 spell_edit_score_limit(slang, badword, goodword, limit) | |
14262 slang_T *slang; | |
14263 char_u *badword; | |
14264 char_u *goodword; | |
14265 int limit; | |
14266 { | |
14267 limitscore_T stack[10]; /* allow for over 3 * 2 edits */ | |
14268 int stackidx; | |
14269 int bi, gi; | |
14270 int bi2, gi2; | |
14271 int bc, gc; | |
14272 int score; | |
14273 int score_off; | |
14274 int minscore; | |
14275 int round; | |
14276 | |
14277 #ifdef FEAT_MBYTE | |
14278 /* Multi-byte characters require a bit more work, use a different function | |
14279 * to avoid testing "has_mbyte" quite often. */ | |
14280 if (has_mbyte) | |
14281 return spell_edit_score_limit_w(slang, badword, goodword, limit); | |
14282 #endif | |
14283 | |
14284 /* | |
14285 * The idea is to go from start to end over the words. So long as | |
14286 * characters are equal just continue, this always gives the lowest score. | |
14287 * When there is a difference try several alternatives. Each alternative | |
14288 * increases "score" for the edit distance. Some of the alternatives are | |
14289 * pushed unto a stack and tried later, some are tried right away. At the | |
14290 * end of the word the score for one alternative is known. The lowest | |
14291 * possible score is stored in "minscore". | |
14292 */ | |
14293 stackidx = 0; | |
14294 bi = 0; | |
14295 gi = 0; | |
14296 score = 0; | |
14297 minscore = limit + 1; | |
14298 | |
14299 for (;;) | |
14300 { | |
14301 /* Skip over an equal part, score remains the same. */ | |
14302 for (;;) | |
14303 { | |
14304 bc = badword[bi]; | |
14305 gc = goodword[gi]; | |
14306 if (bc != gc) /* stop at a char that's different */ | |
14307 break; | |
14308 if (bc == NUL) /* both words end */ | |
14309 { | |
14310 if (score < minscore) | |
14311 minscore = score; | |
14312 goto pop; /* do next alternative */ | |
14313 } | |
14314 ++bi; | |
14315 ++gi; | |
14316 } | |
14317 | |
14318 if (gc == NUL) /* goodword ends, delete badword chars */ | |
14319 { | |
14320 do | |
14321 { | |
14322 if ((score += SCORE_DEL) >= minscore) | |
14323 goto pop; /* do next alternative */ | |
14324 } while (badword[++bi] != NUL); | |
14325 minscore = score; | |
14326 } | |
14327 else if (bc == NUL) /* badword ends, insert badword chars */ | |
14328 { | |
14329 do | |
14330 { | |
14331 if ((score += SCORE_INS) >= minscore) | |
14332 goto pop; /* do next alternative */ | |
14333 } while (goodword[++gi] != NUL); | |
14334 minscore = score; | |
14335 } | |
14336 else /* both words continue */ | |
14337 { | |
14338 /* If not close to the limit, perform a change. Only try changes | |
14339 * that may lead to a lower score than "minscore". | |
14340 * round 0: try deleting a char from badword | |
14341 * round 1: try inserting a char in badword */ | |
14342 for (round = 0; round <= 1; ++round) | |
14343 { | |
14344 score_off = score + (round == 0 ? SCORE_DEL : SCORE_INS); | |
14345 if (score_off < minscore) | |
14346 { | |
14347 if (score_off + SCORE_EDIT_MIN >= minscore) | |
14348 { | |
14349 /* Near the limit, rest of the words must match. We | |
14350 * can check that right now, no need to push an item | |
14351 * onto the stack. */ | |
14352 bi2 = bi + 1 - round; | |
14353 gi2 = gi + round; | |
14354 while (goodword[gi2] == badword[bi2]) | |
14355 { | |
14356 if (goodword[gi2] == NUL) | |
14357 { | |
14358 minscore = score_off; | |
14359 break; | |
14360 } | |
14361 ++bi2; | |
14362 ++gi2; | |
14363 } | |
14364 } | |
14365 else | |
14366 { | |
14367 /* try deleting/inserting a character later */ | |
14368 stack[stackidx].badi = bi + 1 - round; | |
14369 stack[stackidx].goodi = gi + round; | |
14370 stack[stackidx].score = score_off; | |
14371 ++stackidx; | |
14372 } | |
14373 } | |
14374 } | |
14375 | |
14376 if (score + SCORE_SWAP < minscore) | |
14377 { | |
14378 /* If swapping two characters makes a match then the | |
14379 * substitution is more expensive, thus there is no need to | |
14380 * try both. */ | |
14381 if (gc == badword[bi + 1] && bc == goodword[gi + 1]) | |
14382 { | |
14383 /* Swap two characters, that is: skip them. */ | |
14384 gi += 2; | |
14385 bi += 2; | |
14386 score += SCORE_SWAP; | |
14387 continue; | |
14388 } | |
14389 } | |
14390 | |
14391 /* Substitute one character for another which is the same | |
14392 * thing as deleting a character from both goodword and badword. | |
14393 * Use a better score when there is only a case difference. */ | |
14394 if (SPELL_TOFOLD(bc) == SPELL_TOFOLD(gc)) | |
14395 score += SCORE_ICASE; | |
14396 else | |
14397 { | |
14398 /* For a similar character use SCORE_SIMILAR. */ | |
14399 if (slang != NULL | |
14400 && slang->sl_has_map | |
14401 && similar_chars(slang, gc, bc)) | |
14402 score += SCORE_SIMILAR; | |
14403 else | |
14404 score += SCORE_SUBST; | |
14405 } | |
14406 | |
14407 if (score < minscore) | |
14408 { | |
14409 /* Do the substitution. */ | |
14410 ++gi; | |
14411 ++bi; | |
14412 continue; | |
14413 } | |
14414 } | |
14415 pop: | |
14416 /* | |
14417 * Get here to try the next alternative, pop it from the stack. | |
14418 */ | |
14419 if (stackidx == 0) /* stack is empty, finished */ | |
14420 break; | |
14421 | |
14422 /* pop an item from the stack */ | |
14423 --stackidx; | |
14424 gi = stack[stackidx].goodi; | |
14425 bi = stack[stackidx].badi; | |
14426 score = stack[stackidx].score; | |
14427 } | |
14428 | |
14429 /* When the score goes over "limit" it may actually be much higher. | |
14430 * Return a very large number to avoid going below the limit when giving a | |
14431 * bonus. */ | |
14432 if (minscore > limit) | |
14433 return SCORE_MAXMAX; | |
14434 return minscore; | |
14435 } | |
14436 | |
14437 #ifdef FEAT_MBYTE | |
14438 /* | |
14439 * Multi-byte version of spell_edit_score_limit(). | |
14440 * Keep it in sync with the above! | |
14441 */ | |
14442 static int | |
14443 spell_edit_score_limit_w(slang, badword, goodword, limit) | |
14444 slang_T *slang; | |
14445 char_u *badword; | |
14446 char_u *goodword; | |
14447 int limit; | |
14448 { | |
14449 limitscore_T stack[10]; /* allow for over 3 * 2 edits */ | |
14450 int stackidx; | |
14451 int bi, gi; | |
14452 int bi2, gi2; | |
14453 int bc, gc; | |
14454 int score; | |
14455 int score_off; | |
14456 int minscore; | |
14457 int round; | |
14458 char_u *p; | |
14459 int wbadword[MAXWLEN]; | |
14460 int wgoodword[MAXWLEN]; | |
14461 | |
14462 /* Get the characters from the multi-byte strings and put them in an | |
14463 * int array for easy access. */ | |
14464 bi = 0; | |
14465 for (p = badword; *p != NUL; ) | |
14466 wbadword[bi++] = mb_cptr2char_adv(&p); | |
14467 wbadword[bi++] = 0; | |
14468 gi = 0; | |
14469 for (p = goodword; *p != NUL; ) | |
14470 wgoodword[gi++] = mb_cptr2char_adv(&p); | |
14471 wgoodword[gi++] = 0; | |
14472 | |
14473 /* | |
14474 * The idea is to go from start to end over the words. So long as | |
14475 * characters are equal just continue, this always gives the lowest score. | |
14476 * When there is a difference try several alternatives. Each alternative | |
14477 * increases "score" for the edit distance. Some of the alternatives are | |
14478 * pushed unto a stack and tried later, some are tried right away. At the | |
14479 * end of the word the score for one alternative is known. The lowest | |
14480 * possible score is stored in "minscore". | |
14481 */ | |
14482 stackidx = 0; | |
14483 bi = 0; | |
14484 gi = 0; | |
14485 score = 0; | |
14486 minscore = limit + 1; | |
14487 | |
14488 for (;;) | |
14489 { | |
14490 /* Skip over an equal part, score remains the same. */ | |
14491 for (;;) | |
14492 { | |
14493 bc = wbadword[bi]; | |
14494 gc = wgoodword[gi]; | |
14495 | |
14496 if (bc != gc) /* stop at a char that's different */ | |
14497 break; | |
14498 if (bc == NUL) /* both words end */ | |
14499 { | |
14500 if (score < minscore) | |
14501 minscore = score; | |
14502 goto pop; /* do next alternative */ | |
14503 } | |
14504 ++bi; | |
14505 ++gi; | |
14506 } | |
14507 | |
14508 if (gc == NUL) /* goodword ends, delete badword chars */ | |
14509 { | |
14510 do | |
14511 { | |
14512 if ((score += SCORE_DEL) >= minscore) | |
14513 goto pop; /* do next alternative */ | |
14514 } while (wbadword[++bi] != NUL); | |
14515 minscore = score; | |
14516 } | |
14517 else if (bc == NUL) /* badword ends, insert badword chars */ | |
14518 { | |
14519 do | |
14520 { | |
14521 if ((score += SCORE_INS) >= minscore) | |
14522 goto pop; /* do next alternative */ | |
14523 } while (wgoodword[++gi] != NUL); | |
14524 minscore = score; | |
14525 } | |
14526 else /* both words continue */ | |
14527 { | |
14528 /* If not close to the limit, perform a change. Only try changes | |
14529 * that may lead to a lower score than "minscore". | |
14530 * round 0: try deleting a char from badword | |
14531 * round 1: try inserting a char in badword */ | |
14532 for (round = 0; round <= 1; ++round) | |
14533 { | |
14534 score_off = score + (round == 0 ? SCORE_DEL : SCORE_INS); | |
14535 if (score_off < minscore) | |
14536 { | |
14537 if (score_off + SCORE_EDIT_MIN >= minscore) | |
14538 { | |
14539 /* Near the limit, rest of the words must match. We | |
14540 * can check that right now, no need to push an item | |
14541 * onto the stack. */ | |
14542 bi2 = bi + 1 - round; | |
14543 gi2 = gi + round; | |
14544 while (wgoodword[gi2] == wbadword[bi2]) | |
14545 { | |
14546 if (wgoodword[gi2] == NUL) | |
14547 { | |
14548 minscore = score_off; | |
14549 break; | |
14550 } | |
14551 ++bi2; | |
14552 ++gi2; | |
14553 } | |
14554 } | |
14555 else | |
14556 { | |
14557 /* try deleting a character from badword later */ | |
14558 stack[stackidx].badi = bi + 1 - round; | |
14559 stack[stackidx].goodi = gi + round; | |
14560 stack[stackidx].score = score_off; | |
14561 ++stackidx; | |
14562 } | |
14563 } | |
14564 } | |
14565 | |
14566 if (score + SCORE_SWAP < minscore) | |
14567 { | |
14568 /* If swapping two characters makes a match then the | |
14569 * substitution is more expensive, thus there is no need to | |
14570 * try both. */ | |
14571 if (gc == wbadword[bi + 1] && bc == wgoodword[gi + 1]) | |
14572 { | |
14573 /* Swap two characters, that is: skip them. */ | |
14574 gi += 2; | |
14575 bi += 2; | |
14576 score += SCORE_SWAP; | |
14577 continue; | |
14578 } | |
14579 } | |
14580 | |
14581 /* Substitute one character for another which is the same | |
14582 * thing as deleting a character from both goodword and badword. | |
14583 * Use a better score when there is only a case difference. */ | |
14584 if (SPELL_TOFOLD(bc) == SPELL_TOFOLD(gc)) | |
14585 score += SCORE_ICASE; | |
14586 else | |
14587 { | |
14588 /* For a similar character use SCORE_SIMILAR. */ | |
14589 if (slang != NULL | |
14590 && slang->sl_has_map | |
14591 && similar_chars(slang, gc, bc)) | |
14592 score += SCORE_SIMILAR; | |
14593 else | |
14594 score += SCORE_SUBST; | |
14595 } | |
14596 | |
14597 if (score < minscore) | |
14598 { | |
14599 /* Do the substitution. */ | |
14600 ++gi; | |
14601 ++bi; | |
14602 continue; | |
14603 } | |
14604 } | |
14605 pop: | |
14606 /* | |
14607 * Get here to try the next alternative, pop it from the stack. | |
14608 */ | |
14609 if (stackidx == 0) /* stack is empty, finished */ | |
14610 break; | |
14611 | |
14612 /* pop an item from the stack */ | |
14613 --stackidx; | |
14614 gi = stack[stackidx].goodi; | |
14615 bi = stack[stackidx].badi; | |
14616 score = stack[stackidx].score; | |
14617 } | |
14618 | |
14619 /* When the score goes over "limit" it may actually be much higher. | |
14620 * Return a very large number to avoid going below the limit when giving a | |
14621 * bonus. */ | |
14622 if (minscore > limit) | |
14623 return SCORE_MAXMAX; | |
14624 return minscore; | |
14625 } | |
14626 #endif | |
14627 | |
14628 #define DUMPFLAG_KEEPCASE 1 /* round 2: keep-case tree */ | |
14629 #define DUMPFLAG_COUNT 2 /* include word count */ | |
14630 | |
12580 /* | 14631 /* |
12581 * ":spelldump" | 14632 * ":spelldump" |
12582 */ | 14633 */ |
12583 /*ARGSUSED*/ | 14634 /*ARGSUSED*/ |
12584 void | 14635 void |
12601 int flags; | 14652 int flags; |
12602 char_u *region_names = NULL; /* region names being used */ | 14653 char_u *region_names = NULL; /* region names being used */ |
12603 int do_region = TRUE; /* dump region names and numbers */ | 14654 int do_region = TRUE; /* dump region names and numbers */ |
12604 char_u *p; | 14655 char_u *p; |
12605 int lpi; | 14656 int lpi; |
14657 int dumpflags; | |
12606 | 14658 |
12607 if (no_spell_checking(curwin)) | 14659 if (no_spell_checking(curwin)) |
12608 return; | 14660 return; |
12609 | 14661 |
12610 /* Create a new empty buffer by splitting the window. */ | 14662 /* Create a new empty buffer by splitting the window. */ |
12655 * round 2: keep-case tree */ | 14707 * round 2: keep-case tree */ |
12656 for (round = 1; round <= 2; ++round) | 14708 for (round = 1; round <= 2; ++round) |
12657 { | 14709 { |
12658 if (round == 1) | 14710 if (round == 1) |
12659 { | 14711 { |
14712 dumpflags = 0; | |
12660 byts = slang->sl_fbyts; | 14713 byts = slang->sl_fbyts; |
12661 idxs = slang->sl_fidxs; | 14714 idxs = slang->sl_fidxs; |
12662 } | 14715 } |
12663 else | 14716 else |
12664 { | 14717 { |
14718 dumpflags = DUMPFLAG_KEEPCASE; | |
12665 byts = slang->sl_kbyts; | 14719 byts = slang->sl_kbyts; |
12666 idxs = slang->sl_kidxs; | 14720 idxs = slang->sl_kidxs; |
12667 } | 14721 } |
12668 if (byts == NULL) | 14722 if (byts == NULL) |
12669 continue; /* array is empty */ | 14723 continue; /* array is empty */ |
14724 | |
14725 if (eap->forceit) | |
14726 dumpflags |= DUMPFLAG_COUNT; | |
12670 | 14727 |
12671 depth = 0; | 14728 depth = 0; |
12672 arridx[0] = 0; | 14729 arridx[0] = 0; |
12673 curi[0] = 1; | 14730 curi[0] = 1; |
12674 while (depth >= 0 && !got_int) | 14731 while (depth >= 0 && !got_int) |
12705 | 14762 |
12706 /* Dump the basic word if there is no prefix or | 14763 /* Dump the basic word if there is no prefix or |
12707 * when it's the first one. */ | 14764 * when it's the first one. */ |
12708 c = (unsigned)flags >> 24; | 14765 c = (unsigned)flags >> 24; |
12709 if (c == 0 || curi[depth] == 2) | 14766 if (c == 0 || curi[depth] == 2) |
12710 dump_word(word, round, flags, lnum++); | 14767 dump_word(slang, word, dumpflags, |
14768 flags, lnum++); | |
12711 | 14769 |
12712 /* Apply the prefix, if there is one. */ | 14770 /* Apply the prefix, if there is one. */ |
12713 if (c != 0) | 14771 if (c != 0) |
12714 lnum = dump_prefixes(slang, word, round, | 14772 lnum = dump_prefixes(slang, word, dumpflags, |
12715 flags, lnum); | 14773 flags, lnum); |
12716 } | 14774 } |
12717 } | 14775 } |
12718 else | 14776 else |
12719 { | 14777 { |
12736 | 14794 |
12737 /* | 14795 /* |
12738 * Dump one word: apply case modifications and append a line to the buffer. | 14796 * Dump one word: apply case modifications and append a line to the buffer. |
12739 */ | 14797 */ |
12740 static void | 14798 static void |
12741 dump_word(word, round, flags, lnum) | 14799 dump_word(slang, word, dumpflags, flags, lnum) |
14800 slang_T *slang; | |
12742 char_u *word; | 14801 char_u *word; |
12743 int round; | 14802 int dumpflags; |
12744 int flags; | 14803 int flags; |
12745 linenr_T lnum; | 14804 linenr_T lnum; |
12746 { | 14805 { |
12747 int keepcap = FALSE; | 14806 int keepcap = FALSE; |
12748 char_u *p; | 14807 char_u *p; |
14808 char_u *tw; | |
12749 char_u cword[MAXWLEN]; | 14809 char_u cword[MAXWLEN]; |
12750 char_u badword[MAXWLEN + 10]; | 14810 char_u badword[MAXWLEN + 10]; |
12751 int i; | 14811 int i; |
12752 | 14812 |
12753 if (round == 1 && (flags & WF_CAPMASK) != 0) | 14813 if ((dumpflags & DUMPFLAG_KEEPCASE) == 0 && (flags & WF_CAPMASK) != 0) |
12754 { | 14814 { |
12755 /* Need to fix case according to "flags". */ | 14815 /* Need to fix case according to "flags". */ |
12756 make_case_word(word, cword, flags); | 14816 make_case_word(word, cword, flags); |
12757 p = cword; | 14817 p = cword; |
12758 } | 14818 } |
12759 else | 14819 else |
12760 { | 14820 { |
12761 p = word; | 14821 p = word; |
12762 if (round == 2 && ((captype(word, NULL) & WF_KEEPCAP) == 0 | 14822 if ((dumpflags & DUMPFLAG_KEEPCASE) |
14823 && ((captype(word, NULL) & WF_KEEPCAP) == 0 | |
12763 || (flags & WF_FIXCAP) != 0)) | 14824 || (flags & WF_FIXCAP) != 0)) |
12764 keepcap = TRUE; | 14825 keepcap = TRUE; |
12765 } | 14826 } |
14827 tw = p; | |
12766 | 14828 |
12767 /* Add flags and regions after a slash. */ | 14829 /* Add flags and regions after a slash. */ |
12768 if ((flags & (WF_BANNED | WF_RARE | WF_REGION)) || keepcap) | 14830 if ((flags & (WF_BANNED | WF_RARE | WF_REGION)) || keepcap) |
12769 { | 14831 { |
12770 STRCPY(badword, p); | 14832 STRCPY(badword, p); |
12780 if (flags & (0x10000 << i)) | 14842 if (flags & (0x10000 << i)) |
12781 sprintf((char *)badword + STRLEN(badword), "%d", i + 1); | 14843 sprintf((char *)badword + STRLEN(badword), "%d", i + 1); |
12782 p = badword; | 14844 p = badword; |
12783 } | 14845 } |
12784 | 14846 |
14847 if (dumpflags & DUMPFLAG_COUNT) | |
14848 { | |
14849 hashitem_T *hi; | |
14850 | |
14851 /* Include the word count for ":spelldump!". */ | |
14852 hi = hash_find(&slang->sl_wordcount, tw); | |
14853 if (!HASHITEM_EMPTY(hi)) | |
14854 { | |
14855 vim_snprintf((char *)IObuff, IOSIZE, "%s\t%d", | |
14856 tw, HI2WC(hi)->wc_count); | |
14857 p = IObuff; | |
14858 } | |
14859 } | |
14860 | |
12785 ml_append(lnum, p, (colnr_T)0, FALSE); | 14861 ml_append(lnum, p, (colnr_T)0, FALSE); |
12786 } | 14862 } |
12787 | 14863 |
12788 /* | 14864 /* |
12789 * For ":spelldump": Find matching prefixes for "word". Prepend each to | 14865 * For ":spelldump": Find matching prefixes for "word". Prepend each to |
12790 * "word" and append a line to the buffer. | 14866 * "word" and append a line to the buffer. |
12791 * Return the updated line number. | 14867 * Return the updated line number. |
12792 */ | 14868 */ |
12793 static linenr_T | 14869 static linenr_T |
12794 dump_prefixes(slang, word, round, flags, startlnum) | 14870 dump_prefixes(slang, word, dumpflags, flags, startlnum) |
12795 slang_T *slang; | 14871 slang_T *slang; |
12796 char_u *word; /* case-folded word */ | 14872 char_u *word; /* case-folded word */ |
12797 int round; | 14873 int dumpflags; |
12798 int flags; /* flags with prefix ID */ | 14874 int flags; /* flags with prefix ID */ |
12799 linenr_T startlnum; | 14875 linenr_T startlnum; |
12800 { | 14876 { |
12801 idx_T arridx[MAXWLEN]; | 14877 idx_T arridx[MAXWLEN]; |
12802 int curi[MAXWLEN]; | 14878 int curi[MAXWLEN]; |
12858 | 14934 |
12859 c = valid_word_prefix(i, n, flags, word, slang, FALSE); | 14935 c = valid_word_prefix(i, n, flags, word, slang, FALSE); |
12860 if (c != 0) | 14936 if (c != 0) |
12861 { | 14937 { |
12862 vim_strncpy(prefix + depth, word, MAXWLEN - depth - 1); | 14938 vim_strncpy(prefix + depth, word, MAXWLEN - depth - 1); |
12863 dump_word(prefix, round, | 14939 dump_word(slang, prefix, dumpflags, |
12864 (c & WF_RAREPFX) ? (flags | WF_RARE) | 14940 (c & WF_RAREPFX) ? (flags | WF_RARE) |
12865 : flags, lnum++); | 14941 : flags, lnum++); |
12866 } | 14942 } |
12867 | 14943 |
12868 /* Check for prefix that matches the word when the | 14944 /* Check for prefix that matches the word when the |
12874 TRUE); | 14950 TRUE); |
12875 if (c != 0) | 14951 if (c != 0) |
12876 { | 14952 { |
12877 vim_strncpy(prefix + depth, word_up, | 14953 vim_strncpy(prefix + depth, word_up, |
12878 MAXWLEN - depth - 1); | 14954 MAXWLEN - depth - 1); |
12879 dump_word(prefix, round, | 14955 dump_word(slang, prefix, dumpflags, |
12880 (c & WF_RAREPFX) ? (flags | WF_RARE) | 14956 (c & WF_RAREPFX) ? (flags | WF_RARE) |
12881 : flags, lnum++); | 14957 : flags, lnum++); |
12882 } | 14958 } |
12883 } | 14959 } |
12884 } | 14960 } |
12979 char_u *pat; | 15055 char_u *pat; |
12980 char_u ***matchp; | 15056 char_u ***matchp; |
12981 { | 15057 { |
12982 garray_T ga; | 15058 garray_T ga; |
12983 | 15059 |
12984 spell_suggest_list(&ga, pat, 100, spell_expand_need_cap); | 15060 spell_suggest_list(&ga, pat, 100, spell_expand_need_cap, TRUE); |
12985 *matchp = ga.ga_data; | 15061 *matchp = ga.ga_data; |
12986 return ga.ga_len; | 15062 return ga.ga_len; |
12987 } | 15063 } |
12988 #endif | 15064 #endif |
12989 | 15065 |