comparison src/spellfile.c @ 20681:a9f2cd2933ef v8.2.0894

patch 8.2.0894: :mkspell can take very long if the word count is high Commit: https://github.com/vim/vim/commit/59f88fbf24b21dbae114a79a15695fa2c3a09fca Author: Bram Moolenaar <Bram@vim.org> Date: Wed Jun 3 20:51:11 2020 +0200 patch 8.2.0894: :mkspell can take very long if the word count is high Problem: :mkspell can take very long if the word count is high. Solution: Use long to avoid negative numbers. Increase the limits by 20% if the compression did not have effect.
author Bram Moolenaar <Bram@vim.org>
date Wed, 03 Jun 2020 21:00:04 +0200
parents aadd1cae2ff5
children 3a9dcfe62691
comparison
equal deleted inserted replaced
20680:85da25f6882a 20681:a9f2cd2933ef
1993 static int store_word(spellinfo_T *spin, char_u *word, int flags, int region, char_u *pfxlist, int need_affix); 1993 static int store_word(spellinfo_T *spin, char_u *word, int flags, int region, char_u *pfxlist, int need_affix);
1994 static int tree_add_word(spellinfo_T *spin, char_u *word, wordnode_T *tree, int flags, int region, int affixID); 1994 static int tree_add_word(spellinfo_T *spin, char_u *word, wordnode_T *tree, int flags, int region, int affixID);
1995 static wordnode_T *get_wordnode(spellinfo_T *spin); 1995 static wordnode_T *get_wordnode(spellinfo_T *spin);
1996 static void free_wordnode(spellinfo_T *spin, wordnode_T *n); 1996 static void free_wordnode(spellinfo_T *spin, wordnode_T *n);
1997 static void wordtree_compress(spellinfo_T *spin, wordnode_T *root); 1997 static void wordtree_compress(spellinfo_T *spin, wordnode_T *root);
1998 static int node_compress(spellinfo_T *spin, wordnode_T *node, hashtab_T *ht, int *tot); 1998 static long node_compress(spellinfo_T *spin, wordnode_T *node, hashtab_T *ht, long *tot);
1999 static int node_equal(wordnode_T *n1, wordnode_T *n2); 1999 static int node_equal(wordnode_T *n1, wordnode_T *n2);
2000 static void clear_node(wordnode_T *node); 2000 static void clear_node(wordnode_T *node);
2001 static int put_node(FILE *fd, wordnode_T *node, int idx, int regionmask, int prefixtree); 2001 static int put_node(FILE *fd, wordnode_T *node, int idx, int regionmask, int prefixtree);
2002 static int sug_filltree(spellinfo_T *spin, slang_T *slang); 2002 static int sug_filltree(spellinfo_T *spin, slang_T *slang);
2003 static int sug_maketable(spellinfo_T *spin); 2003 static int sug_maketable(spellinfo_T *spin);
2017 #define CONDIT_CFIX 2 // affix must have CIRCUMFIX flag 2017 #define CONDIT_CFIX 2 // affix must have CIRCUMFIX flag
2018 #define CONDIT_SUF 4 // add a suffix for matching flags 2018 #define CONDIT_SUF 4 // add a suffix for matching flags
2019 #define CONDIT_AFF 8 // word already has an affix 2019 #define CONDIT_AFF 8 // word already has an affix
2020 2020
2021 /* 2021 /*
2022 * Tunable parameters for when the tree is compressed. See 'mkspellmem'. 2022 * Tunable parameters for when the tree is compressed. Filled from the
2023 * 'mkspellmem' option.
2023 */ 2024 */
2024 static long compress_start = 30000; // memory / SBLOCKSIZE 2025 static long compress_start = 30000; // memory / SBLOCKSIZE
2025 static long compress_inc = 100; // memory / SBLOCKSIZE 2026 static long compress_inc = 100; // memory / SBLOCKSIZE
2026 static long compress_added = 500000; // word count 2027 static long compress_added = 500000; // word count
2028
2029 // Actually used values. These can change if compression doesn't result in
2030 // reducing the size.
2031 static long used_compress_inc;
2032 static long used_compress_added;
2027 2033
2028 /* 2034 /*
2029 * Check the 'mkspellmem' option. Return FAIL if it's wrong. 2035 * Check the 'mkspellmem' option. Return FAIL if it's wrong.
2030 * Sets "sps_flags". 2036 * Sets "sps_flags".
2031 */ 2037 */
4532 4538
4533 if (spin->si_compress_cnt > 1) 4539 if (spin->si_compress_cnt > 1)
4534 { 4540 {
4535 if (--spin->si_compress_cnt == 1) 4541 if (--spin->si_compress_cnt == 1)
4536 // Did enough words to lower the block count limit. 4542 // Did enough words to lower the block count limit.
4537 spin->si_blocks_cnt += compress_inc; 4543 spin->si_blocks_cnt += used_compress_inc;
4538 } 4544 }
4539 4545
4540 /* 4546 /*
4541 * When we have allocated lots of memory we need to compress the word tree 4547 * When we have allocated lots of memory we need to compress the word tree
4542 * to free up some room. But compression is slow, and we might actually 4548 * to free up some room. But compression is slow, and we might actually
4543 * need that room, thus only compress in the following situations: 4549 * need that room, thus only compress in the following situations:
4544 * 1. When not compressed before (si_compress_cnt == 0): when using 4550 * 1. When not compressed before (si_compress_cnt == 0): when using
4545 * "compress_start" blocks. 4551 * "compress_start" blocks.
4546 * 2. When compressed before and used "compress_inc" blocks before 4552 * 2. When compressed before and used "used_compress_inc" blocks before
4547 * adding "compress_added" words (si_compress_cnt > 1). 4553 * adding "used_compress_added" words (si_compress_cnt > 1).
4548 * 3. When compressed before, added "compress_added" words 4554 * 3. When compressed before, added "used_compress_added" words
4549 * (si_compress_cnt == 1) and the number of free nodes drops below the 4555 * (si_compress_cnt == 1) and the number of free nodes drops below the
4550 * maximum word length. 4556 * maximum word length.
4551 */ 4557 */
4552 #ifndef SPELL_COMPRESS_ALLWAYS 4558 #ifndef SPELL_COMPRESS_ALLWAYS
4553 if (spin->si_compress_cnt == 1 4559 if (spin->si_compress_cnt == 1
4554 ? spin->si_free_count < MAXWLEN 4560 ? spin->si_free_count < MAXWLEN
4555 : spin->si_blocks_cnt >= compress_start) 4561 : spin->si_blocks_cnt >= compress_start)
4556 #endif 4562 #endif
4557 { 4563 {
4558 // Decrement the block counter. The effect is that we compress again 4564 // Decrement the block counter. The effect is that we compress again
4559 // when the freed up room has been used and another "compress_inc" 4565 // when the freed up room has been used and another "used_compress_inc"
4560 // blocks have been allocated. Unless "compress_added" words have 4566 // blocks have been allocated. Unless "used_compress_added" words have
4561 // been added, then the limit is put back again. 4567 // been added, then the limit is put back again.
4562 spin->si_blocks_cnt -= compress_inc; 4568 spin->si_blocks_cnt -= used_compress_inc;
4563 spin->si_compress_cnt = compress_added; 4569 spin->si_compress_cnt = used_compress_added;
4564 4570
4565 if (spin->si_verbose) 4571 if (spin->si_verbose)
4566 { 4572 {
4567 msg_start(); 4573 msg_start();
4568 msg_puts(_(msg_compressing)); 4574 msg_puts(_(msg_compressing));
4653 */ 4659 */
4654 static void 4660 static void
4655 wordtree_compress(spellinfo_T *spin, wordnode_T *root) 4661 wordtree_compress(spellinfo_T *spin, wordnode_T *root)
4656 { 4662 {
4657 hashtab_T ht; 4663 hashtab_T ht;
4658 int n; 4664 long n;
4659 int tot = 0; 4665 long tot = 0;
4660 int perc; 4666 long perc;
4661 4667
4662 // Skip the root itself, it's not actually used. The first sibling is the 4668 // Skip the root itself, it's not actually used. The first sibling is the
4663 // start of the tree. 4669 // start of the tree.
4664 if (root->wn_sibling != NULL) 4670 if (root->wn_sibling != NULL)
4665 { 4671 {
4666 hash_init(&ht); 4672 hash_init(&ht);
4667 n = node_compress(spin, root->wn_sibling, &ht, &tot); 4673 n = node_compress(spin, root->wn_sibling, &ht, &tot);
4674
4675 if (tot == 0)
4676 {
4677 // Compression did not have effect. Increase the limits by 20% to
4678 // avoid wasting time on compression, memory will be used anyway.
4679 used_compress_inc += used_compress_inc / 5;
4680 used_compress_added += used_compress_added / 5;
4681 }
4668 4682
4669 #ifndef SPELL_PRINTTREE 4683 #ifndef SPELL_PRINTTREE
4670 if (spin->si_verbose || p_verbose > 2) 4684 if (spin->si_verbose || p_verbose > 2)
4671 #endif 4685 #endif
4672 { 4686 {
4675 else if (tot == 0) 4689 else if (tot == 0)
4676 perc = 0; 4690 perc = 0;
4677 else 4691 else
4678 perc = (tot - n) * 100 / tot; 4692 perc = (tot - n) * 100 / tot;
4679 vim_snprintf((char *)IObuff, IOSIZE, 4693 vim_snprintf((char *)IObuff, IOSIZE,
4680 _("Compressed %d of %d nodes; %d (%d%%) remaining"), 4694 _("Compressed %ld of %ld nodes; %ld (%ld%%) remaining"),
4681 n, tot, tot - n, perc); 4695 n, tot, tot - n, perc);
4682 spell_message(spin, IObuff); 4696 spell_message(spin, IObuff);
4683 } 4697 }
4684 #ifdef SPELL_PRINTTREE 4698 #ifdef SPELL_PRINTTREE
4685 spell_print_tree(root->wn_sibling); 4699 spell_print_tree(root->wn_sibling);
4690 4704
4691 /* 4705 /*
4692 * Compress a node, its siblings and its children, depth first. 4706 * Compress a node, its siblings and its children, depth first.
4693 * Returns the number of compressed nodes. 4707 * Returns the number of compressed nodes.
4694 */ 4708 */
4695 static int 4709 static long
4696 node_compress( 4710 node_compress(
4697 spellinfo_T *spin, 4711 spellinfo_T *spin,
4698 wordnode_T *node, 4712 wordnode_T *node,
4699 hashtab_T *ht, 4713 hashtab_T *ht,
4700 int *tot) // total count of nodes before compressing, 4714 long *tot) // total count of nodes before compressing,
4701 // incremented while going through the tree 4715 // incremented while going through the tree
4702 { 4716 {
4703 wordnode_T *np; 4717 wordnode_T *np;
4704 wordnode_T *tp; 4718 wordnode_T *tp;
4705 wordnode_T *child; 4719 wordnode_T *child;
4706 hash_T hash; 4720 hash_T hash;
4707 hashitem_T *hi; 4721 hashitem_T *hi;
4708 int len = 0; 4722 long len = 0;
4709 unsigned nr, n; 4723 unsigned nr, n;
4710 int compressed = 0; 4724 long compressed = 0;
4711 4725
4712 /* 4726 /*
4713 * Go through the list of siblings. Compress each child and then try 4727 * Go through the list of siblings. Compress each child and then try
4714 * finding an identical child to replace it. 4728 * finding an identical child to replace it.
4715 * Note that with "child" we mean not just the node that is pointed to, 4729 * Note that with "child" we mean not just the node that is pointed to,
5897 ga_init2(&spin.si_map, (int)sizeof(char_u), 100); 5911 ga_init2(&spin.si_map, (int)sizeof(char_u), 100);
5898 ga_init2(&spin.si_comppat, (int)sizeof(char_u *), 20); 5912 ga_init2(&spin.si_comppat, (int)sizeof(char_u *), 20);
5899 ga_init2(&spin.si_prefcond, (int)sizeof(char_u *), 50); 5913 ga_init2(&spin.si_prefcond, (int)sizeof(char_u *), 50);
5900 hash_init(&spin.si_commonwords); 5914 hash_init(&spin.si_commonwords);
5901 spin.si_newcompID = 127; // start compound ID at first maximum 5915 spin.si_newcompID = 127; // start compound ID at first maximum
5916 used_compress_inc = compress_inc;
5917 used_compress_added = compress_added;
5902 5918
5903 // default: fnames[0] is output file, following are input files 5919 // default: fnames[0] is output file, following are input files
5904 innames = &fnames[1]; 5920 innames = &fnames[1];
5905 incount = fcount - 1; 5921 incount = fcount - 1;
5906 5922