Mercurial > vim
comparison src/spellfile.c @ 20681:a9f2cd2933ef v8.2.0894
patch 8.2.0894: :mkspell can take very long if the word count is high
Commit: https://github.com/vim/vim/commit/59f88fbf24b21dbae114a79a15695fa2c3a09fca
Author: Bram Moolenaar <Bram@vim.org>
Date: Wed Jun 3 20:51:11 2020 +0200
patch 8.2.0894: :mkspell can take very long if the word count is high
Problem: :mkspell can take very long if the word count is high.
Solution: Use long to avoid negative numbers. Increase the limits by 20% if
the compression did not have effect.
author | Bram Moolenaar <Bram@vim.org> |
---|---|
date | Wed, 03 Jun 2020 21:00:04 +0200 |
parents | aadd1cae2ff5 |
children | 3a9dcfe62691 |
comparison
equal
deleted
inserted
replaced
20680:85da25f6882a | 20681:a9f2cd2933ef |
---|---|
1993 static int store_word(spellinfo_T *spin, char_u *word, int flags, int region, char_u *pfxlist, int need_affix); | 1993 static int store_word(spellinfo_T *spin, char_u *word, int flags, int region, char_u *pfxlist, int need_affix); |
1994 static int tree_add_word(spellinfo_T *spin, char_u *word, wordnode_T *tree, int flags, int region, int affixID); | 1994 static int tree_add_word(spellinfo_T *spin, char_u *word, wordnode_T *tree, int flags, int region, int affixID); |
1995 static wordnode_T *get_wordnode(spellinfo_T *spin); | 1995 static wordnode_T *get_wordnode(spellinfo_T *spin); |
1996 static void free_wordnode(spellinfo_T *spin, wordnode_T *n); | 1996 static void free_wordnode(spellinfo_T *spin, wordnode_T *n); |
1997 static void wordtree_compress(spellinfo_T *spin, wordnode_T *root); | 1997 static void wordtree_compress(spellinfo_T *spin, wordnode_T *root); |
1998 static int node_compress(spellinfo_T *spin, wordnode_T *node, hashtab_T *ht, int *tot); | 1998 static long node_compress(spellinfo_T *spin, wordnode_T *node, hashtab_T *ht, long *tot); |
1999 static int node_equal(wordnode_T *n1, wordnode_T *n2); | 1999 static int node_equal(wordnode_T *n1, wordnode_T *n2); |
2000 static void clear_node(wordnode_T *node); | 2000 static void clear_node(wordnode_T *node); |
2001 static int put_node(FILE *fd, wordnode_T *node, int idx, int regionmask, int prefixtree); | 2001 static int put_node(FILE *fd, wordnode_T *node, int idx, int regionmask, int prefixtree); |
2002 static int sug_filltree(spellinfo_T *spin, slang_T *slang); | 2002 static int sug_filltree(spellinfo_T *spin, slang_T *slang); |
2003 static int sug_maketable(spellinfo_T *spin); | 2003 static int sug_maketable(spellinfo_T *spin); |
2017 #define CONDIT_CFIX 2 // affix must have CIRCUMFIX flag | 2017 #define CONDIT_CFIX 2 // affix must have CIRCUMFIX flag |
2018 #define CONDIT_SUF 4 // add a suffix for matching flags | 2018 #define CONDIT_SUF 4 // add a suffix for matching flags |
2019 #define CONDIT_AFF 8 // word already has an affix | 2019 #define CONDIT_AFF 8 // word already has an affix |
2020 | 2020 |
2021 /* | 2021 /* |
2022 * Tunable parameters for when the tree is compressed. See 'mkspellmem'. | 2022 * Tunable parameters for when the tree is compressed. Filled from the |
2023 * 'mkspellmem' option. | |
2023 */ | 2024 */ |
2024 static long compress_start = 30000; // memory / SBLOCKSIZE | 2025 static long compress_start = 30000; // memory / SBLOCKSIZE |
2025 static long compress_inc = 100; // memory / SBLOCKSIZE | 2026 static long compress_inc = 100; // memory / SBLOCKSIZE |
2026 static long compress_added = 500000; // word count | 2027 static long compress_added = 500000; // word count |
2028 | |
2029 // Actually used values. These can change if compression doesn't result in | |
2030 // reducing the size. | |
2031 static long used_compress_inc; | |
2032 static long used_compress_added; | |
2027 | 2033 |
2028 /* | 2034 /* |
2029 * Check the 'mkspellmem' option. Return FAIL if it's wrong. | 2035 * Check the 'mkspellmem' option. Return FAIL if it's wrong. |
2030 * Sets "sps_flags". | 2036 * Sets "sps_flags". |
2031 */ | 2037 */ |
4532 | 4538 |
4533 if (spin->si_compress_cnt > 1) | 4539 if (spin->si_compress_cnt > 1) |
4534 { | 4540 { |
4535 if (--spin->si_compress_cnt == 1) | 4541 if (--spin->si_compress_cnt == 1) |
4536 // Did enough words to lower the block count limit. | 4542 // Did enough words to lower the block count limit. |
4537 spin->si_blocks_cnt += compress_inc; | 4543 spin->si_blocks_cnt += used_compress_inc; |
4538 } | 4544 } |
4539 | 4545 |
4540 /* | 4546 /* |
4541 * When we have allocated lots of memory we need to compress the word tree | 4547 * When we have allocated lots of memory we need to compress the word tree |
4542 * to free up some room. But compression is slow, and we might actually | 4548 * to free up some room. But compression is slow, and we might actually |
4543 * need that room, thus only compress in the following situations: | 4549 * need that room, thus only compress in the following situations: |
4544 * 1. When not compressed before (si_compress_cnt == 0): when using | 4550 * 1. When not compressed before (si_compress_cnt == 0): when using |
4545 * "compress_start" blocks. | 4551 * "compress_start" blocks. |
4546 * 2. When compressed before and used "compress_inc" blocks before | 4552 * 2. When compressed before and used "used_compress_inc" blocks before |
4547 * adding "compress_added" words (si_compress_cnt > 1). | 4553 * adding "used_compress_added" words (si_compress_cnt > 1). |
4548 * 3. When compressed before, added "compress_added" words | 4554 * 3. When compressed before, added "used_compress_added" words |
4549 * (si_compress_cnt == 1) and the number of free nodes drops below the | 4555 * (si_compress_cnt == 1) and the number of free nodes drops below the |
4550 * maximum word length. | 4556 * maximum word length. |
4551 */ | 4557 */ |
4552 #ifndef SPELL_COMPRESS_ALLWAYS | 4558 #ifndef SPELL_COMPRESS_ALLWAYS |
4553 if (spin->si_compress_cnt == 1 | 4559 if (spin->si_compress_cnt == 1 |
4554 ? spin->si_free_count < MAXWLEN | 4560 ? spin->si_free_count < MAXWLEN |
4555 : spin->si_blocks_cnt >= compress_start) | 4561 : spin->si_blocks_cnt >= compress_start) |
4556 #endif | 4562 #endif |
4557 { | 4563 { |
4558 // Decrement the block counter. The effect is that we compress again | 4564 // Decrement the block counter. The effect is that we compress again |
4559 // when the freed up room has been used and another "compress_inc" | 4565 // when the freed up room has been used and another "used_compress_inc" |
4560 // blocks have been allocated. Unless "compress_added" words have | 4566 // blocks have been allocated. Unless "used_compress_added" words have |
4561 // been added, then the limit is put back again. | 4567 // been added, then the limit is put back again. |
4562 spin->si_blocks_cnt -= compress_inc; | 4568 spin->si_blocks_cnt -= used_compress_inc; |
4563 spin->si_compress_cnt = compress_added; | 4569 spin->si_compress_cnt = used_compress_added; |
4564 | 4570 |
4565 if (spin->si_verbose) | 4571 if (spin->si_verbose) |
4566 { | 4572 { |
4567 msg_start(); | 4573 msg_start(); |
4568 msg_puts(_(msg_compressing)); | 4574 msg_puts(_(msg_compressing)); |
4653 */ | 4659 */ |
4654 static void | 4660 static void |
4655 wordtree_compress(spellinfo_T *spin, wordnode_T *root) | 4661 wordtree_compress(spellinfo_T *spin, wordnode_T *root) |
4656 { | 4662 { |
4657 hashtab_T ht; | 4663 hashtab_T ht; |
4658 int n; | 4664 long n; |
4659 int tot = 0; | 4665 long tot = 0; |
4660 int perc; | 4666 long perc; |
4661 | 4667 |
4662 // Skip the root itself, it's not actually used. The first sibling is the | 4668 // Skip the root itself, it's not actually used. The first sibling is the |
4663 // start of the tree. | 4669 // start of the tree. |
4664 if (root->wn_sibling != NULL) | 4670 if (root->wn_sibling != NULL) |
4665 { | 4671 { |
4666 hash_init(&ht); | 4672 hash_init(&ht); |
4667 n = node_compress(spin, root->wn_sibling, &ht, &tot); | 4673 n = node_compress(spin, root->wn_sibling, &ht, &tot); |
4674 | |
4675 if (tot == 0) | |
4676 { | |
4677 // Compression did not have effect. Increase the limits by 20% to | |
4678 // avoid wasting time on compression, memory will be used anyway. | |
4679 used_compress_inc += used_compress_inc / 5; | |
4680 used_compress_added += used_compress_added / 5; | |
4681 } | |
4668 | 4682 |
4669 #ifndef SPELL_PRINTTREE | 4683 #ifndef SPELL_PRINTTREE |
4670 if (spin->si_verbose || p_verbose > 2) | 4684 if (spin->si_verbose || p_verbose > 2) |
4671 #endif | 4685 #endif |
4672 { | 4686 { |
4675 else if (tot == 0) | 4689 else if (tot == 0) |
4676 perc = 0; | 4690 perc = 0; |
4677 else | 4691 else |
4678 perc = (tot - n) * 100 / tot; | 4692 perc = (tot - n) * 100 / tot; |
4679 vim_snprintf((char *)IObuff, IOSIZE, | 4693 vim_snprintf((char *)IObuff, IOSIZE, |
4680 _("Compressed %d of %d nodes; %d (%d%%) remaining"), | 4694 _("Compressed %ld of %ld nodes; %ld (%ld%%) remaining"), |
4681 n, tot, tot - n, perc); | 4695 n, tot, tot - n, perc); |
4682 spell_message(spin, IObuff); | 4696 spell_message(spin, IObuff); |
4683 } | 4697 } |
4684 #ifdef SPELL_PRINTTREE | 4698 #ifdef SPELL_PRINTTREE |
4685 spell_print_tree(root->wn_sibling); | 4699 spell_print_tree(root->wn_sibling); |
4690 | 4704 |
4691 /* | 4705 /* |
4692 * Compress a node, its siblings and its children, depth first. | 4706 * Compress a node, its siblings and its children, depth first. |
4693 * Returns the number of compressed nodes. | 4707 * Returns the number of compressed nodes. |
4694 */ | 4708 */ |
4695 static int | 4709 static long |
4696 node_compress( | 4710 node_compress( |
4697 spellinfo_T *spin, | 4711 spellinfo_T *spin, |
4698 wordnode_T *node, | 4712 wordnode_T *node, |
4699 hashtab_T *ht, | 4713 hashtab_T *ht, |
4700 int *tot) // total count of nodes before compressing, | 4714 long *tot) // total count of nodes before compressing, |
4701 // incremented while going through the tree | 4715 // incremented while going through the tree |
4702 { | 4716 { |
4703 wordnode_T *np; | 4717 wordnode_T *np; |
4704 wordnode_T *tp; | 4718 wordnode_T *tp; |
4705 wordnode_T *child; | 4719 wordnode_T *child; |
4706 hash_T hash; | 4720 hash_T hash; |
4707 hashitem_T *hi; | 4721 hashitem_T *hi; |
4708 int len = 0; | 4722 long len = 0; |
4709 unsigned nr, n; | 4723 unsigned nr, n; |
4710 int compressed = 0; | 4724 long compressed = 0; |
4711 | 4725 |
4712 /* | 4726 /* |
4713 * Go through the list of siblings. Compress each child and then try | 4727 * Go through the list of siblings. Compress each child and then try |
4714 * finding an identical child to replace it. | 4728 * finding an identical child to replace it. |
4715 * Note that with "child" we mean not just the node that is pointed to, | 4729 * Note that with "child" we mean not just the node that is pointed to, |
5897 ga_init2(&spin.si_map, (int)sizeof(char_u), 100); | 5911 ga_init2(&spin.si_map, (int)sizeof(char_u), 100); |
5898 ga_init2(&spin.si_comppat, (int)sizeof(char_u *), 20); | 5912 ga_init2(&spin.si_comppat, (int)sizeof(char_u *), 20); |
5899 ga_init2(&spin.si_prefcond, (int)sizeof(char_u *), 50); | 5913 ga_init2(&spin.si_prefcond, (int)sizeof(char_u *), 50); |
5900 hash_init(&spin.si_commonwords); | 5914 hash_init(&spin.si_commonwords); |
5901 spin.si_newcompID = 127; // start compound ID at first maximum | 5915 spin.si_newcompID = 127; // start compound ID at first maximum |
5916 used_compress_inc = compress_inc; | |
5917 used_compress_added = compress_added; | |
5902 | 5918 |
5903 // default: fnames[0] is output file, following are input files | 5919 // default: fnames[0] is output file, following are input files |
5904 innames = &fnames[1]; | 5920 innames = &fnames[1]; |
5905 incount = fcount - 1; | 5921 incount = fcount - 1; |
5906 | 5922 |