vim: src/spell.c comparison

comparison src/spell.c @ 324:548525d9da24

updated for version 7.0085

author	vimboss
date	Tue, 14 Jun 2005 22:01:04 +0000
parents	03b3684919e3
children	f76b0d38b6bd

comparison

equal deleted inserted replaced

-:03b3684919e3
+:548525d9da24
 * spell.c: code for spell checking
 *
 * The spell checking mechanism uses a tree (aka trie).  Each node in the tree
 * has a list of bytes that can appear (siblings).  For each byte there is a
 * pointer to the node with the byte that follows in the word (child).
-* A NUL byte is used where the word may end.
+*
+* A NUL byte is used where the word may end.  The bytes are sorted, so that
+* binary searching can be used and the NUL bytes are at the start.  The
+* number of possible bytes is stored before the list of bytes.
+*
+* The tree uses two arrays: "byts" stores the characters, "idxs" stores
+* either the next index or flags.  The tree starts at index 0.  For example,
+* to lookup "vi" this sequence is followed:
+*	i = 0
+*	len = byts[i]
+*	n = where "v" appears in byts[i + 1] to byts[i + len]
+*	i = idxs[n]
+*	len = byts[i]
+*	n = where "i" appears in byts[i + 1] to byts[i + len]
+*	i = idxs[n]
+*	len = byts[i]
+*	find that byts[i + 1] is 0, idxs[i + 1] has flags for "vi".
 *
 * There are two trees: one with case-folded words and one with words in
 * original case.  The second one is only used for keep-case words and is
 * usually small.
 *
 * See ":help develop-spell".
 */
 /*
 * Use this to let the score depend in how much a suggestion sounds like the
-* bad word.  It's quite slow and doesn't make the sorting much better....
+* bad word.  It's quite slow and only occasionally makes the sorting better.
-* #define SOUNDFOLD_SCORE
+#define SOUNDFOLD_SCORE
+*/
+/*
+* Use this to adjust the score after finding suggestions, based on the
+* suggested word sounding like the bad word.  This is much faster than doing
+* it for every possible suggestion.
+* Disadvantage: When "the" is typed as "hte" it sounds different and goes
+* down in the list.
+#define RESCORE(word_score, sound_score) ((2 * word_score + sound_score) / 3)
 */
 /*
 * Vim spell file format:  <HEADER> <SUGGEST> <LWORDTREE> <KWORDTREE>
 *
 * <regionname>	2 bytes     Region name: ca, au, etc.  Lower case.
 *			    First <regionname> is region 1.
 *
 * <charflagslen> 1 byte    Number of bytes in <charflags> (should be 128).
 * <charflags>  N bytes     List of flags (first one is for character 128):
-*			    0x01  word character
+*			    0x01  word character	CF_WORD
-*			    0x02  upper-case character
+*			    0x02  upper-case character	CF_UPPER
 * <fcharslen>  2 bytes     Number of bytes in <fchars>.
 * <fchars>     N bytes	    Folded characters, first one is for character 128.
 *
 *
 * <SUGGEST> : <repcount> <rep> ...
 #define MAXWLEN 250		/* Assume max. word len is this many bytes.
 				   Some places assume a word length fits in a
 				   byte, thus it can't be above 255. */
-/* Flags used for a word. */
+/* Type used for indexes in the word tree need to be at least 3 bytes.  If int
+* is 8 bytes we could use something smaller, but what? */
+#if SIZEOF_INT > 2
+typedef int idx_T;
+#else
+typedef long idx_T;
+#endif
+/* Flags used for a word.  Only the lowest byte can be used, the region byte
+* comes above it. */
 #define WF_REGION   0x01	/* region byte follows */
 #define WF_ONECAP   0x02	/* word with one capital (or all capitals) */
 #define WF_ALLCAP   0x04	/* word must be all capitals */
 #define WF_RARE	    0x08	/* rare word */
 #define WF_BANNED   0x10	/* bad word */
 #define WF_KEEPCAP  0x80	/* keep-case word */
 #define WF_CAPMASK (WF_ONECAP | WF_ALLCAP | WF_KEEPCAP)
+#define WF_USED	    0x10000	/* Word was found in text. Must be in separate
+				   byte before region and flags. */
 #define BY_NOFLAGS  0		/* end of word without flags or region */
 #define BY_FLAGS    1		/* end of word, flag byte follows */
 #define BY_INDEX    2		/* child is shared, index follows */
 #define BY_SPECIAL  BY_INDEX	/* hightest special byte value */
 slang_T	*sl_next;	/* next language */
 char_u	*sl_name;	/* language name "en", "en.rare", "nl", etc. */
 char_u	*sl_fname;	/* name of .spl file */
 int		sl_add;		/* TRUE if it's a .add file. */
 char_u	*sl_fbyts;	/* case-folded word bytes */
-int		*sl_fidxs;	/* case-folded word indexes */
+idx_T	*sl_fidxs;	/* case-folded word indexes */
 char_u	*sl_kbyts;	/* keep-case word bytes */
-int		*sl_kidxs;	/* keep-case word indexes */
+idx_T	*sl_kidxs;	/* keep-case word indexes */
 char_u	sl_regions[17];	/* table with up to 8 region names plus NUL */
 garray_T	sl_rep;		/* list of fromto_T entries from REP lines */
 short	sl_rep_first[256];  /* indexes where byte first appears, -1 if
 				       there is none */
 typedef struct suggest_S
 {
 char_u	*st_word;	/* suggested word, allocated string */
 int		st_orglen;	/* length of replaced text */
 int		st_score;	/* lower is better */
+#ifdef RESCORE
+int		st_had_bonus;	/* bonus already included in score */
+#endif
 } suggest_T;
 #define SUG(sup, i) (((suggest_T *)(sup)->su_ga.ga_data)[i])
 /* Number of suggestions displayed. */
 #define SUG_PROMPT_COUNT    ((int)Rows - 2)
-/* Threshold for sorting and cleaning up suggestions. */
+/* Number of suggestions kept when cleaning up.  When rescore_suggestions() is
-#define SUG_CLEANUP_COUNT   (SUG_PROMPT_COUNT + 50)
+* called the score may change, thus we need to keep more than what is
+* displayed. */
+#define SUG_CLEAN_COUNT	    (SUG_PROMPT_COUNT < 25 ? 25 : SUG_PROMPT_COUNT)
+/* Threshold for sorting and cleaning up suggestions.  Don't want to keep lots
+* of suggestions that are not going to be displayed. */
+#define SUG_MAX_COUNT	    (SUG_PROMPT_COUNT + 50)
 /* score for various changes */
 #define SCORE_SPLIT	99	/* split bad word */
 #define SCORE_ICASE	52	/* slightly different case */
 #define SCORE_ALLCAP	120	/* need all-cap case */
 #define SCORE_REGION	70	/* word is for different region */
 #define SCORE_RARE	180	/* rare word */
+#define SCORE_NOTUSED	11	/* word not found in text yet */
 /* score for edit distance */
 #define SCORE_SWAP	90	/* swap two characters */
 #define SCORE_SWAP3	110	/* swap two characters in three */
 #define SCORE_REP	87	/* REP replacement */
 #define SCORE_SUBST	93	/* substitute a character */
 #define SCORE_SIMILAR	33	/* substitute a similar character */
-#define SCORE_DEL	96	/* delete a character */
+#define SCORE_DEL	94	/* delete a character */
-#define SCORE_INS	94	/* insert a character */
+#define SCORE_INS	96	/* insert a character */
 #define SCORE_MAXINIT	350	/* Initial maximum score: higher == slower.
 				 * 350 allows for about three changes. */
 #define SCORE_MAXMAX	999999	/* accept any score */
 typedef struct spelltab_S
 {
 char_u  st_isw[256];	/* flags: is word char */
 char_u  st_isu[256];	/* flags: is uppercase char */
 char_u  st_fold[256];	/* chars: folded case */
+char_u  st_upper[256];	/* chars: upper case */
 } spelltab_T;
 static spelltab_T   spelltab;
 static int	    did_set_spelltab;
-#define SPELL_ISWORD	1
+#define CF_WORD		0x01
-#define SPELL_ISUPPER	2
+#define CF_UPPER	0x02
 static void clear_spell_chartab __ARGS((spelltab_T *sp));
 static int set_spell_finish __ARGS((spelltab_T	*new_st));
 /*
 int		ts_score;	/* score */
 int		ts_curi;	/* index in list of child nodes */
 int		ts_fidx;	/* index in fword[], case-folded bad word */
 int		ts_fidxtry;	/* ts_fidx at which bytes may be changed */
 int		ts_twordlen;	/* valid length of tword[] */
-int		ts_arridx;	/* index in tree array, start of node */
+idx_T	ts_arridx;	/* index in tree array, start of node */
 char_u	ts_save_prewordlen; /* saved "prewordlen" */
 int		ts_save_splitoff;   /* su_splitoff saved here */
 int		ts_save_badflags;   /* badflags saved here */
 } trystate_T;
 static int spell_valid_case __ARGS((int origflags, int treeflags));
 static void spell_load_lang __ARGS((char_u *lang));
 static char_u *spell_enc __ARGS((void));
 static void spell_load_cb __ARGS((char_u *fname, void *cookie));
 static slang_T *spell_load_file __ARGS((char_u *fname, char_u *lang, slang_T *old_lp, int silent));
-static int read_tree __ARGS((FILE *fd, char_u *byts, int *idxs, int maxidx, int startidx));
+static idx_T read_tree __ARGS((FILE *fd, char_u *byts, idx_T *idxs, int maxidx, int startidx));
 static int find_region __ARGS((char_u *rp, char_u *region));
 static int captype __ARGS((char_u *word, char_u *end));
 static void spell_reload_one __ARGS((char_u *fname, int added_word));
 static int set_spell_charflags __ARGS((char_u *flags, int cnt, char_u *upp));
 static int set_spell_chartab __ARGS((char_u *fol, char_u *low, char_u *upp));
 static void write_spell_chartab __ARGS((FILE *fd));
-static int spell_isupper __ARGS((int c));
 static int spell_casefold __ARGS((char_u *p, int len, char_u *buf, int buflen));
-static void onecap_copy __ARGS((char_u *word, int len, char_u *wcopy, int upper));
+static void onecap_copy __ARGS((char_u *word, char_u *wcopy, int upper));
 static void spell_try_change __ARGS((suginfo_T *su));
 static int try_deeper __ARGS((suginfo_T *su, trystate_T *stack, int depth, int score_add));
 static void find_keepcap_word __ARGS((slang_T *slang, char_u *fword, char_u *kword));
 static void spell_try_soundalike __ARGS((suginfo_T *su));
 static void make_case_word __ARGS((char_u *fword, char_u *cword, int flags));
+#if 0
 static int similar_chars __ARGS((slang_T *slang, int c1, int c2));
+#endif
+#ifdef RESCORE
+static void add_suggestion __ARGS((suginfo_T *su, char_u *goodword, int use_score, int had_bonus));
+#else
 static void add_suggestion __ARGS((suginfo_T *su, char_u *goodword, int use_score));
+#endif
 static void add_banned __ARGS((suginfo_T *su, char_u *word));
 static int was_banned __ARGS((suginfo_T *su, char_u *word));
 static void free_banned __ARGS((suginfo_T *su));
-static void cleanup_suggestions __ARGS((suginfo_T *su));
+#ifdef RESCORE
+static void rescore_suggestions __ARGS((suginfo_T *su));
+#endif
+static void cleanup_suggestions __ARGS((suginfo_T *su, int keep));
 static void spell_soundfold __ARGS((slang_T *slang, char_u *inword, char_u *res));
+#if defined(RESCORE) || defined(SOUNDFOLD_SCORE)
+static int spell_sound_score __ARGS((slang_T *slang, char_u *goodword, char_u	*badsound));
+#endif
 static int spell_edit_score __ARGS((char_u *badword, char_u *goodword));
+/*
+* Use our own character-case definitions, because the current locale may
+* differ from what the .spl file uses.
+* These must not be called with negative number!
+*/
+#ifndef FEAT_MBYTE
+/* Non-multi-byte implementation. */
+# define SPELL_TOFOLD(c) ((c) < 256 ? spelltab.st_fold[c] : (c))
+# define SPELL_TOUPPER(c) ((c) < 256 ? spelltab.st_upper[c] : (c))
+# define SPELL_ISUPPER(c) ((c) < 256 ? spelltab.st_isu[c] : FALSE)
+#else
+/* Multi-byte implementation.  For Unicode we can call utf_*(), but don't do
+* that for ASCII, because we don't want to use 'casemap' here.  Otherwise use
+* the "w" library function for characters above 255 if available. */
+# ifdef HAVE_TOWLOWER
+#  define SPELL_TOFOLD(c) (enc_utf8 && (c) >= 128 ? utf_fold(c) \
+	    : (c) < 256 ? spelltab.st_fold[c] : towlower(c))
+# else
+#  define SPELL_TOFOLD(c) (enc_utf8 && (c) >= 128 ? utf_fold(c) \
+	    : (c) < 256 ? spelltab.st_fold[c] : (c))
+# endif
+# ifdef HAVE_TOWUPPER
+#  define SPELL_TOUPPER(c) (enc_utf8 && (c) >= 128 ? utf_toupper(c) \
+	    : (c) < 256 ? spelltab.st_upper[c] : towupper(c))
+# else
+#  define SPELL_TOUPPER(c) (enc_utf8 && (c) >= 128 ? utf_toupper(c) \
+	    : (c) < 256 ? spelltab.st_upper[c] : (c))
+# endif
+# ifdef HAVE_ISWUPPER
+#  define SPELL_ISUPPER(c) (enc_utf8 && (c) >= 128 ? utf_isupper(c) \
+	    : (c) < 256 ? spelltab.st_isu[c] : iswupper(c))
+# else
+#  define SPELL_ISUPPER(c) (enc_utf8 && (c) >= 128 ? utf_isupper(c) \
+	    : (c) < 256 ? spelltab.st_isu[c] : (c))
+# endif
+#endif
 static char *e_format = N_("E759: Format error in spell file");
 /*
 				       mi.mi_lp->lp_slang != NULL; ++mi.mi_lp)
 	{
 	    /* Check for a matching word in case-folded words. */
 	    find_word(&mi, FALSE);
+	    /* Check for a matching word in keep-case words. */
 	    find_word(&mi, TRUE);
 	}
 	if (mi.mi_result != SP_OK)
 	{
 static void
 find_word(mip, keepcap)
 matchinf_T	*mip;
 int		keepcap;
 {
-int		arridx = 0;
+idx_T	arridx = 0;
 int		endlen[MAXWLEN];    /* length at possible word endings */
-int		endidx[MAXWLEN];    /* possible word endings */
+idx_T	endidx[MAXWLEN];    /* possible word endings */
 int		endidxcnt = 0;
 int		len;
 int		wlen = 0;
 int		flen;
 int		c;
 char_u	*ptr;
-unsigned	lo, hi, m;
+idx_T	lo, hi, m;
 #ifdef FEAT_MBYTE
 char_u	*s;
 #endif
 char_u	*p;
 int		res = SP_BAD;
 int		valid;
 slang_T	*slang = mip->mi_lp->lp_slang;
 unsigned	flags;
 char_u	*byts;
-int		*idxs;
+idx_T	*idxs;
 if (keepcap)
 {
 	/* Check for word with matching case in keep-case tree. */
 	ptr = mip->mi_word;
 	/* Check flags and region.  Repeat this if there are more
 	 * flags/region alternatives until there is a match. */
 	for (len = byts[arridx - 1]; len > 0 && byts[arridx] == 0; --len)
 	{
 	    flags = idxs[arridx];
+	    /* Set a flag for words that were used.  The region and case
+	     * doesn't matter here, it's only used to rate the suggestions. */
+	    idxs[arridx] = flags | WF_USED;
 	    if (keepcap)
 	    {
 		/* For "keepcap" tree the case is always right. */
 		valid = TRUE;
 	    }
 	    len = spell_check(curwin, p, &attr);
 	    if (attr != 0)
 	    {
 		/* We found a bad word.  Check the attribute. */
-		/* TODO: check for syntax @Spell cluster. */
 		if (allwords || attr == highlight_attr[HLF_SPB])
 		{
 		    /* When searching forward only accept a bad word after
 		     * the cursor. */
 		    if (dir == BACKWARD
 slang_T	*lp = NULL;
 garray_T	*gap;
 fromto_T	*ftp;
 int		rr;
 short	*first;
+idx_T	idx;
 fd = mch_fopen((char *)fname, "r");
 if (fd == NULL)
 {
 	if (!silent)
 	}
 	for (i = 0; i < ccnt; ++i)
 	    fol[i] = getc(fd);				/* <fchars> */
 	fol[i] = NUL;
-	/* Set the word-char flags and fill spell_isupper() table. */
+	/* Set the word-char flags and fill SPELL_ISUPPER() table. */
 	i = set_spell_charflags(p, cnt, fol);
 	vim_free(p);
 	vim_free(fol);
 	if (i == FAIL)
 	    goto formerr;
 	    /* Allocate the index array. */
 	    p = lalloc_clear((long_u)(len * sizeof(int)), TRUE);
 	    if (p == NULL)
 		goto endFAIL;
 	    if (round == 1)
-		lp->sl_fidxs = (int *)p;
+		lp->sl_fidxs = (idx_T *)p;
 	    else
-		lp->sl_kidxs = (int *)p;
+		lp->sl_kidxs = (idx_T *)p;
 	    /* Read the tree and store it in the array. */
-	    i = read_tree(fd,
+	    idx = read_tree(fd,
 			round == 1 ? lp->sl_fbyts : lp->sl_kbyts,
 			round == 1 ? lp->sl_fidxs : lp->sl_kidxs,
 			len, 0);
-	    if (i == -1)
+	    if (idx == -1)
 		goto truncerr;
-	    if (i < 0)
+	    if (idx < 0)
 		goto formerr;
 	}
 }
 /* For a new file link it in the list of spell files. */
 *
 * Returns the index follosing the siblings.
 * Returns -1 if the file is shorter than expected.
 * Returns -2 if there is a format error.
 */
-static int
+static idx_T
 read_tree(fd, byts, idxs, maxidx, startidx)
 FILE	*fd;
 char_u	*byts;
-int		*idxs;
+idx_T	*idxs;
 int		maxidx;		    /* size of arrays */
-int		startidx;	    /* current index in "byts" and "idxs" */
+idx_T	startidx;	    /* current index in "byts" and "idxs" */
 {
 int		len;
 int		i;
 int		n;
-int		idx = startidx;
+idx_T	idx = startidx;
 int		c;
 #define SHARED_MASK	0x8000000
 len = getc(fd);					/* <siblingcount> */
 if (len <= 0)
 if (has_mbyte)
 	c = mb_ptr2char_adv(&p);
 else
 #endif
 	c = *p++;
-firstcap = allcap = spell_isupper(c);
+firstcap = allcap = SPELL_ISUPPER(c);
 /*
 * Need to check all letters to find a word with mixed upper/lower.
 * But a word with an upper char only at start is a ONECAP.
 */
 #ifdef FEAT_MBYTE
 	    c = mb_ptr2char(p);
 #else
 	    c = *p;
 #endif
-	    if (!spell_isupper(c))
+	    if (!SPELL_ISUPPER(c))
 	    {
 		/* UUl -> KEEPCAP */
 		if (past_second && allcap)
 		    return WF_KEEPCAP;
 		allcap = FALSE;
 static char *e_affname = N_("Affix name too long in %s line %d: %s");
 int		do_rep;
 int		do_sal;
 int		do_map;
 int		found_map = FALSE;
+hashitem_T	*hi;
 /*
 * Open the file.
 */
 fd = mch_fopen((char *)fname, "r");
 		if (*items[0] == 'P')
 		    tp = &aff->af_pref;
 		else
 		    tp = &aff->af_suff;
 		aff_todo = atoi((char *)items[3]);
-		if (!HASHITEM_EMPTY(hash_find(tp, cur_aff->ah_key)))
+		hi = hash_find(tp, cur_aff->ah_key);
+		if (!HASHITEM_EMPTY(hi))
 		{
 		    smsg((char_u *)_("Duplicate affix in %s line %d: %s"),
 						       fname, lnum, items[1]);
 		    aff_todo = 0;
 		}
 if (fol != NULL || low != NULL || upp != NULL)
 {
 	/*
 	 * Don't write a word table for an ASCII file, so that we don't check
 	 * for conflicts with a word table that matches 'encoding'.
-	 * Don't write one for utf-8 either, we use utf_isupper() and
+	 * Don't write one for utf-8 either, we use utf_*() and
 	 * mb_get_class(), the list of chars in the file will be incomplete.
 	 */
 	if (!spin->si_ascii
 #ifdef FEAT_MBYTE
 		&& !enc_utf8
 	    verbose_leave();
 }
 /* Read and ignore the first line: word count. */
 (void)vim_fgets(line, MAXLINELEN, fd);
-if (!isdigit(*skipwhite(line)))
+if (!vim_isdigit(*skipwhite(line)))
 	EMSG2(_("E760: No word count in %s"), fname);
 /*
 * Read all the lines in the file one by one.
 * The words are converted to 'encoding' here, before being added to
 			    if (ae->ae_chop != NULL)
 			    {
 				/* Skip chop string. */
 #ifdef FEAT_MBYTE
 				if (has_mbyte)
+				{
 				    i = mb_charlen(ae->ae_chop);
+				    for ( ; i > 0; --i)
+					mb_ptr_adv(p);
+				}
 				else
 #endif
-				    i = STRLEN(ae->ae_chop);
+				    p += STRLEN(ae->ae_chop);
-				for ( ; i > 0; --i)
-				    mb_ptr_adv(p);
 			    }
 			    STRCAT(newword, p);
 			}
 			else
 			{
 */
 static void
 clear_spell_chartab(sp)
 spelltab_T	*sp;
 {
-int	    i;
+int		i;
 /* Init everything to FALSE. */
 vim_memset(sp->st_isw, FALSE, sizeof(sp->st_isw));
 vim_memset(sp->st_isu, FALSE, sizeof(sp->st_isu));
 for (i = 0; i < 256; ++i)
+{
 	sp->st_fold[i] = i;
+	sp->st_upper[i] = i;
+}
 /* We include digits.  A word shouldn't start with a digit, but handling
 * that is done separately. */
 for (i = '0'; i <= '9'; ++i)
 	sp->st_isw[i] = TRUE;
 	sp->st_isw[i] = TRUE;
 	sp->st_isu[i] = TRUE;
 	sp->st_fold[i] = i + 0x20;
 }
 for (i = 'a'; i <= 'z'; ++i)
+{
 	sp->st_isw[i] = TRUE;
+	sp->st_upper[i] = i - 0x20;
+}
 }
 /*
 * Init the chartab used for spelling.  Only depends on 'encoding'.
 * Called once while starting up and when 'encoding' changes.
 	/* DBCS: assume double-wide characters are word characters. */
 	for (i = 128; i <= 255; ++i)
 	    if (MB_BYTE2LEN(i) == 2)
 		spelltab.st_isw[i] = TRUE;
 }
+else if (enc_utf8)
+{
+	for (i = 128; i < 256; ++i)
+	{
+	    spelltab.st_isu[i] = utf_isupper(i);
+	    spelltab.st_isw[i] = spelltab.st_isu[i] || utf_islower(i);
+	    spelltab.st_fold[i] = utf_fold(i);
+	    spelltab.st_upper[i] = utf_toupper(i);
+	}
+}
 else
 #endif
 {
-	/* Rough guess: use isalpha() and isupper() for characters above 128. */
+	/* Rough guess: use locale-dependent library functions. */
 	for (i = 128; i < 256; ++i)
 	{
-	    spelltab.st_isw[i] = MB_ISUPPER(i) || MB_ISLOWER(i);
 	    if (MB_ISUPPER(i))
 	    {
+		spelltab.st_isw[i] = TRUE;
 		spelltab.st_isu[i] = TRUE;
 		spelltab.st_fold[i] = MB_TOLOWER(i);
+	    }
+	    else if (MB_ISLOWER(i))
+	    {
+		spelltab.st_isw[i] = TRUE;
+		spelltab.st_upper[i] = MB_TOUPPER(i);
 	    }
 	}
 }
 }
 	    }
 	    new_st.st_fold[l] = f;
 	}
 	/* if "UPP" and "FOL" are not the same the "UPP" char needs
-	 * case-folding and it's upper case. */
+	 * case-folding, it's upper case and the "UPP" is the upper case of
+	 * "FOL" . */
 	if (u < 256 && u != f)
 	{
 	    if (f >= 256)
 	    {
 		EMSG(_(e_affrange));
 		return FAIL;
 	    }
 	    new_st.st_fold[u] = f;
 	    new_st.st_isu[u] = TRUE;
+	    new_st.st_upper[f] = u;
 	}
 }
 if (*pl != NUL || *pu != NUL)
 {
 /* We build the new tables here first, so that we can compare with the
 * previous one. */
 spelltab_T	new_st;
 int		i;
 char_u	*p = upp;
+int		c;
 clear_spell_chartab(&new_st);
 for (i = 0; i < cnt; ++i)
 {
-	new_st.st_isw[i + 128] = (flags[i] & SPELL_ISWORD) != 0;
+	new_st.st_isw[i + 128] = (flags[i] & CF_WORD) != 0;
-	new_st.st_isu[i + 128] = (flags[i] & SPELL_ISUPPER) != 0;
+	new_st.st_isu[i + 128] = (flags[i] & CF_UPPER) != 0;
 	if (*p == NUL)
 	    return FAIL;
 #ifdef FEAT_MBYTE
-	new_st.st_fold[i + 128] = mb_ptr2char_adv(&p);
+	c = mb_ptr2char_adv(&p);
 #else
-	new_st.st_fold[i + 128] = *p++;
+	c = *p++;
 #endif
+	new_st.st_fold[i + 128] = c;
+	if (i + 128 != c && new_st.st_isu[i + 128] && c < 256)
+	    new_st.st_upper[c] = i + 128;
 }
 return set_spell_finish(&new_st);
 }
 	/* check that it's the same table */
 	for (i = 0; i < 256; ++i)
 	{
 	    if (spelltab.st_isw[i] != new_st->st_isw[i]
 		    || spelltab.st_isu[i] != new_st->st_isu[i]
-		    || spelltab.st_fold[i] != new_st->st_fold[i])
+		    || spelltab.st_fold[i] != new_st->st_fold[i]
+		    || spelltab.st_upper[i] != new_st->st_upper[i])
 	    {
 		EMSG(_("E763: Word characters differ between spell files"));
 		return FAIL;
 	    }
 	}
 fputc(128, fd);				    /* <charflagslen> */
 for (i = 128; i < 256; ++i)
 {
 	flags = 0;
 	if (spelltab.st_isw[i])
-	    flags |= SPELL_ISWORD;
+	    flags |= CF_WORD;
 	if (spelltab.st_isu[i])
-	    flags |= SPELL_ISUPPER;
+	    flags |= CF_UPPER;
 	fputc(flags, fd);			    /* <charflags> */
 #ifdef FEAT_MBYTE
 	if (has_mbyte)
 	    len += mb_char2bytes(spelltab.st_fold[i], charbuf + len);
 put_bytes(fd, (long_u)len, 2);		    /* <fcharlen> */
 fwrite(charbuf, (size_t)len, (size_t)1, fd);    /* <fchars> */
 }
 /*
-* Return TRUE if "c" is an upper-case character for spelling.
+* Case-fold "str[len]" into "buf[buflen]".  The result is NUL terminated.
-*/
+* Uses the character definitions from the .spl file.
-static int
-spell_isupper(c)
-int		c;
-{
-# ifdef FEAT_MBYTE
-if (enc_utf8)
-{
-	/* For Unicode we can call utf_isupper(), but don't do that for ASCII,
-	 * because we don't want to use 'casemap' here. */
-	if (c >= 128)
-	    return utf_isupper(c);
-}
-else if (has_mbyte && c > 256)
-{
-	/* For characters above 255 we don't have something specfied.
-	 * Fall back to locale-dependent iswupper().  If not available
-	 * simply return FALSE. */
-#  ifdef HAVE_ISWUPPER
-	return iswupper(c);
-#  else
-	return FALSE;
-#  endif
-}
-# endif
-return spelltab.st_isu[c];
-}
-/*
-* Case-fold "p[len]" into "buf[buflen]".  Used for spell checking.
 * When using a multi-byte 'encoding' the length may change!
 * Returns FAIL when something wrong.
 */
 static int
-spell_casefold(p, len, buf, buflen)
+spell_casefold(str, len, buf, buflen)
-char_u	*p;
+char_u	*str;
 int		len;
 char_u	*buf;
 int		buflen;
 {
 int		i;
 }
 #ifdef FEAT_MBYTE
 if (has_mbyte)
 {
+	int	outi = 0;
+	char_u	*p;
 	int	c;
-	int	outi = 0;
 	/* Fold one character at a time. */
-	for (i = 0; i < len; i += mb_ptr2len_check(p + i))
+	for (p = str; p < str + len; )
 	{
-	    c = mb_ptr2char(p + i);
-	    if (enc_utf8)
-		/* For Unicode case folding is always the same, no need to use
-		 * the table from the spell file. */
-		c = utf_fold(c);
-	    else if (c < 256)
-		/* Use the table from the spell file. */
-		c = spelltab.st_fold[c];
-# ifdef HAVE_TOWLOWER
-	    else
-		/* We don't know what to do, fall back to towlower(), it
-		 * depends on the current locale. */
-		c = towlower(c);
-# endif
 	    if (outi + MB_MAXBYTES > buflen)
 	    {
 		buf[outi] = NUL;
 		return FAIL;
 	    }
-	    outi += mb_char2bytes(c, buf + outi);
+	    c = mb_ptr2char_adv(&p);
+	    outi += mb_char2bytes(SPELL_TOFOLD(c), buf + outi);
 	}
 	buf[outi] = NUL;
 }
 else
 #endif
 {
 	/* Be quick for non-multibyte encodings. */
 	for (i = 0; i < len; ++i)
-	    buf[i] = spelltab.st_fold[p[i]];
+	    buf[i] = spelltab.st_fold[str[i]];
 	buf[i] = NUL;
 }
 return OK;
 }
 add_banned(&sug, sug.su_badword);
 /*
 * 1. Try inserting/deleting/swapping/changing a letter, use REP entries
 *    from the .aff file and inserting a space (split the word).
+*
+* Set a maximum score to limit the combination of operations that is
+* tried.
 */
-/* Set a maximum score to limit the combination of operations that is
-* tried. */
 sug.su_maxscore = SCORE_MAXINIT;
 spell_try_change(&sug);
-cleanup_suggestions(&sug);
 /*
 * 2. Try finding sound-a-like words.
+*
+* Only do this when we don't have a lot of suggestions yet, because it's
+* very slow and often doesn't find new suggestions.
 */
-/* Allow a higher score if we don't have many suggestions yet. */
+if (sug.su_ga.ga_len < SUG_CLEAN_COUNT)
-if (sug.su_maxscore == SCORE_MAXINIT)
+{
+	/* Allow a higher score now. */
 	sug.su_maxscore = SCORE_MAXMAX;
-spell_try_soundalike(&sug);
+	spell_try_soundalike(&sug);
+}
 /* When CTRL-C was hit while searching do show the results. */
+ui_breakcheck();
 if (got_int)
 {
 	(void)vgetc();
 	got_int = FALSE;
 }
 if (sug.su_ga.ga_len == 0)
 	MSG(_("Sorry, no suggestions"));
 else
 {
-	/* Cleanup, sort the suggestions and truncate at SUG_PROMPT_COUNT. */
+#ifdef RESCORE
-	cleanup_suggestions(&sug);
+	/* Do slow but more accurate computation of the word score. */
+	rescore_suggestions(&sug);
+#endif
+	/* Sort the suggestions and truncate at SUG_PROMPT_COUNT. */
+	cleanup_suggestions(&sug, SUG_PROMPT_COUNT);
 	/* List the suggestions. */
 	msg_start();
 	vim_snprintf((char *)IObuff, IOSIZE, _("Change \"%.*s\" to:"),
 						sug.su_badlen, sug.su_badptr);
 	    STRCPY(wcopy, stp->st_word);
 	    if (sug.su_badlen > stp->st_orglen)
 		vim_strncpy(wcopy + STRLEN(wcopy),
 					       sug.su_badptr + stp->st_orglen,
 					      sug.su_badlen - stp->st_orglen);
-	    /* TODO: remove score */
+	    if (p_verbose > 0)
-	    vim_snprintf((char *)IObuff, IOSIZE, _("%2d \"%s\"  (%d)"),
+		vim_snprintf((char *)IObuff, IOSIZE, _("%2d \"%s\"  (%d)"),
 						 i + 1, wcopy, stp->st_score);
+	    else
+		vim_snprintf((char *)IObuff, IOSIZE, _("%2d \"%s\""),
+								i + 1, wcopy);
 	    msg_puts(IObuff);
 	    lines_left = 3;		/* avoid more prompt */
 	    msg_putchar('\n');
 	}
 /* Free the banned words. */
 free_banned(&sug);
 }
 /*
-* Make a copy of "word[len]", with the first letter upper or lower cased,
+* Make a copy of "word", with the first letter upper or lower cased, to
-* to "wcopy[MAXWLEN]".
+* "wcopy[MAXWLEN]".  "word" must not be empty.
+* The result is NUL terminated.
 */
 static void
-onecap_copy(word, len, wcopy, upper)
+onecap_copy(word, wcopy, upper)
 char_u	*word;
-int		len;
 char_u	*wcopy;
 int		upper;	    /* TRUE: first letter made upper case */
 {
 char_u	*p;
 int		c;
 	c = mb_ptr2char_adv(&p);
 else
 #endif
 	c = *p++;
 if (upper)
-	c = MB_TOUPPER(c);
+	c = SPELL_TOUPPER(c);
 else
-	c = MB_TOLOWER(c);
+	c = SPELL_TOFOLD(c);
 #ifdef FEAT_MBYTE
 if (has_mbyte)
 	l = mb_char2bytes(c, wcopy);
 else
 #endif
 {
 	l = 1;
 	wcopy[0] = c;
 }
-vim_strncpy(wcopy + l, p, len - (p - word));
+vim_strncpy(wcopy + l, p, MAXWLEN - l);
 }
 /*
-* Make a copy of "word[len]" with all the letters upper cased into
+* Make a copy of "word" with all the letters upper cased into
-* "wcopy[MAXWLEN]".
+* "wcopy[MAXWLEN]".  The result is NUL terminated.
 */
 static void
 allcap_copy(word, wcopy)
 char_u	*word;
 char_u	*wcopy;
 	if (has_mbyte)
 	    c = mb_ptr2char_adv(&s);
 	else
 #endif
 	    c = *s++;
+	c = SPELL_TOUPPER(c);
-	c = MB_TOUPPER(c);	/* TODO: use spell toupper */
 #ifdef FEAT_MBYTE
 	if (has_mbyte)
 	{
 	    if (d - wcopy >= MAXWLEN - MB_MAXBYTES)
 int		splitoff = 0;	    /* index in tword after last split */
 trystate_T	*sp;
 int		newscore;
 langp_T	*lp;
 char_u	*byts;
-int		*idxs;
+idx_T	*idxs;
 int		depth;
 int		c;
 int		n;
 int		flags;
 int		badflags;
 garray_T	*gap;
-int		arridx;
+idx_T	arridx;
 int		len;
 char_u	*p;
 fromto_T	*ftp;
 int		fl, tl;
 		/*
 		 * End of word in tree.
 		 */
 		++sp->ts_curi;		/* eat one NUL byte */
-		flags = idxs[arridx];
+		flags = (int)idxs[arridx];
 		/*
 		 * Form the word with proper case in preword.
 		 * If there is a word from a previous split, append.
 		 */
 			     && (((unsigned)flags >> 8) & lp->lp_region) == 0)
 		    newscore += SCORE_REGION;
 		if (flags & WF_RARE)
 		    newscore += SCORE_RARE;
+		/* Words that were not found in the text get a penalty. */
+		if ((flags & WF_USED) == 0)
+		    newscore += SCORE_NOTUSED;
 		if (!spell_valid_case(badflags,
 					 captype(preword + prewordlen, NULL)))
 		    newscore += SCORE_ICASE;
 		if (fword[sp->ts_fidx] == 0)
 		{
 		    /* The badword also ends: add suggestions, */
-		    add_suggestion(su, preword, sp->ts_score + newscore);
+		    add_suggestion(su, preword, sp->ts_score + newscore
+#ifdef RESCORE
+			    , FALSE
+#endif
+			    );
 		}
 		else if (sp->ts_fidx >= sp->ts_fidxtry)
 		{
 		    /* The word in the tree ends but the badword
 		     * continues: try inserting a space and check that a valid
 			/* Append a space to preword. */
 			STRCAT(preword, " ");
 			prewordlen = STRLEN(preword);
 			splitoff = sp->ts_twordlen;
-			/* TODO: when case-folding changed the number of bytes
+#ifdef FEAT_MBYTE
-			 * this doesn't work... */
+			if (has_mbyte)
-			badflags = captype(su->su_badptr + sp->ts_fidx,
+			{
-					       su->su_badptr + su->su_badlen);
+			    int		i = 0;
+			    /* Case-folding may change the number of bytes:
+			     * Count nr of chars in fword[sp->ts_fidx] and
+			     * advance that many chars in su->su_badptr. */
+			    for (p = fword; p < fword + sp->ts_fidx;
+								mb_ptr_adv(p))
+				++i;
+			    for (p = su->su_badptr; i > 0; mb_ptr_adv(p))
+				--i;
+			}
+			else
+#endif
+			    p = su->su_badptr + sp->ts_fidx;
+			badflags = captype(p, su->su_badptr + su->su_badlen);
 			sp->ts_state = STATE_SPLITUNDO;
 			++depth;
 			/* Restart at top of the tree. */
 			stack[depth].ts_arridx = 0;
 		    /* Normal byte, go one level deeper.  If it's not equal to
 		     * the byte in the bad word adjust the score.  But don't
 		     * even try when the byte was already changed. */
 		    if (c == fword[sp->ts_fidx])
 			newscore = 0;
-			/* TODO: multi-byte characters */
+		    /* TODO: this is too slow and comparing bytes isn't right
+		     * for multi-byte characters. */
+#if 0
 		    else if (lp->lp_slang->sl_map != NULL
-			    && similar_chars(lp->lp_slang,
+					&& similar_chars(lp->lp_slang,
 						       c, fword[sp->ts_fidx]))
 			newscore = SCORE_SIMILAR;
+#endif
 		    else
 			newscore = SCORE_SUBST;
 		    if ((newscore == 0 || sp->ts_fidx >= sp->ts_fidxtry)
 				    && try_deeper(su, stack, depth, newscore))
 		    {
 char_u	*fword;
 char_u	*kword;
 {
 char_u	uword[MAXWLEN];		/* "fword" in upper-case */
 int		depth;
-int		tryidx;
+idx_T	tryidx;
 /* The following arrays are used at each depth in the tree. */
-int		arridx[MAXWLEN];
+idx_T	arridx[MAXWLEN];
 int		round[MAXWLEN];
 int		fwordidx[MAXWLEN];
 int		uwordidx[MAXWLEN];
 int		kwordlen[MAXWLEN];
 int		flen, ulen;
 int		l;
 int		len;
 int		c;
-unsigned	lo, hi, m;
+idx_T	lo, hi, m;
 char_u	*p;
 char_u	*byts = slang->sl_kbyts;    /* array with bytes of the words */
-int		*idxs = slang->sl_kidxs;    /* array with indexes */
+idx_T	*idxs = slang->sl_kidxs;    /* array with indexes */
 if (byts == NULL)
 {
 	/* array is empty: "cannot happen" */
 	*kword = NUL;
 {
 char_u	salword[MAXWLEN];
 char_u	tword[MAXWLEN];
 char_u	tfword[MAXWLEN];
 char_u	tsalword[MAXWLEN];
-int		arridx[MAXWLEN];
+idx_T	arridx[MAXWLEN];
 int		curi[MAXWLEN];
 langp_T	*lp;
 char_u	*byts;
-int		*idxs;
+idx_T	*idxs;
 int		depth;
 int		c;
-int		n;
+idx_T	n;
 int		round;
 int		flags;
+int		score, sound_score;
+char_u	*bp, *sp;
 for (lp = LANGP_ENTRY(curwin->w_buffer->b_langp, 0);
 						   lp->lp_slang != NULL; ++lp)
 {
 	if (lp->lp_slang->sl_sal.ga_len > 0)
 			++curi[depth];
 			c = byts[n];
 			if (c == 0)
 			{
 			    /* End of word, deal with the word. */
-			    flags = idxs[n];
+			    flags = (int)idxs[n];
 			    if (round == 2 || (flags & WF_KEEPCAP) == 0)
 			    {
 				tword[depth] = NUL;
 				if (round == 1)
 				    spell_soundfold(lp->lp_slang,
 							     tfword, MAXWLEN);
 				    spell_soundfold(lp->lp_slang,
 							    tfword, tsalword);
 				}
-				/* TODO: also compare with small changes
+				/*
-				 * (insert char, swap char, etc.) */
+				 * Accept the word if the sound-folded words
-				if (STRCMP(salword, tsalword) == 0)
+				 * are (almost) equal.
+				 */
+				for (bp = salword, sp = tsalword; *bp == *sp;
+								   ++bp, ++sp)
+				    if (*bp == NUL)
+					break;
+				if (*bp == *sp)
+				    /* equal */
+				    sound_score = 0;
+				else if (*bp != NUL && bp[1] != NUL
+					&& *bp == sp[1] && bp[1] == *sp
+					       && STRCMP(bp + 2, sp + 2) == 0)
+				    /* swap two bytes */
+				    sound_score = SCORE_SWAP;
+				else if (STRCMP(bp + 1, sp) == 0)
+				    /* delete byte */
+				    sound_score = SCORE_DEL;
+				else if (STRCMP(bp, sp + 1) == 0)
+				    /* insert byte */
+				    sound_score = SCORE_INS;
+				else if (STRCMP(bp + 1, sp + 1) == 0)
+				    /* skip one byte */
+				    sound_score = SCORE_SUBST;
+				else
+				    /* not equal or similar */
+				    sound_score = SCORE_MAXMAX;
+				if (sound_score < SCORE_MAXMAX)
 				{
+				    char_u	cword[MAXWLEN];
+				    char_u	*p;
 				    if (round == 1 && flags != 0)
 				    {
-					char_u	cword[MAXWLEN];
+					/* Need to fix case according to
+					 * "flags". */
 					make_case_word(tword, cword, flags);
-					add_suggestion(su, cword, 0);
+					p = cword;
 				    }
 				    else
-					add_suggestion(su, tword, 0);
+					p = tword;
+				    /* Compute the score. */
+				    score = spell_edit_score(su->su_badword, p);
+#ifdef RESCORE
+				    /* give a bonus for the good word sounding
+				     * the same as the bad word */
+				    add_suggestion(su, tword,
+						 RESCORE(score, sound_score),
+									TRUE);
+#else
+				    add_suggestion(su, tword,
+							 score + sound_score);
+#endif
 				}
 			    }
 			    /* Skip over other NUL bytes. */
 			    while (byts[n + 1] == 0)
 			    tword[depth++] = c;
 			    arridx[depth] = idxs[n];
 			    curi[depth] = 1;
 			}
 		    }
+		    line_breakcheck();
 		}
-		line_breakcheck();
+	    }
-	    }
+	}
-	}
+}
 }
-}
+/*
-/*
+* Copy "fword" to "cword", fixing case according to "flags".
-* Copy "fword" to "cword", fixing according to "flags".
 */
 static void
 make_case_word(fword, cword, flags)
 char_u	*fword;
 char_u	*cword;
 if (flags & WF_ALLCAP)
 	/* Make it all upper-case */
 	allcap_copy(fword, cword);
 else if (flags & WF_ONECAP)
 	/* Make the first letter upper-case */
-	onecap_copy(fword, STRLEN(fword), cword, TRUE);
+	onecap_copy(fword, cword, TRUE);
 else
 	/* Use goodword as-is. */
 	STRCPY(cword, fword);
 }
+#if 0
 /*
 * Return TRUE if "c1" and "c2" are similar characters according to the MAP
 * lines in the .aff file.
 */
 static int
 p2 = vim_strchr(slang->sl_map, c2);
 if (p2 == NULL)
 	return FALSE;
 return vim_strchr(p1, '/') == vim_strchr(p2, '/');
 }
+#endif
 /*
 * Add a suggestion to the list of suggestions.
 * Do not add a duplicate suggestion or suggestions with a bad score.
 * When "use_score" is not zero it's used, otherwise the score is computed
 * with spell_edit_score().
 */
 static void
-add_suggestion(su, goodword, use_score)
+add_suggestion(su, goodword, score
+#ifdef RESCORE
+	    , had_bonus
+#endif
+	    )
 suginfo_T	*su;
 char_u	*goodword;
-int		use_score;
+int		score;
+#ifdef RESCORE
+int		had_bonus;	/* set st_had_bonus */
+#endif
 {
 suggest_T   *stp;
-int		score;
 int		i;
 #ifdef SOUNDFOLD_SCORE
 char_u	fword[MAXWLEN];
 char_u	salword[MAXWLEN];
 #endif
 /* Check that the word wasn't banned. */
 if (was_banned(su, goodword))
 	return;
-/* Compute the score and add the suggestion if it's good enough. */
-if (use_score != 0)
-	score = use_score;
-else
-	score = spell_edit_score(su->su_badword, goodword);
 if (score <= su->su_maxscore)
 {
 #ifdef SOUNDFOLD_SCORE
 	/* Add to the score when the word sounds differently.
 	 * This is slow... */
 	if (su->su_slang->sl_sal.ga_len > 0)
-	{
+	    score += spell_sound_score(su->su_slang, fword, su->su_salword);
-	    (void)spell_casefold(goodword, STRLEN(goodword), fword, MAXWLEN);
-	    spell_soundfold(su->su_slang, fword, salword);
-	    score += spell_edit_score(su->su_salword, salword);
-	}
 #endif
 	/* Check if the word is already there. */
 	stp = &SUG(su, 0);
 	for (i = su->su_ga.ga_len - 1; i >= 0; --i)
 	    if (STRCMP(stp[i].st_word, goodword) == 0)
 	    {
 		/* Found it.  Remember the lowest score. */
 		if (stp[i].st_score > score)
+		{
 		    stp[i].st_score = score;
+#ifdef RESCORE
+		    stp[i].st_had_bonus = had_bonus;
+#endif
+		}
 		break;
 	    }
 	if (i < 0 && ga_grow(&su->su_ga, 1) == OK)
 	{
 	    stp = &SUG(su, su->su_ga.ga_len);
 	    stp->st_word = vim_strsave(goodword);
 	    if (stp->st_word != NULL)
 	    {
 		stp->st_score = score;
+#ifdef RESCORE
+		stp->st_had_bonus = had_bonus;
+#endif
 		stp->st_orglen = su->su_badlen;
 		++su->su_ga.ga_len;
 		/* If we have too many suggestions now, sort the list and keep
 		 * the best suggestions. */
-		if (su->su_ga.ga_len > SUG_CLEANUP_COUNT)
+		if (su->su_ga.ga_len > SUG_MAX_COUNT)
-		    cleanup_suggestions(su);
+		    cleanup_suggestions(su, SUG_CLEAN_COUNT);
 	    }
 	}
 }
 }
 static int
 was_banned(su, word)
 suginfo_T	*su;
 char_u	*word;
 {
-return !HASHITEM_EMPTY(hash_find(&su->su_banned, word));
+hashitem_T	*hi = hash_find(&su->su_banned, word);
+return !HASHITEM_EMPTY(hi);
 }
 /*
 * Free the banned words in "su".
 */
 	}
 }
 hash_clear(&su->su_banned);
 }
+#ifdef RESCORE
+/*
+* Recompute the score if sound-folding is possible.  This is slow,
+* thus only done for the final results.
+*/
+static void
+rescore_suggestions(su)
+suginfo_T	*su;
+{
+langp_T	*lp;
+suggest_T	*stp;
+char_u	sal_badword[MAXWLEN];
+int		score;
+int		i;
+for (lp = LANGP_ENTRY(curwin->w_buffer->b_langp, 0);
+						   lp->lp_slang != NULL; ++lp)
+{
+	if (lp->lp_slang->sl_sal.ga_len > 0)
+	{
+	    /* soundfold the bad word */
+	    spell_soundfold(lp->lp_slang, su->su_fbadword, sal_badword);
+	    for (i = 0; i < su->su_ga.ga_len; ++i)
+	    {
+		stp = &SUG(su, i);
+		if (!stp->st_had_bonus)
+		{
+		    score = spell_sound_score(lp->lp_slang, stp->st_word,
+								 sal_badword);
+		    stp->st_score = RESCORE(stp->st_score, score);
+		}
+	    }
+	    break;
+	}
+}
+}
+#endif
 static int
 #ifdef __BORLANDC__
 _RTLENTRYF
 #endif
 sug_compare __ARGS((const void *s1, const void *s2));
 * Cleanup the suggestions:
 * - Sort on score.
 * - Remove words that won't be displayed.
 */
 static void
-cleanup_suggestions(su)
+cleanup_suggestions(su, keep)
 suginfo_T	*su;
+int		keep;		/* nr of suggestions to keep */
 {
 suggest_T   *stp = &SUG(su, 0);
 int		i;
 /* Sort the list. */
 qsort(su->su_ga.ga_data, (size_t)su->su_ga.ga_len,
 					      sizeof(suggest_T), sug_compare);
 /* Truncate the list to the number of suggestions that will be displayed. */
-if (su->su_ga.ga_len > SUG_PROMPT_COUNT)
+if (su->su_ga.ga_len > keep)
 {
-	for (i = SUG_PROMPT_COUNT; i < su->su_ga.ga_len; ++i)
+	for (i = keep; i < su->su_ga.ga_len; ++i)
 	    vim_free(stp[i].st_word);
-	su->su_ga.ga_len = SUG_PROMPT_COUNT;
+	su->su_ga.ga_len = keep;
-	su->su_maxscore = stp[SUG_PROMPT_COUNT - 1].st_score;
+	su->su_maxscore = stp[keep - 1].st_score;
 }
 }
 /*
 * Turn "inword" into its sound-a-like equivalent in "res[MAXWLEN]".
 {
 fromto_T	*ftp;
 char_u	word[MAXWLEN];
 #ifdef FEAT_MBYTE
 int		l;
+int		found_mbyte = FALSE;
 #endif
 char_u	*s;
 char_u	*t;
 int		i, j, z;
 int		n, k = 0;
 int		c;
 int		pri;
 int		p0 = -333;
 int		c0;
-/* Remove accents, if wanted.
+/* Remove accents, if wanted.  We actually remove all non-word characters.
-* We actually remove all non-word characters. */
+* But keep white space. */
 if (slang->sl_rem_accents)
 {
 	t = word;
 	for (s = inword; *s != NUL; )
 	{
+	    if (vim_iswhite(*s))
+		*t++ = *s++;
 #ifdef FEAT_MBYTE
-	    if (has_mbyte)
+	    else if (has_mbyte)
 	    {
 		l = mb_ptr2len_check(s);
 		if (SPELL_ISWORDP(s))
 		{
 		    mch_memmove(t, s, l);
 		    t += l;
+		    if (l > 1)
+			found_mbyte = TRUE;
 		}
 		s += l;
 	    }
+#endif
 	    else
-#endif
 	    {
 		if (SPELL_ISWORDP(s))
 		    *t++ = *s;
 		++s;
 	    }
 	}
 	*t = NUL;
 }
 else
+{
+#ifdef FEAT_MBYTE
+	if (has_mbyte)
+	    for (s = inword; *s != NUL; s += l)
+		if ((l = mb_ptr2len_check(s)) > 1)
+		{
+		    found_mbyte = TRUE;
+		    break;
+		}
+#endif
 	STRCPY(word, inword);
+}
+#ifdef FEAT_MBYTE
+/* If there are multi-byte characters in the word return it as-is, because
+* the following won't work. */
+if (found_mbyte)
+{
+	STRCPY(res, word);
+	return;
+}
+#endif
 ftp = (fromto_T *)slang->sl_sal.ga_data;
 /*
 * This comes from Aspell phonet.cpp.  Converted from C++ to C.
+* Changed to keep spaces.
 * TODO: support for multi-byte chars.
 */
 i = j = z = 0;
 while ((c = word[i]) != NUL)
 {
 		if (*s == '^' && *(s + 1) == '^')
 		    s++;
 		if (*s == NUL
 			|| (*s == '^'
-			    && (i == 0 || !SPELL_ISWORDP(word + i - 1))
+			    && (i == 0 || !(word[i - 1] == ' '
+					      || SPELL_ISWORDP(word + i - 1)))
 			    && (*(s + 1) != '$'
 				|| (!SPELL_ISWORDP(word + i + k0))))
 			|| (*s == '$' && i > 0
 			    && SPELL_ISWORDP(word + i - 1)
 			    && (!SPELL_ISWORDP(word + i + k0))))
 		    break;
 		}
 		++n;
 	    }
 	}
+	else if (vim_iswhite(c))
+	{
+	    c = ' ';
+	    k = 1;
+	}
 	if (z0 == 0)
 	{
 	    if (k && !p0 && j < MAXWLEN && c != NUL
 		    && (!slang->sl_collapse || j == 0 || res[j - 1] != c))
 }
 res[j] = NUL;
 }
+#if defined(RESCORE) || defined(SOUNDFOLD_SCORE)
+/*
+* Return the score for how much words sound different.
+*/
+static int
+spell_sound_score(slang, goodword, badsound)
+slang_T	*slang;
+char_u	*goodword;	/* good word */
+char_u	*badsound;	/* sound-folded bad word */
+{
+char_u	fword[MAXWLEN];
+char_u	goodsound[MAXWLEN];
+int		score;
+/* Case-fold the word, needed for sound folding. */
+(void)spell_casefold(goodword, STRLEN(goodword), fword, MAXWLEN);
+/* sound-fold the good word */
+spell_soundfold(slang, fword, goodsound);
+/* compute the edit distance-score of the sounds */
+score = spell_edit_score(badsound, goodsound);
+/* Correction: adding/inserting "*" at the start (word starts with vowel)
+* shouldn't be counted so much, vowels halfway the word aren't counted at
+* all. */
+if (*badsound != *goodsound && (*badsound == '*' || *goodsound == '*'))
+	score -= SCORE_DEL / 2;
+return score;
+}
+#endif
 /*
 * Compute the "edit distance" to turn "badword" into "goodword".  The less
 * deletes/inserts/swaps are required the lower the score.
+*
 * The algorithm comes from Aspell editdist.cpp, edit_distance().
-* TODO: make this work with multi-byte chars.
+* It has been converted from C++ to C and modified to support multi-byte
+* characters.
 */
 static int
 spell_edit_score(badword, goodword)
 char_u	*badword;
 char_u	*goodword;
 int		*cnt;
 int		badlen, goodlen;
 int		j, i;
 int		t;
 int		bc, gc;
+int		pbc, pgc;
+#ifdef FEAT_MBYTE
+char_u	*p;
+int		wbadword[MAXWLEN];
+int		wgoodword[MAXWLEN];
+if (has_mbyte)
+{
+	/* Get the characters from the multi-byte strings and put them in an
+	 * int array for easy access. */
+	for (p = badword, badlen = 0; *p != NUL; )
+	    wbadword[badlen++] = mb_ptr2char_adv(&p);
+	++badlen;
+	for (p = goodword, goodlen = 0; *p != NUL; )
+	    wgoodword[goodlen++] = mb_ptr2char_adv(&p);
+	++goodlen;
+}
+else
+#endif
+{
+	badlen = STRLEN(badword) + 1;
+	goodlen = STRLEN(goodword) + 1;
+}
 /* We use "cnt" as an array: CNT(badword_idx, goodword_idx). */
 #define CNT(a, b)   cnt[(a) + (b) * (badlen + 1)]
-badlen = STRLEN(badword) + 1;
-goodlen = STRLEN(goodword) + 1;
 cnt = (int *)lalloc((long_u)(sizeof(int) * (badlen + 1) * (goodlen + 1)),
 									TRUE);
-if (cnt == 0)
+if (cnt == NULL)
-	return 0;
+	return 0;	/* out of memory */
 CNT(0, 0) = 0;
 for (j = 1; j <= goodlen; ++j)
 	CNT(0, j) = CNT(0, j - 1) + SCORE_DEL;
 for (i = 1; i <= badlen; ++i)
 {
 	CNT(i, 0) = CNT(i - 1, 0) + SCORE_INS;
 	for (j = 1; j <= goodlen; ++j)
 	{
-	    bc = badword[i - 1];
+#ifdef FEAT_MBYTE
-	    gc = goodword[j - 1];
+	    if (has_mbyte)
+	    {
+		bc = wbadword[i - 1];
+		gc = wgoodword[j - 1];
+	    }
+	    else
+#endif
+	    {
+		bc = badword[i - 1];
+		gc = goodword[j - 1];
+	    }
 	    if (bc == gc)
 		CNT(i, j) = CNT(i - 1, j - 1);
 	    else
 	    {
 		/* Use a better score when there is only a case difference. */
-		if (spelltab.st_fold[bc] == spelltab.st_fold[gc])
+		if (SPELL_TOFOLD(bc) == SPELL_TOFOLD(gc))
 		    CNT(i, j) = SCORE_ICASE + CNT(i - 1, j - 1);
 		else
 		    CNT(i, j) = SCORE_SUBST + CNT(i - 1, j - 1);
-		if (i > 1 && j > 1 && bc == goodword[j - 2]
+		if (i > 1 && j > 1)
-						      && badword[i - 2] == gc)
 		{
-		    t = SCORE_SWAP + CNT(i - 2, j - 2);
+#ifdef FEAT_MBYTE
-		    if (t < CNT(i, j))
+		    if (has_mbyte)
-			CNT(i, j) = t;
+		    {
+			pbc = wbadword[i - 2];
+			pgc = wgoodword[j - 2];
+		    }
+		    else
+#endif
+		    {
+			pbc = badword[i - 2];
+			pgc = goodword[j - 2];
+		    }
+		    if (bc == pgc && pbc == gc)
+		    {
+			t = SCORE_SWAP + CNT(i - 2, j - 2);
+			if (t < CNT(i, j))
+			    CNT(i, j) = t;
+		    }
 		}
 		t = SCORE_DEL + CNT(i - 1, j);
 		if (t < CNT(i, j))
 		    CNT(i, j) = t;
 		t = SCORE_INS + CNT(i, j - 1);

Mercurial > vim

comparison src/spell.c @ 324:548525d9da24