Mercurial > vim

diff src/spell.c @ 375:f14cbd913415 v7.0097
updated for version 7.0097
author: vimboss
date: Wed, 29 Jun 2005 22:40:58 +0000
parents: a698eb686ded
children: d2bc505a6d91
--- a/src/spell.c
+++ b/src/spell.c
@@ -134,14 +134,17 @@
  *			    SAL_F0LLOWUP
  *			    SAL_COLLAPSE
  *			    SAL_REM_ACCENTS
+ *			    SAL_SOFO: SOFOFROM and SOFOTO used instead of SAL
+ *
+ * <salcount>   2 bytes	    number of <sal> items following
  *
  * <sal> : <salfromlen> <salfrom> <saltolen> <salto>
  *
- * <salfromlen>	1 byte	    length of <salfrom>
+ * <salfromlen>	1-2 bytes    length of <salfrom> (2 bytes for SAL_SOFO)
  *
  * <salfrom>	N bytes	    "from" part of soundsalike
  *
- * <saltolen>	1 byte	    length of <salto>
+ * <saltolen>	1-2 bytes    length of <salto> (2 bytes for SAL_SOFO)
  *
  * <salto>	N bytes	    "to" part of soundsalike
  *
@@ -267,16 +270,22 @@ typedef struct salitem_S
 {
     char_u	*sm_lead;	/* leading letters */
     int		sm_leadlen;	/* length of "sm_lead" */
-    char_u	*sm_oneoff;	/* letters from () or NULL */
+    char_u	*sm_oneof;	/* letters from () or NULL */
     char_u	*sm_rules;	/* rules like ^, $, priority */
     char_u	*sm_to;		/* replacement. */
 #ifdef FEAT_MBYTE
     int		*sm_lead_w;	/* wide character copy of "sm_lead" */
-    int		*sm_oneoff_w;	/* wide character copy of "sm_oneoff" */
+    int		*sm_oneof_w;	/* wide character copy of "sm_oneof" */
     int		*sm_to_w;	/* wide character copy of "sm_to" */
 #endif
 } salitem_T;
 
+#ifdef FEAT_MBYTE
+typedef int salfirst_T;
+#else
+typedef short salfirst_T;
+#endif
+
 /*
  * Structure used to store words and other info for one language, loaded from
  * a .spl file.
@@ -316,8 +325,11 @@ struct slang_S
     short	sl_rep_first[256];  /* indexes where byte first appears, -1 if
 				       there is none */
     garray_T	sl_sal;		/* list of salitem_T entries from SAL lines */
-    short	sl_sal_first[256];  /* indexes where byte first appears, -1 if
+    salfirst_T	sl_sal_first[256];  /* indexes where byte first appears, -1 if
 				       there is none */
+    int		sl_sofo;	/* SOFOFROM and SOFOTO instead of SAL items:
+				 * "sl_sal_first" maps chars, when has_mbyte
+				 * "sl_sal" is a list of wide char lists. */
     int		sl_followup;	/* SAL followup */
     int		sl_collapse;	/* SAL collapse_result */
     int		sl_rem_accents;	/* SAL remove_accents */
@@ -338,6 +350,7 @@ static slang_T *first_lang = NULL;
 #define SAL_F0LLOWUP		1
 #define SAL_COLLAPSE		2
 #define SAL_REM_ACCENTS		4
+#define SAL_SOFO		8   /* SOFOFROM and SOFOTO instead of SAL */
 
 /*
  * Structure used in "b_langp", filled from 'spelllang'.
@@ -501,6 +514,7 @@ typedef enum
     STATE_START = 0,	/* At start of node check for NUL bytes (goodword
 			 * ends); if badword ends there is a match, otherwise
 			 * try splitting word. */
+    STATE_NOPREFIX,	/* try without prefix */
     STATE_SPLITUNDO,	/* Undo splitting. */
     STATE_ENDNUL,	/* Past NUL bytes at start of the node. */
     STATE_PLAIN,	/* Use each byte of the node. */
@@ -530,6 +544,8 @@ typedef struct trystate_S
     char_u	ts_fidx;	/* index in fword[], case-folded bad word */
     char_u	ts_fidxtry;	/* ts_fidx at which bytes may be changed */
     char_u	ts_twordlen;	/* valid length of tword[] */
+    char_u	ts_prefixdepth;	/* stack depth for end of prefix or PREFIXTREE
+				 * or NOPREFIX */
 #ifdef FEAT_MBYTE
     char_u	ts_tcharlen;	/* number of bytes in tword character */
     char_u	ts_tcharidx;	/* current byte index in tword character */
@@ -546,6 +562,10 @@ typedef struct trystate_S
 #define DIFF_YES	1	/* different byte found */
 #define DIFF_INSERT	2	/* inserting character */
 
+/* special values ts_prefixdepth */
+#define PREFIXTREE	0xfe	/* walking through the prefix tree */
+#define NOPREFIX	0xff	/* not using prefixes */
+
 /* mode values for find_word */
 #define FIND_FOLDWORD	0	/* find word case-folded */
 #define FIND_KEEPWORD	1	/* find keep-case word */
@@ -601,9 +621,11 @@ static int was_banned __ARGS((suginfo_T 
 static void free_banned __ARGS((suginfo_T *su));
 static void rescore_suggestions __ARGS((suginfo_T *su));
 static int cleanup_suggestions __ARGS((garray_T *gap, int maxscore, int keep));
-static void spell_soundfold __ARGS((slang_T *slang, char_u *inword, char_u *res));
+static void spell_soundfold __ARGS((slang_T *slang, char_u *inword, int folded, char_u *res));
+static void spell_soundfold_sofo __ARGS((slang_T *slang, char_u *inword, char_u *res));
+static void spell_soundfold_sal __ARGS((slang_T *slang, char_u *inword, char_u *res));
 #ifdef FEAT_MBYTE
-static void spell_soundfold_w __ARGS((slang_T *slang, char_u *inword, char_u *res));
+static void spell_soundfold_wsal __ARGS((slang_T *slang, char_u *inword, char_u *res));
 #endif
 static int soundalike_score __ARGS((char_u *goodsound, char_u *badsound));
 static int spell_edit_score __ARGS((char_u *badword, char_u *goodword));
@@ -1107,16 +1129,16 @@ find_prefix(mip)
     char_u	*byts;
     idx_T	*idxs;
 
+    byts = slang->sl_pbyts;
+    if (byts == NULL)
+	return;			/* array is empty */
+
     /* We use the case-folded word here, since prefixes are always
      * case-folded. */
     ptr = mip->mi_fword;
     flen = mip->mi_fwordlen;    /* available case-folded bytes */
-    byts = slang->sl_pbyts;
     idxs = slang->sl_pidxs;
 
-    if (byts == NULL)
-	return;			/* array is empty */
-
     /*
      * Repeat advancing in the tree until:
      * - there is a byte that doesn't match,
@@ -1562,12 +1584,24 @@ slang_clear(lp)
     ga_clear(gap);
 
     gap = &lp->sl_sal;
-    while (gap->ga_len > 0)
-    {
-	smp = &((salitem_T *)gap->ga_data)[--gap->ga_len];
-	vim_free(smp->sm_lead);
-	vim_free(smp->sm_to);
-    }
+    if (lp->sl_sofo)
+	/* SOFOFROM and SOFOTO items: free lists of wide characters. */
+	for (i = 0; i < gap->ga_len; ++i)
+	    vim_free(((int **)gap->ga_data)[i]);
+    else
+	/* SAL items: free salitem_T items */
+	while (gap->ga_len > 0)
+	{
+	    smp = &((salitem_T *)gap->ga_data)[--gap->ga_len];
+	    vim_free(smp->sm_lead);
+	    /* Don't free sm_oneof and sm_rules, they point into sm_lead. */
+	    vim_free(smp->sm_to);
+#ifdef FEAT_MBYTE
+	    vim_free(smp->sm_lead_w);
+	    vim_free(smp->sm_oneof_w);
+	    vim_free(smp->sm_to_w);
+#endif
+	}
     ga_clear(gap);
 
     for (i = 0; i < lp->sl_prefixcnt; ++i)
@@ -1638,6 +1672,7 @@ spell_load_file(fname, lang, old_lp, sil
     salitem_T	*smp;
     int		rr;
     short	*first;
+    salfirst_T	*sfirst;
     idx_T	idx;
     int		c = 0;
 
@@ -1895,151 +1930,282 @@ formerr:
 	lp->sl_collapse = TRUE;
     if (i & SAL_REM_ACCENTS)
 	lp->sl_rem_accents = TRUE;
+    if (i & SAL_SOFO)
+	lp->sl_sofo = TRUE;
 
     cnt = (getc(fd) << 8) + getc(fd);		/* <salcount> */
     if (cnt < 0)
 	goto formerr;
 
-    gap = &lp->sl_sal;
-    if (ga_grow(gap, cnt) == FAIL)
-	goto endFAIL;
-
-    /* <sal> : <salfromlen> <salfrom> <saltolen> <salto> */
-    for (; gap->ga_len < cnt; ++gap->ga_len)
-    {
-	smp = &((salitem_T *)gap->ga_data)[gap->ga_len];
-	ccnt = getc(fd);			/* <salfromlen> */
+    if (lp->sl_sofo)
+    {
+	/*
+	 * SOFOFROM and SOFOTO items come in one <salfrom> and <salto>
+	 */
+	if (cnt != 1)
+	    goto formerr;
+
+	cnt = (getc(fd) << 8) + getc(fd);		/* <salfromlen> */
+	if (cnt < 0)
+	    goto formerr;
+	if ((bp = alloc(cnt + 1)) == NULL)
+	    goto endFAIL;
+	for (i = 0; i < cnt; ++i)
+	    bp[i] = getc(fd);				/* <salfrom> */
+	bp[i] = NUL;
+
+	ccnt = (getc(fd) << 8) + getc(fd);		/* <saltolen> */
 	if (ccnt < 0)
+	{
+	    vim_free(bp);
 	    goto formerr;
-	if ((p = alloc(ccnt + 2)) == NULL)
+	}
+	if ((fol = alloc(ccnt + 1)) == NULL)
+	{
+	    vim_free(bp);
 	    goto endFAIL;
-	smp->sm_lead = p;
-
-	/* Read up to the first special char into sm_lead. */
+	}
 	for (i = 0; i < ccnt; ++i)
-	{
-	    c = getc(fd);			/* <salfrom> */
-	    if (vim_strchr((char_u *)"0123456789(-<^$", c) != NULL)
-		break;
-	    *p++ = c;
-	}
-	smp->sm_leadlen = p - smp->sm_lead;
-	*p++ = NUL;
-
-	/* Put optional chars in sm_oneoff, if any. */
-	if (c == '(')
-	{
-	    smp->sm_oneoff = p;
-	    for (++i; i < ccnt; ++i)
-	    {
-		c = getc(fd);			/* <salfrom> */
-		if (c == ')')
-		    break;
-		*p++ = c;
-	    }
-	    *p++ = NUL;
-	    if (++i < ccnt)
-		c = getc(fd);
-	}
-	else
-	    smp->sm_oneoff = NULL;
-
-	/* Any following chars go in sm_rules. */
-	smp->sm_rules = p;
-	if (i < ccnt)
-	    *p++ = c;
-	for (++i; i < ccnt; ++i)
-	    *p++ = getc(fd);			/* <salfrom> */
-	*p++ = NUL;
-
-	ccnt = getc(fd);			/* <saltolen> */
-	if (ccnt < 0)
-	{
-	    vim_free(smp->sm_lead);
-	    goto formerr;
-	}
-	if ((p = alloc(ccnt + 1)) == NULL)
-	{
-	    vim_free(smp->sm_lead);
-	    goto endFAIL;
-	}
-	smp->sm_to = p;
-
-	for (i = 0; i < ccnt; ++i)
-	    *p++ = getc(fd);			/* <salto> */
-	*p++ = NUL;
+	    fol[i] = getc(fd);				/* <salto> */
+	fol[i] = NUL;
 
 #ifdef FEAT_MBYTE
 	if (has_mbyte)
 	{
-	    /* convert the multi-byte strings to wide char strings */
-	    smp->sm_lead_w = mb_str2wide(smp->sm_lead);
-	    smp->sm_leadlen = mb_charlen(smp->sm_lead);
-	    if (smp->sm_oneoff == NULL)
-		smp->sm_oneoff_w = NULL;
+	    char_u	*s;
+
+	    /* Use "sl_sal" as an array with 256 pointers to a list of wide
+	     * characters.  The index is the low byte of the character.
+	     * The list contains from-to pairs with a terminating NUL.
+	     * sl_sal_first[] is used for latin1 "from" characters. */
+	    gap = &lp->sl_sal;
+	    ga_init2(gap, sizeof(int *), 1);
+	    if (ga_grow(gap, 256) == FAIL)
+	    {
+sofoFAIL:
+		vim_free(bp);
+		vim_free(fol);
+		goto endFAIL;
+	    }
+	    vim_memset(gap->ga_data, 0, sizeof(int *) * 256);
+	    gap->ga_len = 256;
+
+	    /* First count the number of items for each list.  Temporarily use
+	     * sl_sal_first[] for this. */
+	    for (p = bp, s = fol; *p != NUL && *s != NUL; )
+	    {
+		c = mb_ptr2char_adv(&p);
+		mb_ptr_adv(s);
+		if (c >= 256)
+		    ++lp->sl_sal_first[c & 0xff];
+	    }
+	    if (*p != NUL || *s != NUL)	    /* lengths differ */
+		goto sofoerr;
+
+	    /* Allocate the lists. */
+	    for (i = 0; i < 256; ++i)
+		if (lp->sl_sal_first[i] > 0)
+		{
+		    p = alloc(sizeof(int) * (lp->sl_sal_first[i] * 2 + 1));
+		    if (p == NULL)
+			goto sofoFAIL;
+		    ((int **)gap->ga_data)[i] = (int *)p;
+		    *(int *)p = 0;
+		}
+
+	    /* Put the characters in sl_sal_first[] or a sl_sal list. */
+	    vim_memset(lp->sl_sal_first, 0, sizeof(salfirst_T) * 256);
+	    for (p = bp, s = fol; *p != NUL && *s != NUL; )
+	    {
+		c = mb_ptr2char_adv(&p);
+		i = mb_ptr2char_adv(&s);
+		if (c >= 256)
+		{
+		    int	*inp;
+
+		    /* Append the from-to chars at the end of the list with
+		     * the low byte. */
+		    inp = ((int **)gap->ga_data)[c & 0xff];
+		    while (*inp != 0)
+			++inp;
+		    *inp++ = c;		/* from char */
+		    *inp++ = i;		/* to char */
+		    *inp++ = NUL;	/* NUL at the end */
+		}
+		else
+		    /* mapping byte to char is done in sl_sal_first[] */
+		    lp->sl_sal_first[c] = i;
+	    }
+	}
+	else
+#endif
+	{
+	    /* mapping bytes to bytes is done in sl_sal_first[] */
+	    if (cnt != ccnt)
+	    {
+#ifdef FEAT_MBYTE
+sofoerr:
+#endif
+		vim_free(bp);
+		vim_free(fol);
+		goto formerr;
+	    }
+	    for (i = 0; i < cnt; ++i)
+		lp->sl_sal_first[bp[i]] = fol[i];
+	    lp->sl_sal.ga_len = 1;	/* indicates we have soundfolding */
+	}
+	vim_free(bp);
+	vim_free(fol);
+    }
+    else
+    {
+	/*
+	 * SAL items
+	 */
+	gap = &lp->sl_sal;
+	if (ga_grow(gap, cnt) == FAIL)
+	    goto endFAIL;
+
+	/* <sal> : <salfromlen> <salfrom> <saltolen> <salto> */
+	for (; gap->ga_len < cnt; ++gap->ga_len)
+	{
+	    smp = &((salitem_T *)gap->ga_data)[gap->ga_len];
+	    ccnt = getc(fd);			/* <salfromlen> */
+	    if (ccnt < 0)
+		goto formerr;
+	    if ((p = alloc(ccnt + 2)) == NULL)
+		goto endFAIL;
+	    smp->sm_lead = p;
+
+	    /* Read up to the first special char into sm_lead. */
+	    for (i = 0; i < ccnt; ++i)
+	    {
+		c = getc(fd);			/* <salfrom> */
+		if (vim_strchr((char_u *)"0123456789(-<^$", c) != NULL)
+		    break;
+		*p++ = c;
+	    }
+	    smp->sm_leadlen = p - smp->sm_lead;
+	    *p++ = NUL;
+
+	    /* Put (abc) chars in sm_oneof, if any. */
+	    if (c == '(')
+	    {
+		smp->sm_oneof = p;
+		for (++i; i < ccnt; ++i)
+		{
+		    c = getc(fd);			/* <salfrom> */
+		    if (c == ')')
+			break;
+		    *p++ = c;
+		}
+		*p++ = NUL;
+		if (++i < ccnt)
+		    c = getc(fd);
+	    }
 	    else
-		smp->sm_oneoff_w = mb_str2wide(smp->sm_oneoff);
-	    smp->sm_to_w = mb_str2wide(smp->sm_to);
-	    if (smp->sm_lead_w == NULL
-		    || (smp->sm_oneoff_w == NULL && smp->sm_oneoff != NULL)
-		    || smp->sm_to_w == NULL)
+		smp->sm_oneof = NULL;
+
+	    /* Any following chars go in sm_rules. */
+	    smp->sm_rules = p;
+	    if (i < ccnt)
+		/* store the char we got while checking for end of sm_lead */
+		*p++ = c;
+	    for (++i; i < ccnt; ++i)
+		*p++ = getc(fd);			/* <salfrom> */
+	    *p++ = NUL;
+
+	    ccnt = getc(fd);			/* <saltolen> */
+	    if (ccnt < 0)
 	    {
 		vim_free(smp->sm_lead);
-		vim_free(smp->sm_to);
-		vim_free(smp->sm_lead_w);
-		vim_free(smp->sm_oneoff_w);
-		vim_free(smp->sm_to_w);
+		goto formerr;
+	    }
+	    if ((p = alloc(ccnt + 1)) == NULL)
+	    {
+		vim_free(smp->sm_lead);
 		goto endFAIL;
 	    }
-	}
-#endif
-    }
-
-    /* Fill the first-index table. */
-    first = lp->sl_sal_first;
-    for (i = 0; i < 256; ++i)
-	first[i] = -1;
-    for (i = 0; i < gap->ga_len; ++i)
-    {
-	smp = &((salitem_T *)gap->ga_data)[i];
-#ifdef FEAT_MBYTE
-	if (has_mbyte)
-	    /* Use the lowest byte of the first character. */
-	    c = *smp->sm_lead_w & 0xff;
-	else
-#endif
-	    c = *smp->sm_lead;
-	if (first[c] == -1)
-	{
-	    first[c] = i;
+	    smp->sm_to = p;
+
+	    for (i = 0; i < ccnt; ++i)
+		*p++ = getc(fd);			/* <salto> */
+	    *p++ = NUL;
+
 #ifdef FEAT_MBYTE
 	    if (has_mbyte)
 	    {
-		int	j;
-
-		/* Make sure all entries with this byte are following each
-		 * other.  Move the ones down that are in the wrong position.
-		 * Do keep the right sequence. */
-		while (i + 1 < gap->ga_len && (*smp[1].sm_lead_w & 0xff) == c)
+		/* convert the multi-byte strings to wide char strings */
+		smp->sm_lead_w = mb_str2wide(smp->sm_lead);
+		smp->sm_leadlen = mb_charlen(smp->sm_lead);
+		if (smp->sm_oneof == NULL)
+		    smp->sm_oneof_w = NULL;
+		else
+		    smp->sm_oneof_w = mb_str2wide(smp->sm_oneof);
+		smp->sm_to_w = mb_str2wide(smp->sm_to);
+		if (smp->sm_lead_w == NULL
+			|| (smp->sm_oneof_w == NULL && smp->sm_oneof != NULL)
+			|| smp->sm_to_w == NULL)
 		{
-		    ++i;
-		    ++smp;
+		    vim_free(smp->sm_lead);
+		    vim_free(smp->sm_to);
+		    vim_free(smp->sm_lead_w);
+		    vim_free(smp->sm_oneof_w);
+		    vim_free(smp->sm_to_w);
+		    goto endFAIL;
 		}
-		for (j = 1; i + j < gap->ga_len; ++j)
-		    if ((*smp[j].sm_lead_w & 0xff) == c)
-		    {
-			salitem_T  tsal;
-
-			++i;
-			++smp;
-			--j;
-			tsal = smp[j];
-			mch_memmove(smp + 1, smp, sizeof(salitem_T) * j);
-			*smp = tsal;
-		    }
 	    }
 #endif
 	}
+
+	/* Fill the first-index table. */
+	sfirst = lp->sl_sal_first;
+	for (i = 0; i < 256; ++i)
+	    sfirst[i] = -1;
+	smp = (salitem_T *)gap->ga_data;
+	for (i = 0; i < gap->ga_len; ++i)
+	{
+#ifdef FEAT_MBYTE
+	    if (has_mbyte)
+		/* Use the lowest byte of the first character.  For latin1 it's
+		 * the character, for other encodings it should differ for most
+		 * characters. */
+		c = *smp[i].sm_lead_w & 0xff;
+	    else
+#endif
+		c = *smp[i].sm_lead;
+	    if (sfirst[c] == -1)
+	    {
+		sfirst[c] = i;
+#ifdef FEAT_MBYTE
+		if (has_mbyte)
+		{
+		    /* Make sure all entries with this byte are following each
+		     * other.  Move the ones that are in the wrong position.  Do
+		     * keep the same ordering! */
+		    while (i + 1 < gap->ga_len
+					   && (*smp[i + 1].sm_lead_w & 0xff) == c)
+			/* Skip over entry with same index byte. */
+			++i;
+
+		    for (n = 1; i + n < gap->ga_len; ++n)
+			if ((*smp[i + n].sm_lead_w & 0xff) == c)
+			{
+			    salitem_T  tsal;
+
+			    /* Move entry with same index byte after the entries
+			     * we already found. */
+			    ++i;
+			    --n;
+			    tsal = smp[i + n];
+			    mch_memmove(smp + i + 1, smp + i,
+							   sizeof(salitem_T) * n);
+			    smp[i] = tsal;
+			}
+		}
+#endif
+	    }
+	}
     }
 
     cnt = (getc(fd) << 8) + getc(fd);		/* <maplen> */
@@ -2711,6 +2877,8 @@ typedef struct spellinfo_S
 
     garray_T	si_rep;		/* list of fromto_T entries from REP lines */
     garray_T	si_sal;		/* list of fromto_T entries from SAL lines */
+    char_u	*si_sofofr;	/* SOFOFROM text */
+    char_u	*si_sofoto;	/* SOFOTO text */
     int		si_followup;	/* soundsalike: ? */
     int		si_collapse;	/* soundsalike: ? */
     int		si_rem_accents;	/* soundsalike: remove accents */
@@ -2776,6 +2944,7 @@ spell_read_aff(fname, spin)
     int		do_sal;
     int		do_map;
     int		do_midword;
+    int		do_sofo;
     int		found_map = FALSE;
     hashitem_T	*hi;
 
@@ -2811,6 +2980,9 @@ spell_read_aff(fname, spin)
     /* Only do MIDWORD line when not done in another .aff file already */
     do_midword = spin->si_midword == NULL;
 
+    /* Only do SOFOFROM and SOFOTO when not done in another .aff file already */
+    do_sofo = spin->si_sofofr == NULL;
+
     /*
      * Allocate and init the afffile_T structure.
      */
@@ -2886,6 +3058,7 @@ spell_read_aff(fname, spin)
 							       p_enc) == FAIL)
 		    smsg((char_u *)_("Conversion in %s not supported: from %s to %s"),
 					       fname, aff->af_enc, p_enc);
+		spin->si_conv.vc_fail = TRUE;
 #else
 		    smsg((char_u *)_("Conversion in %s not supported"), fname);
 #endif
@@ -3165,12 +3338,30 @@ spell_read_aff(fname, spin)
 								: items[2]);
 		}
 	    }
+	    else if (STRCMP(items[0], "SOFOFROM") == 0 && itemcnt == 2
+				     && (!do_sofo || spin->si_sofofr == NULL))
+	    {
+		if (do_sofo)
+		    spin->si_sofofr = vim_strsave(items[1]);
+	    }
+	    else if (STRCMP(items[0], "SOFOTO") == 0 && itemcnt == 2
+				     && (!do_sofo || spin->si_sofoto == NULL))
+	    {
+		if (do_sofo)
+		    spin->si_sofoto = vim_strsave(items[1]);
+	    }
 	    else
 		smsg((char_u *)_("Unrecognized item in %s line %d: %s"),
 						       fname, lnum, items[0]);
 	}
     }
 
+    if (do_sofo && (spin->si_sofofr == NULL) != (spin->si_sofoto == NULL))
+	smsg((char_u *)_("Missing SOFO%s line in %s"),
+			      spin->si_sofofr == NULL ? "FROM" : "TO", fname);
+    if (spin->si_sofofr != NULL && spin->si_sal.ga_len > 0)
+	smsg((char_u *)_("Both SAL and SOFO lines in %s"), fname);
+
     if (fol != NULL || low != NULL || upp != NULL)
     {
 	if (spin->si_clear_chartab)
@@ -3449,7 +3640,7 @@ spell_read_dic(fname, spin, affile)
 	hi = hash_lookup(&ht, dw, hash);
 	if (!HASHITEM_EMPTY(hi))
 	    smsg((char_u *)_("Duplicate word in %s line %d: %s"),
-							      fname, lnum, w);
+							     fname, lnum, dw);
 	else
 	    hash_add_item(&ht, hi, dw, hash);
 
@@ -3797,6 +3988,7 @@ spell_read_wordfile(fname, spin)
 			smsg((char_u *)_("Conversion in %s not supported: from %s to %s"),
 							  fname, line, p_enc);
 		    vim_free(enc);
+		    spin->si_conv.vc_fail = TRUE;
 #else
 		    smsg((char_u *)_("Conversion in %s not supported"), fname);
 #endif
@@ -4376,6 +4568,8 @@ write_vim_spell(fname, spin)
     qsort(spin->si_rep.ga_data, (size_t)spin->si_rep.ga_len,
 					       sizeof(fromto_T), rep_compare);
 
+    /* round 1: REP items
+     * round 2: SAL items (unless SOFO is used) */
     for (round = 1; round <= 2; ++round)
     {
 	if (round == 1)
@@ -4391,7 +4585,11 @@ write_vim_spell(fname, spin)
 		i |= SAL_COLLAPSE;
 	    if (spin->si_rem_accents)
 		i |= SAL_REM_ACCENTS;
+	    if (spin->si_sofofr != NULL && spin->si_sofoto != NULL)
+		i |= SAL_SOFO;
 	    putc(i, fd);			/* <salflags> */
+	    if (i & SAL_SOFO)
+		break;
 	}
 
 	put_bytes(fd, (long_u)gap->ga_len, 2);	/* <repcount> or <salcount> */
@@ -4410,6 +4608,20 @@ write_vim_spell(fname, spin)
 	}
     }
 
+    /* SOFOFROM and SOFOTO */
+    if (spin->si_sofofr != NULL && spin->si_sofoto != NULL)
+    {
+	put_bytes(fd, 1L, 2);				/* <salcount> */
+
+	l = STRLEN(spin->si_sofofr);
+	put_bytes(fd, (long_u)l, 2);			/* <salfromlen> */
+	fwrite(spin->si_sofofr, l, (size_t)1, fd);	/* <salfrom> */
+
+	l = STRLEN(spin->si_sofoto);
+	put_bytes(fd, (long_u)l, 2);			/* <saltolen> */
+	fwrite(spin->si_sofoto, l, (size_t)1, fd);	/* <salto> */
+    }
+
     put_bytes(fd, (long_u)spin->si_map.ga_len, 2);	/* <maplen> */
     if (spin->si_map.ga_len > 0)			/* <mapstr> */
 	fwrite(spin->si_map.ga_data, (size_t)spin->si_map.ga_len,
@@ -4869,6 +5081,8 @@ mkspell(fcount, fnames, ascii, overwrite
 	ga_clear(&spin.si_map);
 	ga_clear(&spin.si_prefcond);
 	vim_free(spin.si_midword);
+	vim_free(spin.si_sofofr);
+	vim_free(spin.si_sofoto);
 
 	/* Free the .aff file structures. */
 	for (i = 0; i < incount; ++i)
@@ -5535,7 +5749,7 @@ spell_suggest()
 	while (p > line && SPELL_ISWORDP(p))
 	    mb_ptr_back(line, p);
 	/* Forward to start of word. */
-	while (!SPELL_ISWORDP(p))
+	while (*p != NUL && !SPELL_ISWORDP(p))
 	    mb_ptr_adv(p);
 
 	if (!SPELL_ISWORDP(p))		/* No word found. */
@@ -5813,8 +6027,8 @@ spell_find_suggest(badptr, su, maxcount,
 	if (STRNCMP(buf, "expr:", 5) == 0)
 	{
 #ifdef FEAT_EVAL
-	    /* Evaluate an expression.  Skip this when called recursively
-	     * (using spellsuggest() in the expression). */
+	    /* Evaluate an expression.  Skip this when called recursively,
+	     * when using spellsuggest() in the expression. */
 	    if (!expr_busy)
 	    {
 		expr_busy = TRUE;
@@ -6151,8 +6365,8 @@ suggest_try_change(su)
     trystate_T	*sp;
     int		newscore;
     langp_T	*lp;
-    char_u	*byts;
-    idx_T	*idxs;
+    char_u	*byts, *fbyts, *pbyts;
+    idx_T	*idxs, *fidxs, *pidxs;
     int		depth;
     int		c, c2, c3;
     int		n = 0;
@@ -6182,22 +6396,42 @@ suggest_try_change(su)
 	 * "fword[]" the word we are trying to match with (initially the bad
 	 * word).
 	 */
-	byts = lp->lp_slang->sl_fbyts;
-	idxs = lp->lp_slang->sl_fidxs;
-
 	depth = 0;
-	stack[0].ts_state = STATE_START;
-	stack[0].ts_score = 0;
-	stack[0].ts_curi = 1;
-	stack[0].ts_fidx = 0;
-	stack[0].ts_fidxtry = 0;
-	stack[0].ts_twordlen = 0;
-	stack[0].ts_arridx = 0;
+	sp = &stack[0];
+	sp->ts_state = STATE_START;
+	sp->ts_score = 0;
+	sp->ts_curi = 1;
+	sp->ts_fidx = 0;
+	sp->ts_fidxtry = 0;
+	sp->ts_twordlen = 0;
+	sp->ts_arridx = 0;
 #ifdef FEAT_MBYTE
-	stack[0].ts_tcharlen = 0;
+	sp->ts_tcharlen = 0;
 #endif
 
 	/*
+	 * When there are postponed prefixes we need to use these first.  At
+	 * the end of the prefix we continue in the case-fold tree.
+	 */
+	fbyts = lp->lp_slang->sl_fbyts;
+	fidxs = lp->lp_slang->sl_fidxs;
+	pbyts = lp->lp_slang->sl_pbyts;
+	pidxs = lp->lp_slang->sl_pidxs;
+	if (pbyts != NULL)
+	{
+	    byts = pbyts;
+	    idxs = pidxs;
+	    sp->ts_prefixdepth = PREFIXTREE;
+	    sp->ts_state = STATE_NOPREFIX;	/* try without prefix first */
+	}
+	else
+	{
+	    byts = fbyts;
+	    idxs = fidxs;
+	    sp->ts_prefixdepth = NOPREFIX;
+	}
+
+	/*
 	 * Loop to find all suggestions.  At each round we either:
 	 * - For the current state try one operation, advance "ts_curi",
 	 *   increase "depth".
@@ -6210,6 +6444,7 @@ suggest_try_change(su)
 	    switch (sp->ts_state)
 	    {
 	    case STATE_START:
+	    case STATE_NOPREFIX:
 		/*
 		 * Start of node: Deal with NUL bytes, which means
 		 * tword[] may end here.
@@ -6218,6 +6453,40 @@ suggest_try_change(su)
 		len = byts[arridx];	    /* bytes in this node */
 		arridx += sp->ts_curi;	    /* index of current byte */
 
+		if (sp->ts_prefixdepth == PREFIXTREE)
+		{
+		    /* Skip over the NUL bytes, we use them later. */
+		    for (n = 0; n < len && byts[arridx + n] == 0; ++n)
+			;
+		    sp->ts_curi += n;
+
+		    /* At end of a prefix or at start of prefixtree: check for
+		     * following word. */
+		    if (byts[arridx] == 0 || sp->ts_state == STATE_NOPREFIX)
+		    {
+			sp->ts_state = STATE_START;
+			++depth;
+			stack[depth] = stack[depth - 1];
+			sp = &stack[depth];
+			sp->ts_prefixdepth = depth - 1;
+			byts = fbyts;
+			idxs = fidxs;
+			sp->ts_state = STATE_START;
+			sp->ts_curi = 1;   /* start just after length byte */
+			sp->ts_arridx = 0;
+
+			/* Move the prefix to preword[] so that
+			 * find_keepcap_word() works. */
+			prewordlen = splitoff = sp->ts_twordlen;
+			mch_memmove(preword, tword, splitoff);
+			break;
+		    }
+
+		    /* Always past NUL bytes now. */
+		    sp->ts_state = STATE_ENDNUL;
+		    break;
+		}
+
 		if (sp->ts_curi > len || byts[arridx] != 0)
 		{
 		    /* Past bytes in node and/or past NUL bytes. */
@@ -6232,6 +6501,31 @@ suggest_try_change(su)
 
 		flags = (int)idxs[arridx];
 
+		if (sp->ts_prefixdepth < MAXWLEN)
+		{
+		    /* There was a prefix before the word.  Check that the
+		     * prefix can be used with this word. */
+		    /* Count the length of the NULs in the prefix.  If there
+		     * are none this must be the first try without a prefix.
+		     */
+		    n = stack[sp->ts_prefixdepth].ts_arridx;
+		    len = pbyts[n++];
+		    for (c = 0; c < len && pbyts[n + c] == 0; ++c)
+			;
+		    if (c > 0)
+		    {
+			/* The prefix ID is stored two bytes above the flags. */
+			c = valid_word_prefix(c, n, (unsigned)flags >> 16,
+					      tword + splitoff, lp->lp_slang);
+			if (c == 0)
+			    break;
+
+			/* Use the WF_RARE flag for a rare prefix. */
+			if (c & WF_RAREPFX)
+			    flags |= WF_RARE;
+		    }
+		}
+
 		/*
 		 * Form the word with proper case in preword.
 		 * If there is a word from a previous split, append.
@@ -6945,6 +7239,14 @@ suggest_try_change(su)
 		/* Did all possible states at this level, go up one level. */
 		--depth;
 
+		if (depth >= 0 && stack[depth].ts_prefixdepth == PREFIXTREE)
+		{
+		    /* Continue in or go back to the prefix tree. */
+		    byts = pbyts;
+		    idxs = pidxs;
+		    splitoff = 0;
+		}
+
 		/* Don't check for CTRL-C too often, it takes time. */
 		line_breakcheck();
 	    }
@@ -7161,7 +7463,7 @@ score_comp_sal(su)
 	if (lp->lp_slang->sl_sal.ga_len > 0)
 	{
 	    /* soundfold the bad word */
-	    spell_soundfold(lp->lp_slang, su->su_fbadword, badsound);
+	    spell_soundfold(lp->lp_slang, su->su_fbadword, TRUE, badsound);
 
 	    for (i = 0; i < su->su_ga.ga_len; ++i)
 	    {
@@ -7213,7 +7515,7 @@ score_combine(su)
 	if (lp->lp_slang->sl_sal.ga_len > 0)
 	{
 	    /* soundfold the bad word */
-	    spell_soundfold(lp->lp_slang, su->su_fbadword, badsound);
+	    spell_soundfold(lp->lp_slang, su->su_fbadword, TRUE, badsound);
 
 	    for (i = 0; i < su->su_ga.ga_len; ++i)
 	    {
@@ -7320,14 +7622,12 @@ stp_sal_score(stp, su, slang, badsound)
 	    for (p = fword; *(p = skiptowhite(p)) != NUL; )
 		mch_memmove(p, p + 1, STRLEN(p));
 
-	spell_soundfold(slang, fword, badsound2);
+	spell_soundfold(slang, fword, TRUE, badsound2);
 	p = badsound2;
     }
 
-    /* Case-fold the word, sound-fold the word and compute the score for the
-     * difference. */
-    (void)spell_casefold(stp->st_word, STRLEN(stp->st_word), fword, MAXWLEN);
-    spell_soundfold(slang, fword, goodsound);
+    /* Sound-fold the word and compute the score for the difference. */
+    spell_soundfold(slang, stp->st_word, FALSE, goodsound);
 
     return soundalike_score(goodsound, p);
 }
@@ -7341,7 +7641,6 @@ suggest_try_soundalike(su)
 {
     char_u	salword[MAXWLEN];
     char_u	tword[MAXWLEN];
-    char_u	tfword[MAXWLEN];
     char_u	tsalword[MAXWLEN];
     idx_T	arridx[MAXWLEN];
     int		curi[MAXWLEN];
@@ -7362,7 +7661,7 @@ suggest_try_soundalike(su)
 	if (lp->lp_slang->sl_sal.ga_len > 0)
 	{
 	    /* soundfold the bad word */
-	    spell_soundfold(lp->lp_slang, su->su_fbadword, salword);
+	    spell_soundfold(lp->lp_slang, su->su_fbadword, TRUE, salword);
 
 	    /*
 	     * Go through the whole tree, soundfold each word and compare.
@@ -7406,18 +7705,10 @@ suggest_try_soundalike(su)
 			    if (round == 2 || (flags & WF_KEEPCAP) == 0)
 			    {
 				tword[depth] = NUL;
-				if (round == 1)
-				    spell_soundfold(lp->lp_slang,
-							     tword, tsalword);
-				else
-				{
-				    /* In keep-case tree need to case-fold the
-				     * word. */
-				    (void)spell_casefold(tword, depth,
-							     tfword, MAXWLEN);
-				    spell_soundfold(lp->lp_slang,
-							    tfword, tsalword);
-				}
+				/* Sound-fold.  Only in keep-case tree need to
+				 * case-fold the word. */
+				spell_soundfold(lp->lp_slang, tword,
+							round == 1, tsalword);
 
 				/* Compute the edit distance between the
 				 * sound-a-like words. */
@@ -7812,7 +8103,7 @@ rescore_suggestions(su)
 	if (lp->lp_slang->sl_sal.ga_len > 0)
 	{
 	    /* soundfold the bad word */
-	    spell_soundfold(lp->lp_slang, su->su_fbadword, sal_badword);
+	    spell_soundfold(lp->lp_slang, su->su_fbadword, TRUE, sal_badword);
 
 	    for (i = 0; i < su->su_ga.ga_len; ++i)
 	    {
@@ -7896,7 +8187,6 @@ eval_soundfold(word)
     char_u	*word;
 {
     langp_T	*lp;
-    char_u	fword[MAXWLEN];
     char_u	sound[MAXWLEN];
 
     if (curwin->w_p_spell && *curbuf->b_p_spl != NUL)
@@ -7905,11 +8195,8 @@ eval_soundfold(word)
 						   lp->lp_slang != NULL; ++lp)
 	    if (lp->lp_slang->sl_sal.ga_len > 0)
 	    {
-		/* word most be case-folded first. */
-		(void)spell_casefold(word, STRLEN(word), fword, MAXWLEN);
-
 		/* soundfold the word */
-		spell_soundfold(lp->lp_slang, fword, sound);
+		spell_soundfold(lp->lp_slang, word, FALSE, sound);
 		return vim_strsave(sound);
 	    }
 
@@ -7922,14 +8209,125 @@ eval_soundfold(word)
  * Turn "inword" into its sound-a-like equivalent in "res[MAXWLEN]".
  */
     static void
-spell_soundfold(slang, inword, res)
+spell_soundfold(slang, inword, folded, res)
+    slang_T	*slang;
+    char_u	*inword;
+    int		folded;	    /* "inword" is already case-folded */
+    char_u	*res;
+{
+    char_u	fword[MAXWLEN];
+    char_u	*word;
+
+    if (slang->sl_sofo)
+	/* SOFOFROM and SOFOTO used */
+	spell_soundfold_sofo(slang, inword, res);
+    else
+    {
+	/* SAL items used.  Requires the word to be case-folded. */
+	if (folded)
+	    word = inword;
+	else
+	{
+	    (void)spell_casefold(inword, STRLEN(inword), fword, MAXWLEN);
+	    word = fword;
+	}
+
+#ifdef FEAT_MBYTE
+	if (has_mbyte)
+	    spell_soundfold_wsal(slang, word, res);
+	else
+#endif
+	    spell_soundfold_sal(slang, word, res);
+    }
+}
+
+/*
+ * Perform sound folding of "inword" into "res" according to SOFOFROM and
+ * SOFOTO lines.
+ */
+    static void
+spell_soundfold_sofo(slang, inword, res)
+    slang_T	*slang;
+    char_u	*inword;
+    char_u	*res;
+{
+    char_u	*s;
+    int		ri = 0;
+    int		c;
+
+#ifdef FEAT_MBYTE
+    if (has_mbyte)
+    {
+	int	prevc = 0;
+	int	*ip;
+
+	/* The sl_sal_first[] table contains the translation for chars up to
+	 * 255, sl_sal the rest. */
+	for (s = inword; *s != NUL; )
+	{
+	    c = mb_ptr2char_adv(&s);
+	    if (enc_utf8 ? utf_class(c) == 0 : vim_iswhite(c))
+		c = ' ';
+	    else if (c < 256)
+		c = slang->sl_sal_first[c];
+	    else
+	    {
+		ip = ((int **)slang->sl_sal.ga_data)[c & 0xff];
+		if (ip == NULL)		/* empty list, can't match */
+		    c = NUL;
+		else
+		    for (;;)		/* find "c" in the list */
+		    {
+			if (*ip == 0)	/* not found */
+			{
+			    c = NUL;
+			    break;
+			}
+			if (*ip == c)	/* match! */
+			{
+			    c = ip[1];
+			    break;
+			}
+			ip += 2;
+		    }
+	    }
+
+	    if (c != NUL && c != prevc)
+	    {
+		ri += mb_char2bytes(c, res + ri);
+		if (ri + MB_MAXBYTES > MAXWLEN)
+		    break;
+		prevc = c;
+	    }
+	}
+    }
+    else
+#endif
+    {
+	/* The sl_sal_first[] table contains the translation. */
+	for (s = inword; (c = *s) != NUL; ++s)
+	{
+	    if (vim_iswhite(c))
+		c = ' ';
+	    else
+		c = slang->sl_sal_first[c];
+	    if (c != NUL && (ri == 0 || res[ri - 1] != c))
+		res[ri++] = c;
+	}
+    }
+
+    res[ri] = NUL;
+}
+
+    static void
+spell_soundfold_sal(slang, inword, res)
     slang_T	*slang;
     char_u	*inword;
     char_u	*res;
 {
     salitem_T	*smp;
     char_u	word[MAXWLEN];
-    char_u	*s;
+    char_u	*s = inword;
     char_u	*t;
     char_u	*pf;
     int		i, j, z;
@@ -7943,21 +8341,12 @@ spell_soundfold(slang, inword, res)
     int		p0 = -333;
     int		c0;
 
-#ifdef FEAT_MBYTE
-    if (has_mbyte)
-    {
-	/* Call the multi-byte version of this. */
-	spell_soundfold_w(slang, inword, res);
-	return;
-    }
-#endif
-
     /* Remove accents, if wanted.  We actually remove all non-word characters.
-     * But keep white space. */
+     * But keep white space.  We need a copy, the word may be changed here. */
     if (slang->sl_rem_accents)
     {
 	t = word;
-	for (s = inword; *s != NUL; )
+	while (*s != NUL)
 	{
 	    if (vim_iswhite(*s))
 	    {
@@ -7974,7 +8363,7 @@ spell_soundfold(slang, inword, res)
 	*t = NUL;
     }
     else
-	STRCPY(word, inword);
+	STRCPY(word, s);
 
     smp = (salitem_T *)slang->sl_sal.ga_data;
 
@@ -8011,9 +8400,9 @@ spell_soundfold(slang, inword, res)
 		    }
 		}
 
-		if ((pf = smp[n].sm_oneoff) != NULL)
+		if ((pf = smp[n].sm_oneof) != NULL)
 		{
-		    /* Check for match with one of the chars in "sm_oneoff". */
+		    /* Check for match with one of the chars in "sm_oneof". */
 		    while (*pf != NUL && *pf != word[i + k])
 			++pf;
 		    if (*pf == NUL)
@@ -8081,10 +8470,10 @@ spell_soundfold(slang, inword, res)
 			    }
 			    k0 += k - 1;
 
-			    if ((pf = smp[n0].sm_oneoff) != NULL)
+			    if ((pf = smp[n0].sm_oneof) != NULL)
 			    {
 				/* Check for match with one of the chars in
-				 * "sm_oneoff". */
+				 * "sm_oneof". */
 				while (*pf != NUL && *pf != word[i + k0])
 				    ++pf;
 				if (*pf == NUL)
@@ -8211,12 +8600,12 @@ spell_soundfold(slang, inword, res)
  * Multi-byte version of spell_soundfold().
  */
     static void
-spell_soundfold_w(slang, inword, res)
+spell_soundfold_wsal(slang, inword, res)
     slang_T	*slang;
     char_u	*inword;
     char_u	*res;
 {
-    salitem_T	*smp;
+    salitem_T	*smp = (salitem_T *)slang->sl_sal.ga_data;
     int		word[MAXWLEN];
     int		wres[MAXWLEN];
     int		l;
@@ -8266,8 +8655,6 @@ spell_soundfold_w(slang, inword, res)
     }
     word[n] = NUL;
 
-    smp = (salitem_T *)slang->sl_sal.ga_data;
-
     /*
      * This comes from Aspell phonet.cpp.
      * Converted from C++ to C.  Added support for multi-byte chars.
@@ -8282,11 +8669,13 @@ spell_soundfold_w(slang, inword, res)
 
 	if (n >= 0)
 	{
-	    /* check all rules for the same letter */
+	    /* check all rules for the same index byte */
 	    for (; ((ws = smp[n].sm_lead_w)[0] & 0xff) == (c & 0xff); ++n)
 	    {
 		/* Quickly skip entries that don't match the word.  Most
 		 * entries are less then three chars, optimize for that. */
+		if (c != ws[0])
+		    continue;
 		k = smp[n].sm_leadlen;
 		if (k > 1)
 		{
@@ -8302,9 +8691,9 @@ spell_soundfold_w(slang, inword, res)
 		    }
 		}
 
-		if ((pf = smp[n].sm_oneoff_w) != NULL)
+		if ((pf = smp[n].sm_oneof_w) != NULL)
 		{
-		    /* Check for match with one of the chars in "sm_oneoff". */
+		    /* Check for match with one of the chars in "sm_oneof". */
 		    while (*pf != NUL && *pf != word[i + k])
 			++pf;
 		    if (*pf == NUL)
@@ -8350,12 +8739,15 @@ spell_soundfold_w(slang, inword, res)
 		    if (slang->sl_followup && k > 1 && n0 >= 0
 					   && p0 != '-' && word[i + k] != NUL)
 		    {
-			/* test follow-up rule for "word[i + k]" */
+			/* Test follow-up rule for "word[i + k]"; loop over
+			 * all entries with the same index byte. */
 			for ( ; ((ws = smp[n0].sm_lead_w)[0] & 0xff)
 							 == (c0 & 0xff); ++n0)
 			{
 			    /* Quickly skip entries that don't match the word.
-			     * */
+			     */
+			    if (c0 != ws[0])
+				continue;
 			    k0 = smp[n0].sm_leadlen;
 			    if (k0 > 1)
 			    {
@@ -8373,10 +8765,10 @@ spell_soundfold_w(slang, inword, res)
 			    }
 			    k0 += k - 1;
 
-			    if ((pf = smp[n0].sm_oneoff_w) != NULL)
+			    if ((pf = smp[n0].sm_oneof_w) != NULL)
 			    {
 				/* Check for match with one of the chars in
-				 * "sm_oneoff". */
+				 * "sm_oneof". */
 				while (*pf != NUL && *pf != word[i + k0])
 				    ++pf;
 				if (*pf == NUL)
author	vimboss
date	Wed, 29 Jun 2005 22:40:58 +0000
parents	a698eb686ded
children	d2bc505a6d91