comparison src/spell.c @ 226:4e7dca477fee

updated for version 7.0063
author vimboss
date Tue, 22 Mar 2005 22:54:12 +0000
parents 5175af353b81
children 723a01584c3e
comparison
equal deleted inserted replaced
225:b78857578493 226:4e7dca477fee
7 * See README.txt for an overview of the Vim source code. 7 * See README.txt for an overview of the Vim source code.
8 */ 8 */
9 9
10 /* 10 /*
11 * spell.c: code for spell checking 11 * spell.c: code for spell checking
12 *
13 * Terminology:
14 * "dword" is a dictionary word, made out of letters and digits.
15 * "nword" is a word with a character that's not a letter or digit.
16 * "word" is either a "dword" or an "nword".
12 */ 17 */
13 18
14 #if defined(MSDOS) || defined(WIN16) || defined(WIN32) || defined(_WIN64) 19 #if defined(MSDOS) || defined(WIN16) || defined(WIN32) || defined(_WIN64)
15 # include <io.h> /* for lseek(), must be before vim.h */ 20 # include <io.h> /* for lseek(), must be before vim.h */
16 #endif 21 #endif
20 #if defined(FEAT_SYN_HL) || defined(PROTO) 25 #if defined(FEAT_SYN_HL) || defined(PROTO)
21 26
22 #ifdef HAVE_FCNTL_H 27 #ifdef HAVE_FCNTL_H
23 # include <fcntl.h> 28 # include <fcntl.h>
24 #endif 29 #endif
30
31 #define MAXWLEN 100 /* assume max. word len is this many bytes */
25 32
26 /* 33 /*
27 * Structure that is used to store the text from the language file. This 34 * Structure that is used to store the text from the language file. This
28 * avoids the need to allocate each individual word and copying it. It's 35 * avoids the need to allocate each individual word and copying it. It's
29 * allocated in big chunks for speed. 36 * allocated in big chunks for speed.
34 { 41 {
35 sblock_T *sb_next; /* next block in list */ 42 sblock_T *sb_next; /* next block in list */
36 char_u sb_data[1]; /* data, actually longer */ 43 char_u sb_data[1]; /* data, actually longer */
37 }; 44 };
38 45
46 /* Structure to store words and additions. Used twice : once for case-folded
47 * and once for keep-case words. */
48 typedef struct winfo_S
49 {
50 hashtab_T wi_ht; /* hashtable with all words, both dword_T and
51 nword_T (check flags for DW_NWORD) */
52 garray_T wi_add; /* table with pointers to additions in a
53 dword_T */
54 int wi_addlen; /* longest addition length */
55 } winfo_T;
56
39 /* 57 /*
40 * Structure used to store words and other info for one language. 58 * Structure used to store words and other info for one language.
41 */ 59 */
42 typedef struct slang_S slang_T; 60 typedef struct slang_S slang_T;
43
44 struct slang_S 61 struct slang_S
45 { 62 {
46 slang_T *sl_next; /* next language */ 63 slang_T *sl_next; /* next language */
47 char_u sl_name[2]; /* language name "en", "nl", etc. */ 64 char_u sl_name[2]; /* language name "en", "nl", etc. */
48 hashtab_T sl_ht; /* hashtable with all words */ 65 winfo_T sl_fwords; /* case-folded words and additions */
49 garray_T sl_match; /* table with pointers to matches */ 66 winfo_T sl_kwords; /* keep-case words and additions */
50 garray_T sl_add; /* table with pointers to additions */ 67 char_u sl_regions[17]; /* table with up to 8 region names plus NUL */
51 char_u sl_regions[13]; /* table with up to 6 region names */
52 sblock_T *sl_block; /* list with allocated memory blocks */ 68 sblock_T *sl_block; /* list with allocated memory blocks */
53 }; 69 };
54 70
55 static slang_T *first_lang = NULL; 71 static slang_T *first_lang = NULL;
72
73 /* Entry for dword in "sl_ht". Also used for part of an nword, starting with
74 * the first non-word character. And used for additions in wi_add. */
75 typedef struct dword_S
76 {
77 char_u dw_region; /* one bit per region where it's valid */
78 char_u dw_flags; /* WF_ flags */
79 char_u dw_word[1]; /* actually longer, NUL terminated */
80 } dword_T;
81
82 #define REGION_ALL 0xff
83
84 #define HI2DWORD(hi) (dword_T *)(hi->hi_key - 2)
85
86 /* Entry for a nword in "sl_ht". Note that the last three items must be
87 * identical to dword_T, so that they can be in the same hashtable. */
88 typedef struct nword_S
89 {
90 garray_T nw_ga; /* table with pointers to dword_T for part
91 starting with non-word character */
92 int nw_maxlen; /* longest nword length (after the dword) */
93 char_u nw_region; /* one bit per region where it's valid */
94 char_u nw_flags; /* WF_ flags */
95 char_u nw_word[1]; /* actually longer, NUL terminated */
96 } nword_T;
97
98 /* Get nword_T pointer from hashitem that uses nw_word */
99 static nword_T dumnw;
100 #define HI2NWORD(hi) ((nword_T *)((hi)->hi_key - (dumnw.nw_word - (char_u *)&dumnw)))
101
102 #define DW_CAP 0x01 /* word must start with capital */
103 #define DW_RARE 0x02 /* rare word */
104 #define DW_NWORD 0x04 /* this is an nword_T */
105 #define DW_DWORD 0x08 /* (also) use as dword without nword */
56 106
57 /* 107 /*
58 * Structure used in "b_langp", filled from 'spelllang'. 108 * Structure used in "b_langp", filled from 'spelllang'.
59 */ 109 */
60 typedef struct langp_S 110 typedef struct langp_S
62 slang_T *lp_slang; /* info for this language (NULL for last one) */ 112 slang_T *lp_slang; /* info for this language (NULL for last one) */
63 int lp_region; /* bitmask for region or REGION_ALL */ 113 int lp_region; /* bitmask for region or REGION_ALL */
64 } langp_T; 114 } langp_T;
65 115
66 #define LANGP_ENTRY(ga, i) (((langp_T *)(ga).ga_data) + (i)) 116 #define LANGP_ENTRY(ga, i) (((langp_T *)(ga).ga_data) + (i))
67 #define MATCH_ENTRY(gap, i) *(((char_u **)(gap)->ga_data) + i) 117 #define DWORD_ENTRY(gap, i) *(((dword_T **)(gap)->ga_data) + i)
68
69 /*
70 * The byte before a word in the hashtable indicates the type of word.
71 * Also used for the byte just before a match.
72 * The top two bits are used to indicate rare and case-sensitive words.
73 * The lower bits are used to indicate the region in which the word is valid.
74 * Words valid in all regions use REGION_ALL.
75 */
76 #define REGION_MASK 0x3f
77 #define REGION_ALL 0x3f
78 #define CASE_MASK 0x40
79 #define RARE_MASK 0x80
80 118
81 #define SP_OK 0 119 #define SP_OK 0
82 #define SP_BAD 1 120 #define SP_BAD 1
83 #define SP_RARE 2 121 #define SP_RARE 2
84 #define SP_LOCAL 3 122 #define SP_LOCAL 3
123
124 static char *e_invchar2 = N_("E753: Invalid character in \"%s\"");
85 125
86 static slang_T *spell_load_lang __ARGS((char_u *lang)); 126 static slang_T *spell_load_lang __ARGS((char_u *lang));
87 static void spell_load_file __ARGS((char_u *fname)); 127 static void spell_load_file __ARGS((char_u *fname));
88 static int find_region __ARGS((char_u *rp, char_u *region)); 128 static int find_region __ARGS((char_u *rp, char_u *region));
89 129
100 spell_check(wp, ptr, attrp) 140 spell_check(wp, ptr, attrp)
101 win_T *wp; /* current window */ 141 win_T *wp; /* current window */
102 char_u *ptr; 142 char_u *ptr;
103 int *attrp; 143 int *attrp;
104 { 144 {
105 char_u *e; 145 char_u *e; /* end of word */
146 char_u *ne; /* new end of word */
147 char_u *me; /* max. end of match */
106 langp_T *lp; 148 langp_T *lp;
107 int result; 149 int result;
108 int len = 0; 150 int len = 0;
109 hash_T hash;
110 hashitem_T *hi; 151 hashitem_T *hi;
111 int c; 152 int round;
112 #define MAXWLEN 80 /* assume max. word len is 80 */ 153 char_u kword[MAXWLEN + 1]; /* word copy */
113 char_u word[MAXWLEN + 1]; 154 char_u fword[MAXWLEN + 1]; /* word with case folded */
155 char_u match[MAXWLEN + 1]; /* fword with additional chars */
156 char_u kwordclen[MAXWLEN + 1]; /* len of orig chars after kword[] */
157 char_u fwordclen[MAXWLEN + 1]; /* len of chars after fword[] */
158 char_u *clen;
159 int cidx = 0; /* char index in xwordclen[] */
160 hash_T fhash; /* hash for fword */
161 hash_T khash; /* hash for kword */
162 int match_len = 0; /* length of match[] */
163 int fmatch_len = 0; /* length of nword match in chars */
114 garray_T *gap; 164 garray_T *gap;
115 int l, h, t; 165 int l, t;
116 char_u *p; 166 char_u *p, *tp;
117 int n; 167 int n;
168 dword_T *dw;
169 dword_T *tdw;
170 winfo_T *wi;
171 nword_T *nw;
172 int w_isupper;
118 173
119 /* Find the end of the word. We already know that *ptr is a word char. */ 174 /* Find the end of the word. We already know that *ptr is a word char. */
120 e = ptr; 175 e = ptr;
121 do 176 do
122 { 177 {
123 mb_ptr_adv(e); 178 mb_ptr_adv(e);
124 ++len; 179 ++len;
125 } while (*e != NUL && vim_iswordc_buf(e, wp->w_buffer)); 180 } while (*e != NUL && spell_iswordc(e));
181
182 /* A word starting with a number is always OK. */
183 if (*ptr >= '0' && *ptr <= '9')
184 return (int)(e - ptr);
185
186 #ifdef FEAT_MBYTE
187 w_isupper = MB_ISUPPER(mb_ptr2char(ptr));
188 #else
189 w_isupper = MB_ISUPPER(*ptr);
190 #endif
191
192 /* Make a copy of the word so that it can be NUL terminated.
193 * Compute hash value. */
194 mch_memmove(kword, ptr, e - ptr);
195 kword[e - ptr] = NUL;
196 khash = hash_hash(kword);
197
198 /* Make case-folded copy of the Word. Compute its hash value. */
199 (void)str_foldcase(ptr, e - ptr, fword, MAXWLEN + 1);
200 fhash = hash_hash(fword);
201
202 /* Further case-folded characters to check for an nword match go in
203 * match[]. */
204 me = e;
205
206 /* "ne" is the end for the longest match */
207 ne = e;
126 208
127 /* The word is bad unless we find it in the dictionary. */ 209 /* The word is bad unless we find it in the dictionary. */
128 result = SP_BAD; 210 result = SP_BAD;
129 211
130 /* Words are always stored with folded case. */
131 (void)str_foldcase(ptr, e - ptr, word, MAXWLEN + 1);
132 hash = hash_hash(word);
133
134 /* 212 /*
135 * Loop over the languages specified in 'spelllang'. 213 * Loop over the languages specified in 'spelllang'.
136 * We check them all, because a match may find a longer word. 214 * We check them all, because a matching nword may be longer than an
215 * already found dword or nword.
137 */ 216 */
138 for (lp = LANGP_ENTRY(wp->w_buffer->b_langp, 0); lp->lp_slang != NULL; 217 for (lp = LANGP_ENTRY(wp->w_buffer->b_langp, 0); lp->lp_slang != NULL; ++lp)
139 ++lp)
140 { 218 {
141 /* Check words when it wasn't recognized as a good word yet. */ 219 /*
142 if (result != SP_OK) 220 * Check for a matching word in the hashtable.
221 * Check both the keep-case word and the fold-case word.
222 */
223 for (round = 0; round <= 1; ++round)
143 { 224 {
144 /* Word lookup. Using a hash table is fast. */ 225 if (round == 0)
145 hi = hash_lookup(&lp->lp_slang->sl_ht, word, hash); 226 {
227 wi = &lp->lp_slang->sl_kwords;
228 hi = hash_lookup(&wi->wi_ht, kword, khash);
229 }
230 else
231 {
232 wi = &lp->lp_slang->sl_fwords;
233 hi = hash_lookup(&wi->wi_ht, fword, fhash);
234 }
146 if (!HASHITEM_EMPTY(hi)) 235 if (!HASHITEM_EMPTY(hi))
147 { 236 {
148 /* The character before the key indicates the type of word. */ 237 /*
149 c = hi->hi_key[-1]; 238 * If this is an nword entry, check for match with remainder.
150 if ((c & CASE_MASK) != 0) 239 */
151 { 240 dw = HI2DWORD(hi);
152 /* Need to check first letter is uppercase. If it is, 241 if (dw->dw_flags & DW_NWORD)
153 * check region. If it isn't it may be a rare word. */ 242 {
154 if ( 243 /* If the word is not defined as a dword we must find an
244 * nword. */
245 if ((dw->dw_flags & DW_DWORD) == 0)
246 dw = NULL;
247
248 /* Fold more characters when needed for the nword. Need
249 * to do one extra to check for a non-word character after
250 * the nword. Also keep the byte-size of each character,
251 * both before and after folding case. */
252 nw = HI2NWORD(hi);
253 while ((round == 0
254 ? me - e <= nw->nw_maxlen
255 : match_len <= nw->nw_maxlen)
256 && *me != NUL)
257 {
155 #ifdef FEAT_MBYTE 258 #ifdef FEAT_MBYTE
156 MB_ISUPPER(mb_ptr2char(ptr)) 259 l = mb_ptr2len_check(me);
157 #else 260 #else
158 MB_ISUPPER(*ptr) 261 l = 1;
159 #endif 262 #endif
160 ) 263 (void)str_foldcase(me, l, match + match_len,
161 { 264 MAXWLEN - match_len + 1);
162 if ((c & lp->lp_region) == 0) 265 me += l;
266 kwordclen[cidx] = l;
267 fwordclen[cidx] = STRLEN(match + match_len);
268 match_len += fwordclen[cidx];
269 ++cidx;
270 }
271
272 if (round == 0)
273 {
274 clen = kwordclen;
275 tp = e;
276 }
277 else
278 {
279 clen = fwordclen;
280 tp = match;
281 }
282
283 /* Match with each item. The longest match wins:
284 * "you've" is longer than "you". */
285 gap = &nw->nw_ga;
286 for (t = 0; t < gap->ga_len; ++t)
287 {
288 /* Skip entries with wrong case for first char.
289 * Continue if it's a rare word without a captial. */
290 tdw = DWORD_ENTRY(gap, t);
291 if ((tdw->dw_flags & (DW_CAP | DW_RARE)) == DW_CAP
292 && !w_isupper)
293 continue;
294
295 p = tdw->dw_word;
296 l = 0;
297 for (n = 0; p[n] != 0; n += clen[l++])
298 if (vim_memcmp(p + n, tp + n, clen[l]) != 0)
299 break;
300
301 /* Use a match if it's longer than previous matches
302 * and the next character is not a word character. */
303 if (p[n] == 0 && l > fmatch_len && (tp[n] == 0
304 || !spell_iswordc(tp + n)))
305 {
306 dw = tdw;
307 fmatch_len = l;
308 if (round == 0)
309 ne = tp + n;
310 else
311 {
312 /* Need to use the length of the original
313 * chars, not the fold-case ones. */
314 ne = e;
315 for (l = 0; l < fmatch_len; ++l)
316 ne += kwordclen[l];
317 }
318 if ((lp->lp_region & tdw->dw_region) == 0)
319 result = SP_LOCAL;
320 else if ((tdw->dw_flags & DW_CAP) && !w_isupper)
321 result = SP_RARE;
322 else
323 result = SP_OK;
324 }
325 }
326
327 }
328
329 if (dw != NULL)
330 {
331 if (dw->dw_flags & DW_CAP)
332 {
333 /* Need to check first letter is uppercase. If it is,
334 * check region. If it isn't it may be a rare word.
335 * */
336 if (w_isupper)
337 {
338 if ((dw->dw_region & lp->lp_region) == 0)
339 result = SP_LOCAL;
340 else
341 result = SP_OK;
342 }
343 else if (dw->dw_flags & DW_RARE)
344 result = SP_RARE;
345 }
346 else
347 {
348 if ((dw->dw_region & lp->lp_region) == 0)
163 result = SP_LOCAL; 349 result = SP_LOCAL;
350 else if (dw->dw_flags & DW_RARE)
351 result = SP_RARE;
164 else 352 else
165 result = SP_OK; 353 result = SP_OK;
166 } 354 }
167 else if (c & RARE_MASK)
168 result = SP_RARE;
169 }
170 else
171 {
172 if ((c & lp->lp_region) == 0)
173 result = SP_LOCAL;
174 else if (c & RARE_MASK)
175 result = SP_RARE;
176 else
177 result = SP_OK;
178 } 355 }
179 } 356 }
180 } 357 }
181 358
182 /* Match lookup. Uses a binary search. If there is a match adjust 359 /*
183 * "e" to the end. This is also done when a word matched, because 360 * Check for an addition.
184 * "you've" is longer than "you". */ 361 * Only after a dword, not after an nword.
185 gap = &lp->lp_slang->sl_match; 362 * Check both the keep-case word and the fold-case word.
186 l = 0; /* low index */ 363 */
187 h = gap->ga_len - 1; /* high index */ 364 if (fmatch_len == 0)
188 /* keep searching, the match must be between "l" and "h" (inclusive) */ 365 for (round = 0; round <= 1; ++round)
189 while (h >= l)
190 {
191 t = (h + l) / 2;
192 p = MATCH_ENTRY(gap, t) + 1;
193 for (n = 0; p[n] != 0 && p[n] == ptr[n]; ++n)
194 ;
195 if (p[n] == 0)
196 { 366 {
197 if ((ptr[n] == 0 || !vim_iswordc_buf(ptr + n, wp->w_buffer))) 367 if (round == 0)
198 { 368 wi = &lp->lp_slang->sl_kwords;
199 /* match! */ 369 else
200 e = ptr + n; 370 wi = &lp->lp_slang->sl_fwords;
201 if (result != SP_OK) 371 gap = &wi->wi_add;
202 { 372 if (gap->ga_len == 0) /* no additions, skip quickly */
203 if ((lp->lp_region & p[-1]) == 0) 373 continue;
204 result = SP_LOCAL; 374
375 /* Fold characters when needed for the addition. Need to do one
376 * extra to check for a word character after the addition. */
377 while ((round == 0
378 ? me - e <= wi->wi_addlen
379 : match_len <= wi->wi_addlen)
380 && *me != NUL)
381 {
382 #ifdef FEAT_MBYTE
383 l = mb_ptr2len_check(me);
384 #else
385 l = 1;
386 #endif
387 (void)str_foldcase(me, l, match + match_len,
388 MAXWLEN - match_len + 1);
389 me += l;
390 kwordclen[cidx] = l;
391 fwordclen[cidx] = STRLEN(match + match_len);
392 match_len += fwordclen[cidx];
393 ++cidx;
394 }
395
396 if (round == 0)
397 {
398 clen = kwordclen;
399 tp = e;
400 }
401 else
402 {
403 clen = fwordclen;
404 tp = match;
405 }
406
407 /* Addition lookup. Uses a linear search, there should be
408 * very few. If there is a match adjust "ne" to the end.
409 * This doesn't change whether a word was good or bad, only
410 * the length. */
411 for (t = 0; t < gap->ga_len; ++t)
412 {
413 tdw = DWORD_ENTRY(gap, t);
414 p = tdw->dw_word;
415 l = 0;
416 for (n = 0; p[n] != 0; n += clen[l++])
417 if (vim_memcmp(p + n, tp + n, clen[l]) != 0)
418 break;
419
420 /* Use a match if it's longer than previous matches
421 * and the next character is not a word character. */
422 if (p[n] == 0 && l > fmatch_len
423 && (tp[n] == 0 || !spell_iswordc(tp + n)))
424 {
425 fmatch_len = l;
426 if (round == 0)
427 ne = tp + n;
205 else 428 else
206 result = SP_OK; 429 {
207 } 430 /* Need to use the length of the original
208 break; 431 * chars, not the fold-case ones. */
209 } 432 ne = e;
210 /* match is too short, next item is new low index */ 433 for (l = 0; l < fmatch_len; ++l)
211 l = t + 1; 434 ne += kwordclen[l];
435 }
436 }
437 }
212 } 438 }
213 else if (p[n] < ptr[n])
214 /* match is before word, next item is new low index */
215 l = t + 1;
216 else
217 /* match is after word, previous item is new high index */
218 h = t - 1;
219 }
220
221 /* Addition lookup. Uses a linear search, there should be very few.
222 * If there is a match adjust "e" to the end. This doesn't change
223 * whether a word was good or bad, only the length. */
224 gap = &lp->lp_slang->sl_add;
225 for (t = 0; t < gap->ga_len; ++t)
226 {
227 p = MATCH_ENTRY(gap, t) + 1;
228 for (n = 0; p[n] != 0 && p[n] == e[n]; ++n)
229 ;
230 if (p[n] == 0
231 && (e[n] == 0 || !vim_iswordc_buf(e + n, wp->w_buffer)))
232 {
233 /* match */
234 e += n;
235 break;
236 }
237 }
238 } 439 }
239 440
240 if (result != SP_OK) 441 if (result != SP_OK)
241 { 442 {
242 if (result == SP_BAD) 443 if (result == SP_BAD)
245 *attrp = highlight_attr[HLF_SPR]; 446 *attrp = highlight_attr[HLF_SPR];
246 else 447 else
247 *attrp = highlight_attr[HLF_SPL]; 448 *attrp = highlight_attr[HLF_SPL];
248 } 449 }
249 450
250 return (int)(e - ptr); 451 return (int)(ne - ptr);
251 } 452 }
252 453
253 static slang_T *load_lp; /* passed from spell_load_lang() to 454 static slang_T *load_lp; /* passed from spell_load_lang() to
254 spell_load_file() */ 455 spell_load_file() */
255 456
262 { 463 {
263 slang_T *lp; 464 slang_T *lp;
264 char_u fname_enc[80]; 465 char_u fname_enc[80];
265 char_u fname_ascii[20]; 466 char_u fname_ascii[20];
266 char_u *p; 467 char_u *p;
468 int r;
267 469
268 lp = (slang_T *)alloc(sizeof(slang_T)); 470 lp = (slang_T *)alloc(sizeof(slang_T));
269 if (lp != NULL) 471 if (lp != NULL)
270 { 472 {
271 lp->sl_name[0] = lang[0]; 473 lp->sl_name[0] = lang[0];
272 lp->sl_name[1] = lang[1]; 474 lp->sl_name[1] = lang[1];
273 hash_init(&lp->sl_ht); 475 hash_init(&lp->sl_fwords.wi_ht);
274 ga_init2(&lp->sl_match, sizeof(char_u *), 20); 476 ga_init2(&lp->sl_fwords.wi_add, sizeof(dword_T *), 4);
275 ga_init2(&lp->sl_add, sizeof(char_u *), 4); 477 lp->sl_fwords.wi_addlen = 0;
478 hash_init(&lp->sl_kwords.wi_ht);
479 ga_init2(&lp->sl_kwords.wi_add, sizeof(dword_T *), 4);
480 lp->sl_kwords.wi_addlen = 0;
276 lp->sl_regions[0] = NUL; 481 lp->sl_regions[0] = NUL;
277 lp->sl_block = NULL; 482 lp->sl_block = NULL;
278 483
279 /* Find all spell files for "lang" in 'runtimepath' and load them. 484 /* Find all spell files for "lang" in 'runtimepath' and load them.
280 * Use 'encoding', except that we use "latin1" for "latin9". */ 485 * Use 'encoding', except that we use "latin1" for "latin9". */
284 else 489 else
285 #endif 490 #endif
286 p = (char_u *)"latin1"; 491 p = (char_u *)"latin1";
287 load_lp = lp; 492 load_lp = lp;
288 sprintf((char *)fname_enc, "spell/%c%c.%s.spl", lang[0], lang[1], p); 493 sprintf((char *)fname_enc, "spell/%c%c.%s.spl", lang[0], lang[1], p);
289 if (do_in_runtimepath(fname_enc, TRUE, spell_load_file) == FAIL) 494 r = do_in_runtimepath(fname_enc, TRUE, spell_load_file);
495 if (r == FAIL)
290 { 496 {
291 /* Try again to find an ASCII spell file. */ 497 /* Try again to find an ASCII spell file. */
292 sprintf((char *)fname_ascii, "spell/%c%c.spl", lang[0], lang[1]); 498 sprintf((char *)fname_ascii, "spell/%c%c.spl", lang[0], lang[1]);
293 if (do_in_runtimepath(fname_ascii, TRUE, spell_load_file) == FAIL) 499 r = do_in_runtimepath(fname_ascii, TRUE, spell_load_file);
294 { 500 }
295 vim_free(lp); 501
296 lp = NULL; 502 if (r == FAIL)
297 smsg((char_u *)_("Warning: Cannot find dictionary \"%s\""), 503 {
504 vim_free(lp);
505 lp = NULL;
506 smsg((char_u *)_("Warning: Cannot find dictionary \"%s\""),
298 fname_enc + 6); 507 fname_enc + 6);
299 }
300 } 508 }
301 else 509 else
302 { 510 {
303 lp->sl_next = first_lang; 511 lp->sl_next = first_lang;
304 first_lang = lp; 512 first_lang = lp;
317 char_u *fname; 525 char_u *fname;
318 { 526 {
319 int fd; 527 int fd;
320 size_t len; 528 size_t len;
321 size_t l; 529 size_t l;
530 char_u *p = NULL, *np;
531 sblock_T *bl = NULL;
532 int bl_used = 0;
322 size_t rest = 0; 533 size_t rest = 0;
323 char_u *p = NULL, *np; 534 char_u *rbuf; /* read buffer */
324 sblock_T *bl; 535 char_u *rbuf_end; /* past last valid char in "rbuf" */
325 hash_T hash; 536 hash_T hash;
326 hashitem_T *hi; 537 hashitem_T *hi;
327 int c; 538 int c;
539 int cc;
328 int region = REGION_ALL; 540 int region = REGION_ALL;
329 char_u word[MAXWLEN + 1]; 541 int wlen;
330 int n; 542 winfo_T *wi;
543 dword_T *dw, *edw;
544 nword_T *nw = NULL;
545 int flags;
546 char_u *save_sourcing_name = sourcing_name;
547 linenr_T save_sourcing_lnum = sourcing_lnum;
548
549 rbuf = alloc((unsigned)(SBLOCKSIZE + MAXWLEN + 1));
550 if (rbuf == NULL)
551 return;
331 552
332 fd = mch_open((char *)fname, O_RDONLY | O_EXTRA, 0); 553 fd = mch_open((char *)fname, O_RDONLY | O_EXTRA, 0);
333 if (fd < 0) 554 if (fd < 0)
334 { 555 {
335 EMSG2(_(e_notopen), fname); 556 EMSG2(_(e_notopen), fname);
336 return; 557 goto theend;
337 } 558 }
559
560 sourcing_name = fname;
561 sourcing_lnum = 0;
338 562
339 /* Get the length of the whole file. */ 563 /* Get the length of the whole file. */
340 len = lseek(fd, (off_t)0, SEEK_END); 564 len = lseek(fd, (off_t)0, SEEK_END);
341 lseek(fd, (off_t)0, SEEK_SET); 565 lseek(fd, (off_t)0, SEEK_SET);
342 566
343 /* Loop, reading the file one block at a time. 567 /*
568 * Read the file one block at a time.
344 * "rest" is the length of an incomplete line at the previous block. 569 * "rest" is the length of an incomplete line at the previous block.
345 * "p" points to the remainder. */ 570 * "p" points to the remainder.
571 */
346 while (len > 0) 572 while (len > 0)
347 { 573 {
348 /* Allocate a block of memory to store the info in. This is not freed 574 /* Read a block from the file. Prepend the remainder of the previous
349 * until spell_reload() is called. */ 575 * block, if any. */
576 if (rest > 0)
577 {
578 if (rest > MAXWLEN) /* truncate long line (should be comment) */
579 rest = MAXWLEN;
580 mch_memmove(rbuf, p, rest);
581 --sourcing_lnum;
582 }
350 if (len > SBLOCKSIZE) 583 if (len > SBLOCKSIZE)
351 l = SBLOCKSIZE; 584 l = SBLOCKSIZE;
352 else 585 else
353 l = len; 586 l = len;
354 len -= l; 587 len -= l;
355 bl = (sblock_T *)alloc((unsigned)(sizeof(sblock_T) - 1 + l + rest)); 588 if (read(fd, rbuf + rest, l) != l)
356 if (bl == NULL)
357 break;
358 bl->sb_next = load_lp->sl_block;
359 load_lp->sl_block = bl;
360
361 /* Read a block from the file. Prepend the remainder of the previous
362 * block. */
363 if (rest > 0)
364 mch_memmove(bl->sb_data, p, rest);
365 if (read(fd, bl->sb_data + rest, l) != l)
366 { 589 {
367 EMSG2(_(e_notread), fname); 590 EMSG2(_(e_notread), fname);
368 break; 591 break;
369 } 592 }
370 l += rest; 593 rbuf_end = rbuf + l + rest;
371 rest = 0; 594 rest = 0;
372 595
373 /* Deal with each line that was read until we finish the block. */ 596 /* Deal with each line that was read until we finish the block. */
374 for (p = bl->sb_data; l > 0; p = np) 597 for (p = rbuf; p < rbuf_end; p = np)
375 { 598 {
376 /* "np" points to the char after the line (CR or NL). */ 599 ++sourcing_lnum;
377 for (np = p; l > 0 && *np >= ' '; ++np) 600
378 --l; 601 /* "np" points to the first char after the line (CR, NL or white
379 if (l == 0) 602 * space). */
603 for (np = p; np < rbuf_end && *np >= ' '; mb_ptr_adv(np))
604 ;
605 if (np >= rbuf_end)
380 { 606 {
381 /* Incomplete line (or end of file). */ 607 /* Incomplete line or end of file. */
382 rest = np - p; 608 rest = np - p;
383 if (len == 0) 609 if (len == 0)
384 EMSG2(_("E751: Truncated spell file: %s"), fname); 610 EMSG(_("E751: Truncated spell file"));
385 break; 611 break;
386 } 612 }
387 *np = NUL; /* terminate the line with a NUL */ 613 *np = NUL; /* terminate the line with a NUL */
388 614
389 /* Skip comment and empty lines. */ 615 if (*p == '-')
390 c = *p;
391 if (c != '#' && np > p)
392 { 616 {
393 if (c == '=' || c == '+') 617 /*
394 { 618 * Region marker: ---, -xx, -xx-yy, etc.
395 garray_T *gap; 619 */
396 620 ++p;
397 /* Match or Add item. */ 621 if (*p == '-')
398 if (c == '=') 622 {
399 gap = &load_lp->sl_match; 623 if (p[1] != '-' || p[2] != NUL)
400 else 624 {
401 gap = &load_lp->sl_add; 625 EMSG2(_(e_invchar2), p - 1);
402 626 len = 0;
403 if (ga_grow(gap, 1) == OK) 627 break;
404 { 628 }
405 for (n = 0; n < gap->ga_len; ++n) 629 region = REGION_ALL;
406 if ((c = STRCMP(p + 1, 630 }
407 MATCH_ENTRY(gap, n) + 1)) < 0) 631 else
632 {
633 char_u *rp = load_lp->sl_regions;
634 int r;
635
636 /* Start of a region. The region may be repeated:
637 * "-ca-uk". Fill "region" with the bit mask for the
638 * ones we find. */
639 region = 0;
640 for (;;)
641 {
642 r = find_region(rp, p);
643 if (r == REGION_ALL)
644 {
645 /* new region, add it to sl_regions[] */
646 r = STRLEN(rp);
647 if (r >= 16)
648 {
649 EMSG2(_("E752: Too many regions: %s"), p);
650 len = 0;
408 break; 651 break;
409 if (c == 0) 652 }
410 { 653 else
411 if (p_verbose > 0) 654 {
412 smsg((char_u *)_("Warning: duplicate match \"%s\" in %s"), 655 rp[r] = p[0];
413 p + 1, fname); 656 rp[r + 1] = p[1];
657 rp[r + 2] = NUL;
658 r = 1 << (r / 2);
659 }
414 } 660 }
415 else 661 else
662 r = 1 << r;
663
664 region |= r;
665 if (p[2] != '-')
416 { 666 {
417 mch_memmove((char_u **)gap->ga_data + n + 1, 667 if (p[2] > ' ')
418 (char_u **)gap->ga_data + n, 668 {
419 (gap->ga_len - n) * sizeof(char_u *)); 669 EMSG2(_(e_invchar2), p - 1);
420 *(((char_u **)gap->ga_data) + n) = p; 670 len = 0;
421 *p = region; 671 }
422 ++gap->ga_len; 672 break;
423 } 673 }
424 } 674 p += 3;
425 } 675 }
426 else if (c == '-') 676 }
427 { 677 }
428 /* region item */ 678 else if (*p != '#' && *p != NUL)
679 {
680 /*
681 * Not an empty line or comment.
682 */
683 if (*p == '!')
684 {
685 wi = &load_lp->sl_kwords; /* keep case */
429 ++p; 686 ++p;
430 if (*p == '-') 687 }
431 /* end of a region */ 688 else
432 region = REGION_ALL; 689 wi = &load_lp->sl_fwords; /* fold case */
433 else 690
434 { 691 flags = 0;
435 char_u *rp = load_lp->sl_regions; 692 c = *p;
436 int r; 693 if (c == '>') /* rare word */
437 694 {
438 /* The region may be repeated: "-ca-uk". Fill 695 flags = DW_RARE;
439 * "region" with the bit mask for the ones we find. */ 696 ++p;
440 region = 0; 697 }
441 for (;;) 698 else if (*p == '+') /* addition */
699 ++p;
700
701 if (c != '+' && !spell_iswordc(p))
702 {
703 EMSG2(_(e_invchar2), p);
704 len = 0;
705 break;
706 }
707
708 /* Make sure there is room for the word. Folding case may
709 * double the size. */
710 wlen = np - p;
711 if (bl == NULL || bl_used + sizeof(dword_T) + wlen
712 #ifdef FEAT_MBYTE
713 * (has_mbyte ? 2 : 1)
714 #endif
715 >= SBLOCKSIZE)
716 {
717 /* Allocate a block of memory to store the dword_T in.
718 * This is not freed until spell_reload() is called. */
719 bl = (sblock_T *)alloc((unsigned)(sizeof(sblock_T)
720 + SBLOCKSIZE));
721 if (bl == NULL)
722 {
723 len = 0;
724 break;
725 }
726 bl->sb_next = load_lp->sl_block;
727 load_lp->sl_block = bl;
728 bl_used = 0;
729 }
730 dw = (dword_T *)(bl->sb_data + bl_used);
731
732 /* For fold-case words fold the case and check for start
733 * with uppercase letter. */
734 if (wi == &load_lp->sl_fwords)
735 {
736 #ifdef FEAT_MBYTE
737 if (MB_ISUPPER(mb_ptr2char(p)))
738 #else
739 if (MB_ISUPPER(*p))
740 #endif
741 flags |= DW_CAP;
742
743 /* Fold case. */
744 (void)str_foldcase(p, np - p, dw->dw_word, wlen
745 #ifdef FEAT_MBYTE
746 * (has_mbyte ? 2 : 1)
747 #endif
748 + 1);
749 #ifdef FEAT_MBYTE
750 /* case folding may change length of word */
751 wlen = STRLEN(dw->dw_word);
752 #endif
753 }
754 else
755 {
756 /* Keep case: copy the word as-is. */
757 mch_memmove(dw->dw_word, p, wlen + 1);
758 }
759
760 if (c == '+')
761 {
762 garray_T *gap = &wi->wi_add;
763
764 /* Addition. TODO: search for matching entry? */
765 if (wi->wi_addlen < wlen)
766 wi->wi_addlen = wlen;
767 if (ga_grow(gap, 1) == FAIL)
768 {
769 len = 0;
770 break;
771 }
772 *(((dword_T **)gap->ga_data) + gap->ga_len) = dw;
773 ++gap->ga_len;
774 dw->dw_region = region;
775 dw->dw_flags = flags;
776 bl_used += sizeof(dword_T) + wlen;
777 }
778 else
779 {
780 /*
781 * Check for a non-word character. If found it's
782 * going to be an nword.
783 * For an nword we split in two: the leading dword and
784 * the remainder. The dword goes in the hashtable
785 * with an nword_T, the remainder is put in the
786 * dword_T (starting with the first non-word
787 * character).
788 */
789 cc = NUL;
790 for (p = dw->dw_word; *p != NUL; mb_ptr_adv(p))
791 if (!spell_iswordc(p))
442 { 792 {
443 /* start of a region */ 793 cc = *p;
444 r = find_region(rp, p); 794 *p = NUL;
445 if (r == REGION_ALL) 795 break;
446 {
447 /* new region, add it */
448 r = STRLEN(rp);
449 if (r >= 12)
450 {
451 EMSG2(_("E752: Too many regions in %s"),
452 fname);
453 r = REGION_ALL;
454 }
455 else
456 {
457 rp[r] = p[0];
458 rp[r + 1] = p[1];
459 rp[r + 2] = NUL;
460 r = 1 << (r / 2);
461 }
462 }
463 else
464 r = 1 << r;
465
466 region |= r;
467 if (p[2] != '-')
468 {
469 if (p[2] != NUL)
470 EMSG2(_("E753: Invalid character in \"%s\""),
471 p - 1);
472 break;
473 }
474 p += 3;
475 } 796 }
476 } 797
477 } 798 /* check if we already have this dword */
478 else 799 hash = hash_hash(dw->dw_word);
479 { 800 hi = hash_lookup(&wi->wi_ht, dw->dw_word, hash);
480 /* add the word */
481 if (c == '>')
482 c = region | RARE_MASK;
483 else
484 {
485 if (c != ' ')
486 EMSG2(_("E753: Invalid character in \"%s\""), p);
487 c = region;
488 }
489 #ifdef FEAT_MBYTE
490 if (MB_ISUPPER(mb_ptr2char(p + 1)))
491 #else
492 if (MB_ISUPPER(p[1]))
493 #endif
494 c |= CASE_MASK;
495 *p++ = c;
496 (void)str_foldcase(p, np - p, word, MAXWLEN + 1);
497 n = STRLEN(word);
498 if (n > np - p)
499 {
500 sblock_T *s;
501
502 /* Folding case made word longer! We need to allocate
503 * memory for it. */
504 s = (sblock_T *)alloc((unsigned)sizeof(sblock_T)
505 + n + 1);
506 if (s != NULL)
507 {
508 s->sb_next = load_lp->sl_block;
509 load_lp->sl_block = s;
510 s->sb_data[0] = p[-1];
511 p = s->sb_data + 1;
512 }
513 }
514 mch_memmove(p, word, n + 1);
515
516 hash = hash_hash(p);
517 hi = hash_lookup(&load_lp->sl_ht, p, hash);
518 if (!HASHITEM_EMPTY(hi)) 801 if (!HASHITEM_EMPTY(hi))
519 { 802 {
520 c = hi->hi_key[-1]; 803 /* Existing entry. */
521 if ((c & (CASE_MASK | RARE_MASK)) 804 edw = HI2DWORD(hi);
522 == (p[-1] & (CASE_MASK | RARE_MASK))) 805 if ((edw->dw_flags & (DW_CAP | DW_RARE))
806 == (dw->dw_flags & (DW_CAP | DW_RARE)))
523 { 807 {
524 if (p_verbose > 0) 808 if (p_verbose > 0)
525 smsg((char_u *)_("Warning: duplicate word \"%s\" in %s"), 809 smsg((char_u *)_("Warning: duplicate word \"%s\" in %s"),
526 p, fname); 810 dw->dw_word, fname);
811 }
812 }
813
814 if (cc != NUL) /* nword */
815 {
816 if (HASHITEM_EMPTY(hi)
817 || (edw->dw_flags & DW_NWORD) == 0)
818 {
819 sblock_T *sb;
820
821 /* Need to allocate a new nword_T. Put it in an
822 * sblock_T, so that we can free it later. */
823 sb = (sblock_T *)alloc(
824 (unsigned)(sizeof(sblock_T)
825 + sizeof(nword_T) + wlen));
826 if (sb == NULL)
827 {
828 len = 0;
829 break;
830 }
831 sb->sb_next = load_lp->sl_block;
832 load_lp->sl_block = sb;
833 nw = (nword_T *)sb->sb_data;
834
835 ga_init2(&nw->nw_ga, sizeof(dword_T *), 4);
836 nw->nw_maxlen = 0;
837 STRCPY(nw->nw_word, dw->dw_word);
838 if (!HASHITEM_EMPTY(hi))
839 {
840 /* Note: the nw_region and nw_flags is for
841 * the dword that matches with the start
842 * of this nword, not for the nword
843 * itself! */
844 nw->nw_region = edw->dw_region;
845 nw->nw_flags = edw->dw_flags | DW_NWORD;
846
847 /* Remove the dword item so that we can
848 * add it as an nword. */
849 hash_remove(&wi->wi_ht, hi);
850 hi = hash_lookup(&wi->wi_ht,
851 nw->nw_word, hash);
852 }
853 else
854 {
855 nw->nw_region = 0;
856 nw->nw_flags = DW_NWORD;
857 }
527 } 858 }
528 else 859 else
529 hi->hi_key[-1] |= (p[-1] & (CASE_MASK | RARE_MASK)); 860 nw = HI2NWORD(hi);
530 } 861 }
531 else 862
532 hash_add_item(&load_lp->sl_ht, hi, p, hash); 863 if (HASHITEM_EMPTY(hi))
864 {
865 /* Add new dword or nword entry. */
866 hash_add_item(&wi->wi_ht, hi, cc == NUL
867 ? dw->dw_word : nw->nw_word, hash);
868 if (cc == NUL)
869 {
870 /* New dword: init the values and count the
871 * used space. */
872 dw->dw_flags = DW_DWORD | flags;
873 dw->dw_region = region;
874 bl_used += sizeof(dword_T) + wlen;
875 }
876 }
877 else if (cc == NUL)
878 {
879 /* existing dword: add the region and flags */
880 dw = edw;
881 dw->dw_region |= region;
882 dw->dw_flags |= DW_DWORD | flags;
883 }
884
885 if (cc != NUL)
886 {
887 /* Use the dword for the non-word character and
888 * following characters. */
889 dw->dw_region = region;
890 dw->dw_flags = flags;
891 STRCPY(dw->dw_word + 1, p + 1);
892 dw->dw_word[0] = cc;
893 l = wlen - (p - dw->dw_word);
894 bl_used += sizeof(dword_T) + l;
895 if (nw->nw_maxlen < l)
896 nw->nw_maxlen = l;
897
898 /* Add the dword to the growarray in the nword. */
899 if (ga_grow(&nw->nw_ga, 1) == FAIL)
900 {
901 len = 0;
902 break;
903 }
904 *((dword_T **)nw->nw_ga.ga_data + nw->nw_ga.ga_len)
905 = dw;
906 ++nw->nw_ga.ga_len;
907 }
533 } 908 }
534 } 909 }
535 910
536 while (l > 0 && *np < ' ') 911 /* Skip over CR and NL characters and trailing white space. */
537 { 912 while (np < rbuf_end && *np <= ' ')
538 ++np; 913 ++np;
539 --l;
540 }
541 } 914 }
542 } 915 }
543 916
544 close(fd); 917 close(fd);
918 theend:
919 sourcing_name = save_sourcing_name;
920 sourcing_lnum = save_sourcing_lnum;
921 vim_free(rbuf);
545 } 922 }
546 923
547 /* 924 /*
548 * Parse 'spelllang' and set buf->b_langp accordingly. 925 * Parse 'spelllang' and set buf->b_langp accordingly.
549 * Returns an error message or NULL. 926 * Returns an error message or NULL.
670 { 1047 {
671 buf_T *buf; 1048 buf_T *buf;
672 slang_T *lp; 1049 slang_T *lp;
673 sblock_T *sp; 1050 sblock_T *sp;
674 1051
1052 /* Initialize the table for spell_iswordc(). */
1053 init_spell_chartab();
1054
675 /* Unload all allocated memory. */ 1055 /* Unload all allocated memory. */
676 while (first_lang != NULL) 1056 while (first_lang != NULL)
677 { 1057 {
678 lp = first_lang; 1058 lp = first_lang;
679 first_lang = lp->sl_next; 1059 first_lang = lp->sl_next;
680 1060
681 hash_clear(&lp->sl_ht); 1061 hash_clear(&lp->sl_fwords.wi_ht);
682 ga_clear(&lp->sl_match); 1062 ga_clear(&lp->sl_fwords.wi_add);
683 ga_clear(&lp->sl_add); 1063 hash_clear(&lp->sl_kwords.wi_ht);
1064 ga_clear(&lp->sl_kwords.wi_add);
684 while (lp->sl_block != NULL) 1065 while (lp->sl_block != NULL)
685 { 1066 {
686 sp = lp->sl_block; 1067 sp = lp->sl_block;
687 lp->sl_block = sp->sb_next; 1068 lp->sl_block = sp->sb_next;
688 vim_free(sp); 1069 vim_free(sp);