223
|
1 /* vi:set ts=8 sts=4 sw=4:
|
|
2 *
|
|
3 * VIM - Vi IMproved by Bram Moolenaar
|
|
4 *
|
|
5 * Do ":help uganda" in Vim to read copying and usage conditions.
|
|
6 * Do ":help credits" in Vim to see a list of people who contributed.
|
|
7 * See README.txt for an overview of the Vim source code.
|
|
8 */
|
|
9
|
|
10 /*
|
|
11 * spell.c: code for spell checking
|
226
|
12 *
|
300
|
13 * The spell checking mechanism uses a tree (aka trie). Each node in the tree
|
|
14 * has a list of bytes that can appear (siblings). For each byte there is a
|
|
15 * pointer to the node with the byte that follows in the word (child).
|
|
16 * A NUL byte is used where the word may end.
|
|
17 *
|
|
18 * There are two trees: one with case-folded words and one with words in
|
|
19 * original case. The second one is only used for keep-case words and is
|
|
20 * usually small.
|
|
21 *
|
|
22 * Thanks to Olaf Seibert for providing an example implementation of this tree
|
|
23 * and the compression mechanism.
|
243
|
24 *
|
|
25 * Matching involves checking the caps type: Onecap ALLCAP KeepCap.
|
|
26 *
|
236
|
27 * Why doesn't Vim use aspell/ispell/myspell/etc.?
|
|
28 * See ":help develop-spell".
|
|
29 */
|
|
30
|
300
|
31 /*
|
|
32 * Vim spell file format: <HEADER> <SUGGEST> <LWORDTREE> <KWORDTREE>
|
|
33 *
|
|
34 * <HEADER>: <fileID> <regioncnt> <regionname> ...
|
|
35 * <charflagslen> <charflags> <fcharslen> <fchars>
|
|
36 *
|
|
37 * <fileID> 10 bytes "VIMspell05"
|
|
38 * <regioncnt> 1 byte number of regions following (8 supported)
|
|
39 * <regionname> 2 bytes Region name: ca, au, etc.
|
|
40 * First <regionname> is region 1.
|
|
41 *
|
|
42 * <charflagslen> 1 byte Number of bytes in <charflags> (should be 128).
|
|
43 * <charflags> N bytes List of flags (first one is for character 128):
|
|
44 * 0x01 word character
|
|
45 * 0x01 upper-case character
|
|
46 * <fcharslen> 2 bytes Number of bytes in <fchars>.
|
|
47 * <fchars> N bytes Folded characters, first one is for character 128.
|
|
48 *
|
|
49 *
|
|
50 * <SUGGEST> : <suggestlen> <more> ...
|
|
51 *
|
|
52 * <suggestlen> 4 bytes Length of <SUGGEST> in bytes, excluding
|
|
53 * <suggestlen>. MSB first.
|
|
54 * <more> To be defined.
|
|
55 *
|
|
56 *
|
|
57 * <LWORDTREE>: <wordtree>
|
|
58 *
|
|
59 * <wordtree>: <nodecount> <nodedata> ...
|
|
60 *
|
|
61 * <nodecount> 4 bytes Number of nodes following. MSB first.
|
|
62 *
|
|
63 * <nodedata>: <siblingcount> <sibling> ...
|
|
64 *
|
|
65 * <siblingcount> 1 byte Number of siblings in this node. The siblings
|
|
66 * follow in sorted order.
|
|
67 *
|
|
68 * <sibling>: <byte> [<nodeidx> <xbyte> | <flags> [<region>]]
|
|
69 *
|
|
70 * <byte> 1 byte Byte value of the sibling. Special cases:
|
|
71 * BY_NOFLAGS: End of word without flags and for all
|
|
72 * regions.
|
|
73 * BY_FLAGS: End of word, <flags> follow.
|
|
74 * BY_INDEX: Child of sibling is shared, <nodeidx>
|
|
75 * and <xbyte> follow.
|
|
76 *
|
|
77 * <nodeidx> 3 bytes Index of child for this sibling, MSB first.
|
|
78 *
|
|
79 * <xbyte> 1 byte byte value of the sibling.
|
|
80 *
|
|
81 * <flags> 1 byte bitmask of:
|
|
82 * WF_ALLCAP word must have only capitals
|
|
83 * WF_ONECAP first char of word must be capital
|
|
84 * WF_RARE rare word
|
|
85 * WF_REGION <region> follows
|
|
86 *
|
|
87 * <region> 1 byte Bitmask for regions in which word is valid. When
|
|
88 * omitted it's valid in all regions.
|
|
89 * Lowest bit is for region 1.
|
|
90 *
|
|
91 * <KWORDTREE>: <wordtree>
|
|
92 *
|
|
93 *
|
|
94 * All text characters are in 'encoding', but stored as single bytes.
|
|
95 * The region name is ASCII.
|
|
96 */
|
|
97
|
223
|
98 #if defined(MSDOS) || defined(WIN16) || defined(WIN32) || defined(_WIN64)
|
|
99 # include <io.h> /* for lseek(), must be before vim.h */
|
|
100 #endif
|
|
101
|
|
102 #include "vim.h"
|
|
103
|
|
104 #if defined(FEAT_SYN_HL) || defined(PROTO)
|
|
105
|
|
106 #ifdef HAVE_FCNTL_H
|
|
107 # include <fcntl.h>
|
|
108 #endif
|
|
109
|
300
|
110 #define MAXWLEN 250 /* assume max. word len is this many bytes */
|
226
|
111
|
300
|
112 /* Flags used for a word. */
|
|
113 #define WF_REGION 0x01 /* region byte follows */
|
|
114 #define WF_ONECAP 0x02 /* word with one capital (or all capitals) */
|
|
115 #define WF_ALLCAP 0x04 /* word must be all capitals */
|
|
116 #define WF_RARE 0x08 /* rare word */
|
|
117
|
|
118 #define WF_KEEPCAP 0x100 /* keep-case word */
|
|
119
|
|
120 #define BY_NOFLAGS 0 /* end of word without flags or region */
|
|
121 #define BY_FLAGS 1 /* end of word, flag byte follows */
|
|
122 #define BY_INDEX 2 /* child is shared, index follows */
|
|
123 #define BY_SPECIAL BY_INDEX /* hightest special byte value */
|
236
|
124
|
243
|
125 /* Info from "REP" entries in ".aff" file used in af_rep.
|
|
126 * TODO: This is not used yet. Either use it or remove it. */
|
236
|
127 typedef struct repentry_S
|
|
128 {
|
|
129 char_u *re_from;
|
|
130 char_u *re_to;
|
|
131 } repentry_T;
|
|
132
|
|
133 /*
|
243
|
134 * Structure used to store words and other info for one language, loaded from
|
|
135 * a .spl file.
|
300
|
136 * The main access is through the tree in "sl_fbyts/sl_fidxs", storing the
|
|
137 * case-folded words. "sl_kbyts/sl_kidxs" is for keep-case words.
|
|
138 *
|
|
139 * The "byts" array stores the possible bytes in each tree node, preceded by
|
|
140 * the number of possible bytes, sorted on byte value:
|
|
141 * <len> <byte1> <byte2> ...
|
|
142 * The "idxs" array stores the index of the child node corresponding to the
|
|
143 * byte in "byts".
|
|
144 * Exception: when the byte is zero, the word may end here and "idxs" holds
|
|
145 * the flags and region for the word. There may be several zeros in sequence
|
|
146 * for alternative flag/region combinations.
|
236
|
147 */
|
|
148 typedef struct slang_S slang_T;
|
|
149 struct slang_S
|
|
150 {
|
|
151 slang_T *sl_next; /* next language */
|
|
152 char_u *sl_name; /* language name "en", "en.rare", "nl", etc. */
|
300
|
153 char_u *sl_fbyts; /* case-folded word bytes */
|
|
154 int *sl_fidxs; /* case-folded word indexes */
|
|
155 char_u *sl_kbyts; /* keep-case word bytes */
|
|
156 int *sl_kidxs; /* keep-case word indexes */
|
243
|
157 char_u *sl_try; /* "TRY" from .aff file TODO: not used */
|
|
158 garray_T sl_rep; /* list of repentry_T entries from REP lines
|
|
159 * TODO not used */
|
236
|
160 char_u sl_regions[17]; /* table with up to 8 region names plus NUL */
|
|
161 int sl_error; /* error while loading */
|
|
162 };
|
|
163
|
243
|
164 /* First language that is loaded, start of the linked list of loaded
|
|
165 * languages. */
|
236
|
166 static slang_T *first_lang = NULL;
|
|
167
|
|
168 #define REGION_ALL 0xff
|
|
169
|
|
170
|
|
171 /*
|
|
172 * Structure used in "b_langp", filled from 'spelllang'.
|
|
173 */
|
|
174 typedef struct langp_S
|
|
175 {
|
|
176 slang_T *lp_slang; /* info for this language (NULL for last one) */
|
|
177 int lp_region; /* bitmask for region or REGION_ALL */
|
|
178 } langp_T;
|
|
179
|
|
180 #define LANGP_ENTRY(ga, i) (((langp_T *)(ga).ga_data) + (i))
|
|
181
|
|
182 #define SP_OK 0
|
|
183 #define SP_BAD 1
|
|
184 #define SP_RARE 2
|
|
185 #define SP_LOCAL 3
|
|
186
|
300
|
187 #define VIMSPELLMAGIC "VIMspell05" /* string at start of Vim spell file */
|
236
|
188 #define VIMSPELLMAGICL 10
|
|
189
|
|
190 /*
|
|
191 * Structure to store info for word matching.
|
|
192 */
|
|
193 typedef struct matchinf_S
|
|
194 {
|
|
195 langp_T *mi_lp; /* info for language and region */
|
|
196 slang_T *mi_slang; /* info for the language */
|
243
|
197
|
|
198 /* pointers to original text to be checked */
|
236
|
199 char_u *mi_word; /* start of word being checked */
|
300
|
200 char_u *mi_end; /* end of matching word */
|
243
|
201 char_u *mi_fend; /* next char to be added to mi_fword */
|
300
|
202 char_u *mi_cend; /* char after what was used for
|
|
203 mi_capflags */
|
243
|
204
|
|
205 /* case-folded text */
|
|
206 char_u mi_fword[MAXWLEN + 1]; /* mi_word case-folded */
|
300
|
207 int mi_fwordlen; /* nr of valid bytes in mi_fword */
|
243
|
208
|
|
209 /* others */
|
236
|
210 int mi_result; /* result so far: SP_BAD, SP_OK, etc. */
|
300
|
211 int mi_capflags; /* WF_ONECAP WF_ALLCAP WF_KEEPCAP */
|
236
|
212 } matchinf_T;
|
|
213
|
|
214 static slang_T *slang_alloc __ARGS((char_u *lang));
|
|
215 static void slang_free __ARGS((slang_T *lp));
|
300
|
216 static void find_word __ARGS((matchinf_T *mip, int keepcap));
|
236
|
217 static slang_T *spell_load_lang __ARGS((char_u *lang));
|
|
218 static void spell_load_file __ARGS((char_u *fname, void *cookie));
|
300
|
219 static int read_tree __ARGS((FILE *fd, char_u *byts, int *idxs, int maxidx, int startidx));
|
236
|
220 static int find_region __ARGS((char_u *rp, char_u *region));
|
|
221 static int captype __ARGS((char_u *word, char_u *end));
|
|
222
|
|
223 /*
|
|
224 * Main spell-checking function.
|
300
|
225 * "ptr" points to a character that could be the start of a word.
|
236
|
226 * "*attrp" is set to the attributes for a badly spelled word. For a non-word
|
|
227 * or when it's OK it remains unchanged.
|
|
228 * This must only be called when 'spelllang' is not empty.
|
|
229 * Returns the length of the word in bytes, also when it's OK, so that the
|
|
230 * caller can skip over the word.
|
|
231 */
|
|
232 int
|
300
|
233 spell_check(wp, ptr, attrp)
|
236
|
234 win_T *wp; /* current window */
|
|
235 char_u *ptr;
|
|
236 int *attrp;
|
|
237 {
|
|
238 matchinf_T mi; /* Most things are put in "mi" so that it can
|
|
239 be passed to functions quickly. */
|
|
240
|
300
|
241 /* Find the end of the word. */
|
236
|
242 mi.mi_word = ptr;
|
|
243 mi.mi_end = ptr;
|
|
244
|
300
|
245 /* A word starting with a number is always OK. Also skip hexadecimal
|
|
246 * numbers 0xFF99 and 0X99FF. */
|
|
247 if (*ptr >= '0' && *ptr <= '9')
|
|
248 {
|
|
249 if (*ptr == '0' && (ptr[1] == 'x' || ptr[2] == 'X'))
|
|
250 mi.mi_end = skiphex(ptr);
|
|
251 else
|
|
252 mi.mi_end = skipdigits(ptr);
|
|
253 }
|
|
254 else
|
236
|
255 {
|
300
|
256 mi.mi_fend = ptr;
|
|
257 if (spell_iswordc(mi.mi_fend))
|
|
258 {
|
|
259 /* Make case-folded copy of the characters until the next non-word
|
|
260 * character. */
|
|
261 do
|
|
262 {
|
|
263 mb_ptr_adv(mi.mi_fend);
|
|
264 } while (*mi.mi_fend != NUL && spell_iswordc(mi.mi_fend));
|
236
|
265
|
300
|
266 (void)spell_casefold(ptr, (int)(mi.mi_fend - ptr), mi.mi_fword,
|
|
267 MAXWLEN + 1);
|
|
268 mi.mi_fwordlen = STRLEN(mi.mi_fword);
|
236
|
269
|
300
|
270 /* Check the caps type of the word. */
|
|
271 mi.mi_capflags = captype(ptr, mi.mi_fend);
|
236
|
272
|
300
|
273 /* We always use the characters up to the next non-word character,
|
|
274 * also for bad words. */
|
|
275 mi.mi_end = mi.mi_fend;
|
|
276 }
|
|
277 else
|
|
278 {
|
|
279 /* No word characters. Don't case-fold anything, we may quickly
|
|
280 * find out this is not a word (but it could be!). */
|
|
281 mi.mi_fwordlen = 0;
|
|
282 mi.mi_capflags = 0;
|
|
283 }
|
243
|
284
|
300
|
285 mi.mi_cend = mi.mi_fend;
|
|
286
|
|
287 /* The word is bad unless we recognize it. */
|
|
288 mi.mi_result = SP_BAD;
|
236
|
289
|
300
|
290 /*
|
|
291 * Loop over the languages specified in 'spelllang'.
|
|
292 * We check them all, because a matching word may be longer than an
|
|
293 * already found matching word.
|
|
294 */
|
|
295 for (mi.mi_lp = LANGP_ENTRY(wp->w_buffer->b_langp, 0);
|
|
296 mi.mi_lp->lp_slang != NULL; ++mi.mi_lp)
|
243
|
297 {
|
300
|
298 /* Check for a matching word in case-folded words. */
|
|
299 find_word(&mi, FALSE);
|
|
300
|
|
301 /* Try keep-case words. */
|
|
302 find_word(&mi, TRUE);
|
|
303 }
|
243
|
304
|
300
|
305 if (mi.mi_result != SP_OK)
|
|
306 {
|
|
307 /* When we are at a non-word character there is no error, just
|
|
308 * skip over the character (try looking for a word after it). */
|
|
309 if (!spell_iswordc(ptr))
|
243
|
310 {
|
300
|
311 #ifdef FEAT_MBYTE
|
|
312 if (has_mbyte)
|
|
313 return mb_ptr2len_check(ptr);
|
|
314 #endif
|
|
315 return 1;
|
243
|
316 }
|
|
317
|
300
|
318 if (mi.mi_result == SP_BAD)
|
|
319 *attrp = highlight_attr[HLF_SPB];
|
|
320 else if (mi.mi_result == SP_RARE)
|
|
321 *attrp = highlight_attr[HLF_SPR];
|
|
322 else
|
|
323 *attrp = highlight_attr[HLF_SPL];
|
243
|
324 }
|
|
325 }
|
|
326
|
300
|
327 return (int)(mi.mi_end - ptr);
|
236
|
328 }
|
|
329
|
|
330 /*
|
300
|
331 * Check if the word at "mip->mi_word" is in the tree.
|
|
332 * When "keepcap" is TRUE check in keep-case word tree.
|
|
333 *
|
|
334 * For a match mip->mi_result is updated.
|
243
|
335 */
|
|
336 static void
|
300
|
337 find_word(mip, keepcap)
|
243
|
338 matchinf_T *mip;
|
300
|
339 int keepcap;
|
243
|
340 {
|
300
|
341 int arridx = 0;
|
|
342 int endlen[MAXWLEN]; /* length at possible word endings */
|
|
343 int endidx[MAXWLEN]; /* possible word endings */
|
|
344 int endidxcnt = 0;
|
|
345 int len;
|
|
346 int wlen = 0;
|
|
347 int flen;
|
|
348 int c;
|
|
349 char_u *ptr;
|
|
350 unsigned lo, hi, m;
|
243
|
351 #ifdef FEAT_MBYTE
|
300
|
352 char_u *s;
|
|
353 char_u *p;
|
243
|
354 #endif
|
300
|
355 int res;
|
302
|
356 int valid = FALSE;
|
300
|
357 slang_T *slang = mip->mi_lp->lp_slang;
|
|
358 unsigned flags;
|
|
359 char_u *byts;
|
|
360 int *idxs;
|
243
|
361
|
300
|
362 if (keepcap)
|
236
|
363 {
|
300
|
364 /* Check for word with matching case in keep-case tree. */
|
|
365 ptr = mip->mi_word;
|
|
366 flen = 9999; /* no case folding, always enough bytes */
|
|
367 byts = slang->sl_kbyts;
|
|
368 idxs = slang->sl_kidxs;
|
236
|
369 }
|
|
370 else
|
|
371 {
|
300
|
372 /* Check for case-folded in case-folded tree. */
|
|
373 ptr = mip->mi_fword;
|
|
374 flen = mip->mi_fwordlen; /* available case-folded bytes */
|
|
375 byts = slang->sl_fbyts;
|
|
376 idxs = slang->sl_fidxs;
|
243
|
377 }
|
|
378
|
300
|
379 if (byts == NULL)
|
|
380 return; /* array is empty */
|
236
|
381
|
|
382 /*
|
300
|
383 * Repeat advancing in the tree until there is a byte that doesn't match,
|
|
384 * we reach the end of the tree or we reach the end of the line.
|
236
|
385 */
|
300
|
386 for (;;)
|
236
|
387 {
|
300
|
388 if (flen == 0 && *mip->mi_fend != NUL)
|
236
|
389 {
|
300
|
390 /* Need to fold at least one more character. Do until next
|
|
391 * non-word character for efficiency. */
|
|
392 do
|
236
|
393 {
|
|
394 #ifdef FEAT_MBYTE
|
|
395 if (has_mbyte)
|
300
|
396 flen += mb_ptr2len_check(mip->mi_fend + flen);
|
236
|
397 else
|
|
398 #endif
|
300
|
399 ++flen;
|
|
400 } while (spell_iswordc(mip->mi_fend + flen));
|
|
401
|
|
402 (void)spell_casefold(mip->mi_fend, flen,
|
|
403 mip->mi_fword + mip->mi_fwordlen,
|
|
404 MAXWLEN - mip->mi_fwordlen);
|
|
405 mip->mi_fend += flen;
|
|
406 flen = STRLEN(mip->mi_fword + mip->mi_fwordlen);
|
|
407 mip->mi_fwordlen += flen;
|
|
408 }
|
|
409
|
|
410 len = byts[arridx++];
|
|
411
|
|
412 /* If the first possible byte is a zero the word could end here.
|
|
413 * Remember this index, we first check for the longest word. */
|
|
414 if (byts[arridx] == 0)
|
|
415 {
|
|
416 endlen[endidxcnt] = wlen;
|
|
417 endidx[endidxcnt++] = arridx++;
|
|
418 --len;
|
|
419
|
|
420 /* Skip over the zeros, there can be several flag/region
|
|
421 * combinations. */
|
|
422 while (len > 0 && byts[arridx] == 0)
|
|
423 {
|
|
424 ++arridx;
|
|
425 --len;
|
|
426 }
|
|
427 if (len == 0)
|
|
428 break; /* no children, word must end here */
|
|
429 }
|
|
430
|
|
431 /* Stop looking at end of the line. */
|
|
432 if (ptr[wlen] == NUL)
|
|
433 break;
|
|
434
|
|
435 /* Perform a binary search in the list of accepted bytes. */
|
|
436 c = ptr[wlen];
|
|
437 lo = arridx;
|
|
438 hi = arridx + len - 1;
|
|
439 while (lo < hi)
|
|
440 {
|
|
441 m = (lo + hi) / 2;
|
|
442 if (byts[m] > c)
|
|
443 hi = m - 1;
|
|
444 else if (byts[m] < c)
|
|
445 lo = m + 1;
|
|
446 else
|
|
447 {
|
|
448 lo = hi = m;
|
|
449 break;
|
236
|
450 }
|
|
451 }
|
300
|
452
|
|
453 /* Stop if there is no matching byte. */
|
|
454 if (hi < lo || byts[lo] != c)
|
|
455 break;
|
|
456
|
|
457 /* Continue at the child (if there is one). */
|
|
458 arridx = idxs[lo];
|
|
459 ++wlen;
|
|
460 --flen;
|
236
|
461 }
|
|
462
|
300
|
463 /*
|
|
464 * Verify that one of the possible endings is valid. Try the longest
|
|
465 * first.
|
|
466 */
|
|
467 while (endidxcnt > 0)
|
|
468 {
|
|
469 --endidxcnt;
|
|
470 arridx = endidx[endidxcnt];
|
|
471 wlen = endlen[endidxcnt];
|
236
|
472
|
300
|
473 #ifdef FEAT_MBYTE
|
|
474 if ((*mb_head_off)(ptr, ptr + wlen) > 0)
|
|
475 continue; /* not at first byte of character */
|
|
476 #endif
|
|
477 if (spell_iswordc(ptr + wlen))
|
|
478 continue; /* next char is a word character */
|
|
479
|
|
480 #ifdef FEAT_MBYTE
|
|
481 if (!keepcap && has_mbyte)
|
|
482 {
|
|
483 /* Compute byte length in original word, length may change
|
|
484 * when folding case. */
|
|
485 p = mip->mi_word;
|
|
486 for (s = ptr; s < ptr + wlen; mb_ptr_adv(s))
|
|
487 mb_ptr_adv(p);
|
|
488 wlen = p - mip->mi_word;
|
|
489 }
|
|
490 #endif
|
236
|
491
|
300
|
492 /* Check flags and region. Repeat this if there are more
|
|
493 * flags/region alternatives until there is a match. */
|
|
494 res = SP_BAD;
|
|
495 for (len = byts[arridx - 1]; len > 0 && byts[arridx] == 0; --len)
|
|
496 {
|
|
497 flags = idxs[arridx];
|
|
498 if (keepcap)
|
|
499 {
|
|
500 /* For "keepcap" tree the case is always right. */
|
|
501 valid = TRUE;
|
|
502 }
|
|
503 else
|
|
504 {
|
|
505 /* Check that the word is in the required case. */
|
|
506 if (mip->mi_cend != mip->mi_word + wlen)
|
|
507 {
|
|
508 /* mi_capflags was set for a different word
|
|
509 * length, need to do it again. */
|
|
510 mip->mi_cend = mip->mi_word + wlen;
|
|
511 mip->mi_capflags = captype(mip->mi_word,
|
|
512 mip->mi_cend);
|
|
513 }
|
|
514
|
|
515 valid = (mip->mi_capflags == WF_ALLCAP
|
|
516 || ((flags & WF_ALLCAP) == 0
|
|
517 && ((flags & WF_ONECAP) == 0
|
|
518 || mip->mi_capflags == WF_ONECAP)));
|
|
519 }
|
236
|
520
|
300
|
521 if (valid && res != SP_OK)
|
|
522 {
|
|
523 if (flags & WF_REGION)
|
|
524 {
|
|
525 /* Check region. */
|
|
526 if ((mip->mi_lp->lp_region & (flags >> 8)) != 0)
|
|
527 res = SP_OK;
|
|
528 else
|
|
529 res = SP_LOCAL;
|
|
530 }
|
|
531 else if (flags & WF_RARE)
|
|
532 res = SP_RARE;
|
|
533 else
|
|
534 res = SP_OK;
|
|
535 }
|
236
|
536
|
300
|
537 if (res == SP_OK)
|
|
538 break;
|
|
539 ++arridx;
|
|
540 }
|
|
541
|
|
542 if (valid)
|
|
543 {
|
|
544 /* Valid word! Always use the longest match. */
|
|
545 if (mip->mi_end < mip->mi_word + wlen)
|
|
546 mip->mi_end = mip->mi_word + wlen;
|
|
547 if (mip->mi_result != SP_OK)
|
|
548 mip->mi_result = res;
|
|
549 break;
|
|
550 }
|
|
551 }
|
236
|
552 }
|
|
553
|
300
|
554
|
236
|
555 /*
|
|
556 * Move to next spell error.
|
|
557 * Return OK if found, FAIL otherwise.
|
|
558 */
|
|
559 int
|
|
560 spell_move_to(dir, allwords)
|
|
561 int dir; /* FORWARD or BACKWARD */
|
|
562 int allwords; /* TRUE for "[s" and "]s" */
|
|
563 {
|
249
|
564 linenr_T lnum;
|
|
565 pos_T found_pos;
|
236
|
566 char_u *line;
|
|
567 char_u *p;
|
|
568 int attr = 0;
|
|
569 int len;
|
249
|
570 int has_syntax = syntax_present(curbuf);
|
|
571 int col;
|
|
572 int can_spell;
|
236
|
573
|
|
574 if (!curwin->w_p_spell || *curwin->w_buffer->b_p_spl == NUL)
|
|
575 {
|
|
576 EMSG(_("E756: Spell checking not enabled"));
|
|
577 return FAIL;
|
|
578 }
|
|
579
|
249
|
580 /*
|
|
581 * Start looking for bad word at the start of the line, because we can't
|
|
582 * start halfway a word, we don't know where it starts or ends.
|
|
583 *
|
|
584 * When searching backwards, we continue in the line to find the last
|
|
585 * bad word (in the cursor line: before the cursor).
|
|
586 */
|
|
587 lnum = curwin->w_cursor.lnum;
|
|
588 found_pos.lnum = 0;
|
236
|
589
|
|
590 while (!got_int)
|
|
591 {
|
249
|
592 line = ml_get(lnum);
|
|
593 p = line;
|
|
594
|
236
|
595 while (*p != NUL)
|
|
596 {
|
300
|
597 /* When searching backward don't search after the cursor. */
|
|
598 if (dir == BACKWARD
|
|
599 && lnum == curwin->w_cursor.lnum
|
|
600 && (colnr_T)(p - line) >= curwin->w_cursor.col)
|
|
601 break;
|
249
|
602
|
300
|
603 /* start of word */
|
|
604 len = spell_check(curwin, p, &attr);
|
249
|
605
|
300
|
606 if (attr != 0)
|
|
607 {
|
|
608 /* We found a bad word. Check the attribute. */
|
|
609 /* TODO: check for syntax @Spell cluster. */
|
|
610 if (allwords || attr == highlight_attr[HLF_SPB])
|
236
|
611 {
|
300
|
612 /* When searching forward only accept a bad word after
|
|
613 * the cursor. */
|
|
614 if (dir == BACKWARD
|
|
615 || lnum > curwin->w_cursor.lnum
|
|
616 || (lnum == curwin->w_cursor.lnum
|
|
617 && (colnr_T)(p - line)
|
|
618 > curwin->w_cursor.col))
|
236
|
619 {
|
300
|
620 if (has_syntax)
|
249
|
621 {
|
300
|
622 col = p - line;
|
|
623 (void)syn_get_id(lnum, (colnr_T)col,
|
|
624 FALSE, &can_spell);
|
249
|
625
|
300
|
626 /* have to get the line again, a multi-line
|
|
627 * regexp may make it invalid */
|
|
628 line = ml_get(lnum);
|
|
629 p = line + col;
|
|
630 }
|
|
631 else
|
|
632 can_spell = TRUE;
|
249
|
633
|
300
|
634 if (can_spell)
|
|
635 {
|
|
636 found_pos.lnum = lnum;
|
|
637 found_pos.col = p - line;
|
249
|
638 #ifdef FEAT_VIRTUALEDIT
|
300
|
639 found_pos.coladd = 0;
|
249
|
640 #endif
|
300
|
641 if (dir == FORWARD)
|
|
642 {
|
|
643 /* No need to search further. */
|
|
644 curwin->w_cursor = found_pos;
|
|
645 return OK;
|
249
|
646 }
|
|
647 }
|
236
|
648 }
|
|
649 }
|
300
|
650 attr = 0;
|
236
|
651 }
|
|
652
|
300
|
653 /* advance to character after the word */
|
|
654 p += len;
|
|
655 if (*p == NUL)
|
|
656 break;
|
236
|
657 }
|
|
658
|
|
659 /* Advance to next line. */
|
249
|
660 if (dir == BACKWARD)
|
|
661 {
|
|
662 if (found_pos.lnum != 0)
|
|
663 {
|
|
664 /* Use the last match in the line. */
|
|
665 curwin->w_cursor = found_pos;
|
|
666 return OK;
|
|
667 }
|
|
668 if (lnum == 1)
|
|
669 return FAIL;
|
|
670 --lnum;
|
|
671 }
|
|
672 else
|
|
673 {
|
|
674 if (lnum == curbuf->b_ml.ml_line_count)
|
|
675 return FAIL;
|
|
676 ++lnum;
|
|
677 }
|
236
|
678
|
|
679 line_breakcheck();
|
|
680 }
|
|
681
|
|
682 return FAIL; /* interrupted */
|
|
683 }
|
|
684
|
|
685 /*
|
|
686 * Load word list for "lang" from a Vim spell file.
|
|
687 * "lang" must be the language without the region: "en" or "en-rare".
|
|
688 */
|
|
689 static slang_T *
|
|
690 spell_load_lang(lang)
|
|
691 char_u *lang;
|
|
692 {
|
|
693 slang_T *lp;
|
|
694 char_u fname_enc[80];
|
|
695 char_u *p;
|
|
696 int r;
|
|
697
|
|
698 lp = slang_alloc(lang);
|
|
699 if (lp != NULL)
|
|
700 {
|
|
701 /* Find all spell files for "lang" in 'runtimepath' and load them.
|
|
702 * Use 'encoding', except that we use "latin1" for "latin9". */
|
|
703 #ifdef FEAT_MBYTE
|
|
704 if (STRLEN(p_enc) < 60 && STRCMP(p_enc, "iso-8859-15") != 0)
|
|
705 p = p_enc;
|
|
706 else
|
|
707 #endif
|
|
708 p = (char_u *)"latin1";
|
272
|
709 vim_snprintf((char *)fname_enc, sizeof(fname_enc),
|
|
710 "spell/%s.%s.spl", lang, p);
|
236
|
711
|
|
712 r = do_in_runtimepath(fname_enc, TRUE, spell_load_file, lp);
|
240
|
713 if (r == FAIL && !lp->sl_error)
|
|
714 {
|
|
715 /* Try loading the ASCII version. */
|
272
|
716 vim_snprintf((char *)fname_enc, sizeof(fname_enc),
|
|
717 "spell/%s.ascii.spl", lang);
|
240
|
718
|
|
719 r = do_in_runtimepath(fname_enc, TRUE, spell_load_file, lp);
|
|
720 }
|
236
|
721 if (r == FAIL || lp->sl_error)
|
|
722 {
|
|
723 slang_free(lp);
|
|
724 lp = NULL;
|
|
725 if (r == FAIL)
|
|
726 smsg((char_u *)_("Warning: Cannot find word list \"%s\""),
|
|
727 fname_enc + 6);
|
|
728 }
|
|
729 else
|
|
730 {
|
|
731 lp->sl_next = first_lang;
|
|
732 first_lang = lp;
|
|
733 }
|
|
734 }
|
|
735
|
|
736 return lp;
|
|
737 }
|
|
738
|
|
739 /*
|
|
740 * Allocate a new slang_T.
|
|
741 * Caller must fill "sl_next".
|
|
742 */
|
|
743 static slang_T *
|
|
744 slang_alloc(lang)
|
|
745 char_u *lang;
|
|
746 {
|
|
747 slang_T *lp;
|
|
748
|
300
|
749 lp = (slang_T *)alloc_clear(sizeof(slang_T));
|
236
|
750 if (lp != NULL)
|
|
751 {
|
|
752 lp->sl_name = vim_strsave(lang);
|
|
753 ga_init2(&lp->sl_rep, sizeof(repentry_T), 4);
|
|
754 }
|
|
755 return lp;
|
|
756 }
|
|
757
|
|
758 /*
|
|
759 * Free the contents of an slang_T and the structure itself.
|
|
760 */
|
|
761 static void
|
|
762 slang_free(lp)
|
|
763 slang_T *lp;
|
|
764 {
|
|
765 vim_free(lp->sl_name);
|
300
|
766 vim_free(lp->sl_fbyts);
|
|
767 vim_free(lp->sl_kbyts);
|
|
768 vim_free(lp->sl_fidxs);
|
|
769 vim_free(lp->sl_kidxs);
|
236
|
770 ga_clear(&lp->sl_rep);
|
|
771 vim_free(lp->sl_try);
|
|
772 vim_free(lp);
|
|
773 }
|
|
774
|
|
775 /*
|
|
776 * Load one spell file into an slang_T.
|
|
777 * Invoked through do_in_runtimepath().
|
|
778 */
|
|
779 static void
|
|
780 spell_load_file(fname, cookie)
|
|
781 char_u *fname;
|
|
782 void *cookie; /* points to the slang_T to be filled */
|
|
783 {
|
|
784 slang_T *lp = cookie;
|
|
785 FILE *fd;
|
|
786 char_u buf[MAXWLEN + 1];
|
|
787 char_u *p;
|
|
788 int i;
|
300
|
789 int len;
|
236
|
790 int round;
|
|
791 char_u *save_sourcing_name = sourcing_name;
|
|
792 linenr_T save_sourcing_lnum = sourcing_lnum;
|
255
|
793 int cnt, ccnt;
|
|
794 char_u *fol;
|
236
|
795
|
|
796 fd = fopen((char *)fname, "r");
|
|
797 if (fd == NULL)
|
|
798 {
|
|
799 EMSG2(_(e_notopen), fname);
|
255
|
800 goto endFAIL;
|
236
|
801 }
|
|
802
|
|
803 /* Set sourcing_name, so that error messages mention the file name. */
|
|
804 sourcing_name = fname;
|
|
805 sourcing_lnum = 0;
|
|
806
|
255
|
807 /* <HEADER>: <fileID> <regioncnt> <regionname> ...
|
|
808 * <charflagslen> <charflags> <fcharslen> <fchars> */
|
236
|
809 for (i = 0; i < VIMSPELLMAGICL; ++i)
|
|
810 buf[i] = getc(fd); /* <fileID> */
|
|
811 if (STRNCMP(buf, VIMSPELLMAGIC, VIMSPELLMAGICL) != 0)
|
|
812 {
|
|
813 EMSG(_("E757: Wrong file ID in spell file"));
|
255
|
814 goto endFAIL;
|
236
|
815 }
|
|
816
|
|
817 cnt = getc(fd); /* <regioncnt> */
|
255
|
818 if (cnt < 0)
|
236
|
819 {
|
|
820 truncerr:
|
|
821 EMSG(_("E758: Truncated spell file"));
|
255
|
822 goto endFAIL;
|
236
|
823 }
|
|
824 if (cnt > 8)
|
|
825 {
|
|
826 formerr:
|
|
827 EMSG(_("E759: Format error in spell file"));
|
255
|
828 goto endFAIL;
|
236
|
829 }
|
|
830 for (i = 0; i < cnt; ++i)
|
|
831 {
|
|
832 lp->sl_regions[i * 2] = getc(fd); /* <regionname> */
|
|
833 lp->sl_regions[i * 2 + 1] = getc(fd);
|
|
834 }
|
|
835 lp->sl_regions[cnt * 2] = NUL;
|
|
836
|
255
|
837 cnt = getc(fd); /* <charflagslen> */
|
|
838 if (cnt > 0)
|
|
839 {
|
300
|
840 p = alloc((unsigned)cnt);
|
255
|
841 if (p == NULL)
|
|
842 goto endFAIL;
|
|
843 for (i = 0; i < cnt; ++i)
|
|
844 p[i] = getc(fd); /* <charflags> */
|
|
845
|
|
846 ccnt = (getc(fd) << 8) + getc(fd); /* <fcharslen> */
|
|
847 if (ccnt <= 0)
|
300
|
848 {
|
|
849 vim_free(p);
|
255
|
850 goto formerr;
|
300
|
851 }
|
|
852 fol = alloc((unsigned)ccnt + 1);
|
255
|
853 if (fol == NULL)
|
300
|
854 {
|
|
855 vim_free(p);
|
255
|
856 goto endFAIL;
|
300
|
857 }
|
255
|
858 for (i = 0; i < ccnt; ++i)
|
|
859 fol[i] = getc(fd); /* <fchars> */
|
|
860 fol[i] = NUL;
|
|
861
|
|
862 /* Set the word-char flags and fill spell_isupper() table. */
|
300
|
863 i = set_spell_charflags(p, cnt, fol);
|
|
864 vim_free(p);
|
|
865 vim_free(fol);
|
|
866 if (i == FAIL)
|
255
|
867 goto formerr;
|
|
868 }
|
|
869 else
|
|
870 {
|
|
871 /* When <charflagslen> is zero then <fcharlen> must also be zero. */
|
|
872 cnt = (getc(fd) << 8) + getc(fd);
|
|
873 if (cnt != 0)
|
|
874 goto formerr;
|
|
875 }
|
|
876
|
236
|
877 /* <SUGGEST> : <suggestlen> <more> ... */
|
|
878 /* TODO, just skip this for now */
|
|
879 i = (getc(fd) << 24) + (getc(fd) << 16) + (getc(fd) << 8) + getc(fd);
|
|
880 while (i-- > 0)
|
|
881 if (getc(fd) == EOF) /* <suggestlen> */
|
|
882 goto truncerr;
|
|
883
|
300
|
884 /* round 1: <LWORDTREE>
|
|
885 * round 2: <KWORDTREE> */
|
|
886 for (round = 1; round <= 2; ++round)
|
236
|
887 {
|
300
|
888 /* The tree size was computed when writing the file, so that we can
|
|
889 * allocate it as one long block. <nodecount> */
|
|
890 len = (getc(fd) << 24) + (getc(fd) << 16) + (getc(fd) << 8) + getc(fd);
|
|
891 if (len < 0)
|
|
892 goto truncerr;
|
|
893 if (len > 0)
|
236
|
894 {
|
300
|
895 /* Allocate the byte array. */
|
|
896 p = lalloc((long_u)len, TRUE);
|
|
897 if (p == NULL)
|
|
898 goto endFAIL;
|
|
899 if (round == 1)
|
|
900 lp->sl_fbyts = p;
|
|
901 else
|
|
902 lp->sl_kbyts = p;
|
236
|
903
|
300
|
904 /* Allocate the index array. */
|
|
905 p = lalloc_clear((long_u)(len * sizeof(int)), TRUE);
|
|
906 if (p == NULL)
|
|
907 goto endFAIL;
|
|
908 if (round == 1)
|
|
909 lp->sl_fidxs = (int *)p;
|
|
910 else
|
|
911 lp->sl_kidxs = (int *)p;
|
|
912
|
|
913
|
|
914 /* Read the tree and store it in the array. */
|
|
915 i = read_tree(fd,
|
|
916 round == 1 ? lp->sl_fbyts : lp->sl_kbyts,
|
|
917 round == 1 ? lp->sl_fidxs : lp->sl_kidxs,
|
|
918 len, 0);
|
|
919 if (i == -1)
|
|
920 goto truncerr;
|
|
921 if (i < 0)
|
236
|
922 goto formerr;
|
|
923 }
|
300
|
924 }
|
243
|
925
|
255
|
926 goto endOK;
|
|
927
|
|
928 endFAIL:
|
236
|
929 lp->sl_error = TRUE;
|
255
|
930
|
|
931 endOK:
|
236
|
932 if (fd != NULL)
|
|
933 fclose(fd);
|
|
934 sourcing_name = save_sourcing_name;
|
|
935 sourcing_lnum = save_sourcing_lnum;
|
|
936 }
|
|
937
|
|
938 /*
|
300
|
939 * Read one row of siblings from the spell file and store it in the byte array
|
|
940 * "byts" and index array "idxs". Recursively read the children.
|
|
941 *
|
|
942 * NOTE: The code here must match put_tree().
|
|
943 *
|
|
944 * Returns the index follosing the siblings.
|
|
945 * Returns -1 if the file is shorter than expected.
|
|
946 * Returns -2 if there is a format error.
|
236
|
947 */
|
300
|
948 static int
|
|
949 read_tree(fd, byts, idxs, maxidx, startidx)
|
|
950 FILE *fd;
|
|
951 char_u *byts;
|
|
952 int *idxs;
|
|
953 int maxidx; /* size of arrays */
|
|
954 int startidx; /* current index in "byts" and "idxs" */
|
236
|
955 {
|
300
|
956 int len;
|
|
957 int i;
|
|
958 int n;
|
|
959 int idx = startidx;
|
|
960 int c;
|
|
961 #define SHARED_MASK 0x8000000
|
236
|
962
|
300
|
963 len = getc(fd); /* <siblingcount> */
|
|
964 if (len <= 0)
|
|
965 return -1;
|
|
966
|
|
967 if (startidx + len >= maxidx)
|
|
968 return -2;
|
|
969 byts[idx++] = len;
|
|
970
|
|
971 /* Read the byte values, flag/region bytes and shared indexes. */
|
|
972 for (i = 1; i <= len; ++i)
|
236
|
973 {
|
300
|
974 c = getc(fd); /* <byte> */
|
|
975 if (c < 0)
|
|
976 return -1;
|
|
977 if (c <= BY_SPECIAL)
|
|
978 {
|
|
979 if (c == BY_NOFLAGS)
|
|
980 {
|
|
981 /* No flags, all regions. */
|
|
982 idxs[idx] = 0;
|
|
983 c = 0;
|
|
984 }
|
|
985 else if (c == BY_FLAGS)
|
|
986 {
|
|
987 /* Read flags and option region. */
|
|
988 c = getc(fd); /* <flags> */
|
|
989 if (c & WF_REGION)
|
|
990 c = (getc(fd) << 8) + c; /* <region> */
|
|
991 idxs[idx] = c;
|
|
992 c = 0;
|
|
993 }
|
|
994 else /* c == BY_INDEX */
|
|
995 {
|
|
996 /* <nodeidx> */
|
|
997 n = (getc(fd) << 16) + (getc(fd) << 8) + getc(fd);
|
|
998 if (n < 0 || n >= maxidx)
|
|
999 return -2;
|
|
1000 idxs[idx] = n + SHARED_MASK;
|
|
1001 c = getc(fd); /* <xbyte> */
|
|
1002 }
|
|
1003 }
|
|
1004 byts[idx++] = c;
|
236
|
1005 }
|
|
1006
|
300
|
1007 /* Recursively read the children for non-shared siblings.
|
|
1008 * Skip the end-of-word ones (zero byte value) and the shared ones (and
|
|
1009 * remove SHARED_MASK) */
|
|
1010 for (i = 1; i <= len; ++i)
|
|
1011 if (byts[startidx + i] != 0)
|
|
1012 {
|
|
1013 if (idxs[startidx + i] & SHARED_MASK)
|
|
1014 idxs[startidx + i] &= ~SHARED_MASK;
|
|
1015 else
|
|
1016 {
|
|
1017 idxs[startidx + i] = idx;
|
|
1018 idx = read_tree(fd, byts, idxs, maxidx, idx);
|
|
1019 if (idx < 0)
|
|
1020 break;
|
|
1021 }
|
|
1022 }
|
236
|
1023
|
300
|
1024 return idx;
|
236
|
1025 }
|
|
1026
|
|
1027 /*
|
|
1028 * Parse 'spelllang' and set buf->b_langp accordingly.
|
|
1029 * Returns an error message or NULL.
|
|
1030 */
|
|
1031 char_u *
|
|
1032 did_set_spelllang(buf)
|
|
1033 buf_T *buf;
|
|
1034 {
|
|
1035 garray_T ga;
|
|
1036 char_u *lang;
|
|
1037 char_u *e;
|
|
1038 char_u *region;
|
|
1039 int region_mask;
|
|
1040 slang_T *lp;
|
|
1041 int c;
|
|
1042 char_u lbuf[MAXWLEN + 1];
|
|
1043
|
|
1044 ga_init2(&ga, sizeof(langp_T), 2);
|
|
1045
|
|
1046 /* loop over comma separated languages. */
|
|
1047 for (lang = buf->b_p_spl; *lang != NUL; lang = e)
|
|
1048 {
|
|
1049 e = vim_strchr(lang, ',');
|
|
1050 if (e == NULL)
|
|
1051 e = lang + STRLEN(lang);
|
240
|
1052 region = NULL;
|
236
|
1053 if (e > lang + 2)
|
|
1054 {
|
|
1055 if (e - lang >= MAXWLEN)
|
|
1056 {
|
|
1057 ga_clear(&ga);
|
|
1058 return e_invarg;
|
|
1059 }
|
|
1060 if (lang[2] == '_')
|
|
1061 region = lang + 3;
|
|
1062 }
|
|
1063
|
|
1064 for (lp = first_lang; lp != NULL; lp = lp->sl_next)
|
|
1065 if (STRNICMP(lp->sl_name, lang, 2) == 0)
|
|
1066 break;
|
|
1067
|
|
1068 if (lp == NULL)
|
|
1069 {
|
|
1070 /* Not found, load the language. */
|
|
1071 STRNCPY(lbuf, lang, e - lang);
|
|
1072 lbuf[e - lang] = NUL;
|
|
1073 if (region != NULL)
|
|
1074 mch_memmove(lbuf + 2, lbuf + 5, e - lang - 4);
|
|
1075 lp = spell_load_lang(lbuf);
|
|
1076 }
|
|
1077
|
|
1078 if (lp != NULL)
|
|
1079 {
|
|
1080 if (region == NULL)
|
|
1081 region_mask = REGION_ALL;
|
|
1082 else
|
|
1083 {
|
|
1084 /* find region in sl_regions */
|
|
1085 c = find_region(lp->sl_regions, region);
|
|
1086 if (c == REGION_ALL)
|
|
1087 {
|
|
1088 c = *e;
|
|
1089 *e = NUL;
|
|
1090 smsg((char_u *)_("Warning: region %s not supported"), lang);
|
|
1091 *e = c;
|
|
1092 region_mask = REGION_ALL;
|
|
1093 }
|
|
1094 else
|
|
1095 region_mask = 1 << c;
|
|
1096 }
|
|
1097
|
|
1098 if (ga_grow(&ga, 1) == FAIL)
|
|
1099 {
|
|
1100 ga_clear(&ga);
|
|
1101 return e_outofmem;
|
|
1102 }
|
|
1103 LANGP_ENTRY(ga, ga.ga_len)->lp_slang = lp;
|
|
1104 LANGP_ENTRY(ga, ga.ga_len)->lp_region = region_mask;
|
|
1105 ++ga.ga_len;
|
|
1106 }
|
|
1107
|
|
1108 if (*e == ',')
|
|
1109 ++e;
|
|
1110 }
|
|
1111
|
|
1112 /* Add a NULL entry to mark the end of the list. */
|
|
1113 if (ga_grow(&ga, 1) == FAIL)
|
|
1114 {
|
|
1115 ga_clear(&ga);
|
|
1116 return e_outofmem;
|
|
1117 }
|
|
1118 LANGP_ENTRY(ga, ga.ga_len)->lp_slang = NULL;
|
|
1119 ++ga.ga_len;
|
|
1120
|
|
1121 /* Everything is fine, store the new b_langp value. */
|
|
1122 ga_clear(&buf->b_langp);
|
|
1123 buf->b_langp = ga;
|
|
1124
|
|
1125 return NULL;
|
|
1126 }
|
|
1127
|
|
1128 /*
|
|
1129 * Find the region "region[2]" in "rp" (points to "sl_regions").
|
|
1130 * Each region is simply stored as the two characters of it's name.
|
|
1131 * Returns the index if found, REGION_ALL if not found.
|
|
1132 */
|
|
1133 static int
|
|
1134 find_region(rp, region)
|
|
1135 char_u *rp;
|
|
1136 char_u *region;
|
|
1137 {
|
|
1138 int i;
|
|
1139
|
|
1140 for (i = 0; ; i += 2)
|
|
1141 {
|
|
1142 if (rp[i] == NUL)
|
|
1143 return REGION_ALL;
|
|
1144 if (rp[i] == region[0] && rp[i + 1] == region[1])
|
|
1145 break;
|
|
1146 }
|
|
1147 return i / 2;
|
|
1148 }
|
|
1149
|
|
1150 /*
|
|
1151 * Return type of word:
|
|
1152 * w word 0
|
300
|
1153 * Word WF_ONECAP
|
|
1154 * W WORD WF_ALLCAP
|
|
1155 * WoRd wOrd WF_KEEPCAP
|
236
|
1156 */
|
|
1157 static int
|
|
1158 captype(word, end)
|
|
1159 char_u *word;
|
|
1160 char_u *end;
|
|
1161 {
|
|
1162 char_u *p;
|
|
1163 int c;
|
|
1164 int firstcap;
|
|
1165 int allcap;
|
|
1166 int past_second = FALSE; /* past second word char */
|
|
1167
|
|
1168 /* find first letter */
|
|
1169 for (p = word; !spell_iswordc(p); mb_ptr_adv(p))
|
|
1170 if (p >= end)
|
|
1171 return 0; /* only non-word characters, illegal word */
|
|
1172 #ifdef FEAT_MBYTE
|
|
1173 c = mb_ptr2char_adv(&p);
|
|
1174 #else
|
|
1175 c = *p++;
|
|
1176 #endif
|
255
|
1177 firstcap = allcap = spell_isupper(c);
|
236
|
1178
|
|
1179 /*
|
|
1180 * Need to check all letters to find a word with mixed upper/lower.
|
|
1181 * But a word with an upper char only at start is a ONECAP.
|
|
1182 */
|
|
1183 for ( ; p < end; mb_ptr_adv(p))
|
|
1184 if (spell_iswordc(p))
|
|
1185 {
|
|
1186 #ifdef FEAT_MBYTE
|
|
1187 c = mb_ptr2char(p);
|
|
1188 #else
|
|
1189 c = *p;
|
|
1190 #endif
|
255
|
1191 if (!spell_isupper(c))
|
236
|
1192 {
|
|
1193 /* UUl -> KEEPCAP */
|
|
1194 if (past_second && allcap)
|
300
|
1195 return WF_KEEPCAP;
|
236
|
1196 allcap = FALSE;
|
|
1197 }
|
|
1198 else if (!allcap)
|
|
1199 /* UlU -> KEEPCAP */
|
300
|
1200 return WF_KEEPCAP;
|
236
|
1201 past_second = TRUE;
|
|
1202 }
|
|
1203
|
|
1204 if (allcap)
|
300
|
1205 return WF_ALLCAP;
|
236
|
1206 if (firstcap)
|
300
|
1207 return WF_ONECAP;
|
236
|
1208 return 0;
|
|
1209 }
|
|
1210
|
|
1211 # if defined(FEAT_MBYTE) || defined(PROTO)
|
|
1212 /*
|
|
1213 * Clear all spelling tables and reload them.
|
|
1214 * Used after 'encoding' is set.
|
|
1215 */
|
|
1216 void
|
|
1217 spell_reload()
|
|
1218 {
|
|
1219 buf_T *buf;
|
|
1220 slang_T *lp;
|
|
1221
|
|
1222 /* Initialize the table for spell_iswordc(). */
|
|
1223 init_spell_chartab();
|
|
1224
|
|
1225 /* Unload all allocated memory. */
|
|
1226 while (first_lang != NULL)
|
|
1227 {
|
|
1228 lp = first_lang;
|
|
1229 first_lang = lp->sl_next;
|
|
1230 slang_free(lp);
|
|
1231 }
|
|
1232
|
|
1233 /* Go through all buffers and handle 'spelllang'. */
|
|
1234 for (buf = firstbuf; buf != NULL; buf = buf->b_next)
|
|
1235 {
|
|
1236 ga_clear(&buf->b_langp);
|
|
1237 if (*buf->b_p_spl != NUL)
|
|
1238 did_set_spelllang(buf);
|
|
1239 }
|
|
1240 }
|
|
1241 # endif
|
|
1242
|
|
1243
|
|
1244 #if defined(FEAT_MBYTE) || defined(PROTO)
|
|
1245 /*
|
|
1246 * Functions for ":mkspell".
|
|
1247 * Only possible with the multi-byte feature.
|
|
1248 */
|
|
1249
|
300
|
1250 #define MAXLINELEN 500 /* Maximum length in bytes of a line in a .aff
|
236
|
1251 and .dic file. */
|
|
1252 /*
|
|
1253 * Main structure to store the contents of a ".aff" file.
|
|
1254 */
|
|
1255 typedef struct afffile_S
|
|
1256 {
|
|
1257 char_u *af_enc; /* "SET", normalized, alloc'ed string or NULL */
|
|
1258 char_u *af_try; /* "TRY" line in "af_enc" encoding */
|
|
1259 hashtab_T af_pref; /* hashtable for prefixes, affheader_T */
|
|
1260 hashtab_T af_suff; /* hashtable for suffixes, affheader_T */
|
|
1261 garray_T af_rep; /* list of repentry_T entries from REP lines */
|
|
1262 } afffile_T;
|
|
1263
|
|
1264 typedef struct affentry_S affentry_T;
|
|
1265 /* Affix entry from ".aff" file. Used for prefixes and suffixes. */
|
|
1266 struct affentry_S
|
|
1267 {
|
|
1268 affentry_T *ae_next; /* next affix with same name/number */
|
|
1269 char_u *ae_chop; /* text to chop off basic word (can be NULL) */
|
|
1270 char_u *ae_add; /* text to add to basic word (can be NULL) */
|
|
1271 char_u *ae_cond; /* condition (NULL for ".") */
|
|
1272 regprog_T *ae_prog; /* regexp program for ae_cond or NULL */
|
300
|
1273 };
|
|
1274
|
|
1275 /* Affix header from ".aff" file. Used for af_pref and af_suff. */
|
|
1276 typedef struct affheader_S
|
|
1277 {
|
|
1278 char_u ah_key[2]; /* key for hashtable == name of affix entry */
|
|
1279 int ah_combine; /* suffix may combine with prefix */
|
|
1280 affentry_T *ah_first; /* first affix entry */
|
|
1281 } affheader_T;
|
|
1282
|
|
1283 #define HI2AH(hi) ((affheader_T *)(hi)->hi_key)
|
|
1284
|
|
1285 /*
|
|
1286 * Structure that is used to store the items in the word tree. This avoids
|
|
1287 * the need to keep track of each allocated thing, it's freed all at once
|
|
1288 * after ":mkspell" is done.
|
|
1289 */
|
|
1290 #define SBLOCKSIZE 16000 /* size of sb_data */
|
|
1291 typedef struct sblock_S sblock_T;
|
|
1292 struct sblock_S
|
|
1293 {
|
|
1294 sblock_T *sb_next; /* next block in list */
|
|
1295 int sb_used; /* nr of bytes already in use */
|
|
1296 char_u sb_data[1]; /* data, actually longer */
|
236
|
1297 };
|
|
1298
|
|
1299 /*
|
300
|
1300 * A node in the tree.
|
236
|
1301 */
|
300
|
1302 typedef struct wordnode_S wordnode_T;
|
|
1303 struct wordnode_S
|
236
|
1304 {
|
300
|
1305 char_u wn_hashkey[6]; /* room for the hash key */
|
|
1306 wordnode_T *wn_next; /* next node with same hash key */
|
|
1307 wordnode_T *wn_child; /* child (next byte in word) */
|
|
1308 wordnode_T *wn_sibling; /* next sibling (alternate byte in word,
|
|
1309 always sorted) */
|
|
1310 wordnode_T *wn_wnode; /* parent node that will write this node */
|
|
1311 int wn_index; /* index in written nodes (valid after first
|
|
1312 round) */
|
|
1313 char_u wn_byte; /* Byte for this node. NUL for word end */
|
|
1314 char_u wn_flags; /* when wn_byte is NUL: WF_ flags */
|
|
1315 char_u wn_region; /* when wn_byte is NUL: region mask */
|
236
|
1316 };
|
|
1317
|
300
|
1318 #define HI2WN(hi) (wordnode_T *)((hi)->hi_key)
|
236
|
1319
|
300
|
1320 /*
|
|
1321 * Info used while reading the spell files.
|
|
1322 */
|
|
1323 typedef struct spellinfo_S
|
249
|
1324 {
|
300
|
1325 wordnode_T *si_foldroot; /* tree with case-folded words */
|
|
1326 wordnode_T *si_keeproot; /* tree with keep-case words */
|
|
1327 sblock_T *si_blocks; /* memory blocks used */
|
|
1328 int si_ascii; /* handling only ASCII words */
|
|
1329 int si_region; /* region mask */
|
|
1330 vimconv_T si_conv; /* for conversion to 'encoding' */
|
302
|
1331 int si_memtot; /* runtime memory used */
|
300
|
1332 } spellinfo_T;
|
249
|
1333
|
300
|
1334 static afffile_T *spell_read_aff __ARGS((char_u *fname, spellinfo_T *spin));
|
240
|
1335 static int has_non_ascii __ARGS((char_u *s));
|
300
|
1336 static void spell_free_aff __ARGS((afffile_T *aff));
|
|
1337 static int spell_read_dic __ARGS((char_u *fname, spellinfo_T *spin, afffile_T *affile));
|
|
1338 static int store_aff_word __ARGS((char_u *word, spellinfo_T *spin, char_u *afflist, hashtab_T *ht, hashtab_T *xht, int comb));
|
|
1339 static int spell_read_wordfile __ARGS((char_u *fname, spellinfo_T *spin));
|
|
1340 static void *getroom __ARGS((sblock_T **blp, size_t len));
|
|
1341 static char_u *getroom_save __ARGS((sblock_T **blp, char_u *s));
|
|
1342 static void free_blocks __ARGS((sblock_T *bl));
|
|
1343 static wordnode_T *wordtree_alloc __ARGS((sblock_T **blp));
|
|
1344 static int store_word __ARGS((char_u *word, spellinfo_T *spin));
|
|
1345 static int tree_add_word __ARGS((char_u *word, wordnode_T *tree, int flags, int region, sblock_T **blp));
|
|
1346 static void wordtree_compress __ARGS((wordnode_T *root));
|
|
1347 static int node_compress __ARGS((wordnode_T *node, hashtab_T *ht, int *tot));
|
|
1348 static int node_equal __ARGS((wordnode_T *n1, wordnode_T *n2));
|
|
1349 static void write_vim_spell __ARGS((char_u *fname, spellinfo_T *spin, int regcount, char_u *regchars));
|
|
1350 static int put_tree __ARGS((FILE *fd, wordnode_T *node, int index, int regionmask));
|
236
|
1351
|
|
1352 /*
|
|
1353 * Read an affix ".aff" file.
|
|
1354 * Returns an afffile_T, NULL for failure.
|
|
1355 */
|
|
1356 static afffile_T *
|
300
|
1357 spell_read_aff(fname, spin)
|
236
|
1358 char_u *fname;
|
300
|
1359 spellinfo_T *spin;
|
236
|
1360 {
|
|
1361 FILE *fd;
|
|
1362 afffile_T *aff;
|
|
1363 char_u rline[MAXLINELEN];
|
|
1364 char_u *line;
|
|
1365 char_u *pc = NULL;
|
|
1366 char_u *(items[6]);
|
|
1367 int itemcnt;
|
|
1368 char_u *p;
|
|
1369 int lnum = 0;
|
|
1370 affheader_T *cur_aff = NULL;
|
|
1371 int aff_todo = 0;
|
|
1372 hashtab_T *tp;
|
255
|
1373 char_u *low = NULL;
|
|
1374 char_u *fol = NULL;
|
|
1375 char_u *upp = NULL;
|
236
|
1376
|
300
|
1377 /*
|
|
1378 * Open the file.
|
|
1379 */
|
236
|
1380 fd = fopen((char *)fname, "r");
|
|
1381 if (fd == NULL)
|
|
1382 {
|
|
1383 EMSG2(_(e_notopen), fname);
|
|
1384 return NULL;
|
|
1385 }
|
|
1386
|
|
1387 smsg((char_u *)_("Reading affix file %s..."), fname);
|
|
1388 out_flush();
|
|
1389
|
300
|
1390 /*
|
|
1391 * Allocate and init the afffile_T structure.
|
|
1392 */
|
|
1393 aff = (afffile_T *)getroom(&spin->si_blocks, sizeof(afffile_T));
|
236
|
1394 if (aff == NULL)
|
|
1395 return NULL;
|
|
1396 hash_init(&aff->af_pref);
|
|
1397 hash_init(&aff->af_suff);
|
|
1398 ga_init2(&aff->af_rep, (int)sizeof(repentry_T), 20);
|
|
1399
|
|
1400 /*
|
|
1401 * Read all the lines in the file one by one.
|
|
1402 */
|
255
|
1403 while (!vim_fgets(rline, MAXLINELEN, fd) && !got_int)
|
236
|
1404 {
|
255
|
1405 line_breakcheck();
|
236
|
1406 ++lnum;
|
|
1407
|
|
1408 /* Skip comment lines. */
|
|
1409 if (*rline == '#')
|
|
1410 continue;
|
|
1411
|
|
1412 /* Convert from "SET" to 'encoding' when needed. */
|
|
1413 vim_free(pc);
|
300
|
1414 if (spin->si_conv.vc_type != CONV_NONE)
|
236
|
1415 {
|
300
|
1416 pc = string_convert(&spin->si_conv, rline, NULL);
|
255
|
1417 if (pc == NULL)
|
|
1418 {
|
|
1419 smsg((char_u *)_("Conversion failure for word in %s line %d: %s"),
|
|
1420 fname, lnum, rline);
|
|
1421 continue;
|
|
1422 }
|
236
|
1423 line = pc;
|
|
1424 }
|
|
1425 else
|
|
1426 {
|
|
1427 pc = NULL;
|
|
1428 line = rline;
|
|
1429 }
|
|
1430
|
|
1431 /* Split the line up in white separated items. Put a NUL after each
|
|
1432 * item. */
|
|
1433 itemcnt = 0;
|
|
1434 for (p = line; ; )
|
|
1435 {
|
|
1436 while (*p != NUL && *p <= ' ') /* skip white space and CR/NL */
|
|
1437 ++p;
|
|
1438 if (*p == NUL)
|
|
1439 break;
|
300
|
1440 if (itemcnt == 6) /* too many items */
|
|
1441 break;
|
236
|
1442 items[itemcnt++] = p;
|
300
|
1443 while (*p > ' ') /* skip until white space or CR/NL */
|
236
|
1444 ++p;
|
|
1445 if (*p == NUL)
|
|
1446 break;
|
|
1447 *p++ = NUL;
|
|
1448 }
|
|
1449
|
|
1450 /* Handle non-empty lines. */
|
|
1451 if (itemcnt > 0)
|
|
1452 {
|
|
1453 if (STRCMP(items[0], "SET") == 0 && itemcnt == 2
|
|
1454 && aff->af_enc == NULL)
|
|
1455 {
|
300
|
1456 /* Setup for conversion from "ENC" to 'encoding'. */
|
|
1457 aff->af_enc = enc_canonize(items[1]);
|
|
1458 if (aff->af_enc != NULL && !spin->si_ascii
|
|
1459 && convert_setup(&spin->si_conv, aff->af_enc,
|
|
1460 p_enc) == FAIL)
|
|
1461 smsg((char_u *)_("Conversion in %s not supported: from %s to %s"),
|
|
1462 fname, aff->af_enc, p_enc);
|
236
|
1463 }
|
302
|
1464 else if (STRCMP(items[0], "NOSPLITSUGS") == 0 && itemcnt == 1)
|
|
1465 {
|
|
1466 /* ignored */
|
|
1467 }
|
236
|
1468 else if (STRCMP(items[0], "TRY") == 0 && itemcnt == 2
|
|
1469 && aff->af_try == NULL)
|
300
|
1470 {
|
|
1471 aff->af_try = getroom_save(&spin->si_blocks, items[1]);
|
|
1472 }
|
236
|
1473 else if ((STRCMP(items[0], "PFX") == 0
|
|
1474 || STRCMP(items[0], "SFX") == 0)
|
|
1475 && aff_todo == 0
|
|
1476 && itemcnt == 4)
|
|
1477 {
|
|
1478 /* New affix letter. */
|
300
|
1479 cur_aff = (affheader_T *)getroom(&spin->si_blocks,
|
|
1480 sizeof(affheader_T));
|
236
|
1481 if (cur_aff == NULL)
|
|
1482 break;
|
|
1483 cur_aff->ah_key[0] = *items[1];
|
|
1484 cur_aff->ah_key[1] = NUL;
|
|
1485 if (items[1][1] != NUL)
|
|
1486 smsg((char_u *)_("Affix name too long in %s line %d: %s"),
|
|
1487 fname, lnum, items[1]);
|
|
1488 if (*items[2] == 'Y')
|
|
1489 cur_aff->ah_combine = TRUE;
|
300
|
1490 else if (*items[2] != 'N')
|
236
|
1491 smsg((char_u *)_("Expected Y or N in %s line %d: %s"),
|
|
1492 fname, lnum, items[2]);
|
|
1493 if (*items[0] == 'P')
|
|
1494 tp = &aff->af_pref;
|
|
1495 else
|
|
1496 tp = &aff->af_suff;
|
300
|
1497 aff_todo = atoi((char *)items[3]);
|
236
|
1498 if (!HASHITEM_EMPTY(hash_find(tp, cur_aff->ah_key)))
|
300
|
1499 {
|
236
|
1500 smsg((char_u *)_("Duplicate affix in %s line %d: %s"),
|
|
1501 fname, lnum, items[1]);
|
300
|
1502 aff_todo = 0;
|
|
1503 }
|
236
|
1504 else
|
|
1505 hash_add(tp, cur_aff->ah_key);
|
|
1506 }
|
|
1507 else if ((STRCMP(items[0], "PFX") == 0
|
|
1508 || STRCMP(items[0], "SFX") == 0)
|
|
1509 && aff_todo > 0
|
|
1510 && STRCMP(cur_aff->ah_key, items[1]) == 0
|
|
1511 && itemcnt == 5)
|
|
1512 {
|
|
1513 affentry_T *aff_entry;
|
|
1514
|
|
1515 /* New item for an affix letter. */
|
|
1516 --aff_todo;
|
300
|
1517 aff_entry = (affentry_T *)getroom(&spin->si_blocks,
|
|
1518 sizeof(affentry_T));
|
236
|
1519 if (aff_entry == NULL)
|
|
1520 break;
|
240
|
1521
|
236
|
1522 if (STRCMP(items[2], "0") != 0)
|
300
|
1523 aff_entry->ae_chop = getroom_save(&spin->si_blocks,
|
|
1524 items[2]);
|
236
|
1525 if (STRCMP(items[3], "0") != 0)
|
300
|
1526 aff_entry->ae_add = getroom_save(&spin->si_blocks,
|
|
1527 items[3]);
|
236
|
1528
|
300
|
1529 /* Don't use an affix entry with non-ASCII characters when
|
|
1530 * "spin->si_ascii" is TRUE. */
|
|
1531 if (!spin->si_ascii || !(has_non_ascii(aff_entry->ae_chop)
|
240
|
1532 || has_non_ascii(aff_entry->ae_add)))
|
|
1533 {
|
|
1534 aff_entry->ae_next = cur_aff->ah_first;
|
|
1535 cur_aff->ah_first = aff_entry;
|
300
|
1536
|
|
1537 if (STRCMP(items[4], ".") != 0)
|
|
1538 {
|
|
1539 char_u buf[MAXLINELEN];
|
|
1540
|
|
1541 aff_entry->ae_cond = getroom_save(&spin->si_blocks,
|
|
1542 items[4]);
|
|
1543 if (*items[0] == 'P')
|
|
1544 sprintf((char *)buf, "^%s", items[4]);
|
|
1545 else
|
|
1546 sprintf((char *)buf, "%s$", items[4]);
|
|
1547 aff_entry->ae_prog = vim_regcomp(buf,
|
|
1548 RE_MAGIC + RE_STRING);
|
|
1549 }
|
240
|
1550 }
|
236
|
1551 }
|
255
|
1552 else if (STRCMP(items[0], "FOL") == 0 && itemcnt == 2)
|
|
1553 {
|
|
1554 if (fol != NULL)
|
|
1555 smsg((char_u *)_("Duplicate FOL in %s line %d"),
|
|
1556 fname, lnum);
|
|
1557 else
|
|
1558 fol = vim_strsave(items[1]);
|
|
1559 }
|
|
1560 else if (STRCMP(items[0], "LOW") == 0 && itemcnt == 2)
|
|
1561 {
|
|
1562 if (low != NULL)
|
|
1563 smsg((char_u *)_("Duplicate LOW in %s line %d"),
|
|
1564 fname, lnum);
|
|
1565 else
|
|
1566 low = vim_strsave(items[1]);
|
|
1567 }
|
|
1568 else if (STRCMP(items[0], "UPP") == 0 && itemcnt == 2)
|
|
1569 {
|
|
1570 if (upp != NULL)
|
|
1571 smsg((char_u *)_("Duplicate UPP in %s line %d"),
|
|
1572 fname, lnum);
|
|
1573 else
|
|
1574 upp = vim_strsave(items[1]);
|
|
1575 }
|
236
|
1576 else if (STRCMP(items[0], "REP") == 0 && itemcnt == 2)
|
|
1577 /* Ignore REP count */;
|
|
1578 else if (STRCMP(items[0], "REP") == 0 && itemcnt == 3)
|
|
1579 {
|
|
1580 repentry_T *rp;
|
|
1581
|
|
1582 /* REP item */
|
|
1583 if (ga_grow(&aff->af_rep, 1) == FAIL)
|
|
1584 break;
|
|
1585 rp = ((repentry_T *)aff->af_rep.ga_data) + aff->af_rep.ga_len;
|
300
|
1586 rp->re_from = getroom_save(&spin->si_blocks, items[1]);
|
|
1587 rp->re_to = getroom_save(&spin->si_blocks, items[2]);
|
236
|
1588 ++aff->af_rep.ga_len;
|
|
1589 }
|
300
|
1590 else
|
236
|
1591 smsg((char_u *)_("Unrecognized item in %s line %d: %s"),
|
|
1592 fname, lnum, items[0]);
|
|
1593 }
|
|
1594 }
|
|
1595
|
255
|
1596 if (fol != NULL || low != NULL || upp != NULL)
|
|
1597 {
|
260
|
1598 /* Don't write a word table for an ASCII file, so that we don't check
|
|
1599 * for conflicts with a word table that matches 'encoding'. */
|
300
|
1600 if (!spin->si_ascii)
|
260
|
1601 {
|
|
1602 if (fol == NULL || low == NULL || upp == NULL)
|
|
1603 smsg((char_u *)_("Missing FOL/LOW/UPP line in %s"), fname);
|
|
1604 else
|
|
1605 set_spell_chartab(fol, low, upp);
|
|
1606 }
|
255
|
1607
|
|
1608 vim_free(fol);
|
|
1609 vim_free(low);
|
|
1610 vim_free(upp);
|
|
1611 }
|
|
1612
|
236
|
1613 vim_free(pc);
|
|
1614 fclose(fd);
|
|
1615 return aff;
|
|
1616 }
|
|
1617
|
|
1618 /*
|
240
|
1619 * Return TRUE if string "s" contains a non-ASCII character (128 or higher).
|
|
1620 * When "s" is NULL FALSE is returned.
|
|
1621 */
|
|
1622 static int
|
|
1623 has_non_ascii(s)
|
|
1624 char_u *s;
|
|
1625 {
|
|
1626 char_u *p;
|
|
1627
|
|
1628 if (s != NULL)
|
|
1629 for (p = s; *p != NUL; ++p)
|
|
1630 if (*p >= 128)
|
|
1631 return TRUE;
|
|
1632 return FALSE;
|
|
1633 }
|
|
1634
|
|
1635 /*
|
236
|
1636 * Free the structure filled by spell_read_aff().
|
|
1637 */
|
|
1638 static void
|
|
1639 spell_free_aff(aff)
|
|
1640 afffile_T *aff;
|
|
1641 {
|
|
1642 hashtab_T *ht;
|
|
1643 hashitem_T *hi;
|
|
1644 int todo;
|
|
1645 affheader_T *ah;
|
300
|
1646 affentry_T *ae;
|
236
|
1647
|
|
1648 vim_free(aff->af_enc);
|
|
1649
|
300
|
1650 /* All this trouble to foree the "ae_prog" items... */
|
236
|
1651 for (ht = &aff->af_pref; ; ht = &aff->af_suff)
|
|
1652 {
|
|
1653 todo = ht->ht_used;
|
|
1654 for (hi = ht->ht_array; todo > 0; ++hi)
|
|
1655 {
|
|
1656 if (!HASHITEM_EMPTY(hi))
|
|
1657 {
|
|
1658 --todo;
|
|
1659 ah = HI2AH(hi);
|
300
|
1660 for (ae = ah->ah_first; ae != NULL; ae = ae->ae_next)
|
|
1661 vim_free(ae->ae_prog);
|
236
|
1662 }
|
|
1663 }
|
|
1664 if (ht == &aff->af_suff)
|
|
1665 break;
|
|
1666 }
|
300
|
1667
|
236
|
1668 hash_clear(&aff->af_pref);
|
|
1669 hash_clear(&aff->af_suff);
|
|
1670 ga_clear(&aff->af_rep);
|
|
1671 }
|
|
1672
|
|
1673 /*
|
300
|
1674 * Read dictionary file "fname".
|
236
|
1675 * Returns OK or FAIL;
|
|
1676 */
|
|
1677 static int
|
300
|
1678 spell_read_dic(fname, spin, affile)
|
236
|
1679 char_u *fname;
|
300
|
1680 spellinfo_T *spin;
|
|
1681 afffile_T *affile;
|
236
|
1682 {
|
300
|
1683 hashtab_T ht;
|
236
|
1684 char_u line[MAXLINELEN];
|
300
|
1685 char_u *afflist;
|
|
1686 char_u *dw;
|
236
|
1687 char_u *pc;
|
|
1688 char_u *w;
|
|
1689 int l;
|
|
1690 hash_T hash;
|
|
1691 hashitem_T *hi;
|
|
1692 FILE *fd;
|
|
1693 int lnum = 1;
|
300
|
1694 int non_ascii = 0;
|
|
1695 int retval = OK;
|
|
1696 char_u message[MAXLINELEN + MAXWLEN];
|
236
|
1697
|
300
|
1698 /*
|
|
1699 * Open the file.
|
|
1700 */
|
236
|
1701 fd = fopen((char *)fname, "r");
|
|
1702 if (fd == NULL)
|
|
1703 {
|
|
1704 EMSG2(_(e_notopen), fname);
|
|
1705 return FAIL;
|
|
1706 }
|
|
1707
|
300
|
1708 /* The hashtable is only used to detect duplicated words. */
|
|
1709 hash_init(&ht);
|
|
1710
|
236
|
1711 smsg((char_u *)_("Reading dictionary file %s..."), fname);
|
|
1712 out_flush();
|
|
1713
|
|
1714 /* Read and ignore the first line: word count. */
|
|
1715 (void)vim_fgets(line, MAXLINELEN, fd);
|
|
1716 if (!isdigit(*skipwhite(line)))
|
|
1717 EMSG2(_("E760: No word count in %s"), fname);
|
|
1718
|
|
1719 /*
|
|
1720 * Read all the lines in the file one by one.
|
|
1721 * The words are converted to 'encoding' here, before being added to
|
|
1722 * the hashtable.
|
|
1723 */
|
255
|
1724 while (!vim_fgets(line, MAXLINELEN, fd) && !got_int)
|
236
|
1725 {
|
255
|
1726 line_breakcheck();
|
236
|
1727 ++lnum;
|
|
1728
|
300
|
1729 /* Remove CR, LF and white space from the end. White space halfway
|
|
1730 * the word is kept to allow e.g., "et al.". */
|
236
|
1731 l = STRLEN(line);
|
|
1732 while (l > 0 && line[l - 1] <= ' ')
|
|
1733 --l;
|
|
1734 if (l == 0)
|
|
1735 continue; /* empty line */
|
|
1736 line[l] = NUL;
|
|
1737
|
300
|
1738 /* This takes time, print a message now and then. */
|
|
1739 if ((lnum & 0x3ff) == 0)
|
|
1740 {
|
|
1741 vim_snprintf((char *)message, sizeof(message),
|
|
1742 _("line %6d - %s"), lnum, line);
|
|
1743 msg_start();
|
|
1744 msg_outtrans_attr(message, 0);
|
|
1745 msg_clr_eos();
|
|
1746 msg_didout = FALSE;
|
|
1747 msg_col = 0;
|
|
1748 out_flush();
|
|
1749 }
|
|
1750
|
236
|
1751 /* Find the optional affix names. */
|
300
|
1752 afflist = vim_strchr(line, '/');
|
|
1753 if (afflist != NULL)
|
|
1754 *afflist++ = NUL;
|
236
|
1755
|
300
|
1756 /* Skip non-ASCII words when "spin->si_ascii" is TRUE. */
|
|
1757 if (spin->si_ascii && has_non_ascii(line))
|
|
1758 {
|
|
1759 ++non_ascii;
|
240
|
1760 continue;
|
300
|
1761 }
|
240
|
1762
|
236
|
1763 /* Convert from "SET" to 'encoding' when needed. */
|
300
|
1764 if (spin->si_conv.vc_type != CONV_NONE)
|
236
|
1765 {
|
300
|
1766 pc = string_convert(&spin->si_conv, line, NULL);
|
255
|
1767 if (pc == NULL)
|
|
1768 {
|
|
1769 smsg((char_u *)_("Conversion failure for word in %s line %d: %s"),
|
|
1770 fname, lnum, line);
|
|
1771 continue;
|
|
1772 }
|
236
|
1773 w = pc;
|
|
1774 }
|
|
1775 else
|
|
1776 {
|
|
1777 pc = NULL;
|
|
1778 w = line;
|
|
1779 }
|
|
1780
|
300
|
1781 /* Store the word in the hashtable to be able to find duplicates. */
|
|
1782 dw = (char_u *)getroom_save(&spin->si_blocks, w);
|
236
|
1783 if (dw == NULL)
|
300
|
1784 retval = FAIL;
|
|
1785 vim_free(pc);
|
|
1786 if (retval == FAIL)
|
236
|
1787 break;
|
|
1788
|
300
|
1789 hash = hash_hash(dw);
|
|
1790 hi = hash_lookup(&ht, dw, hash);
|
236
|
1791 if (!HASHITEM_EMPTY(hi))
|
|
1792 smsg((char_u *)_("Duplicate word in %s line %d: %s"),
|
300
|
1793 fname, lnum, line);
|
236
|
1794 else
|
300
|
1795 hash_add_item(&ht, hi, dw, hash);
|
|
1796
|
|
1797 /* Add the word to the word tree(s). */
|
|
1798 if (store_word(dw, spin) == FAIL)
|
|
1799 retval = FAIL;
|
236
|
1800
|
300
|
1801 if (afflist != NULL)
|
|
1802 {
|
|
1803 /* Find all matching suffixes and add the resulting words.
|
|
1804 * Additionally do matching prefixes that combine. */
|
|
1805 if (store_aff_word(dw, spin, afflist,
|
|
1806 &affile->af_suff, &affile->af_pref, FALSE) == FAIL)
|
|
1807 retval = FAIL;
|
|
1808
|
|
1809 /* Find all matching prefixes and add the resulting words. */
|
|
1810 if (store_aff_word(dw, spin, afflist,
|
|
1811 &affile->af_pref, NULL, FALSE) == FAIL)
|
|
1812 retval = FAIL;
|
|
1813 }
|
236
|
1814 }
|
|
1815
|
300
|
1816 if (spin->si_ascii && non_ascii > 0)
|
|
1817 smsg((char_u *)_("Ignored %d words with non-ASCII characters"),
|
|
1818 non_ascii);
|
|
1819 hash_clear(&ht);
|
|
1820
|
236
|
1821 fclose(fd);
|
300
|
1822 return retval;
|
236
|
1823 }
|
|
1824
|
|
1825 /*
|
300
|
1826 * Apply affixes to a word and store the resulting words.
|
|
1827 * "ht" is the hashtable with affentry_T that need to be applied, either
|
|
1828 * prefixes or suffixes.
|
|
1829 * "xht", when not NULL, is the prefix hashtable, to be used additionally on
|
|
1830 * the resulting words for combining affixes.
|
|
1831 *
|
|
1832 * Returns FAIL when out of memory.
|
236
|
1833 */
|
300
|
1834 static int
|
|
1835 store_aff_word(word, spin, afflist, ht, xht, comb)
|
|
1836 char_u *word; /* basic word start */
|
|
1837 spellinfo_T *spin; /* spell info */
|
|
1838 char_u *afflist; /* list of names of supported affixes */
|
|
1839 hashtab_T *ht;
|
|
1840 hashtab_T *xht;
|
|
1841 int comb; /* only use affixes that combine */
|
236
|
1842 {
|
|
1843 int todo;
|
|
1844 hashitem_T *hi;
|
300
|
1845 affheader_T *ah;
|
|
1846 affentry_T *ae;
|
|
1847 regmatch_T regmatch;
|
|
1848 char_u newword[MAXWLEN];
|
|
1849 int retval = OK;
|
|
1850 int i;
|
|
1851 char_u *p;
|
236
|
1852
|
300
|
1853 todo = ht->ht_used;
|
|
1854 for (hi = ht->ht_array; todo > 0 && retval == OK; ++hi)
|
236
|
1855 {
|
|
1856 if (!HASHITEM_EMPTY(hi))
|
|
1857 {
|
|
1858 --todo;
|
300
|
1859 ah = HI2AH(hi);
|
236
|
1860
|
300
|
1861 /* Check that the affix combines, if required, and that the word
|
|
1862 * supports this affix. */
|
|
1863 if ((!comb || ah->ah_combine)
|
|
1864 && vim_strchr(afflist, *ah->ah_key) != NULL)
|
236
|
1865 {
|
300
|
1866 /* Loop over all affix entries with this name. */
|
|
1867 for (ae = ah->ah_first; ae != NULL; ae = ae->ae_next)
|
236
|
1868 {
|
300
|
1869 /* Check the condition. It's not logical to match case
|
|
1870 * here, but it is required for compatibility with
|
|
1871 * Myspell. */
|
|
1872 regmatch.regprog = ae->ae_prog;
|
|
1873 regmatch.rm_ic = FALSE;
|
|
1874 if (ae->ae_prog == NULL
|
|
1875 || vim_regexec(®match, word, (colnr_T)0))
|
|
1876 {
|
|
1877 /* Match. Remove the chop and add the affix. */
|
|
1878 if (xht == NULL)
|
240
|
1879 {
|
300
|
1880 /* prefix: chop/add at the start of the word */
|
|
1881 if (ae->ae_add == NULL)
|
|
1882 *newword = NUL;
|
|
1883 else
|
|
1884 STRCPY(newword, ae->ae_add);
|
|
1885 p = word;
|
|
1886 if (ae->ae_chop != NULL)
|
|
1887 /* Skip chop string. */
|
|
1888 for (i = mb_charlen(ae->ae_chop); i > 0; --i)
|
|
1889 mb_ptr_adv(p);
|
|
1890 STRCAT(newword, p);
|
|
1891 }
|
|
1892 else
|
|
1893 {
|
|
1894 /* suffix: chop/add at the end of the word */
|
|
1895 STRCPY(newword, word);
|
|
1896 if (ae->ae_chop != NULL)
|
|
1897 {
|
|
1898 /* Remove chop string. */
|
|
1899 p = newword + STRLEN(newword);
|
|
1900 for (i = mb_charlen(ae->ae_chop); i > 0; --i)
|
|
1901 mb_ptr_back(newword, p);
|
|
1902 *p = NUL;
|
|
1903 }
|
|
1904 if (ae->ae_add != NULL)
|
|
1905 STRCAT(newword, ae->ae_add);
|
240
|
1906 }
|
|
1907
|
300
|
1908 /* Store the modified word. */
|
|
1909 if (store_word(newword, spin) == FAIL)
|
|
1910 retval = FAIL;
|
236
|
1911
|
300
|
1912 /* When added a suffix and combining is allowed also
|
|
1913 * try adding prefixes additionally. */
|
|
1914 if (xht != NULL && ah->ah_combine)
|
|
1915 if (store_aff_word(newword, spin, afflist,
|
|
1916 xht, NULL, TRUE) == FAIL)
|
|
1917 retval = FAIL;
|
236
|
1918 }
|
|
1919 }
|
|
1920 }
|
|
1921 }
|
|
1922 }
|
|
1923
|
|
1924 return retval;
|
|
1925 }
|
|
1926
|
|
1927 /*
|
300
|
1928 * Read a file with a list of words.
|
236
|
1929 */
|
|
1930 static int
|
300
|
1931 spell_read_wordfile(fname, spin)
|
|
1932 char_u *fname;
|
|
1933 spellinfo_T *spin;
|
236
|
1934 {
|
300
|
1935 FILE *fd;
|
|
1936 long lnum = 0;
|
|
1937 char_u rline[MAXLINELEN];
|
|
1938 char_u *line;
|
|
1939 char_u *pc = NULL;
|
|
1940 int l;
|
|
1941 int retval = OK;
|
|
1942 int did_word = FALSE;
|
|
1943 int non_ascii = 0;
|
|
1944 char_u *enc;
|
236
|
1945
|
300
|
1946 /*
|
|
1947 * Open the file.
|
|
1948 */
|
|
1949 fd = fopen((char *)fname, "r");
|
|
1950 if (fd == NULL)
|
236
|
1951 {
|
300
|
1952 EMSG2(_(e_notopen), fname);
|
|
1953 return FAIL;
|
236
|
1954 }
|
|
1955
|
300
|
1956 smsg((char_u *)_("Reading word file %s..."), fname);
|
|
1957 out_flush();
|
|
1958
|
|
1959 /*
|
|
1960 * Read all the lines in the file one by one.
|
|
1961 */
|
|
1962 while (!vim_fgets(rline, MAXLINELEN, fd) && !got_int)
|
|
1963 {
|
|
1964 line_breakcheck();
|
|
1965 ++lnum;
|
|
1966
|
|
1967 /* Skip comment lines. */
|
|
1968 if (*rline == '#')
|
|
1969 continue;
|
|
1970
|
|
1971 /* Remove CR, LF and white space from the end. */
|
|
1972 l = STRLEN(rline);
|
|
1973 while (l > 0 && rline[l - 1] <= ' ')
|
|
1974 --l;
|
|
1975 if (l == 0)
|
|
1976 continue; /* empty or blank line */
|
|
1977 rline[l] = NUL;
|
|
1978
|
|
1979 /* Convert from "=encoding={encoding}" to 'encoding' when needed. */
|
|
1980 vim_free(pc);
|
|
1981 if (spin->si_conv.vc_type != CONV_NONE)
|
|
1982 {
|
|
1983 pc = string_convert(&spin->si_conv, rline, NULL);
|
|
1984 if (pc == NULL)
|
|
1985 {
|
|
1986 smsg((char_u *)_("Conversion failure for word in %s line %d: %s"),
|
|
1987 fname, lnum, rline);
|
|
1988 continue;
|
|
1989 }
|
|
1990 line = pc;
|
|
1991 }
|
|
1992 else
|
|
1993 {
|
|
1994 pc = NULL;
|
|
1995 line = rline;
|
|
1996 }
|
|
1997
|
|
1998 if (*line == '=')
|
|
1999 {
|
|
2000 if (STRNCMP(line + 1, "encoding=", 9) == 0)
|
|
2001 {
|
|
2002 if (spin->si_conv.vc_type != CONV_NONE)
|
|
2003 smsg((char_u *)_("Duplicate =encoding= line ignored in %s line %d: %s"),
|
|
2004 fname, lnum, line);
|
|
2005 else if (did_word)
|
|
2006 smsg((char_u *)_("=encoding= line after word ignored in %s line %d: %s"),
|
|
2007 fname, lnum, line);
|
|
2008 else
|
|
2009 {
|
|
2010 /* Setup for conversion to 'encoding'. */
|
|
2011 enc = enc_canonize(line + 10);
|
|
2012 if (enc != NULL && !spin->si_ascii
|
|
2013 && convert_setup(&spin->si_conv, enc,
|
|
2014 p_enc) == FAIL)
|
|
2015 smsg((char_u *)_("Conversion in %s not supported: from %s to %s"),
|
|
2016 fname, line + 10, p_enc);
|
|
2017 vim_free(enc);
|
|
2018 }
|
|
2019 }
|
|
2020 else
|
|
2021 smsg((char_u *)_("= line ignored in %s line %d: %s"),
|
|
2022 fname, lnum, line);
|
|
2023 continue;
|
|
2024 }
|
|
2025
|
|
2026 /* Skip non-ASCII words when "spin->si_ascii" is TRUE. */
|
|
2027 if (spin->si_ascii && has_non_ascii(line))
|
|
2028 {
|
|
2029 ++non_ascii;
|
|
2030 continue;
|
|
2031 }
|
|
2032
|
|
2033 /* Normal word: store it. */
|
|
2034 if (store_word(line, spin) == FAIL)
|
|
2035 {
|
|
2036 retval = FAIL;
|
|
2037 break;
|
|
2038 }
|
|
2039 did_word = TRUE;
|
|
2040 }
|
|
2041
|
|
2042 vim_free(pc);
|
|
2043 fclose(fd);
|
|
2044
|
|
2045 if (spin->si_ascii && non_ascii > 0)
|
|
2046 smsg((char_u *)_("Ignored %d words with non-ASCII characters"),
|
|
2047 non_ascii);
|
|
2048 return retval;
|
236
|
2049 }
|
|
2050
|
|
2051 /*
|
300
|
2052 * Get part of an sblock_T, "len" bytes long.
|
|
2053 * This avoids calling free() for every little struct we use.
|
|
2054 * The memory is cleared to all zeros.
|
|
2055 * Returns NULL when out of memory.
|
|
2056 */
|
|
2057 static void *
|
|
2058 getroom(blp, len)
|
|
2059 sblock_T **blp;
|
|
2060 size_t len; /* length needed */
|
|
2061 {
|
|
2062 char_u *p;
|
|
2063 sblock_T *bl = *blp;
|
|
2064
|
|
2065 if (bl == NULL || bl->sb_used + len > SBLOCKSIZE)
|
|
2066 {
|
|
2067 /* Allocate a block of memory. This is not freed until much later. */
|
|
2068 bl = (sblock_T *)alloc_clear((unsigned)(sizeof(sblock_T) + SBLOCKSIZE));
|
|
2069 if (bl == NULL)
|
|
2070 return NULL;
|
|
2071 bl->sb_next = *blp;
|
|
2072 *blp = bl;
|
|
2073 bl->sb_used = 0;
|
|
2074 }
|
|
2075
|
|
2076 p = bl->sb_data + bl->sb_used;
|
|
2077 bl->sb_used += len;
|
|
2078
|
|
2079 return p;
|
|
2080 }
|
|
2081
|
|
2082 /*
|
|
2083 * Make a copy of a string into memory allocated with getroom().
|
|
2084 */
|
|
2085 static char_u *
|
|
2086 getroom_save(blp, s)
|
|
2087 sblock_T **blp;
|
|
2088 char_u *s;
|
|
2089 {
|
|
2090 char_u *sc;
|
|
2091
|
|
2092 sc = (char_u *)getroom(blp, STRLEN(s) + 1);
|
|
2093 if (sc != NULL)
|
|
2094 STRCPY(sc, s);
|
|
2095 return sc;
|
|
2096 }
|
|
2097
|
|
2098
|
|
2099 /*
|
|
2100 * Free the list of allocated sblock_T.
|
236
|
2101 */
|
|
2102 static void
|
300
|
2103 free_blocks(bl)
|
|
2104 sblock_T *bl;
|
236
|
2105 {
|
300
|
2106 sblock_T *next;
|
236
|
2107
|
300
|
2108 while (bl != NULL)
|
236
|
2109 {
|
300
|
2110 next = bl->sb_next;
|
|
2111 vim_free(bl);
|
|
2112 bl = next;
|
236
|
2113 }
|
|
2114 }
|
|
2115
|
|
2116 /*
|
300
|
2117 * Allocate the root of a word tree.
|
236
|
2118 */
|
300
|
2119 static wordnode_T *
|
|
2120 wordtree_alloc(blp)
|
|
2121 sblock_T **blp;
|
236
|
2122 {
|
300
|
2123 return (wordnode_T *)getroom(blp, sizeof(wordnode_T));
|
236
|
2124 }
|
|
2125
|
|
2126 /*
|
300
|
2127 * Store a word in the tree(s).
|
|
2128 * Always store it in the case-folded tree.
|
|
2129 * For a keep-case word also store it in the keep-case tree.
|
236
|
2130 */
|
|
2131 static int
|
300
|
2132 store_word(word, spin)
|
|
2133 char_u *word;
|
|
2134 spellinfo_T *spin;
|
236
|
2135 {
|
300
|
2136 int len = STRLEN(word);
|
|
2137 int ct = captype(word, word + len);
|
|
2138 char_u foldword[MAXWLEN];
|
|
2139 int res;
|
236
|
2140
|
300
|
2141 (void)spell_casefold(word, len, foldword, MAXWLEN);
|
|
2142 res = tree_add_word(foldword, spin->si_foldroot, ct, spin->si_region,
|
|
2143 &spin->si_blocks);
|
|
2144 if (res == OK && ct == WF_KEEPCAP)
|
|
2145 res = tree_add_word(word, spin->si_keeproot, ct, spin->si_region,
|
|
2146 &spin->si_blocks);
|
|
2147 return res;
|
236
|
2148 }
|
|
2149
|
|
2150 /*
|
300
|
2151 * Add word "word" to a word tree at "root".
|
255
|
2152 * Returns FAIL when out of memory.
|
236
|
2153 */
|
255
|
2154 static int
|
300
|
2155 tree_add_word(word, root, flags, region, blp)
|
|
2156 char_u *word;
|
|
2157 wordnode_T *root;
|
|
2158 int flags;
|
|
2159 int region;
|
|
2160 sblock_T **blp;
|
236
|
2161 {
|
300
|
2162 wordnode_T *node = root;
|
|
2163 wordnode_T *np;
|
|
2164 wordnode_T **prev = NULL;
|
|
2165 int i;
|
255
|
2166
|
300
|
2167 /* Add each byte of the word to the tree, including the NUL at the end. */
|
|
2168 for (i = 0; ; ++i)
|
255
|
2169 {
|
300
|
2170 /* Look for the sibling that has the same character. They are sorted
|
|
2171 * on byte value, thus stop searching when a sibling is found with a
|
|
2172 * higher byte value. For zero bytes (end of word) check that the
|
|
2173 * flags are equal, there is a separate zero byte for each flag value.
|
|
2174 */
|
|
2175 while (node != NULL && (node->wn_byte < word[i]
|
|
2176 || (node->wn_byte == 0 && node->wn_flags != flags)))
|
236
|
2177 {
|
300
|
2178 prev = &node->wn_sibling;
|
|
2179 node = *prev;
|
236
|
2180 }
|
300
|
2181 if (node == NULL || node->wn_byte != word[i])
|
255
|
2182 {
|
300
|
2183 /* Allocate a new node. */
|
|
2184 np = (wordnode_T *)getroom(blp, sizeof(wordnode_T));
|
|
2185 if (np == NULL)
|
|
2186 return FAIL;
|
|
2187 np->wn_byte = word[i];
|
|
2188 *prev = np;
|
|
2189 np->wn_sibling = node;
|
|
2190 node = np;
|
255
|
2191 }
|
300
|
2192
|
|
2193 if (word[i] == NUL)
|
|
2194 {
|
|
2195 node->wn_flags = flags;
|
|
2196 node->wn_region |= region;
|
|
2197 break;
|
|
2198 }
|
|
2199 prev = &node->wn_child;
|
|
2200 node = *prev;
|
255
|
2201 }
|
|
2202
|
|
2203 return OK;
|
236
|
2204 }
|
|
2205
|
|
2206 /*
|
300
|
2207 * Compress a tree: find tails that are identical and can be shared.
|
|
2208 */
|
|
2209 static void
|
|
2210 wordtree_compress(root)
|
|
2211 wordnode_T *root;
|
|
2212 {
|
|
2213 hashtab_T ht;
|
|
2214 int n;
|
|
2215 int tot = 0;
|
|
2216
|
|
2217 if (root != NULL)
|
|
2218 {
|
|
2219 hash_init(&ht);
|
|
2220 n = node_compress(root, &ht, &tot);
|
|
2221 smsg((char_u *)_("Compressed %d of %d nodes; %d%% remaining"),
|
|
2222 n, tot, (tot - n) * 100 / tot);
|
|
2223 hash_clear(&ht);
|
|
2224 }
|
|
2225 }
|
|
2226
|
|
2227 /*
|
|
2228 * Compress a node, its siblings and its children, depth first.
|
|
2229 * Returns the number of compressed nodes.
|
236
|
2230 */
|
255
|
2231 static int
|
300
|
2232 node_compress(node, ht, tot)
|
|
2233 wordnode_T *node;
|
|
2234 hashtab_T *ht;
|
|
2235 int *tot; /* total count of nodes before compressing,
|
|
2236 incremented while going through the tree */
|
236
|
2237 {
|
300
|
2238 wordnode_T *np;
|
|
2239 wordnode_T *tp;
|
|
2240 wordnode_T *child;
|
|
2241 hash_T hash;
|
236
|
2242 hashitem_T *hi;
|
300
|
2243 int len = 0;
|
|
2244 unsigned nr, n;
|
|
2245 int compressed = 0;
|
236
|
2246
|
300
|
2247 /*
|
|
2248 * Go through the list of siblings. Compress each child and then try
|
|
2249 * finding an identical child to replace it.
|
|
2250 * Note that with "child" we mean not just the node that is pointed to,
|
|
2251 * but the whole list of siblings, of which the node is the first.
|
|
2252 */
|
|
2253 for (np = node; np != NULL; np = np->wn_sibling)
|
236
|
2254 {
|
300
|
2255 ++len;
|
|
2256 if ((child = np->wn_child) != NULL)
|
|
2257 {
|
|
2258 /* Compress the child. This fills wn_hashkey. */
|
|
2259 compressed += node_compress(child, ht, tot);
|
|
2260
|
|
2261 /* Try to find an identical child. */
|
|
2262 hash = hash_hash(child->wn_hashkey);
|
|
2263 hi = hash_lookup(ht, child->wn_hashkey, hash);
|
|
2264 tp = NULL;
|
|
2265 if (!HASHITEM_EMPTY(hi))
|
|
2266 {
|
|
2267 /* There are children with an identical hash value. Now check
|
|
2268 * if there is one that is really identical. */
|
|
2269 for (tp = HI2WN(hi); tp != NULL; tp = tp->wn_next)
|
|
2270 if (node_equal(child, tp))
|
|
2271 {
|
|
2272 /* Found one! Now use that child in place of the
|
|
2273 * current one. This means the current child is
|
|
2274 * dropped from the tree. */
|
|
2275 np->wn_child = tp;
|
|
2276 ++compressed;
|
|
2277 break;
|
|
2278 }
|
|
2279 if (tp == NULL)
|
|
2280 {
|
|
2281 /* No other child with this hash value equals the child of
|
|
2282 * the node, add it to the linked list after the first
|
|
2283 * item. */
|
|
2284 tp = HI2WN(hi);
|
|
2285 child->wn_next = tp->wn_next;
|
|
2286 tp->wn_next = child;
|
|
2287 }
|
|
2288 }
|
|
2289 else
|
|
2290 /* No other child has this hash value, add it to the
|
|
2291 * hashtable. */
|
|
2292 hash_add_item(ht, hi, child->wn_hashkey, hash);
|
|
2293 }
|
236
|
2294 }
|
300
|
2295 *tot += len;
|
|
2296
|
|
2297 /*
|
|
2298 * Make a hash key for the node and its siblings, so that we can quickly
|
|
2299 * find a lookalike node. This must be done after compressing the sibling
|
|
2300 * list, otherwise the hash key would become invalid by the compression.
|
|
2301 */
|
|
2302 node->wn_hashkey[0] = len;
|
|
2303 nr = 0;
|
|
2304 for (np = node; np != NULL; np = np->wn_sibling)
|
236
|
2305 {
|
300
|
2306 if (np->wn_byte == NUL)
|
|
2307 /* end node: only use wn_flags and wn_region */
|
|
2308 n = np->wn_flags + (np->wn_region << 8);
|
|
2309 else
|
|
2310 /* byte node: use the byte value and the child pointer */
|
|
2311 n = np->wn_byte + ((long_u)np->wn_child << 8);
|
|
2312 nr = nr * 101 + n;
|
236
|
2313 }
|
300
|
2314
|
|
2315 /* Avoid NUL bytes, it terminates the hash key. */
|
|
2316 n = nr & 0xff;
|
|
2317 node->wn_hashkey[1] = n == 0 ? 1 : n;
|
|
2318 n = (nr >> 8) & 0xff;
|
|
2319 node->wn_hashkey[2] = n == 0 ? 1 : n;
|
|
2320 n = (nr >> 16) & 0xff;
|
|
2321 node->wn_hashkey[3] = n == 0 ? 1 : n;
|
|
2322 n = (nr >> 24) & 0xff;
|
|
2323 node->wn_hashkey[4] = n == 0 ? 1 : n;
|
|
2324 node->wn_hashkey[5] = NUL;
|
|
2325
|
|
2326 return compressed;
|
|
2327 }
|
|
2328
|
|
2329 /*
|
|
2330 * Return TRUE when two nodes have identical siblings and children.
|
|
2331 */
|
|
2332 static int
|
|
2333 node_equal(n1, n2)
|
|
2334 wordnode_T *n1;
|
|
2335 wordnode_T *n2;
|
|
2336 {
|
|
2337 wordnode_T *p1;
|
|
2338 wordnode_T *p2;
|
|
2339
|
|
2340 for (p1 = n1, p2 = n2; p1 != NULL && p2 != NULL;
|
|
2341 p1 = p1->wn_sibling, p2 = p2->wn_sibling)
|
|
2342 if (p1->wn_byte != p2->wn_byte
|
|
2343 || (p1->wn_byte == NUL
|
|
2344 ? (p1->wn_flags != p2->wn_flags
|
|
2345 || p1->wn_region != p2->wn_region)
|
|
2346 : (p1->wn_child != p2->wn_child)))
|
|
2347 break;
|
|
2348
|
|
2349 return p1 == NULL && p2 == NULL;
|
236
|
2350 }
|
|
2351
|
|
2352 /*
|
|
2353 * Write a number to file "fd", MSB first, in "len" bytes.
|
|
2354 */
|
255
|
2355 void
|
236
|
2356 put_bytes(fd, nr, len)
|
|
2357 FILE *fd;
|
|
2358 long_u nr;
|
|
2359 int len;
|
|
2360 {
|
|
2361 int i;
|
|
2362
|
|
2363 for (i = len - 1; i >= 0; --i)
|
|
2364 putc((int)(nr >> (i * 8)), fd);
|
|
2365 }
|
|
2366
|
|
2367 /*
|
|
2368 * Write the Vim spell file "fname".
|
|
2369 */
|
|
2370 static void
|
300
|
2371 write_vim_spell(fname, spin, regcount, regchars)
|
236
|
2372 char_u *fname;
|
300
|
2373 spellinfo_T *spin;
|
236
|
2374 int regcount; /* number of regions */
|
|
2375 char_u *regchars; /* region names */
|
|
2376 {
|
300
|
2377 FILE *fd;
|
|
2378 int regionmask;
|
236
|
2379 int round;
|
300
|
2380 wordnode_T *tree;
|
|
2381 int nodecount;
|
236
|
2382
|
300
|
2383 fd = fopen((char *)fname, "w");
|
|
2384 if (fd == NULL)
|
236
|
2385 {
|
|
2386 EMSG2(_(e_notopen), fname);
|
|
2387 return;
|
|
2388 }
|
|
2389
|
255
|
2390 /* <HEADER>: <fileID> <regioncnt> <regionname> ...
|
|
2391 * <charflagslen> <charflags> <fcharslen> <fchars> */
|
300
|
2392
|
|
2393 /* <fileID> */
|
|
2394 if (fwrite(VIMSPELLMAGIC, VIMSPELLMAGICL, (size_t)1, fd) != 1)
|
|
2395 EMSG(_(e_write));
|
236
|
2396
|
|
2397 /* write the region names if there is more than one */
|
|
2398 if (regcount > 1)
|
|
2399 {
|
300
|
2400 putc(regcount, fd); /* <regioncnt> <regionname> ... */
|
|
2401 fwrite(regchars, (size_t)(regcount * 2), (size_t)1, fd);
|
|
2402 regionmask = (1 << regcount) - 1;
|
236
|
2403 }
|
|
2404 else
|
|
2405 {
|
300
|
2406 putc(0, fd);
|
|
2407 regionmask = 0;
|
236
|
2408 }
|
|
2409
|
255
|
2410 /* Write the table with character flags and table for case folding.
|
260
|
2411 * <charflagslen> <charflags> <fcharlen> <fchars>
|
|
2412 * Skip this for ASCII, the table may conflict with the one used for
|
|
2413 * 'encoding'. */
|
300
|
2414 if (spin->si_ascii)
|
260
|
2415 {
|
300
|
2416 putc(0, fd);
|
|
2417 putc(0, fd);
|
|
2418 putc(0, fd);
|
260
|
2419 }
|
|
2420 else
|
300
|
2421 write_spell_chartab(fd);
|
255
|
2422
|
236
|
2423
|
255
|
2424 /* <SUGGEST> : <suggestlen> <more> ...
|
|
2425 * TODO. Only write a zero length for now. */
|
300
|
2426 put_bytes(fd, 0L, 4); /* <suggestlen> */
|
236
|
2427
|
302
|
2428 spin->si_memtot = 0;
|
|
2429
|
236
|
2430 /*
|
300
|
2431 * <LWORDTREE> <KWORDTREE>
|
236
|
2432 */
|
300
|
2433 for (round = 1; round <= 2; ++round)
|
236
|
2434 {
|
300
|
2435 tree = (round == 1) ? spin->si_foldroot : spin->si_keeproot;
|
236
|
2436
|
300
|
2437 /* Count the number of nodes. Needed to be able to allocate the
|
|
2438 * memory when reading the nodes. Also fills in the index for shared
|
|
2439 * nodes. */
|
|
2440 nodecount = put_tree(NULL, tree, 0, regionmask);
|
236
|
2441
|
300
|
2442 /* number of nodes in 4 bytes */
|
|
2443 put_bytes(fd, (long_u)nodecount, 4); /* <nodecount> */
|
302
|
2444 spin->si_memtot += nodecount + nodecount * sizeof(int);
|
236
|
2445
|
300
|
2446 /* Write the nodes. */
|
|
2447 (void)put_tree(fd, tree, 0, regionmask);
|
236
|
2448 }
|
|
2449
|
300
|
2450 fclose(fd);
|
236
|
2451 }
|
|
2452
|
|
2453 /*
|
300
|
2454 * Dump a word tree at node "node".
|
|
2455 *
|
|
2456 * This first writes the list of possible bytes (siblings). Then for each
|
|
2457 * byte recursively write the children.
|
|
2458 *
|
|
2459 * NOTE: The code here must match the code in read_tree(), since assumptions
|
|
2460 * are made about the indexes (so that we don't have to write them in the
|
|
2461 * file).
|
236
|
2462 *
|
300
|
2463 * Returns the number of nodes used.
|
236
|
2464 */
|
300
|
2465 static int
|
|
2466 put_tree(fd, node, index, regionmask)
|
|
2467 FILE *fd; /* NULL when only counting */
|
|
2468 wordnode_T *node;
|
|
2469 int index;
|
|
2470 int regionmask;
|
236
|
2471 {
|
300
|
2472 int newindex = index;
|
|
2473 int siblingcount = 0;
|
|
2474 wordnode_T *np;
|
236
|
2475 int flags;
|
300
|
2476
|
|
2477 /* If "node" is zero the tree is empty. */
|
|
2478 if (node == NULL)
|
|
2479 return 0;
|
|
2480
|
|
2481 /* Store the index where this node is written. */
|
|
2482 node->wn_index = index;
|
236
|
2483
|
300
|
2484 /* Count the number of siblings. */
|
|
2485 for (np = node; np != NULL; np = np->wn_sibling)
|
|
2486 ++siblingcount;
|
236
|
2487
|
300
|
2488 /* Write the sibling count. */
|
|
2489 if (fd != NULL)
|
|
2490 putc(siblingcount, fd); /* <siblingcount> */
|
236
|
2491
|
300
|
2492 /* Write each sibling byte and optionally extra info. */
|
|
2493 for (np = node; np != NULL; np = np->wn_sibling)
|
236
|
2494 {
|
300
|
2495 if (np->wn_byte == 0)
|
|
2496 {
|
|
2497 if (fd != NULL)
|
|
2498 {
|
|
2499 /* For a NUL byte (end of word) instead of the byte itself
|
|
2500 * we write the flag/region items. */
|
|
2501 flags = np->wn_flags;
|
|
2502 if (regionmask != 0 && np->wn_region != regionmask)
|
|
2503 flags |= WF_REGION;
|
|
2504 if (flags == 0)
|
|
2505 {
|
|
2506 /* word without flags or region */
|
|
2507 putc(BY_NOFLAGS, fd); /* <byte> */
|
|
2508 }
|
|
2509 else
|
|
2510 {
|
|
2511 putc(BY_FLAGS, fd); /* <byte> */
|
|
2512 putc(flags, fd); /* <flags> */
|
|
2513 if (flags & WF_REGION)
|
|
2514 putc(np->wn_region, fd); /* <regionmask> */
|
|
2515 }
|
|
2516 }
|
|
2517 }
|
|
2518 else
|
|
2519 {
|
|
2520 if (np->wn_child->wn_index != 0 && np->wn_child->wn_wnode != node)
|
|
2521 {
|
|
2522 /* The child is written elsewhere, write the reference. */
|
|
2523 if (fd != NULL)
|
|
2524 {
|
|
2525 putc(BY_INDEX, fd); /* <byte> */
|
|
2526 /* <nodeidx> */
|
|
2527 put_bytes(fd, (long_u)np->wn_child->wn_index, 3);
|
|
2528 }
|
|
2529 }
|
|
2530 else if (np->wn_child->wn_wnode == NULL)
|
|
2531 /* We will write the child below and give it an index. */
|
|
2532 np->wn_child->wn_wnode = node;
|
236
|
2533
|
300
|
2534 if (fd != NULL)
|
|
2535 if (putc(np->wn_byte, fd) == EOF) /* <byte> or <xbyte> */
|
|
2536 {
|
|
2537 EMSG(_(e_write));
|
|
2538 return 0;
|
|
2539 }
|
|
2540 }
|
236
|
2541 }
|
|
2542
|
300
|
2543 /* Space used in the array when reading: one for each sibling and one for
|
|
2544 * the count. */
|
|
2545 newindex += siblingcount + 1;
|
249
|
2546
|
300
|
2547 /* Recursively dump the children of each sibling. */
|
|
2548 for (np = node; np != NULL; np = np->wn_sibling)
|
|
2549 if (np->wn_byte != 0 && np->wn_child->wn_wnode == node)
|
|
2550 newindex = put_tree(fd, np->wn_child, newindex, regionmask);
|
249
|
2551
|
300
|
2552 return newindex;
|
236
|
2553 }
|
|
2554
|
|
2555
|
|
2556 /*
|
|
2557 * ":mkspell outfile infile ..."
|
|
2558 */
|
|
2559 void
|
|
2560 ex_mkspell(eap)
|
|
2561 exarg_T *eap;
|
|
2562 {
|
|
2563 int fcount;
|
|
2564 char_u **fnames;
|
|
2565 char_u fname[MAXPATHL];
|
|
2566 char_u wfname[MAXPATHL];
|
|
2567 afffile_T *(afile[8]);
|
|
2568 int i;
|
|
2569 int len;
|
|
2570 char_u region_name[16];
|
|
2571 struct stat st;
|
240
|
2572 char_u *arg = eap->arg;
|
255
|
2573 int error = FALSE;
|
300
|
2574 spellinfo_T spin;
|
|
2575
|
|
2576 vim_memset(&spin, 0, sizeof(spin));
|
240
|
2577
|
|
2578 if (STRNCMP(arg, "-ascii", 6) == 0)
|
|
2579 {
|
300
|
2580 spin.si_ascii = TRUE;
|
240
|
2581 arg = skipwhite(arg + 6);
|
|
2582 }
|
|
2583
|
|
2584 /* Expand all the remaining arguments (e.g., $VIMRUNTIME). */
|
|
2585 if (get_arglist_exp(arg, &fcount, &fnames) == FAIL)
|
236
|
2586 return;
|
|
2587 if (fcount < 2)
|
|
2588 EMSG(_(e_invarg)); /* need at least output and input names */
|
|
2589 else if (fcount > 9)
|
|
2590 EMSG(_("E754: Only up to 8 regions supported"));
|
|
2591 else
|
|
2592 {
|
|
2593 /* Check for overwriting before doing things that may take a lot of
|
|
2594 * time. */
|
272
|
2595 vim_snprintf((char *)wfname, sizeof(wfname), "%s.%s.spl", fnames[0],
|
300
|
2596 spin.si_ascii ? (char_u *)"ascii" : p_enc);
|
236
|
2597 if (!eap->forceit && mch_stat((char *)wfname, &st) >= 0)
|
|
2598 {
|
|
2599 EMSG(_(e_exists));
|
|
2600 goto theend;
|
|
2601 }
|
|
2602 if (mch_isdir(fnames[0]))
|
|
2603 {
|
|
2604 EMSG2(_(e_isadir2), fnames[0]);
|
|
2605 goto theend;
|
|
2606 }
|
|
2607
|
|
2608 /*
|
|
2609 * Init the aff and dic pointers.
|
|
2610 * Get the region names if there are more than 2 arguments.
|
|
2611 */
|
|
2612 for (i = 1; i < fcount; ++i)
|
|
2613 {
|
|
2614 afile[i - 1] = NULL;
|
300
|
2615
|
236
|
2616 if (fcount > 2)
|
|
2617 {
|
|
2618 len = STRLEN(fnames[i]);
|
|
2619 if (STRLEN(gettail(fnames[i])) < 5 || fnames[i][len - 3] != '_')
|
|
2620 {
|
|
2621 EMSG2(_("E755: Invalid region in %s"), fnames[i]);
|
|
2622 goto theend;
|
|
2623 }
|
|
2624 else
|
|
2625 {
|
|
2626 region_name[(i - 1) * 2] = TOLOWER_ASC(fnames[i][len - 2]);
|
|
2627 region_name[(i - 1) * 2 + 1] =
|
|
2628 TOLOWER_ASC(fnames[i][len - 1]);
|
|
2629 }
|
|
2630 }
|
|
2631 }
|
|
2632
|
255
|
2633 /* Clear the char type tables, don't want to use any of the currently
|
|
2634 * used spell properties. */
|
|
2635 init_spell_chartab();
|
|
2636
|
300
|
2637 spin.si_foldroot = wordtree_alloc(&spin.si_blocks);
|
|
2638 spin.si_keeproot = wordtree_alloc(&spin.si_blocks);
|
|
2639 if (spin.si_foldroot == NULL || spin.si_keeproot == NULL)
|
|
2640 {
|
|
2641 error = TRUE;
|
|
2642 goto theend;
|
|
2643 }
|
|
2644
|
236
|
2645 /*
|
|
2646 * Read all the .aff and .dic files.
|
|
2647 * Text is converted to 'encoding'.
|
300
|
2648 * Words are stored in the case-folded and keep-case trees.
|
236
|
2649 */
|
300
|
2650 for (i = 1; i < fcount && !error; ++i)
|
236
|
2651 {
|
300
|
2652 spin.si_conv.vc_type = CONV_NONE;
|
|
2653 spin.si_region = 1 << (i - 1);
|
|
2654
|
272
|
2655 vim_snprintf((char *)fname, sizeof(fname), "%s.aff", fnames[i]);
|
300
|
2656 if (mch_stat((char *)fname, &st) >= 0)
|
|
2657 {
|
|
2658 /* Read the .aff file. Will init "spin->si_conv" based on the
|
|
2659 * "SET" line. */
|
|
2660 afile[i - 1] = spell_read_aff(fname, &spin);
|
|
2661 if (afile[i - 1] == NULL)
|
|
2662 error = TRUE;
|
|
2663 else
|
|
2664 {
|
|
2665 /* Read the .dic file and store the words in the trees. */
|
|
2666 vim_snprintf((char *)fname, sizeof(fname), "%s.dic",
|
|
2667 fnames[i]);
|
|
2668 if (spell_read_dic(fname, &spin, afile[i - 1]) == FAIL)
|
|
2669 error = TRUE;
|
|
2670 }
|
|
2671 }
|
|
2672 else
|
|
2673 {
|
|
2674 /* No .aff file, try reading the file as a word list. Store
|
|
2675 * the words in the trees. */
|
|
2676 if (spell_read_wordfile(fnames[i], &spin) == FAIL)
|
|
2677 error = TRUE;
|
|
2678 }
|
236
|
2679
|
|
2680 /* Free any conversion stuff. */
|
300
|
2681 convert_setup(&spin.si_conv, NULL, NULL);
|
236
|
2682 }
|
|
2683
|
300
|
2684 if (!error)
|
236
|
2685 {
|
|
2686 /*
|
300
|
2687 * Remove the dummy NUL from the start of the tree root.
|
236
|
2688 */
|
300
|
2689 spin.si_foldroot = spin.si_foldroot->wn_sibling;
|
|
2690 spin.si_keeproot = spin.si_keeproot->wn_sibling;
|
236
|
2691
|
|
2692 /*
|
300
|
2693 * Combine tails in the tree.
|
236
|
2694 */
|
300
|
2695 MSG(_("Compressing word tree..."));
|
236
|
2696 out_flush();
|
300
|
2697 wordtree_compress(spin.si_foldroot);
|
|
2698 wordtree_compress(spin.si_keeproot);
|
236
|
2699 }
|
|
2700
|
300
|
2701 if (!error)
|
|
2702 {
|
|
2703 /*
|
|
2704 * Write the info in the spell file.
|
|
2705 */
|
|
2706 smsg((char_u *)_("Writing spell file %s..."), wfname);
|
|
2707 out_flush();
|
|
2708 write_vim_spell(wfname, &spin, fcount - 1, region_name);
|
|
2709 MSG(_("Done!"));
|
302
|
2710
|
|
2711 smsg((char_u *)_("Estimated runtime memory use: %d bytes"),
|
|
2712 spin.si_memtot);
|
300
|
2713 out_flush();
|
|
2714 }
|
|
2715
|
|
2716 /* Free the allocated memory. */
|
|
2717 free_blocks(spin.si_blocks);
|
|
2718
|
|
2719 /* Free the .aff file structures. */
|
236
|
2720 for (i = 1; i < fcount; ++i)
|
|
2721 if (afile[i - 1] != NULL)
|
|
2722 spell_free_aff(afile[i - 1]);
|
|
2723 }
|
|
2724
|
|
2725 theend:
|
|
2726 FreeWild(fcount, fnames);
|
|
2727 }
|
|
2728
|
|
2729 #endif /* FEAT_MBYTE */
|
|
2730
|
300
|
2731
|
236
|
2732 #endif /* FEAT_SYN_HL */
|