comparison src/spell.c @ 223:5175af353b81

updated for version 7.0062
author vimboss
date Mon, 21 Mar 2005 08:23:33 +0000
parents
children 4e7dca477fee
comparison
equal deleted inserted replaced
222:14ded4ba39cc 223:5175af353b81
1 /* vi:set ts=8 sts=4 sw=4:
2 *
3 * VIM - Vi IMproved by Bram Moolenaar
4 *
5 * Do ":help uganda" in Vim to read copying and usage conditions.
6 * Do ":help credits" in Vim to see a list of people who contributed.
7 * See README.txt for an overview of the Vim source code.
8 */
9
10 /*
11 * spell.c: code for spell checking
12 */
13
14 #if defined(MSDOS) || defined(WIN16) || defined(WIN32) || defined(_WIN64)
15 # include <io.h> /* for lseek(), must be before vim.h */
16 #endif
17
18 #include "vim.h"
19
20 #if defined(FEAT_SYN_HL) || defined(PROTO)
21
22 #ifdef HAVE_FCNTL_H
23 # include <fcntl.h>
24 #endif
25
26 /*
27 * Structure that is used to store the text from the language file. This
28 * avoids the need to allocate each individual word and copying it. It's
29 * allocated in big chunks for speed.
30 */
31 #define SBLOCKSIZE 4096 /* default size of sb_data */
32 typedef struct sblock_S sblock_T;
33 struct sblock_S
34 {
35 sblock_T *sb_next; /* next block in list */
36 char_u sb_data[1]; /* data, actually longer */
37 };
38
39 /*
40 * Structure used to store words and other info for one language.
41 */
42 typedef struct slang_S slang_T;
43
44 struct slang_S
45 {
46 slang_T *sl_next; /* next language */
47 char_u sl_name[2]; /* language name "en", "nl", etc. */
48 hashtab_T sl_ht; /* hashtable with all words */
49 garray_T sl_match; /* table with pointers to matches */
50 garray_T sl_add; /* table with pointers to additions */
51 char_u sl_regions[13]; /* table with up to 6 region names */
52 sblock_T *sl_block; /* list with allocated memory blocks */
53 };
54
55 static slang_T *first_lang = NULL;
56
57 /*
58 * Structure used in "b_langp", filled from 'spelllang'.
59 */
60 typedef struct langp_S
61 {
62 slang_T *lp_slang; /* info for this language (NULL for last one) */
63 int lp_region; /* bitmask for region or REGION_ALL */
64 } langp_T;
65
66 #define LANGP_ENTRY(ga, i) (((langp_T *)(ga).ga_data) + (i))
67 #define MATCH_ENTRY(gap, i) *(((char_u **)(gap)->ga_data) + i)
68
69 /*
70 * The byte before a word in the hashtable indicates the type of word.
71 * Also used for the byte just before a match.
72 * The top two bits are used to indicate rare and case-sensitive words.
73 * The lower bits are used to indicate the region in which the word is valid.
74 * Words valid in all regions use REGION_ALL.
75 */
76 #define REGION_MASK 0x3f
77 #define REGION_ALL 0x3f
78 #define CASE_MASK 0x40
79 #define RARE_MASK 0x80
80
81 #define SP_OK 0
82 #define SP_BAD 1
83 #define SP_RARE 2
84 #define SP_LOCAL 3
85
86 static slang_T *spell_load_lang __ARGS((char_u *lang));
87 static void spell_load_file __ARGS((char_u *fname));
88 static int find_region __ARGS((char_u *rp, char_u *region));
89
90 /*
91 * Main spell-checking function.
92 * "ptr" points to the start of a word.
93 * "*attrp" is set to the attributes for a badly spelled word. For a non-word
94 * or when it's OK it remains unchanged.
95 * This must only be called when 'spelllang' is not empty.
96 * Returns the length of the word in bytes, also when it's OK, so that the
97 * caller can skip over the word.
98 */
99 int
100 spell_check(wp, ptr, attrp)
101 win_T *wp; /* current window */
102 char_u *ptr;
103 int *attrp;
104 {
105 char_u *e;
106 langp_T *lp;
107 int result;
108 int len = 0;
109 hash_T hash;
110 hashitem_T *hi;
111 int c;
112 #define MAXWLEN 80 /* assume max. word len is 80 */
113 char_u word[MAXWLEN + 1];
114 garray_T *gap;
115 int l, h, t;
116 char_u *p;
117 int n;
118
119 /* Find the end of the word. We already know that *ptr is a word char. */
120 e = ptr;
121 do
122 {
123 mb_ptr_adv(e);
124 ++len;
125 } while (*e != NUL && vim_iswordc_buf(e, wp->w_buffer));
126
127 /* The word is bad unless we find it in the dictionary. */
128 result = SP_BAD;
129
130 /* Words are always stored with folded case. */
131 (void)str_foldcase(ptr, e - ptr, word, MAXWLEN + 1);
132 hash = hash_hash(word);
133
134 /*
135 * Loop over the languages specified in 'spelllang'.
136 * We check them all, because a match may find a longer word.
137 */
138 for (lp = LANGP_ENTRY(wp->w_buffer->b_langp, 0); lp->lp_slang != NULL;
139 ++lp)
140 {
141 /* Check words when it wasn't recognized as a good word yet. */
142 if (result != SP_OK)
143 {
144 /* Word lookup. Using a hash table is fast. */
145 hi = hash_lookup(&lp->lp_slang->sl_ht, word, hash);
146 if (!HASHITEM_EMPTY(hi))
147 {
148 /* The character before the key indicates the type of word. */
149 c = hi->hi_key[-1];
150 if ((c & CASE_MASK) != 0)
151 {
152 /* Need to check first letter is uppercase. If it is,
153 * check region. If it isn't it may be a rare word. */
154 if (
155 #ifdef FEAT_MBYTE
156 MB_ISUPPER(mb_ptr2char(ptr))
157 #else
158 MB_ISUPPER(*ptr)
159 #endif
160 )
161 {
162 if ((c & lp->lp_region) == 0)
163 result = SP_LOCAL;
164 else
165 result = SP_OK;
166 }
167 else if (c & RARE_MASK)
168 result = SP_RARE;
169 }
170 else
171 {
172 if ((c & lp->lp_region) == 0)
173 result = SP_LOCAL;
174 else if (c & RARE_MASK)
175 result = SP_RARE;
176 else
177 result = SP_OK;
178 }
179 }
180 }
181
182 /* Match lookup. Uses a binary search. If there is a match adjust
183 * "e" to the end. This is also done when a word matched, because
184 * "you've" is longer than "you". */
185 gap = &lp->lp_slang->sl_match;
186 l = 0; /* low index */
187 h = gap->ga_len - 1; /* high index */
188 /* keep searching, the match must be between "l" and "h" (inclusive) */
189 while (h >= l)
190 {
191 t = (h + l) / 2;
192 p = MATCH_ENTRY(gap, t) + 1;
193 for (n = 0; p[n] != 0 && p[n] == ptr[n]; ++n)
194 ;
195 if (p[n] == 0)
196 {
197 if ((ptr[n] == 0 || !vim_iswordc_buf(ptr + n, wp->w_buffer)))
198 {
199 /* match! */
200 e = ptr + n;
201 if (result != SP_OK)
202 {
203 if ((lp->lp_region & p[-1]) == 0)
204 result = SP_LOCAL;
205 else
206 result = SP_OK;
207 }
208 break;
209 }
210 /* match is too short, next item is new low index */
211 l = t + 1;
212 }
213 else if (p[n] < ptr[n])
214 /* match is before word, next item is new low index */
215 l = t + 1;
216 else
217 /* match is after word, previous item is new high index */
218 h = t - 1;
219 }
220
221 /* Addition lookup. Uses a linear search, there should be very few.
222 * If there is a match adjust "e" to the end. This doesn't change
223 * whether a word was good or bad, only the length. */
224 gap = &lp->lp_slang->sl_add;
225 for (t = 0; t < gap->ga_len; ++t)
226 {
227 p = MATCH_ENTRY(gap, t) + 1;
228 for (n = 0; p[n] != 0 && p[n] == e[n]; ++n)
229 ;
230 if (p[n] == 0
231 && (e[n] == 0 || !vim_iswordc_buf(e + n, wp->w_buffer)))
232 {
233 /* match */
234 e += n;
235 break;
236 }
237 }
238 }
239
240 if (result != SP_OK)
241 {
242 if (result == SP_BAD)
243 *attrp = highlight_attr[HLF_SPB];
244 else if (result == SP_RARE)
245 *attrp = highlight_attr[HLF_SPR];
246 else
247 *attrp = highlight_attr[HLF_SPL];
248 }
249
250 return (int)(e - ptr);
251 }
252
253 static slang_T *load_lp; /* passed from spell_load_lang() to
254 spell_load_file() */
255
256 /*
257 * Load language "lang[2]".
258 */
259 static slang_T *
260 spell_load_lang(lang)
261 char_u *lang;
262 {
263 slang_T *lp;
264 char_u fname_enc[80];
265 char_u fname_ascii[20];
266 char_u *p;
267
268 lp = (slang_T *)alloc(sizeof(slang_T));
269 if (lp != NULL)
270 {
271 lp->sl_name[0] = lang[0];
272 lp->sl_name[1] = lang[1];
273 hash_init(&lp->sl_ht);
274 ga_init2(&lp->sl_match, sizeof(char_u *), 20);
275 ga_init2(&lp->sl_add, sizeof(char_u *), 4);
276 lp->sl_regions[0] = NUL;
277 lp->sl_block = NULL;
278
279 /* Find all spell files for "lang" in 'runtimepath' and load them.
280 * Use 'encoding', except that we use "latin1" for "latin9". */
281 #ifdef FEAT_MBYTE
282 if (STRLEN(p_enc) < 60 && STRCMP(p_enc, "iso-8859-15") != 0)
283 p = p_enc;
284 else
285 #endif
286 p = (char_u *)"latin1";
287 load_lp = lp;
288 sprintf((char *)fname_enc, "spell/%c%c.%s.spl", lang[0], lang[1], p);
289 if (do_in_runtimepath(fname_enc, TRUE, spell_load_file) == FAIL)
290 {
291 /* Try again to find an ASCII spell file. */
292 sprintf((char *)fname_ascii, "spell/%c%c.spl", lang[0], lang[1]);
293 if (do_in_runtimepath(fname_ascii, TRUE, spell_load_file) == FAIL)
294 {
295 vim_free(lp);
296 lp = NULL;
297 smsg((char_u *)_("Warning: Cannot find dictionary \"%s\""),
298 fname_enc + 6);
299 }
300 }
301 else
302 {
303 lp->sl_next = first_lang;
304 first_lang = lp;
305 }
306 }
307
308 return lp;
309 }
310
311 /*
312 * Load one spell file into "load_lp".
313 * Invoked through do_in_runtimepath().
314 */
315 static void
316 spell_load_file(fname)
317 char_u *fname;
318 {
319 int fd;
320 size_t len;
321 size_t l;
322 size_t rest = 0;
323 char_u *p = NULL, *np;
324 sblock_T *bl;
325 hash_T hash;
326 hashitem_T *hi;
327 int c;
328 int region = REGION_ALL;
329 char_u word[MAXWLEN + 1];
330 int n;
331
332 fd = mch_open((char *)fname, O_RDONLY | O_EXTRA, 0);
333 if (fd < 0)
334 {
335 EMSG2(_(e_notopen), fname);
336 return;
337 }
338
339 /* Get the length of the whole file. */
340 len = lseek(fd, (off_t)0, SEEK_END);
341 lseek(fd, (off_t)0, SEEK_SET);
342
343 /* Loop, reading the file one block at a time.
344 * "rest" is the length of an incomplete line at the previous block.
345 * "p" points to the remainder. */
346 while (len > 0)
347 {
348 /* Allocate a block of memory to store the info in. This is not freed
349 * until spell_reload() is called. */
350 if (len > SBLOCKSIZE)
351 l = SBLOCKSIZE;
352 else
353 l = len;
354 len -= l;
355 bl = (sblock_T *)alloc((unsigned)(sizeof(sblock_T) - 1 + l + rest));
356 if (bl == NULL)
357 break;
358 bl->sb_next = load_lp->sl_block;
359 load_lp->sl_block = bl;
360
361 /* Read a block from the file. Prepend the remainder of the previous
362 * block. */
363 if (rest > 0)
364 mch_memmove(bl->sb_data, p, rest);
365 if (read(fd, bl->sb_data + rest, l) != l)
366 {
367 EMSG2(_(e_notread), fname);
368 break;
369 }
370 l += rest;
371 rest = 0;
372
373 /* Deal with each line that was read until we finish the block. */
374 for (p = bl->sb_data; l > 0; p = np)
375 {
376 /* "np" points to the char after the line (CR or NL). */
377 for (np = p; l > 0 && *np >= ' '; ++np)
378 --l;
379 if (l == 0)
380 {
381 /* Incomplete line (or end of file). */
382 rest = np - p;
383 if (len == 0)
384 EMSG2(_("E751: Truncated spell file: %s"), fname);
385 break;
386 }
387 *np = NUL; /* terminate the line with a NUL */
388
389 /* Skip comment and empty lines. */
390 c = *p;
391 if (c != '#' && np > p)
392 {
393 if (c == '=' || c == '+')
394 {
395 garray_T *gap;
396
397 /* Match or Add item. */
398 if (c == '=')
399 gap = &load_lp->sl_match;
400 else
401 gap = &load_lp->sl_add;
402
403 if (ga_grow(gap, 1) == OK)
404 {
405 for (n = 0; n < gap->ga_len; ++n)
406 if ((c = STRCMP(p + 1,
407 MATCH_ENTRY(gap, n) + 1)) < 0)
408 break;
409 if (c == 0)
410 {
411 if (p_verbose > 0)
412 smsg((char_u *)_("Warning: duplicate match \"%s\" in %s"),
413 p + 1, fname);
414 }
415 else
416 {
417 mch_memmove((char_u **)gap->ga_data + n + 1,
418 (char_u **)gap->ga_data + n,
419 (gap->ga_len - n) * sizeof(char_u *));
420 *(((char_u **)gap->ga_data) + n) = p;
421 *p = region;
422 ++gap->ga_len;
423 }
424 }
425 }
426 else if (c == '-')
427 {
428 /* region item */
429 ++p;
430 if (*p == '-')
431 /* end of a region */
432 region = REGION_ALL;
433 else
434 {
435 char_u *rp = load_lp->sl_regions;
436 int r;
437
438 /* The region may be repeated: "-ca-uk". Fill
439 * "region" with the bit mask for the ones we find. */
440 region = 0;
441 for (;;)
442 {
443 /* start of a region */
444 r = find_region(rp, p);
445 if (r == REGION_ALL)
446 {
447 /* new region, add it */
448 r = STRLEN(rp);
449 if (r >= 12)
450 {
451 EMSG2(_("E752: Too many regions in %s"),
452 fname);
453 r = REGION_ALL;
454 }
455 else
456 {
457 rp[r] = p[0];
458 rp[r + 1] = p[1];
459 rp[r + 2] = NUL;
460 r = 1 << (r / 2);
461 }
462 }
463 else
464 r = 1 << r;
465
466 region |= r;
467 if (p[2] != '-')
468 {
469 if (p[2] != NUL)
470 EMSG2(_("E753: Invalid character in \"%s\""),
471 p - 1);
472 break;
473 }
474 p += 3;
475 }
476 }
477 }
478 else
479 {
480 /* add the word */
481 if (c == '>')
482 c = region | RARE_MASK;
483 else
484 {
485 if (c != ' ')
486 EMSG2(_("E753: Invalid character in \"%s\""), p);
487 c = region;
488 }
489 #ifdef FEAT_MBYTE
490 if (MB_ISUPPER(mb_ptr2char(p + 1)))
491 #else
492 if (MB_ISUPPER(p[1]))
493 #endif
494 c |= CASE_MASK;
495 *p++ = c;
496 (void)str_foldcase(p, np - p, word, MAXWLEN + 1);
497 n = STRLEN(word);
498 if (n > np - p)
499 {
500 sblock_T *s;
501
502 /* Folding case made word longer! We need to allocate
503 * memory for it. */
504 s = (sblock_T *)alloc((unsigned)sizeof(sblock_T)
505 + n + 1);
506 if (s != NULL)
507 {
508 s->sb_next = load_lp->sl_block;
509 load_lp->sl_block = s;
510 s->sb_data[0] = p[-1];
511 p = s->sb_data + 1;
512 }
513 }
514 mch_memmove(p, word, n + 1);
515
516 hash = hash_hash(p);
517 hi = hash_lookup(&load_lp->sl_ht, p, hash);
518 if (!HASHITEM_EMPTY(hi))
519 {
520 c = hi->hi_key[-1];
521 if ((c & (CASE_MASK | RARE_MASK))
522 == (p[-1] & (CASE_MASK | RARE_MASK)))
523 {
524 if (p_verbose > 0)
525 smsg((char_u *)_("Warning: duplicate word \"%s\" in %s"),
526 p, fname);
527 }
528 else
529 hi->hi_key[-1] |= (p[-1] & (CASE_MASK | RARE_MASK));
530 }
531 else
532 hash_add_item(&load_lp->sl_ht, hi, p, hash);
533 }
534 }
535
536 while (l > 0 && *np < ' ')
537 {
538 ++np;
539 --l;
540 }
541 }
542 }
543
544 close(fd);
545 }
546
547 /*
548 * Parse 'spelllang' and set buf->b_langp accordingly.
549 * Returns an error message or NULL.
550 */
551 char_u *
552 did_set_spelllang(buf)
553 buf_T *buf;
554 {
555 garray_T ga;
556 char_u *lang;
557 char_u *e;
558 char_u *region;
559 int region_mask;
560 slang_T *lp;
561 int c;
562
563 ga_init2(&ga, sizeof(langp_T), 2);
564
565 /* loop over comma separated languages. */
566 for (lang = buf->b_p_spl; *lang != NUL; lang = e)
567 {
568 e = vim_strchr(lang, ',');
569 if (e == NULL)
570 e = lang + STRLEN(lang);
571 if (e > lang + 2)
572 {
573 if (lang[2] != '_' || e - lang != 5)
574 {
575 ga_clear(&ga);
576 return e_invarg;
577 }
578 region = lang + 3;
579 }
580 else
581 region = NULL;
582
583 for (lp = first_lang; lp != NULL; lp = lp->sl_next)
584 if (STRNICMP(lp->sl_name, lang, 2) == 0)
585 break;
586
587 if (lp == NULL)
588 /* Not found, load the language. */
589 lp = spell_load_lang(lang);
590
591 if (lp != NULL)
592 {
593 if (region == NULL)
594 region_mask = REGION_ALL;
595 else
596 {
597 /* find region in sl_regions */
598 c = find_region(lp->sl_regions, region);
599 if (c == REGION_ALL)
600 {
601 c = lang[5];
602 lang[5] = NUL;
603 smsg((char_u *)_("Warning: region %s not supported"), lang);
604 lang[5] = c;
605 region_mask = REGION_ALL;
606 }
607 else
608 region_mask = 1 << c;
609 }
610
611 if (ga_grow(&ga, 1) == FAIL)
612 {
613 ga_clear(&ga);
614 return e_outofmem;
615 }
616 LANGP_ENTRY(ga, ga.ga_len)->lp_slang = lp;
617 LANGP_ENTRY(ga, ga.ga_len)->lp_region = region_mask;
618 ++ga.ga_len;
619 }
620
621 if (*e == ',')
622 ++e;
623 }
624
625 /* Add a NULL entry to mark the end of the list. */
626 if (ga_grow(&ga, 1) == FAIL)
627 {
628 ga_clear(&ga);
629 return e_outofmem;
630 }
631 LANGP_ENTRY(ga, ga.ga_len)->lp_slang = NULL;
632 ++ga.ga_len;
633
634 /* Everything is fine, store the new b_langp value. */
635 ga_clear(&buf->b_langp);
636 buf->b_langp = ga;
637
638 return NULL;
639 }
640
641 /*
642 * Find the region "region[2]" in "rp" (points to "sl_regions").
643 * Each region is simply stored as the two characters of it's name.
644 * Returns the index if found, REGION_ALL if not found.
645 */
646 static int
647 find_region(rp, region)
648 char_u *rp;
649 char_u *region;
650 {
651 int i;
652
653 for (i = 0; ; i += 2)
654 {
655 if (rp[i] == NUL)
656 return REGION_ALL;
657 if (rp[i] == region[0] && rp[i + 1] == region[1])
658 break;
659 }
660 return i / 2;
661 }
662
663 # if defined(FEAT_MBYTE) || defined(PROTO)
664 /*
665 * Clear all spelling tables and reload them.
666 * Used after 'encoding' is set.
667 */
668 void
669 spell_reload()
670 {
671 buf_T *buf;
672 slang_T *lp;
673 sblock_T *sp;
674
675 /* Unload all allocated memory. */
676 while (first_lang != NULL)
677 {
678 lp = first_lang;
679 first_lang = lp->sl_next;
680
681 hash_clear(&lp->sl_ht);
682 ga_clear(&lp->sl_match);
683 ga_clear(&lp->sl_add);
684 while (lp->sl_block != NULL)
685 {
686 sp = lp->sl_block;
687 lp->sl_block = sp->sb_next;
688 vim_free(sp);
689 }
690 }
691
692 /* Go through all buffers and handle 'spelllang'. */
693 for (buf = firstbuf; buf != NULL; buf = buf->b_next)
694 {
695 ga_clear(&buf->b_langp);
696 if (*buf->b_p_spl != NUL)
697 did_set_spelllang(buf);
698 }
699 }
700 # endif
701
702 #endif /* FEAT_SYN_HL */