# HG changeset patch # User vimboss # Date 1111393413 0 # Node ID 5175af353b81f0c7c34392f7c28da49c84fc12a2 # Parent 14ded4ba39cce02e3bac086a08a63e54d37560d2 updated for version 7.0062 diff --git a/src/spell.c b/src/spell.c new file mode 100644 --- /dev/null +++ b/src/spell.c @@ -0,0 +1,702 @@ +/* vi:set ts=8 sts=4 sw=4: + * + * VIM - Vi IMproved by Bram Moolenaar + * + * Do ":help uganda" in Vim to read copying and usage conditions. + * Do ":help credits" in Vim to see a list of people who contributed. + * See README.txt for an overview of the Vim source code. + */ + +/* + * spell.c: code for spell checking + */ + +#if defined(MSDOS) || defined(WIN16) || defined(WIN32) || defined(_WIN64) +# include /* for lseek(), must be before vim.h */ +#endif + +#include "vim.h" + +#if defined(FEAT_SYN_HL) || defined(PROTO) + +#ifdef HAVE_FCNTL_H +# include +#endif + +/* + * Structure that is used to store the text from the language file. This + * avoids the need to allocate each individual word and copying it. It's + * allocated in big chunks for speed. + */ +#define SBLOCKSIZE 4096 /* default size of sb_data */ +typedef struct sblock_S sblock_T; +struct sblock_S +{ + sblock_T *sb_next; /* next block in list */ + char_u sb_data[1]; /* data, actually longer */ +}; + +/* + * Structure used to store words and other info for one language. + */ +typedef struct slang_S slang_T; + +struct slang_S +{ + slang_T *sl_next; /* next language */ + char_u sl_name[2]; /* language name "en", "nl", etc. */ + hashtab_T sl_ht; /* hashtable with all words */ + garray_T sl_match; /* table with pointers to matches */ + garray_T sl_add; /* table with pointers to additions */ + char_u sl_regions[13]; /* table with up to 6 region names */ + sblock_T *sl_block; /* list with allocated memory blocks */ +}; + +static slang_T *first_lang = NULL; + +/* + * Structure used in "b_langp", filled from 'spelllang'. + */ +typedef struct langp_S +{ + slang_T *lp_slang; /* info for this language (NULL for last one) */ + int lp_region; /* bitmask for region or REGION_ALL */ +} langp_T; + +#define LANGP_ENTRY(ga, i) (((langp_T *)(ga).ga_data) + (i)) +#define MATCH_ENTRY(gap, i) *(((char_u **)(gap)->ga_data) + i) + +/* + * The byte before a word in the hashtable indicates the type of word. + * Also used for the byte just before a match. + * The top two bits are used to indicate rare and case-sensitive words. + * The lower bits are used to indicate the region in which the word is valid. + * Words valid in all regions use REGION_ALL. + */ +#define REGION_MASK 0x3f +#define REGION_ALL 0x3f +#define CASE_MASK 0x40 +#define RARE_MASK 0x80 + +#define SP_OK 0 +#define SP_BAD 1 +#define SP_RARE 2 +#define SP_LOCAL 3 + +static slang_T *spell_load_lang __ARGS((char_u *lang)); +static void spell_load_file __ARGS((char_u *fname)); +static int find_region __ARGS((char_u *rp, char_u *region)); + +/* + * Main spell-checking function. + * "ptr" points to the start of a word. + * "*attrp" is set to the attributes for a badly spelled word. For a non-word + * or when it's OK it remains unchanged. + * This must only be called when 'spelllang' is not empty. + * Returns the length of the word in bytes, also when it's OK, so that the + * caller can skip over the word. + */ + int +spell_check(wp, ptr, attrp) + win_T *wp; /* current window */ + char_u *ptr; + int *attrp; +{ + char_u *e; + langp_T *lp; + int result; + int len = 0; + hash_T hash; + hashitem_T *hi; + int c; +#define MAXWLEN 80 /* assume max. word len is 80 */ + char_u word[MAXWLEN + 1]; + garray_T *gap; + int l, h, t; + char_u *p; + int n; + + /* Find the end of the word. We already know that *ptr is a word char. */ + e = ptr; + do + { + mb_ptr_adv(e); + ++len; + } while (*e != NUL && vim_iswordc_buf(e, wp->w_buffer)); + + /* The word is bad unless we find it in the dictionary. */ + result = SP_BAD; + + /* Words are always stored with folded case. */ + (void)str_foldcase(ptr, e - ptr, word, MAXWLEN + 1); + hash = hash_hash(word); + + /* + * Loop over the languages specified in 'spelllang'. + * We check them all, because a match may find a longer word. + */ + for (lp = LANGP_ENTRY(wp->w_buffer->b_langp, 0); lp->lp_slang != NULL; + ++lp) + { + /* Check words when it wasn't recognized as a good word yet. */ + if (result != SP_OK) + { + /* Word lookup. Using a hash table is fast. */ + hi = hash_lookup(&lp->lp_slang->sl_ht, word, hash); + if (!HASHITEM_EMPTY(hi)) + { + /* The character before the key indicates the type of word. */ + c = hi->hi_key[-1]; + if ((c & CASE_MASK) != 0) + { + /* Need to check first letter is uppercase. If it is, + * check region. If it isn't it may be a rare word. */ + if ( +#ifdef FEAT_MBYTE + MB_ISUPPER(mb_ptr2char(ptr)) +#else + MB_ISUPPER(*ptr) +#endif + ) + { + if ((c & lp->lp_region) == 0) + result = SP_LOCAL; + else + result = SP_OK; + } + else if (c & RARE_MASK) + result = SP_RARE; + } + else + { + if ((c & lp->lp_region) == 0) + result = SP_LOCAL; + else if (c & RARE_MASK) + result = SP_RARE; + else + result = SP_OK; + } + } + } + + /* Match lookup. Uses a binary search. If there is a match adjust + * "e" to the end. This is also done when a word matched, because + * "you've" is longer than "you". */ + gap = &lp->lp_slang->sl_match; + l = 0; /* low index */ + h = gap->ga_len - 1; /* high index */ + /* keep searching, the match must be between "l" and "h" (inclusive) */ + while (h >= l) + { + t = (h + l) / 2; + p = MATCH_ENTRY(gap, t) + 1; + for (n = 0; p[n] != 0 && p[n] == ptr[n]; ++n) + ; + if (p[n] == 0) + { + if ((ptr[n] == 0 || !vim_iswordc_buf(ptr + n, wp->w_buffer))) + { + /* match! */ + e = ptr + n; + if (result != SP_OK) + { + if ((lp->lp_region & p[-1]) == 0) + result = SP_LOCAL; + else + result = SP_OK; + } + break; + } + /* match is too short, next item is new low index */ + l = t + 1; + } + else if (p[n] < ptr[n]) + /* match is before word, next item is new low index */ + l = t + 1; + else + /* match is after word, previous item is new high index */ + h = t - 1; + } + + /* Addition lookup. Uses a linear search, there should be very few. + * If there is a match adjust "e" to the end. This doesn't change + * whether a word was good or bad, only the length. */ + gap = &lp->lp_slang->sl_add; + for (t = 0; t < gap->ga_len; ++t) + { + p = MATCH_ENTRY(gap, t) + 1; + for (n = 0; p[n] != 0 && p[n] == e[n]; ++n) + ; + if (p[n] == 0 + && (e[n] == 0 || !vim_iswordc_buf(e + n, wp->w_buffer))) + { + /* match */ + e += n; + break; + } + } + } + + if (result != SP_OK) + { + if (result == SP_BAD) + *attrp = highlight_attr[HLF_SPB]; + else if (result == SP_RARE) + *attrp = highlight_attr[HLF_SPR]; + else + *attrp = highlight_attr[HLF_SPL]; + } + + return (int)(e - ptr); +} + +static slang_T *load_lp; /* passed from spell_load_lang() to + spell_load_file() */ + +/* + * Load language "lang[2]". + */ + static slang_T * +spell_load_lang(lang) + char_u *lang; +{ + slang_T *lp; + char_u fname_enc[80]; + char_u fname_ascii[20]; + char_u *p; + + lp = (slang_T *)alloc(sizeof(slang_T)); + if (lp != NULL) + { + lp->sl_name[0] = lang[0]; + lp->sl_name[1] = lang[1]; + hash_init(&lp->sl_ht); + ga_init2(&lp->sl_match, sizeof(char_u *), 20); + ga_init2(&lp->sl_add, sizeof(char_u *), 4); + lp->sl_regions[0] = NUL; + lp->sl_block = NULL; + + /* Find all spell files for "lang" in 'runtimepath' and load them. + * Use 'encoding', except that we use "latin1" for "latin9". */ +#ifdef FEAT_MBYTE + if (STRLEN(p_enc) < 60 && STRCMP(p_enc, "iso-8859-15") != 0) + p = p_enc; + else +#endif + p = (char_u *)"latin1"; + load_lp = lp; + sprintf((char *)fname_enc, "spell/%c%c.%s.spl", lang[0], lang[1], p); + if (do_in_runtimepath(fname_enc, TRUE, spell_load_file) == FAIL) + { + /* Try again to find an ASCII spell file. */ + sprintf((char *)fname_ascii, "spell/%c%c.spl", lang[0], lang[1]); + if (do_in_runtimepath(fname_ascii, TRUE, spell_load_file) == FAIL) + { + vim_free(lp); + lp = NULL; + smsg((char_u *)_("Warning: Cannot find dictionary \"%s\""), + fname_enc + 6); + } + } + else + { + lp->sl_next = first_lang; + first_lang = lp; + } + } + + return lp; +} + +/* + * Load one spell file into "load_lp". + * Invoked through do_in_runtimepath(). + */ + static void +spell_load_file(fname) + char_u *fname; +{ + int fd; + size_t len; + size_t l; + size_t rest = 0; + char_u *p = NULL, *np; + sblock_T *bl; + hash_T hash; + hashitem_T *hi; + int c; + int region = REGION_ALL; + char_u word[MAXWLEN + 1]; + int n; + + fd = mch_open((char *)fname, O_RDONLY | O_EXTRA, 0); + if (fd < 0) + { + EMSG2(_(e_notopen), fname); + return; + } + + /* Get the length of the whole file. */ + len = lseek(fd, (off_t)0, SEEK_END); + lseek(fd, (off_t)0, SEEK_SET); + + /* Loop, reading the file one block at a time. + * "rest" is the length of an incomplete line at the previous block. + * "p" points to the remainder. */ + while (len > 0) + { + /* Allocate a block of memory to store the info in. This is not freed + * until spell_reload() is called. */ + if (len > SBLOCKSIZE) + l = SBLOCKSIZE; + else + l = len; + len -= l; + bl = (sblock_T *)alloc((unsigned)(sizeof(sblock_T) - 1 + l + rest)); + if (bl == NULL) + break; + bl->sb_next = load_lp->sl_block; + load_lp->sl_block = bl; + + /* Read a block from the file. Prepend the remainder of the previous + * block. */ + if (rest > 0) + mch_memmove(bl->sb_data, p, rest); + if (read(fd, bl->sb_data + rest, l) != l) + { + EMSG2(_(e_notread), fname); + break; + } + l += rest; + rest = 0; + + /* Deal with each line that was read until we finish the block. */ + for (p = bl->sb_data; l > 0; p = np) + { + /* "np" points to the char after the line (CR or NL). */ + for (np = p; l > 0 && *np >= ' '; ++np) + --l; + if (l == 0) + { + /* Incomplete line (or end of file). */ + rest = np - p; + if (len == 0) + EMSG2(_("E751: Truncated spell file: %s"), fname); + break; + } + *np = NUL; /* terminate the line with a NUL */ + + /* Skip comment and empty lines. */ + c = *p; + if (c != '#' && np > p) + { + if (c == '=' || c == '+') + { + garray_T *gap; + + /* Match or Add item. */ + if (c == '=') + gap = &load_lp->sl_match; + else + gap = &load_lp->sl_add; + + if (ga_grow(gap, 1) == OK) + { + for (n = 0; n < gap->ga_len; ++n) + if ((c = STRCMP(p + 1, + MATCH_ENTRY(gap, n) + 1)) < 0) + break; + if (c == 0) + { + if (p_verbose > 0) + smsg((char_u *)_("Warning: duplicate match \"%s\" in %s"), + p + 1, fname); + } + else + { + mch_memmove((char_u **)gap->ga_data + n + 1, + (char_u **)gap->ga_data + n, + (gap->ga_len - n) * sizeof(char_u *)); + *(((char_u **)gap->ga_data) + n) = p; + *p = region; + ++gap->ga_len; + } + } + } + else if (c == '-') + { + /* region item */ + ++p; + if (*p == '-') + /* end of a region */ + region = REGION_ALL; + else + { + char_u *rp = load_lp->sl_regions; + int r; + + /* The region may be repeated: "-ca-uk". Fill + * "region" with the bit mask for the ones we find. */ + region = 0; + for (;;) + { + /* start of a region */ + r = find_region(rp, p); + if (r == REGION_ALL) + { + /* new region, add it */ + r = STRLEN(rp); + if (r >= 12) + { + EMSG2(_("E752: Too many regions in %s"), + fname); + r = REGION_ALL; + } + else + { + rp[r] = p[0]; + rp[r + 1] = p[1]; + rp[r + 2] = NUL; + r = 1 << (r / 2); + } + } + else + r = 1 << r; + + region |= r; + if (p[2] != '-') + { + if (p[2] != NUL) + EMSG2(_("E753: Invalid character in \"%s\""), + p - 1); + break; + } + p += 3; + } + } + } + else + { + /* add the word */ + if (c == '>') + c = region | RARE_MASK; + else + { + if (c != ' ') + EMSG2(_("E753: Invalid character in \"%s\""), p); + c = region; + } +#ifdef FEAT_MBYTE + if (MB_ISUPPER(mb_ptr2char(p + 1))) +#else + if (MB_ISUPPER(p[1])) +#endif + c |= CASE_MASK; + *p++ = c; + (void)str_foldcase(p, np - p, word, MAXWLEN + 1); + n = STRLEN(word); + if (n > np - p) + { + sblock_T *s; + + /* Folding case made word longer! We need to allocate + * memory for it. */ + s = (sblock_T *)alloc((unsigned)sizeof(sblock_T) + + n + 1); + if (s != NULL) + { + s->sb_next = load_lp->sl_block; + load_lp->sl_block = s; + s->sb_data[0] = p[-1]; + p = s->sb_data + 1; + } + } + mch_memmove(p, word, n + 1); + + hash = hash_hash(p); + hi = hash_lookup(&load_lp->sl_ht, p, hash); + if (!HASHITEM_EMPTY(hi)) + { + c = hi->hi_key[-1]; + if ((c & (CASE_MASK | RARE_MASK)) + == (p[-1] & (CASE_MASK | RARE_MASK))) + { + if (p_verbose > 0) + smsg((char_u *)_("Warning: duplicate word \"%s\" in %s"), + p, fname); + } + else + hi->hi_key[-1] |= (p[-1] & (CASE_MASK | RARE_MASK)); + } + else + hash_add_item(&load_lp->sl_ht, hi, p, hash); + } + } + + while (l > 0 && *np < ' ') + { + ++np; + --l; + } + } + } + + close(fd); +} + +/* + * Parse 'spelllang' and set buf->b_langp accordingly. + * Returns an error message or NULL. + */ + char_u * +did_set_spelllang(buf) + buf_T *buf; +{ + garray_T ga; + char_u *lang; + char_u *e; + char_u *region; + int region_mask; + slang_T *lp; + int c; + + ga_init2(&ga, sizeof(langp_T), 2); + + /* loop over comma separated languages. */ + for (lang = buf->b_p_spl; *lang != NUL; lang = e) + { + e = vim_strchr(lang, ','); + if (e == NULL) + e = lang + STRLEN(lang); + if (e > lang + 2) + { + if (lang[2] != '_' || e - lang != 5) + { + ga_clear(&ga); + return e_invarg; + } + region = lang + 3; + } + else + region = NULL; + + for (lp = first_lang; lp != NULL; lp = lp->sl_next) + if (STRNICMP(lp->sl_name, lang, 2) == 0) + break; + + if (lp == NULL) + /* Not found, load the language. */ + lp = spell_load_lang(lang); + + if (lp != NULL) + { + if (region == NULL) + region_mask = REGION_ALL; + else + { + /* find region in sl_regions */ + c = find_region(lp->sl_regions, region); + if (c == REGION_ALL) + { + c = lang[5]; + lang[5] = NUL; + smsg((char_u *)_("Warning: region %s not supported"), lang); + lang[5] = c; + region_mask = REGION_ALL; + } + else + region_mask = 1 << c; + } + + if (ga_grow(&ga, 1) == FAIL) + { + ga_clear(&ga); + return e_outofmem; + } + LANGP_ENTRY(ga, ga.ga_len)->lp_slang = lp; + LANGP_ENTRY(ga, ga.ga_len)->lp_region = region_mask; + ++ga.ga_len; + } + + if (*e == ',') + ++e; + } + + /* Add a NULL entry to mark the end of the list. */ + if (ga_grow(&ga, 1) == FAIL) + { + ga_clear(&ga); + return e_outofmem; + } + LANGP_ENTRY(ga, ga.ga_len)->lp_slang = NULL; + ++ga.ga_len; + + /* Everything is fine, store the new b_langp value. */ + ga_clear(&buf->b_langp); + buf->b_langp = ga; + + return NULL; +} + +/* + * Find the region "region[2]" in "rp" (points to "sl_regions"). + * Each region is simply stored as the two characters of it's name. + * Returns the index if found, REGION_ALL if not found. + */ + static int +find_region(rp, region) + char_u *rp; + char_u *region; +{ + int i; + + for (i = 0; ; i += 2) + { + if (rp[i] == NUL) + return REGION_ALL; + if (rp[i] == region[0] && rp[i + 1] == region[1]) + break; + } + return i / 2; +} + +# if defined(FEAT_MBYTE) || defined(PROTO) +/* + * Clear all spelling tables and reload them. + * Used after 'encoding' is set. + */ + void +spell_reload() +{ + buf_T *buf; + slang_T *lp; + sblock_T *sp; + + /* Unload all allocated memory. */ + while (first_lang != NULL) + { + lp = first_lang; + first_lang = lp->sl_next; + + hash_clear(&lp->sl_ht); + ga_clear(&lp->sl_match); + ga_clear(&lp->sl_add); + while (lp->sl_block != NULL) + { + sp = lp->sl_block; + lp->sl_block = sp->sb_next; + vim_free(sp); + } + } + + /* Go through all buffers and handle 'spelllang'. */ + for (buf = firstbuf; buf != NULL; buf = buf->b_next) + { + ga_clear(&buf->b_langp); + if (*buf->b_p_spl != NUL) + did_set_spelllang(buf); + } +} +# endif + +#endif /* FEAT_SYN_HL */