Mercurial > vim
changeset 5901:10fc95f48546 v7.4.293
updated for version 7.4.293
Problem: It is not possible to ignore composing characters at a specific
point in a pattern.
Solution: Add the %C item.
author | Bram Moolenaar <bram@vim.org> |
---|---|
date | Tue, 13 May 2014 19:37:29 +0200 |
parents | 20c0da43f879 |
children | 3a1b9743d014 |
files | runtime/doc/pattern.txt src/regexp.c src/regexp_nfa.c src/version.c |
diffstat | 4 files changed, 69 insertions(+), 13 deletions(-) [+] |
line wrap: on
line diff
--- a/runtime/doc/pattern.txt +++ b/runtime/doc/pattern.txt @@ -545,6 +545,7 @@ Character classes {not in Vi}: */char |/\%u| \%u \%u match specified multibyte character (eg \%u20ac) |/\%U| \%U \%U match specified large multibyte character (eg \%U12345678) +|/\%C| \%C \%C match any composing characters Example matches ~ \<\I\i* or @@ -1207,12 +1208,18 @@ will probably never match. 8. Composing characters *patterns-composing* */\Z* -When "\Z" appears anywhere in the pattern, composing characters are ignored. -Thus only the base characters need to match, the composing characters may be -different and the number of composing characters may differ. Only relevant -when 'encoding' is "utf-8". +When "\Z" appears anywhere in the pattern, all composing characters are +ignored. Thus only the base characters need to match, the composing +characters may be different and the number of composing characters may differ. +Only relevant when 'encoding' is "utf-8". Exception: If the pattern starts with one or more composing characters, these must match. + */\%C* +Use "\%C" to skip any composing characters. For example, the pattern "a" does +not match in "càt" (where the a has the composing character 0x0300), but +"a\%C" does. Note that this does not match "cát" (where the á is character +0xe1, it does not have a compositing character). It does match "cat" (where +the a is just an a). When a composing character appears at the start of the pattern of after an item that doesn't include the composing character, a match is found at any
--- a/src/regexp.c +++ b/src/regexp.c @@ -244,6 +244,7 @@ #define RE_MARK 207 /* mark cmp Match mark position */ #define RE_VISUAL 208 /* Match Visual area */ +#define RE_COMPOSING 209 /* any composing characters */ /* * Magic characters have a special meaning, they don't match literally. @@ -2208,6 +2209,10 @@ regatom(flagp) ret = regnode(RE_VISUAL); break; + case 'C': + ret = regnode(RE_COMPOSING); + break; + /* \%[abc]: Emit as a list of branches, all ending at the last * branch which matches nothing. */ case '[': @@ -4710,11 +4715,13 @@ regmatch(scan) status = RA_NOMATCH; } #ifdef FEAT_MBYTE - /* Check for following composing character. */ + /* Check for following composing character, unless %C + * follows (skips over all composing chars). */ if (status != RA_NOMATCH && enc_utf8 && UTF_COMPOSINGLIKE(reginput, reginput + len) - && !ireg_icombine) + && !ireg_icombine + && OP(next) != RE_COMPOSING) { /* raaron: This code makes a composing character get * ignored, which is the correct behavior (sometimes) @@ -4791,6 +4798,16 @@ regmatch(scan) status = RA_NOMATCH; break; #endif + case RE_COMPOSING: +#ifdef FEAT_MBYTE + if (enc_utf8) + { + /* Skip composing characters. */ + while (utf_iscomposing(utf_ptr2char(reginput))) + mb_cptr_adv(reginput); + } +#endif + break; case NOTHING: break;
--- a/src/regexp_nfa.c +++ b/src/regexp_nfa.c @@ -81,6 +81,7 @@ enum NFA_COMPOSING, /* Next nodes in NFA are part of the composing multibyte char */ NFA_END_COMPOSING, /* End of a composing char in the NFA */ + NFA_ANY_COMPOSING, /* \%C: Any composing characters. */ NFA_OPT_CHARS, /* \%[abc] */ /* The following are used only in the postfix form, not in the NFA */ @@ -1418,6 +1419,10 @@ nfa_regatom() EMIT(NFA_VISUAL); break; + case 'C': + EMIT(NFA_ANY_COMPOSING); + break; + case '[': { int n; @@ -2429,6 +2434,7 @@ nfa_set_code(c) case NFA_MARK_LT: STRCPY(code, "NFA_MARK_LT "); break; case NFA_CURSOR: STRCPY(code, "NFA_CURSOR "); break; case NFA_VISUAL: STRCPY(code, "NFA_VISUAL "); break; + case NFA_ANY_COMPOSING: STRCPY(code, "NFA_ANY_COMPOSING "); break; case NFA_STAR: STRCPY(code, "NFA_STAR "); break; case NFA_STAR_NONGREEDY: STRCPY(code, "NFA_STAR_NONGREEDY "); break; @@ -2967,6 +2973,7 @@ nfa_max_width(startstate, depth) case NFA_NLOWER_IC: case NFA_UPPER_IC: case NFA_NUPPER_IC: + case NFA_ANY_COMPOSING: /* possibly non-ascii */ #ifdef FEAT_MBYTE if (has_mbyte) @@ -4152,6 +4159,7 @@ match_follows(startstate, depth) continue; case NFA_ANY: + case NFA_ANY_COMPOSING: case NFA_IDENT: case NFA_SIDENT: case NFA_KWORD: @@ -4395,7 +4403,7 @@ skip_add: switch (state->c) { case NFA_MATCH: - nfa_match = TRUE; +// nfa_match = TRUE; break; case NFA_SPLIT: @@ -5151,6 +5159,7 @@ failure_chance(state, depth) case NFA_MATCH: case NFA_MCLOSE: + case NFA_ANY_COMPOSING: /* empty match works always */ return 0; @@ -5573,6 +5582,12 @@ nfa_regmatch(prog, start, submatch, m) { case NFA_MATCH: { +#ifdef FEAT_MBYTE + /* If the match ends before a composing characters and + * ireg_icombine is not set, that is not really a match. */ + if (enc_utf8 && !ireg_icombine && utf_iscomposing(curc)) + break; +#endif nfa_match = TRUE; copy_sub(&submatch->norm, &t->subs.norm); #ifdef FEAT_SYN_HL @@ -6120,6 +6135,23 @@ nfa_regmatch(prog, start, submatch, m) } break; + case NFA_ANY_COMPOSING: + /* On a composing character skip over it. Otherwise do + * nothing. Always matches. */ +#ifdef FEAT_MBYTE + if (enc_utf8 && utf_iscomposing(curc)) + { + add_off = clen; + } + else +#endif + { + add_here = TRUE; + add_off = 0; + } + add_state = t->state->out; + break; + /* * Character classes like \a for alpha, \d for digit etc. */ @@ -6484,12 +6516,10 @@ nfa_regmatch(prog, start, submatch, m) if (!result && ireg_ic) result = MB_TOLOWER(c) == MB_TOLOWER(curc); #ifdef FEAT_MBYTE - /* If there is a composing character which is not being - * ignored there can be no match. Match with composing - * character uses NFA_COMPOSING above. */ - if (result && enc_utf8 && !ireg_icombine - && clen != utf_char2len(curc)) - result = FALSE; + /* If ireg_icombine is not set only skip over the character + * itself. When it is set skip over composing characters. */ + if (result && enc_utf8 && !ireg_icombine) + clen = utf_char2len(curc); #endif ADD_STATE_IF_MATCH(t->state); break;