# HG changeset patch # User Bram Moolenaar # Date 1376474809 -7200 # Node ID 3e9107b86b68d83bfa94e43afffbf17623afe55e # Parent a643d80b65071c4713309430a0c29da35ff90d45 updated for version 7.4.001 Problem: Character classes such as [a-z] to not react to 'ignorecase'. Breaks man page highlighting. (Mario Grgic) Solution: Add separate items for classes that react to 'ignorecase'. Clean up logic handling character classes. Add more tests. diff --git a/src/regexp_nfa.c b/src/regexp_nfa.c --- a/src/regexp_nfa.c +++ b/src/regexp_nfa.c @@ -29,6 +29,9 @@ # define NFA_REGEXP_DEBUG_LOG "nfa_regexp_debug.log" #endif +/* Added to NFA_ANY - NFA_NUPPER_IC to include a NL. */ +#define NFA_ADD_NL 31 + enum { NFA_SPLIT = -1024, @@ -183,6 +186,13 @@ enum NFA_NLOWER, /* Match non-lowercase char */ NFA_UPPER, /* Match uppercase char */ NFA_NUPPER, /* Match non-uppercase char */ + NFA_LOWER_IC, /* Match [a-z] */ + NFA_NLOWER_IC, /* Match [^a-z] */ + NFA_UPPER_IC, /* Match [A-Z] */ + NFA_NUPPER_IC, /* Match [^A-Z] */ + + NFA_FIRST_NL = NFA_ANY + NFA_ADD_NL, + NFA_LAST_NL = NFA_NUPPER_IC + NFA_ADD_NL, NFA_CURSOR, /* Match cursor pos */ NFA_LNUM, /* Match line number */ @@ -199,9 +209,6 @@ enum NFA_MARK_LT, /* Match < mark */ NFA_VISUAL, /* Match Visual area */ - NFA_FIRST_NL = NFA_ANY + ADD_NL, - NFA_LAST_NL = NFA_NUPPER + ADD_NL, - /* Character classes [:alnum:] etc */ NFA_CLASS_ALNUM, NFA_CLASS_ALPHA, @@ -578,6 +585,8 @@ realloc_post_list() * On failure, return 0 (=FAIL) * Start points to the first char of the range, while end should point * to the closing brace. + * Keep in mind that 'ignorecase' applies at execution time, thus [a-z] may + * need to be interpreted as [a-zA-Z]. */ static int nfa_recognize_char_class(start, end, extra_newl) @@ -681,7 +690,7 @@ nfa_recognize_char_class(start, end, ext return FAIL; if (newl == TRUE) - extra_newl = ADD_NL; + extra_newl = NFA_ADD_NL; switch (config) { @@ -710,13 +719,13 @@ nfa_recognize_char_class(start, end, ext case CLASS_not | CLASS_az | CLASS_AZ: return extra_newl + NFA_NALPHA; case CLASS_az: - return extra_newl + NFA_LOWER; + return extra_newl + NFA_LOWER_IC; case CLASS_not | CLASS_az: - return extra_newl + NFA_NLOWER; + return extra_newl + NFA_NLOWER_IC; case CLASS_AZ: - return extra_newl + NFA_UPPER; + return extra_newl + NFA_UPPER_IC; case CLASS_not | CLASS_AZ: - return extra_newl + NFA_NUPPER; + return extra_newl + NFA_NUPPER_IC; } return FAIL; } @@ -914,7 +923,7 @@ nfa_regatom() break; } - extra = ADD_NL; + extra = NFA_ADD_NL; /* "\_[" is collection plus newline */ if (c == '[') @@ -970,7 +979,7 @@ nfa_regatom() } #endif EMIT(nfa_classcodes[p - classchars]); - if (extra == ADD_NL) + if (extra == NFA_ADD_NL) { EMIT(NFA_NEWL); EMIT(NFA_OR); @@ -1240,21 +1249,21 @@ collection: { /* * Try to reverse engineer character classes. For example, - * recognize that [0-9] stands for \d and [A-Za-z_] with \h, + * recognize that [0-9] stands for \d and [A-Za-z_] for \h, * and perform the necessary substitutions in the NFA. */ result = nfa_recognize_char_class(regparse, endp, - extra == ADD_NL); + extra == NFA_ADD_NL); if (result != FAIL) { - if (result >= NFA_DIGIT && result <= NFA_NUPPER) - EMIT(result); - else /* must be char class + newline */ + if (result >= NFA_FIRST_NL && result <= NFA_LAST_NL) { - EMIT(result - ADD_NL); + EMIT(result - NFA_ADD_NL); EMIT(NFA_NEWL); EMIT(NFA_OR); } + else + EMIT(result); regparse = endp; mb_ptr_adv(regparse); return OK; @@ -1504,7 +1513,7 @@ collection: * collection, add an OR below. But not for negated * range. */ if (!negated) - extra = ADD_NL; + extra = NFA_ADD_NL; } else { @@ -1537,7 +1546,7 @@ collection: EMIT(NFA_END_COLL); /* \_[] also matches \n but it's not negated */ - if (extra == ADD_NL) + if (extra == NFA_ADD_NL) { EMIT(reg_string ? NL : NFA_NEWL); EMIT(NFA_OR); @@ -2011,7 +2020,7 @@ nfa_set_code(c) if (c >= NFA_FIRST_NL && c <= NFA_LAST_NL) { addnl = TRUE; - c -= ADD_NL; + c -= NFA_ADD_NL; } STRCPY(code, ""); @@ -2217,6 +2226,10 @@ nfa_set_code(c) case NFA_NLOWER:STRCPY(code, "NFA_NLOWER"); break; case NFA_UPPER: STRCPY(code, "NFA_UPPER"); break; case NFA_NUPPER:STRCPY(code, "NFA_NUPPER"); break; + case NFA_LOWER_IC: STRCPY(code, "NFA_LOWER_IC"); break; + case NFA_NLOWER_IC: STRCPY(code, "NFA_NLOWER_IC"); break; + case NFA_UPPER_IC: STRCPY(code, "NFA_UPPER_IC"); break; + case NFA_NUPPER_IC: STRCPY(code, "NFA_NUPPER_IC"); break; default: STRCPY(code, "CHAR(x)"); @@ -2687,6 +2700,10 @@ nfa_max_width(startstate, depth) case NFA_NLOWER: case NFA_UPPER: case NFA_NUPPER: + case NFA_LOWER_IC: + case NFA_NLOWER_IC: + case NFA_UPPER_IC: + case NFA_NUPPER_IC: /* possibly non-ascii */ #ifdef FEAT_MBYTE if (has_mbyte) @@ -3841,6 +3858,10 @@ match_follows(startstate, depth) case NFA_NLOWER: case NFA_UPPER: case NFA_NUPPER: + case NFA_LOWER_IC: + case NFA_NLOWER_IC: + case NFA_UPPER_IC: + case NFA_NUPPER_IC: case NFA_START_COLL: case NFA_START_NEG_COLL: case NFA_NEWL: @@ -5872,6 +5893,28 @@ nfa_regmatch(prog, start, submatch, m) ADD_STATE_IF_MATCH(t->state); break; + case NFA_LOWER_IC: /* [a-z] */ + result = ri_lower(curc) || (ireg_ic && ri_upper(curc)); + ADD_STATE_IF_MATCH(t->state); + break; + + case NFA_NLOWER_IC: /* [^a-z] */ + result = curc != NUL + && !(ri_lower(curc) || (ireg_ic && ri_upper(curc))); + ADD_STATE_IF_MATCH(t->state); + break; + + case NFA_UPPER_IC: /* [A-Z] */ + result = ri_upper(curc) || (ireg_ic && ri_lower(curc)); + ADD_STATE_IF_MATCH(t->state); + break; + + case NFA_NUPPER_IC: /* ^[A-Z] */ + result = curc != NUL + && !(ri_upper(curc) || (ireg_ic && ri_lower(curc))); + ADD_STATE_IF_MATCH(t->state); + break; + case NFA_BACKREF1: case NFA_BACKREF2: case NFA_BACKREF3: diff --git a/src/testdir/test64.in b/src/testdir/test64.in --- a/src/testdir/test64.in +++ b/src/testdir/test64.in @@ -289,15 +289,29 @@ STARTTEST :call add(tl, [2, '.a\%$', " a\n "]) :call add(tl, [2, '.a\%$', " a\n_a", "_a"]) :" -:"""" Test recognition of some character classes -:call add(tl, [2, '[0-9]', '8', '8']) -:call add(tl, [2, '[^0-9]', '8']) -:call add(tl, [2, '[0-9a-fA-F]*', '0a7', '0a7']) -:call add(tl, [2, '[^0-9A-Fa-f]\+', '0a7']) -:call add(tl, [2, '[a-z_A-Z0-9]\+', 'aso_sfoij', 'aso_sfoij']) -:call add(tl, [2, '[a-z]', 'a', 'a']) -:call add(tl, [2, '[a-zA-Z]', 'a', 'a']) -:call add(tl, [2, '[A-Z]', 'a']) +:"""" Test recognition of character classes +:call add(tl, [2, '[0-7]\+', 'x0123456789x', '01234567']) +:call add(tl, [2, '[^0-7]\+', '0a;X+% 897', 'a;X+% 89']) +:call add(tl, [2, '[0-9]\+', 'x0123456789x', '0123456789']) +:call add(tl, [2, '[^0-9]\+', '0a;X+% 9', 'a;X+% ']) +:call add(tl, [2, '[0-9a-fA-F]\+', 'x0189abcdefg', '0189abcdef']) +:call add(tl, [2, '[^0-9A-Fa-f]\+', '0189g;X+% ab', 'g;X+% ']) +:call add(tl, [2, '[a-z_A-Z0-9]\+', ';+aso_SfOij ', 'aso_SfOij']) +:call add(tl, [2, '[^a-z_A-Z0-9]\+', 'aSo_;+% sfOij', ';+% ']) +:call add(tl, [2, '[a-z_A-Z]\+', '0abyz_ABYZ;', 'abyz_ABYZ']) +:call add(tl, [2, '[^a-z_A-Z]\+', 'abAB_09;+% yzYZ', '09;+% ']) +:call add(tl, [2, '[a-z]\+', '0abcxyz1', 'abcxyz']) +:call add(tl, [2, '[a-z]\+', 'AabxyzZ', 'abxyz']) +:call add(tl, [2, '[^a-z]\+', 'a;X09+% x', ';X09+% ']) +:call add(tl, [2, '[^a-z]\+', 'abX0;%yz', 'X0;%']) +:call add(tl, [2, '[a-zA-Z]\+', '0abABxzXZ9', 'abABxzXZ']) +:call add(tl, [2, '[^a-zA-Z]\+', 'ab09_;+ XZ', '09_;+ ']) +:call add(tl, [2, '[A-Z]\+', 'aABXYZz', 'ABXYZ']) +:call add(tl, [2, '[^A-Z]\+', 'ABx0;%YZ', 'x0;%']) +:call add(tl, [2, '[a-z]\+\c', '0abxyzABXYZ;', 'abxyzABXYZ']) +:call add(tl, [2, '[A-Z]\+\c', '0abABxzXZ9', 'abABxzXZ']) +:call add(tl, [2, '\c[^a-z]\+', 'ab09_;+ XZ', '09_;+ ']) +:call add(tl, [2, '\c[^A-Z]\+', 'ab09_;+ XZ', '09_;+ ']) :call add(tl, [2, '\C[^A-Z]\+', 'ABCOIJDEOIFNSD jsfoij sa', ' jsfoij sa']) :" :"""" Tests for \z features diff --git a/src/testdir/test64.ok b/src/testdir/test64.ok --- a/src/testdir/test64.ok +++ b/src/testdir/test64.ok @@ -650,30 +650,72 @@ OK 2 - .a\%$ OK 0 - .a\%$ OK 1 - .a\%$ OK 2 - .a\%$ -OK 0 - [0-9] -OK 1 - [0-9] -OK 2 - [0-9] -OK 0 - [^0-9] -OK 1 - [^0-9] -OK 2 - [^0-9] -OK 0 - [0-9a-fA-F]* -OK 1 - [0-9a-fA-F]* -OK 2 - [0-9a-fA-F]* +OK 0 - [0-7]\+ +OK 1 - [0-7]\+ +OK 2 - [0-7]\+ +OK 0 - [^0-7]\+ +OK 1 - [^0-7]\+ +OK 2 - [^0-7]\+ +OK 0 - [0-9]\+ +OK 1 - [0-9]\+ +OK 2 - [0-9]\+ +OK 0 - [^0-9]\+ +OK 1 - [^0-9]\+ +OK 2 - [^0-9]\+ +OK 0 - [0-9a-fA-F]\+ +OK 1 - [0-9a-fA-F]\+ +OK 2 - [0-9a-fA-F]\+ OK 0 - [^0-9A-Fa-f]\+ OK 1 - [^0-9A-Fa-f]\+ OK 2 - [^0-9A-Fa-f]\+ OK 0 - [a-z_A-Z0-9]\+ OK 1 - [a-z_A-Z0-9]\+ OK 2 - [a-z_A-Z0-9]\+ -OK 0 - [a-z] -OK 1 - [a-z] -OK 2 - [a-z] -OK 0 - [a-zA-Z] -OK 1 - [a-zA-Z] -OK 2 - [a-zA-Z] -OK 0 - [A-Z] -OK 1 - [A-Z] -OK 2 - [A-Z] +OK 0 - [^a-z_A-Z0-9]\+ +OK 1 - [^a-z_A-Z0-9]\+ +OK 2 - [^a-z_A-Z0-9]\+ +OK 0 - [a-z_A-Z]\+ +OK 1 - [a-z_A-Z]\+ +OK 2 - [a-z_A-Z]\+ +OK 0 - [^a-z_A-Z]\+ +OK 1 - [^a-z_A-Z]\+ +OK 2 - [^a-z_A-Z]\+ +OK 0 - [a-z]\+ +OK 1 - [a-z]\+ +OK 2 - [a-z]\+ +OK 0 - [a-z]\+ +OK 1 - [a-z]\+ +OK 2 - [a-z]\+ +OK 0 - [^a-z]\+ +OK 1 - [^a-z]\+ +OK 2 - [^a-z]\+ +OK 0 - [^a-z]\+ +OK 1 - [^a-z]\+ +OK 2 - [^a-z]\+ +OK 0 - [a-zA-Z]\+ +OK 1 - [a-zA-Z]\+ +OK 2 - [a-zA-Z]\+ +OK 0 - [^a-zA-Z]\+ +OK 1 - [^a-zA-Z]\+ +OK 2 - [^a-zA-Z]\+ +OK 0 - [A-Z]\+ +OK 1 - [A-Z]\+ +OK 2 - [A-Z]\+ +OK 0 - [^A-Z]\+ +OK 1 - [^A-Z]\+ +OK 2 - [^A-Z]\+ +OK 0 - [a-z]\+\c +OK 1 - [a-z]\+\c +OK 2 - [a-z]\+\c +OK 0 - [A-Z]\+\c +OK 1 - [A-Z]\+\c +OK 2 - [A-Z]\+\c +OK 0 - \c[^a-z]\+ +OK 1 - \c[^a-z]\+ +OK 2 - \c[^a-z]\+ +OK 0 - \c[^A-Z]\+ +OK 1 - \c[^A-Z]\+ +OK 2 - \c[^A-Z]\+ OK 0 - \C[^A-Z]\+ OK 1 - \C[^A-Z]\+ OK 2 - \C[^A-Z]\+ diff --git a/src/version.c b/src/version.c --- a/src/version.c +++ b/src/version.c @@ -728,6 +728,8 @@ static char *(features[]) = static int included_patches[] = { /* Add new patch number below this line */ /**/ + 1, +/**/ 0 };