# HG changeset patch # User Christian Brabandt # Date 1704405604 -3600 # Node ID 90063f44c99a8dae3c3af5c6d186e522d9324514 # Parent 2ff0721f8cc240b5d60d04ec32d40b0aa1c96cc7 patch 9.1.0011: regexp cannot match combining chars in collection Commit: https://github.com/vim/vim/commit/d2cc51f9a1a5a30ef5d2e732f49d7f495cae24cf Author: Christian Brabandt Date: Thu Jan 4 22:54:08 2024 +0100 patch 9.1.0011: regexp cannot match combining chars in collection Problem: regexp cannot match combining chars in collection Solution: Check for combining characters in regex collections for the NFA and BT Regex Engine Also, while at it, make debug mode work again. fixes #10286 closes: #12871 Signed-off-by: Christian Brabandt diff --git a/src/regexp.c b/src/regexp.c --- a/src/regexp.c +++ b/src/regexp.c @@ -2686,7 +2686,10 @@ static regengine_T bt_regengine = bt_regcomp, bt_regfree, bt_regexec_nl, - bt_regexec_multi, + bt_regexec_multi +#ifdef DEBUG + ,(char_u *)"" +#endif }; #include "regexp_nfa.c" @@ -2696,7 +2699,10 @@ static regengine_T nfa_regengine = nfa_regcomp, nfa_regfree, nfa_regexec_nl, - nfa_regexec_multi, + nfa_regexec_multi +#ifdef DEBUG + ,(char_u *)"" +#endif }; // Which regexp engine to use? Needed for vim_regcomp(). diff --git a/src/regexp.h b/src/regexp.h --- a/src/regexp.h +++ b/src/regexp.h @@ -178,7 +178,9 @@ struct regengine int (*regexec_nl)(regmatch_T *, char_u *, colnr_T, int); // bt_regexec_mult or nfa_regexec_mult long (*regexec_multi)(regmmatch_T *, win_T *, buf_T *, linenr_T, colnr_T, int *); - //char_u *expr; +#ifdef DEBUG + char_u *expr; +#endif }; // Flags used by vim_regsub() and vim_regsub_both() diff --git a/src/regexp_bt.c b/src/regexp_bt.c --- a/src/regexp_bt.c +++ b/src/regexp_bt.c @@ -3743,13 +3743,38 @@ regmatch( case ANYOF: case ANYBUT: - if (c == NUL) - status = RA_NOMATCH; - else if ((cstrchr(OPERAND(scan), c) == NULL) == (op == ANYOF)) - status = RA_NOMATCH; - else - ADVANCE_REGINPUT(); - break; + { + char_u *q = OPERAND(scan); + + if (c == NUL) + status = RA_NOMATCH; + else if ((cstrchr(q, c) == NULL) == (op == ANYOF)) + status = RA_NOMATCH; + else + { + // Check following combining characters + int len = 0; + int i; + + if (enc_utf8) + len = utfc_ptr2len(q) - utf_ptr2len(q); + + MB_CPTR_ADV(rex.input); + MB_CPTR_ADV(q); + + if (!enc_utf8 || len == 0) + break; + + for (i = 0; i < len; ++i) + if (q[i] != rex.input[i]) + { + status = RA_NOMATCH; + break; + } + rex.input += len; + } + break; + } case MULTIBYTECODE: if (has_mbyte) diff --git a/src/regexp_nfa.c b/src/regexp_nfa.c --- a/src/regexp_nfa.c +++ b/src/regexp_nfa.c @@ -1764,6 +1764,7 @@ collection: endp = skip_anyof(p); if (*endp == ']') { + int plen; /* * Try to reverse engineer character classes. For example, * recognize that [0-9] stands for \d and [A-Za-z_] for \h, @@ -2033,13 +2034,43 @@ collection: else { if (got_coll_char == TRUE && startc == 0) + { EMIT(0x0a); + EMIT(NFA_CONCAT); + } else + { EMIT(startc); - EMIT(NFA_CONCAT); + if (!(enc_utf8 && (utf_ptr2len(regparse) != (plen = utfc_ptr2len(regparse))))) + { + EMIT(NFA_CONCAT); + } + } } } + if (enc_utf8 && (utf_ptr2len(regparse) != (plen = utfc_ptr2len(regparse)))) + { + int i = utf_ptr2len(regparse); + + c = utf_ptr2char(regparse + i); + + // Add composing characters + for (;;) + { + if (c == 0) + // \x00 is translated to \x0a, start at \x01. + EMIT(1); + else + EMIT(c); + EMIT(NFA_CONCAT); + if ((i += utf_char2len(c)) >= plen) + break; + c = utf_ptr2char(regparse + i); + } + EMIT(NFA_COMPOSING); + EMIT(NFA_CONCAT); + } MB_PTR_ADV(regparse); } // while (p < endp) @@ -6418,6 +6449,84 @@ nfa_regmatch( result_if_matched = (t->state->c == NFA_START_COLL); for (;;) { + if (state->c == NFA_COMPOSING) + { + int mc = curc; + int len = 0; + nfa_state_T *end; + nfa_state_T *sta; + int cchars[MAX_MCO]; + int ccount = 0; + int j; + + sta = t->state->out->out; + len = 0; + if (utf_iscomposing(sta->c)) + { + // Only match composing character(s), ignore base + // character. Used for ".{composing}" and "{composing}" + // (no preceding character). + len += mb_char2len(mc); + } + if (rex.reg_icombine && len == 0) + { + // If \Z was present, then ignore composing characters. + // When ignoring the base character this always matches. + if (sta->c != curc) + result = FAIL; + else + result = OK; + while (sta->c != NFA_END_COMPOSING) + sta = sta->out; + } + // Check base character matches first, unless ignored. + else if (len > 0 || mc == sta->c) +// if (len > 0 || mc == sta->c) + { + if (len == 0) + { + len += mb_char2len(mc); + sta = sta->out; + } + + // We don't care about the order of composing characters. + // Get them into cchars[] first. + while (len < clen) + { + mc = mb_ptr2char(rex.input + len); + cchars[ccount++] = mc; + len += mb_char2len(mc); + if (ccount == MAX_MCO) + break; + } + + // Check that each composing char in the pattern matches a + // composing char in the text. We do not check if all + // composing chars are matched. + result = OK; + while (sta->c != NFA_END_COMPOSING) + { + for (j = 0; j < ccount; ++j) + if (cchars[j] == sta->c) + break; + if (j == ccount) + { + result = FAIL; + break; + } + sta = sta->out; + } + } + else + result = FAIL; + + if (t->state->out->out1->c == NFA_END_COMPOSING) + { + end = t->state->out->out1; + ADD_STATE_IF_MATCH(end); + } + break; + } if (state->c == NFA_END_COLL) { result = !result_if_matched; diff --git a/src/testdir/test_regexp_utf8.vim b/src/testdir/test_regexp_utf8.vim --- a/src/testdir/test_regexp_utf8.vim +++ b/src/testdir/test_regexp_utf8.vim @@ -575,5 +575,16 @@ func Test_match_too_complicated() set regexpengine=0 endfunc +func Test_combining_chars_in_collection() + new + for i in range(0,2) + exe "set re=".i + put =['ɔ̃', 'ɔ', '̃ ã', 'abcd'] + :%s/[ɔ̃]// + call assert_equal(['', '', 'ɔ', '̃ ã', 'abcd'], getline(1,'$')) + %d + endfor + bw! +endfunc " vim: shiftwidth=2 sts=2 expandtab diff --git a/src/version.c b/src/version.c --- a/src/version.c +++ b/src/version.c @@ -705,6 +705,8 @@ static char *(features[]) = static int included_patches[] = { /* Add new patch number below this line */ /**/ + 11, +/**/ 10, /**/ 9,