# HG changeset patch # User Christian Brabandt # Date 1692557104 -7200 # Node ID 49d43532787f90e934ef427fd74a0c0ec5b9db0a # Parent 9e8d264487789a99e38d11f023e9c770790f844c patch 9.0.1771: regex: combining chars in collections not handled Commit: https://github.com/vim/vim/commit/ca22fc36a4e8a315f199893ee8ff6253573f5fbe Author: Christian Brabandt Date: Sun Aug 20 20:34:22 2023 +0200 patch 9.0.1771: regex: combining chars in collections not handled Problem: regex: combining chars in collections not handled Solution: Check for following combining characters for NFA and BT engine closes: #10459 closes: #10286 Signed-off-by: Christian Brabandt diff --git a/src/regexp_bt.c b/src/regexp_bt.c --- a/src/regexp_bt.c +++ b/src/regexp_bt.c @@ -3743,13 +3743,38 @@ regmatch( case ANYOF: case ANYBUT: - if (c == NUL) - status = RA_NOMATCH; - else if ((cstrchr(OPERAND(scan), c) == NULL) == (op == ANYOF)) - status = RA_NOMATCH; - else - ADVANCE_REGINPUT(); - break; + { + char_u *q = OPERAND(scan); + + if (c == NUL) + status = RA_NOMATCH; + else if ((cstrchr(q, c) == NULL) == (op == ANYOF)) + status = RA_NOMATCH; + else + { + // Check following combining characters + int len = 0; + int i; + + if (enc_utf8) + len = utfc_ptr2len(q) - utf_ptr2len(q); + + MB_CPTR_ADV(rex.input); + MB_CPTR_ADV(q); + + if (!enc_utf8 || len == 0) + break; + + for (i = 0; i < len; ++i) + if (q[i] != rex.input[i]) + { + status = RA_NOMATCH; + break; + } + rex.input += len; + } + break; + } case MULTIBYTECODE: if (has_mbyte) diff --git a/src/regexp_nfa.c b/src/regexp_nfa.c --- a/src/regexp_nfa.c +++ b/src/regexp_nfa.c @@ -1764,6 +1764,7 @@ collection: endp = skip_anyof(p); if (*endp == ']') { + int plen; /* * Try to reverse engineer character classes. For example, * recognize that [0-9] stands for \d and [A-Za-z_] for \h, @@ -2035,11 +2036,34 @@ collection: if (got_coll_char == TRUE && startc == 0) EMIT(0x0a); else + { EMIT(startc); - EMIT(NFA_CONCAT); + if (!(enc_utf8 && (utf_ptr2len(regparse) != (plen = utfc_ptr2len(regparse))))) + { + EMIT(NFA_CONCAT); + } + } } } + if (enc_utf8 && (utf_ptr2len(regparse) != (plen = utfc_ptr2len(regparse)))) + { + int i = utf_ptr2len(regparse); + + c = utf_ptr2char(regparse + i); + + // Add composing characters + for (;;) + { + EMIT(c); + EMIT(NFA_CONCAT); + if ((i += utf_char2len(c)) >= plen) + break; + c = utf_ptr2char(regparse + i); + } + EMIT(NFA_COMPOSING); + EMIT(NFA_CONCAT); + } MB_PTR_ADV(regparse); } // while (p < endp) @@ -6418,6 +6442,84 @@ nfa_regmatch( result_if_matched = (t->state->c == NFA_START_COLL); for (;;) { + if (state->c == NFA_COMPOSING) + { + int mc = curc; + int len = 0; + nfa_state_T *end; + nfa_state_T *sta; + int cchars[MAX_MCO]; + int ccount = 0; + int j; + + sta = t->state->out->out; + len = 0; + if (utf_iscomposing(sta->c)) + { + // Only match composing character(s), ignore base + // character. Used for ".{composing}" and "{composing}" + // (no preceding character). + len += mb_char2len(mc); + } + if (rex.reg_icombine && len == 0) + { + // If \Z was present, then ignore composing characters. + // When ignoring the base character this always matches. + if (sta->c != curc) + result = FAIL; + else + result = OK; + while (sta->c != NFA_END_COMPOSING) + sta = sta->out; + } + // Check base character matches first, unless ignored. + else if (len > 0 || mc == sta->c) +// if (len > 0 || mc == sta->c) + { + if (len == 0) + { + len += mb_char2len(mc); + sta = sta->out; + } + + // We don't care about the order of composing characters. + // Get them into cchars[] first. + while (len < clen) + { + mc = mb_ptr2char(rex.input + len); + cchars[ccount++] = mc; + len += mb_char2len(mc); + if (ccount == MAX_MCO) + break; + } + + // Check that each composing char in the pattern matches a + // composing char in the text. We do not check if all + // composing chars are matched. + result = OK; + while (sta->c != NFA_END_COMPOSING) + { + for (j = 0; j < ccount; ++j) + if (cchars[j] == sta->c) + break; + if (j == ccount) + { + result = FAIL; + break; + } + sta = sta->out; + } + } + else + result = FAIL; + + if (t->state->out->out1->c == NFA_END_COMPOSING) + { + end = t->state->out->out1; + ADD_STATE_IF_MATCH(end); + } + break; + } if (state->c == NFA_END_COLL) { result = !result_if_matched; diff --git a/src/testdir/test_regexp_utf8.vim b/src/testdir/test_regexp_utf8.vim --- a/src/testdir/test_regexp_utf8.vim +++ b/src/testdir/test_regexp_utf8.vim @@ -575,5 +575,16 @@ func Test_match_too_complicated() set regexpengine=0 endfunc +func Test_combining_chars_in_collection() + new + for i in range(0,2) + exe "set re=".i + put =['ɔ̃', 'ɔ', '̃ ã', 'abcd'] + :%s/[ɔ̃]// + call assert_equal(['', '', 'ɔ', '̃ ã', 'abcd'], getline(1,'$')) + %d + endfor + bw! +endfunc " vim: shiftwidth=2 sts=2 expandtab diff --git a/src/version.c b/src/version.c --- a/src/version.c +++ b/src/version.c @@ -696,6 +696,8 @@ static char *(features[]) = static int included_patches[] = { /* Add new patch number below this line */ /**/ + 1771, +/**/ 1770, /**/ 1769,