Mercurial > vim
diff src/regexp_nfa.c @ 34084:90063f44c99a v9.1.0011
patch 9.1.0011: regexp cannot match combining chars in collection
Commit: https://github.com/vim/vim/commit/d2cc51f9a1a5a30ef5d2e732f49d7f495cae24cf
Author: Christian Brabandt <cb@256bit.org>
Date: Thu Jan 4 22:54:08 2024 +0100
patch 9.1.0011: regexp cannot match combining chars in collection
Problem: regexp cannot match combining chars in collection
Solution: Check for combining characters in regex collections for the
NFA and BT Regex Engine
Also, while at it, make debug mode work again.
fixes #10286
closes: #12871
Signed-off-by: Christian Brabandt <cb@256bit.org>
author | Christian Brabandt <cb@256bit.org> |
---|---|
date | Thu, 04 Jan 2024 23:00:04 +0100 |
parents | d415dfae6977 |
children | df52075b12cd |
line wrap: on
line diff
--- a/src/regexp_nfa.c +++ b/src/regexp_nfa.c @@ -1764,6 +1764,7 @@ collection: endp = skip_anyof(p); if (*endp == ']') { + int plen; /* * Try to reverse engineer character classes. For example, * recognize that [0-9] stands for \d and [A-Za-z_] for \h, @@ -2033,13 +2034,43 @@ collection: else { if (got_coll_char == TRUE && startc == 0) + { EMIT(0x0a); + EMIT(NFA_CONCAT); + } else + { EMIT(startc); - EMIT(NFA_CONCAT); + if (!(enc_utf8 && (utf_ptr2len(regparse) != (plen = utfc_ptr2len(regparse))))) + { + EMIT(NFA_CONCAT); + } + } } } + if (enc_utf8 && (utf_ptr2len(regparse) != (plen = utfc_ptr2len(regparse)))) + { + int i = utf_ptr2len(regparse); + + c = utf_ptr2char(regparse + i); + + // Add composing characters + for (;;) + { + if (c == 0) + // \x00 is translated to \x0a, start at \x01. + EMIT(1); + else + EMIT(c); + EMIT(NFA_CONCAT); + if ((i += utf_char2len(c)) >= plen) + break; + c = utf_ptr2char(regparse + i); + } + EMIT(NFA_COMPOSING); + EMIT(NFA_CONCAT); + } MB_PTR_ADV(regparse); } // while (p < endp) @@ -6418,6 +6449,84 @@ nfa_regmatch( result_if_matched = (t->state->c == NFA_START_COLL); for (;;) { + if (state->c == NFA_COMPOSING) + { + int mc = curc; + int len = 0; + nfa_state_T *end; + nfa_state_T *sta; + int cchars[MAX_MCO]; + int ccount = 0; + int j; + + sta = t->state->out->out; + len = 0; + if (utf_iscomposing(sta->c)) + { + // Only match composing character(s), ignore base + // character. Used for ".{composing}" and "{composing}" + // (no preceding character). + len += mb_char2len(mc); + } + if (rex.reg_icombine && len == 0) + { + // If \Z was present, then ignore composing characters. + // When ignoring the base character this always matches. + if (sta->c != curc) + result = FAIL; + else + result = OK; + while (sta->c != NFA_END_COMPOSING) + sta = sta->out; + } + // Check base character matches first, unless ignored. + else if (len > 0 || mc == sta->c) +// if (len > 0 || mc == sta->c) + { + if (len == 0) + { + len += mb_char2len(mc); + sta = sta->out; + } + + // We don't care about the order of composing characters. + // Get them into cchars[] first. + while (len < clen) + { + mc = mb_ptr2char(rex.input + len); + cchars[ccount++] = mc; + len += mb_char2len(mc); + if (ccount == MAX_MCO) + break; + } + + // Check that each composing char in the pattern matches a + // composing char in the text. We do not check if all + // composing chars are matched. + result = OK; + while (sta->c != NFA_END_COMPOSING) + { + for (j = 0; j < ccount; ++j) + if (cchars[j] == sta->c) + break; + if (j == ccount) + { + result = FAIL; + break; + } + sta = sta->out; + } + } + else + result = FAIL; + + if (t->state->out->out1->c == NFA_END_COMPOSING) + { + end = t->state->out->out1; + ADD_STATE_IF_MATCH(end); + } + break; + } if (state->c == NFA_END_COLL) { result = !result_if_matched;