# HG changeset patch # User Bram Moolenaar # Date 1370723403 -7200 # Node ID 3dbd251777de232c168d87650acda5fec408146c # Parent 33a813d17d7552696fa3669f5ad23e58945513ae updated for version 7.3.1150 Problem: New regexpengine: Slow when a look-behind match does not have a width specified. Solution: Try to compute the maximum width. diff --git a/src/regexp_nfa.c b/src/regexp_nfa.c --- a/src/regexp_nfa.c +++ b/src/regexp_nfa.c @@ -38,19 +38,19 @@ enum NFA_START_COLL, /* [abc] start */ NFA_END_COLL, /* [abc] end */ NFA_START_NEG_COLL, /* [^abc] start */ - NFA_END_NEG_COLL, /* [^abc] end (only used in postfix) */ - NFA_RANGE, /* range of the two previous items (only - * used in postfix) */ + NFA_END_NEG_COLL, /* [^abc] end (postfix only) */ + NFA_RANGE, /* range of the two previous items + * (postfix only) */ NFA_RANGE_MIN, /* low end of a range */ NFA_RANGE_MAX, /* high end of a range */ - NFA_CONCAT, /* concatenate two previous items (only - * used in postfix) */ - NFA_OR, - NFA_STAR, /* greedy * */ - NFA_STAR_NONGREEDY, /* non-greedy * */ - NFA_QUEST, /* greedy \? */ - NFA_QUEST_NONGREEDY, /* non-greedy \? */ + NFA_CONCAT, /* concatenate two previous items (postfix + * only) */ + NFA_OR, /* \| (postfix only) */ + NFA_STAR, /* greedy * (posfix only) */ + NFA_STAR_NONGREEDY, /* non-greedy * (postfix only) */ + NFA_QUEST, /* greedy \? (postfix only) */ + NFA_QUEST_NONGREEDY, /* non-greedy \? (postfix only) */ NFA_BOL, /* ^ Begin line */ NFA_EOL, /* $ End line */ @@ -153,8 +153,6 @@ enum /* NFA_FIRST_NL */ NFA_ANY, /* Match any one character. */ - NFA_ANYOF, /* Match any character in this string. */ - NFA_ANYBUT, /* Match any character not in this string. */ NFA_IDENT, /* Match identifier char */ NFA_SIDENT, /* Match identifier char but no digit */ NFA_KWORD, /* Match keyword char */ @@ -496,8 +494,8 @@ nfa_get_regstart(start, depth) /* * Figure out if the NFA state list contains just literal text and nothing - * else. If so return a string with what must match after regstart. - * Otherwise return NULL. + * else. If so return a string in allocated memory with what must match after + * regstart. Otherwise return NULL. */ static char_u * nfa_get_match_text(start) @@ -2578,6 +2576,225 @@ st_pop(p, stack) } /* + * Estimate the maximum byte length of anything matching "state". + * When unknown or unlimited return -1. + */ + static int +nfa_max_width(startstate, depth) + nfa_state_T *startstate; + int depth; +{ + int l, r; + nfa_state_T *state = startstate; + int len = 0; + + /* detect looping in a NFA_SPLIT */ + if (depth > 4) + return -1; + + for (;;) + { + switch (state->c) + { + case NFA_END_INVISIBLE: + case NFA_END_INVISIBLE_NEG: + /* the end, return what we have */ + return len; + + case NFA_SPLIT: + /* two alternatives, use the maximum */ + l = nfa_max_width(state->out, depth + 1); + r = nfa_max_width(state->out1, depth + 1); + if (l < 0 || r < 0) + return -1; + return len + (l > r ? l : r); + + case NFA_ANY: + case NFA_START_COLL: + case NFA_START_NEG_COLL: + /* matches some character, including composing chars */ +#ifdef FEAT_MBYTE + if (enc_utf8) + len += MB_MAXBYTES; + else if (has_mbyte) + len += 2; + else +#endif + ++len; + if (state->c != NFA_ANY) + { + /* skip over the characters */ + state = state->out1->out; + continue; + } + break; + + case NFA_DIGIT: + case NFA_WHITE: + case NFA_HEX: + case NFA_OCTAL: + /* ascii */ + ++len; + break; + + case NFA_IDENT: + case NFA_SIDENT: + case NFA_KWORD: + case NFA_SKWORD: + case NFA_FNAME: + case NFA_SFNAME: + case NFA_PRINT: + case NFA_SPRINT: + case NFA_NWHITE: + case NFA_NDIGIT: + case NFA_NHEX: + case NFA_NOCTAL: + case NFA_WORD: + case NFA_NWORD: + case NFA_HEAD: + case NFA_NHEAD: + case NFA_ALPHA: + case NFA_NALPHA: + case NFA_LOWER: + case NFA_NLOWER: + case NFA_UPPER: + case NFA_NUPPER: + /* possibly non-ascii */ +#ifdef FEAT_MBYTE + if (has_mbyte) + len += 3; + else +#endif + ++len; + break; + + case NFA_START_INVISIBLE: + case NFA_START_INVISIBLE_NEG: + case NFA_START_INVISIBLE_BEFORE: + case NFA_START_INVISIBLE_BEFORE_NEG: + /* zero-width, out1 points to the END state */ + state = state->out1->out; + continue; + + case NFA_BACKREF1: + case NFA_BACKREF2: + case NFA_BACKREF3: + case NFA_BACKREF4: + case NFA_BACKREF5: + case NFA_BACKREF6: + case NFA_BACKREF7: + case NFA_BACKREF8: + case NFA_BACKREF9: +#ifdef FEAT_SYN_HL + case NFA_ZREF1: + case NFA_ZREF2: + case NFA_ZREF3: + case NFA_ZREF4: + case NFA_ZREF5: + case NFA_ZREF6: + case NFA_ZREF7: + case NFA_ZREF8: + case NFA_ZREF9: +#endif + case NFA_NEWL: + case NFA_SKIP: + /* unknown width */ + return -1; + + case NFA_BOL: + case NFA_EOL: + case NFA_BOF: + case NFA_EOF: + case NFA_BOW: + case NFA_EOW: + case NFA_MOPEN: + case NFA_MOPEN1: + case NFA_MOPEN2: + case NFA_MOPEN3: + case NFA_MOPEN4: + case NFA_MOPEN5: + case NFA_MOPEN6: + case NFA_MOPEN7: + case NFA_MOPEN8: + case NFA_MOPEN9: +#ifdef FEAT_SYN_HL + case NFA_ZOPEN: + case NFA_ZOPEN1: + case NFA_ZOPEN2: + case NFA_ZOPEN3: + case NFA_ZOPEN4: + case NFA_ZOPEN5: + case NFA_ZOPEN6: + case NFA_ZOPEN7: + case NFA_ZOPEN8: + case NFA_ZOPEN9: + case NFA_ZCLOSE: + case NFA_ZCLOSE1: + case NFA_ZCLOSE2: + case NFA_ZCLOSE3: + case NFA_ZCLOSE4: + case NFA_ZCLOSE5: + case NFA_ZCLOSE6: + case NFA_ZCLOSE7: + case NFA_ZCLOSE8: + case NFA_ZCLOSE9: +#endif + case NFA_MCLOSE: + case NFA_MCLOSE1: + case NFA_MCLOSE2: + case NFA_MCLOSE3: + case NFA_MCLOSE4: + case NFA_MCLOSE5: + case NFA_MCLOSE6: + case NFA_MCLOSE7: + case NFA_MCLOSE8: + case NFA_MCLOSE9: + case NFA_NOPEN: + case NFA_NCLOSE: + + case NFA_LNUM_GT: + case NFA_LNUM_LT: + case NFA_COL_GT: + case NFA_COL_LT: + case NFA_VCOL_GT: + case NFA_VCOL_LT: + case NFA_MARK_GT: + case NFA_MARK_LT: + case NFA_VISUAL: + case NFA_LNUM: + case NFA_CURSOR: + case NFA_COL: + case NFA_VCOL: + case NFA_MARK: + + case NFA_ZSTART: + case NFA_ZEND: + case NFA_OPT_CHARS: + case NFA_SKIP_CHAR: + case NFA_START_PATTERN: + case NFA_END_PATTERN: + case NFA_COMPOSING: + case NFA_END_COMPOSING: + /* zero-width */ + break; + + default: + if (state->c < 0) + /* don't know what this is */ + return -1; + /* normal character */ + len += MB_CHAR2LEN(state->c); + break; + } + + /* normal way to continue */ + state = state->out; + } + + /* unrecognized */ + return -1; +} +/* * Convert a postfix form into its equivalent NFA. * Return the NFA start state on success, NULL otherwise. */ @@ -2856,8 +3073,6 @@ post2nfa(postfix, end, nfa_calc_size) s = alloc_state(start_state, e.start, s1); if (s == NULL) goto theend; - if (before) - s->val = n; /* store the count */ if (pattern) { /* NFA_ZEND -> NFA_END_PATTERN -> NFA_SKIP -> what follows. */ @@ -2871,6 +3086,14 @@ post2nfa(postfix, end, nfa_calc_size) { patch(e.out, s1); PUSH(frag(s, list1(&s1->out))); + if (before) + { + if (n <= 0) + /* See if we can guess the maximum width, it avoids a + * lot of pointless tries. */ + n = nfa_max_width(e.start, 0); + s->val = n; /* store the count */ + } } break; } @@ -4088,9 +4311,8 @@ recursive_regmatch(state, prog, submatch /* Go back the specified number of bytes, or as far as the * start of the previous line, to try matching "\@<=" or - * not matching "\@val <= 0) { if (REG_MULTI) @@ -4386,7 +4608,7 @@ skip_to_start(c, colp) /* * Check for a match with match_text. - * Called after skip_to_start() has find regstart. + * Called after skip_to_start() has found regstart. * Returns zero for no match, 1 for a match. */ static long @@ -4736,7 +4958,6 @@ nfa_regmatch(prog, start, submatch, m) #ifdef FEAT_SYN_HL || (cout >= NFA_ZCLOSE && cout <= NFA_ZCLOSE9) #endif - || cout == NFA_NCLOSE || t->pim != NULL || (t->state->c != NFA_START_INVISIBLE_BEFORE && t->state->c != NFA_START_INVISIBLE_BEFORE_NEG diff --git a/src/version.c b/src/version.c --- a/src/version.c +++ b/src/version.c @@ -729,6 +729,8 @@ static char *(features[]) = static int included_patches[] = { /* Add new patch number below this line */ /**/ + 1150, +/**/ 1149, /**/ 1148,