comparison src/regexp.c @ 4444:ccecb03e5e8b v7.3.970

updated for version 7.3.970 Problem: Syntax highlighting can be slow. Solution: Include the NFA regexp engine. Add the 'regexpengine' option to select which one is used. (various authors, including Ken Takata, Andrei Aiordachioaie, Russ Cox, Xiaozhou Liua, Ian Young)
author Bram Moolenaar <bram@vim.org>
date Sun, 19 May 2013 19:40:29 +0200
parents 7faeece39228
children fe8a0a6a1c2a
comparison
equal deleted inserted replaced
4443:34f806b8147f 4444:ccecb03e5e8b
36 * Changes have been made by Tony Andrews, Olaf 'Rhialto' Seibert, Robert 36 * Changes have been made by Tony Andrews, Olaf 'Rhialto' Seibert, Robert
37 * Webb, Ciaran McCreesh and Bram Moolenaar. 37 * Webb, Ciaran McCreesh and Bram Moolenaar.
38 * Named character class support added by Walter Briscoe (1998 Jul 01) 38 * Named character class support added by Walter Briscoe (1998 Jul 01)
39 */ 39 */
40 40
41 /* Uncomment the first if you do not want to see debugging logs or files
42 * related to regular expressions, even when compiling with -DDEBUG.
43 * Uncomment the second to get the regexp debugging. */
44 /* #undef DEBUG */
45 /* #define DEBUG */
46
41 #include "vim.h" 47 #include "vim.h"
42 48
43 #undef DEBUG 49 #ifdef DEBUG
50 /* show/save debugging data when BT engine is used */
51 # define BT_REGEXP_DUMP
52 /* save the debugging data to a file instead of displaying it */
53 # define BT_REGEXP_LOG
54 #endif
44 55
45 /* 56 /*
46 * The "internal use only" fields in regexp.h are present to pass info from 57 * The "internal use only" fields in regexp.h are present to pass info from
47 * compile to execute that permits the execute phase to run lots faster on 58 * compile to execute that permits the execute phase to run lots faster on
48 * simple cases. They are: 59 * simple cases. They are:
324 #define UCHARAT(p) ((int)*(char_u *)(p)) 335 #define UCHARAT(p) ((int)*(char_u *)(p))
325 336
326 /* Used for an error (down from) vim_regcomp(): give the error message, set 337 /* Used for an error (down from) vim_regcomp(): give the error message, set
327 * rc_did_emsg and return NULL */ 338 * rc_did_emsg and return NULL */
328 #define EMSG_RET_NULL(m) return (EMSG(m), rc_did_emsg = TRUE, (void *)NULL) 339 #define EMSG_RET_NULL(m) return (EMSG(m), rc_did_emsg = TRUE, (void *)NULL)
329 #define EMSG_M_RET_NULL(m, c) return (EMSG2((m), (c) ? "" : "\\"), rc_did_emsg = TRUE, (void *)NULL)
330 #define EMSG_RET_FAIL(m) return (EMSG(m), rc_did_emsg = TRUE, FAIL) 340 #define EMSG_RET_FAIL(m) return (EMSG(m), rc_did_emsg = TRUE, FAIL)
331 #define EMSG_ONE_RET_NULL EMSG_M_RET_NULL(_("E369: invalid item in %s%%[]"), reg_magic == MAGIC_ALL) 341 #define EMSG2_RET_NULL(m, c) return (EMSG2((m), (c) ? "" : "\\"), rc_did_emsg = TRUE, (void *)NULL)
342 #define EMSG2_RET_FAIL(m, c) return (EMSG2((m), (c) ? "" : "\\"), rc_did_emsg = TRUE, FAIL)
343 #define EMSG_ONE_RET_NULL EMSG2_RET_NULL(_("E369: invalid item in %s%%[]"), reg_magic == MAGIC_ALL)
332 344
333 #define MAX_LIMIT (32767L << 16L) 345 #define MAX_LIMIT (32767L << 16L)
334 346
335 static int re_multi_type __ARGS((int)); 347 static int re_multi_type __ARGS((int));
336 static int cstrncmp __ARGS((char_u *s1, char_u *s2, int *n)); 348 static int cstrncmp __ARGS((char_u *s1, char_u *s2, int *n));
337 static char_u *cstrchr __ARGS((char_u *, int)); 349 static char_u *cstrchr __ARGS((char_u *, int));
338 350
351 #ifdef BT_REGEXP_DUMP
352 static void regdump __ARGS((char_u *, bt_regprog_T *));
353 #endif
339 #ifdef DEBUG 354 #ifdef DEBUG
340 static void regdump __ARGS((char_u *, regprog_T *));
341 static char_u *regprop __ARGS((char_u *)); 355 static char_u *regprop __ARGS((char_u *));
342 #endif 356 #endif
357
358 static char_u e_missingbracket[] = N_("E769: Missing ] after %s[");
359 static char_u e_unmatchedpp[] = N_("E53: Unmatched %s%%(");
360 static char_u e_unmatchedp[] = N_("E54: Unmatched %s(");
361 static char_u e_unmatchedpar[] = N_("E55: Unmatched %s)");
343 362
344 #define NOT_MULTI 0 363 #define NOT_MULTI 0
345 #define MULTI_ONE 1 364 #define MULTI_ONE 1
346 #define MULTI_MULT 2 365 #define MULTI_MULT 2
347 /* 366 /*
628 /* p s u v w x z { | ~ */ 647 /* p s u v w x z { | ~ */
629 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1 648 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1
630 }; 649 };
631 #endif 650 #endif
632 651
633 static int curchr; 652 static int curchr; /* currently parsed character */
653 /* Previous character. Note: prevchr is sometimes -1 when we are not at the
654 * start, eg in /[ ^I]^ the pattern was never found even if it existed,
655 * because ^ was taken to be magic -- webb */
656 static int prevchr;
657 static int prevprevchr; /* previous-previous character */
658 static int nextchr; /* used for ungetchr() */
634 659
635 /* arguments for reg() */ 660 /* arguments for reg() */
636 #define REG_NOPAREN 0 /* toplevel reg() */ 661 #define REG_NOPAREN 0 /* toplevel reg() */
637 #define REG_PAREN 1 /* \(\) */ 662 #define REG_PAREN 1 /* \(\) */
638 #define REG_ZPAREN 2 /* \z(\) */ 663 #define REG_ZPAREN 2 /* \z(\) */
678 static char_u *re_put_long __ARGS((char_u *pr, long_u val)); 703 static char_u *re_put_long __ARGS((char_u *pr, long_u val));
679 static int read_limits __ARGS((long *, long *)); 704 static int read_limits __ARGS((long *, long *));
680 static void regtail __ARGS((char_u *, char_u *)); 705 static void regtail __ARGS((char_u *, char_u *));
681 static void regoptail __ARGS((char_u *, char_u *)); 706 static void regoptail __ARGS((char_u *, char_u *));
682 707
708 static regengine_T bt_regengine;
709 static regengine_T nfa_regengine;
710
683 /* 711 /*
684 * Return TRUE if compiled regular expression "prog" can match a line break. 712 * Return TRUE if compiled regular expression "prog" can match a line break.
685 */ 713 */
686 int 714 int
687 re_multiline(prog) 715 re_multiline(prog)
760 #endif 788 #endif
761 789
762 /* 790 /*
763 * Produce the bytes for equivalence class "c". 791 * Produce the bytes for equivalence class "c".
764 * Currently only handles latin1, latin9 and utf-8. 792 * Currently only handles latin1, latin9 and utf-8.
793 * NOTE: When changing this function, also change nfa_emit_equi_class()
765 */ 794 */
766 static void 795 static void
767 reg_equi_class(c) 796 reg_equi_class(c)
768 int c; 797 int c;
769 { 798 {
1237 } 1266 }
1238 } 1267 }
1239 return p; 1268 return p;
1240 } 1269 }
1241 1270
1242 /* 1271 static regprog_T *bt_regcomp __ARGS((char_u *expr, int re_flags));
1243 * vim_regcomp() - compile a regular expression into internal code 1272
1273 /*
1274 * bt_regcomp() - compile a regular expression into internal code for the
1275 * traditional back track matcher.
1244 * Returns the program in allocated space. Returns NULL for an error. 1276 * Returns the program in allocated space. Returns NULL for an error.
1245 * 1277 *
1246 * We can't allocate space until we know how big the compiled form will be, 1278 * We can't allocate space until we know how big the compiled form will be,
1247 * but we can't compile it (and thus know how big it is) until we've got a 1279 * but we can't compile it (and thus know how big it is) until we've got a
1248 * place to put the code. So we cheat: we compile it twice, once with code 1280 * place to put the code. So we cheat: we compile it twice, once with code
1257 * 1289 *
1258 * Beware that the optimization-preparation code in here knows about some 1290 * Beware that the optimization-preparation code in here knows about some
1259 * of the structure of the compiled regexp. 1291 * of the structure of the compiled regexp.
1260 * "re_flags": RE_MAGIC and/or RE_STRING. 1292 * "re_flags": RE_MAGIC and/or RE_STRING.
1261 */ 1293 */
1262 regprog_T * 1294 static regprog_T *
1263 vim_regcomp(expr, re_flags) 1295 bt_regcomp(expr, re_flags)
1264 char_u *expr; 1296 char_u *expr;
1265 int re_flags; 1297 int re_flags;
1266 { 1298 {
1267 regprog_T *r; 1299 bt_regprog_T *r;
1268 char_u *scan; 1300 char_u *scan;
1269 char_u *longest; 1301 char_u *longest;
1270 int len; 1302 int len;
1271 int flags; 1303 int flags;
1272 1304
1289 if (regsize >= 65536L - 256L) 1321 if (regsize >= 65536L - 256L)
1290 EMSG_RET_NULL(_("E339: Pattern too long")); 1322 EMSG_RET_NULL(_("E339: Pattern too long"));
1291 #endif 1323 #endif
1292 1324
1293 /* Allocate space. */ 1325 /* Allocate space. */
1294 r = (regprog_T *)lalloc(sizeof(regprog_T) + regsize, TRUE); 1326 r = (bt_regprog_T *)lalloc(sizeof(bt_regprog_T) + regsize, TRUE);
1295 if (r == NULL) 1327 if (r == NULL)
1296 return NULL; 1328 return NULL;
1297 1329
1298 /* 1330 /*
1299 * Second pass: emit code. 1331 * Second pass: emit code.
1384 } 1416 }
1385 r->regmust = longest; 1417 r->regmust = longest;
1386 r->regmlen = len; 1418 r->regmlen = len;
1387 } 1419 }
1388 } 1420 }
1389 #ifdef DEBUG 1421 #ifdef BT_REGEXP_DUMP
1390 regdump(expr, r); 1422 regdump(expr, r);
1391 #endif 1423 #endif
1392 return r; 1424 r->engine = &bt_regengine;
1425 return (regprog_T *)r;
1393 } 1426 }
1394 1427
1395 /* 1428 /*
1396 * Setup to parse the regexp. Used once to get the length and once to do it. 1429 * Setup to parse the regexp. Used once to get the length and once to do it.
1397 */ 1430 */
1434 return had_eol; 1467 return had_eol;
1435 } 1468 }
1436 #endif 1469 #endif
1437 1470
1438 /* 1471 /*
1439 * reg - regular expression, i.e. main body or parenthesized thing 1472 * Parse regular expression, i.e. main body or parenthesized thing.
1440 * 1473 *
1441 * Caller must absorb opening parenthesis. 1474 * Caller must absorb opening parenthesis.
1442 * 1475 *
1443 * Combining parenthesis handling with the base level of regular expression 1476 * Combining parenthesis handling with the base level of regular expression
1444 * is a trifle forced, but the need to tie the tails of the branches to what 1477 * is a trifle forced, but the need to tie the tails of the branches to what
1471 #endif 1504 #endif
1472 if (paren == REG_PAREN) 1505 if (paren == REG_PAREN)
1473 { 1506 {
1474 /* Make a MOPEN node. */ 1507 /* Make a MOPEN node. */
1475 if (regnpar >= NSUBEXP) 1508 if (regnpar >= NSUBEXP)
1476 EMSG_M_RET_NULL(_("E51: Too many %s("), reg_magic == MAGIC_ALL); 1509 EMSG2_RET_NULL(_("E51: Too many %s("), reg_magic == MAGIC_ALL);
1477 parno = regnpar; 1510 parno = regnpar;
1478 ++regnpar; 1511 ++regnpar;
1479 ret = regnode(MOPEN + parno); 1512 ret = regnode(MOPEN + parno);
1480 } 1513 }
1481 else if (paren == REG_NPAREN) 1514 else if (paren == REG_NPAREN)
1532 if (paren == REG_ZPAREN) 1565 if (paren == REG_ZPAREN)
1533 EMSG_RET_NULL(_("E52: Unmatched \\z(")); 1566 EMSG_RET_NULL(_("E52: Unmatched \\z("));
1534 else 1567 else
1535 #endif 1568 #endif
1536 if (paren == REG_NPAREN) 1569 if (paren == REG_NPAREN)
1537 EMSG_M_RET_NULL(_("E53: Unmatched %s%%("), reg_magic == MAGIC_ALL); 1570 EMSG2_RET_NULL(_(e_unmatchedpp), reg_magic == MAGIC_ALL);
1538 else 1571 else
1539 EMSG_M_RET_NULL(_("E54: Unmatched %s("), reg_magic == MAGIC_ALL); 1572 EMSG2_RET_NULL(_(e_unmatchedp), reg_magic == MAGIC_ALL);
1540 } 1573 }
1541 else if (paren == REG_NOPAREN && peekchr() != NUL) 1574 else if (paren == REG_NOPAREN && peekchr() != NUL)
1542 { 1575 {
1543 if (curchr == Magic(')')) 1576 if (curchr == Magic(')'))
1544 EMSG_M_RET_NULL(_("E55: Unmatched %s)"), reg_magic == MAGIC_ALL); 1577 EMSG2_RET_NULL(_(e_unmatchedpar), reg_magic == MAGIC_ALL);
1545 else 1578 else
1546 EMSG_RET_NULL(_(e_trailing)); /* "Can't happen". */ 1579 EMSG_RET_NULL(_(e_trailing)); /* "Can't happen". */
1547 /* NOTREACHED */ 1580 /* NOTREACHED */
1548 } 1581 }
1549 /* 1582 /*
1554 had_endbrace[parno] = TRUE; /* have seen the close paren */ 1587 had_endbrace[parno] = TRUE; /* have seen the close paren */
1555 return ret; 1588 return ret;
1556 } 1589 }
1557 1590
1558 /* 1591 /*
1559 * Handle one alternative of an | operator. 1592 * Parse one alternative of an | operator.
1560 * Implements the & operator. 1593 * Implements the & operator.
1561 */ 1594 */
1562 static char_u * 1595 static char_u *
1563 regbranch(flagp) 1596 regbranch(flagp)
1564 int *flagp; 1597 int *flagp;
1597 1630
1598 return ret; 1631 return ret;
1599 } 1632 }
1600 1633
1601 /* 1634 /*
1602 * Handle one alternative of an | or & operator. 1635 * Parse one alternative of an | or & operator.
1603 * Implements the concatenation operator. 1636 * Implements the concatenation operator.
1604 */ 1637 */
1605 static char_u * 1638 static char_u *
1606 regconcat(flagp) 1639 regconcat(flagp)
1607 int *flagp; 1640 int *flagp;
1677 first = regnode(NOTHING); 1710 first = regnode(NOTHING);
1678 return first; 1711 return first;
1679 } 1712 }
1680 1713
1681 /* 1714 /*
1682 * regpiece - something followed by possible [*+=] 1715 * Parse something followed by possible [*+=].
1683 * 1716 *
1684 * Note that the branching code sequences used for = and the general cases 1717 * Note that the branching code sequences used for = and the general cases
1685 * of * and + are somewhat optimized: they use the same NOTHING node as 1718 * of * and + are somewhat optimized: they use the same NOTHING node as
1686 * both the endmarker for their branch list and the body of the last branch. 1719 * both the endmarker for their branch list and the body of the last branch.
1687 * It might seem that this node could be dispensed with entirely, but the 1720 * It might seem that this node could be dispensed with entirely, but the
1757 case '=': lop = BEHIND; break; /* \@<= */ 1790 case '=': lop = BEHIND; break; /* \@<= */
1758 case '!': lop = NOBEHIND; break; /* \@<! */ 1791 case '!': lop = NOBEHIND; break; /* \@<! */
1759 } 1792 }
1760 } 1793 }
1761 if (lop == END) 1794 if (lop == END)
1762 EMSG_M_RET_NULL(_("E59: invalid character after %s@"), 1795 EMSG2_RET_NULL(_("E59: invalid character after %s@"),
1763 reg_magic == MAGIC_ALL); 1796 reg_magic == MAGIC_ALL);
1764 /* Look behind must match with behind_pos. */ 1797 /* Look behind must match with behind_pos. */
1765 if (lop == BEHIND || lop == NOBEHIND) 1798 if (lop == BEHIND || lop == NOBEHIND)
1766 { 1799 {
1767 regtail(ret, regnode(BHPOS)); 1800 regtail(ret, regnode(BHPOS));
1791 reginsert_limits(BRACE_LIMITS, minval, maxval, ret); 1824 reginsert_limits(BRACE_LIMITS, minval, maxval, ret);
1792 } 1825 }
1793 else 1826 else
1794 { 1827 {
1795 if (num_complex_braces >= 10) 1828 if (num_complex_braces >= 10)
1796 EMSG_M_RET_NULL(_("E60: Too many complex %s{...}s"), 1829 EMSG2_RET_NULL(_("E60: Too many complex %s{...}s"),
1797 reg_magic == MAGIC_ALL); 1830 reg_magic == MAGIC_ALL);
1798 reginsert(BRACE_COMPLEX + num_complex_braces, ret); 1831 reginsert(BRACE_COMPLEX + num_complex_braces, ret);
1799 regoptail(ret, regnode(BACK)); 1832 regoptail(ret, regnode(BACK));
1800 regoptail(ret, ret); 1833 regoptail(ret, ret);
1801 reginsert_limits(BRACE_LIMITS, minval, maxval, ret); 1834 reginsert_limits(BRACE_LIMITS, minval, maxval, ret);
1818 } 1851 }
1819 1852
1820 return ret; 1853 return ret;
1821 } 1854 }
1822 1855
1823 /* 1856 /* When making changes to classchars also change nfa_classcodes. */
1824 * regatom - the lowest level 1857 static char_u *classchars = (char_u *)".iIkKfFpPsSdDxXoOwWhHaAlLuU";
1858 static int classcodes[] = {
1859 ANY, IDENT, SIDENT, KWORD, SKWORD,
1860 FNAME, SFNAME, PRINT, SPRINT,
1861 WHITE, NWHITE, DIGIT, NDIGIT,
1862 HEX, NHEX, OCTAL, NOCTAL,
1863 WORD, NWORD, HEAD, NHEAD,
1864 ALPHA, NALPHA, LOWER, NLOWER,
1865 UPPER, NUPPER
1866 };
1867
1868 /*
1869 * Parse the lowest level.
1825 * 1870 *
1826 * Optimization: gobbles an entire sequence of ordinary characters so that 1871 * Optimization: gobbles an entire sequence of ordinary characters so that
1827 * it can turn them into a single node, which is smaller to store and 1872 * it can turn them into a single node, which is smaller to store and
1828 * faster to run. Don't do this when one_exactly is set. 1873 * faster to run. Don't do this when one_exactly is set.
1829 */ 1874 */
1834 char_u *ret; 1879 char_u *ret;
1835 int flags; 1880 int flags;
1836 int cpo_lit; /* 'cpoptions' contains 'l' flag */ 1881 int cpo_lit; /* 'cpoptions' contains 'l' flag */
1837 int cpo_bsl; /* 'cpoptions' contains '\' flag */ 1882 int cpo_bsl; /* 'cpoptions' contains '\' flag */
1838 int c; 1883 int c;
1839 static char_u *classchars = (char_u *)".iIkKfFpPsSdDxXoOwWhHaAlLuU";
1840 static int classcodes[] = {ANY, IDENT, SIDENT, KWORD, SKWORD,
1841 FNAME, SFNAME, PRINT, SPRINT,
1842 WHITE, NWHITE, DIGIT, NDIGIT,
1843 HEX, NHEX, OCTAL, NOCTAL,
1844 WORD, NWORD, HEAD, NHEAD,
1845 ALPHA, NALPHA, LOWER, NLOWER,
1846 UPPER, NUPPER
1847 };
1848 char_u *p; 1884 char_u *p;
1849 int extra = 0; 1885 int extra = 0;
1850 1886
1851 *flagp = WORST; /* Tentatively. */ 1887 *flagp = WORST; /* Tentatively. */
1852 cpo_lit = vim_strchr(p_cpo, CPO_LITERAL) != NULL; 1888 cpo_lit = vim_strchr(p_cpo, CPO_LITERAL) != NULL;
2138 2174
2139 ret = NULL; 2175 ret = NULL;
2140 while ((c = getchr()) != ']') 2176 while ((c = getchr()) != ']')
2141 { 2177 {
2142 if (c == NUL) 2178 if (c == NUL)
2143 EMSG_M_RET_NULL(_("E69: Missing ] after %s%%["), 2179 EMSG2_RET_NULL(_("E69: Missing ] after %s%%["),
2144 reg_magic == MAGIC_ALL); 2180 reg_magic == MAGIC_ALL);
2145 br = regnode(BRANCH); 2181 br = regnode(BRANCH);
2146 if (ret == NULL) 2182 if (ret == NULL)
2147 ret = br; 2183 ret = br;
2148 else 2184 else
2154 one_exactly = FALSE; 2190 one_exactly = FALSE;
2155 if (lastnode == NULL) 2191 if (lastnode == NULL)
2156 return NULL; 2192 return NULL;
2157 } 2193 }
2158 if (ret == NULL) 2194 if (ret == NULL)
2159 EMSG_M_RET_NULL(_("E70: Empty %s%%[]"), 2195 EMSG2_RET_NULL(_("E70: Empty %s%%[]"),
2160 reg_magic == MAGIC_ALL); 2196 reg_magic == MAGIC_ALL);
2161 lastbranch = regnode(BRANCH); 2197 lastbranch = regnode(BRANCH);
2162 br = regnode(NOTHING); 2198 br = regnode(NOTHING);
2163 if (ret != JUST_CALC_SIZE) 2199 if (ret != JUST_CALC_SIZE)
2164 { 2200 {
2198 case 'U': i = gethexchrs(8); break; 2234 case 'U': i = gethexchrs(8); break;
2199 default: i = -1; break; 2235 default: i = -1; break;
2200 } 2236 }
2201 2237
2202 if (i < 0) 2238 if (i < 0)
2203 EMSG_M_RET_NULL( 2239 EMSG2_RET_NULL(
2204 _("E678: Invalid character after %s%%[dxouU]"), 2240 _("E678: Invalid character after %s%%[dxouU]"),
2205 reg_magic == MAGIC_ALL); 2241 reg_magic == MAGIC_ALL);
2206 #ifdef FEAT_MBYTE 2242 #ifdef FEAT_MBYTE
2207 if (use_multibytecode(i)) 2243 if (use_multibytecode(i))
2208 ret = regnode(MULTIBYTECODE); 2244 ret = regnode(MULTIBYTECODE);
2270 } 2306 }
2271 break; 2307 break;
2272 } 2308 }
2273 } 2309 }
2274 2310
2275 EMSG_M_RET_NULL(_("E71: Invalid character after %s%%"), 2311 EMSG2_RET_NULL(_("E71: Invalid character after %s%%"),
2276 reg_magic == MAGIC_ALL); 2312 reg_magic == MAGIC_ALL);
2277 } 2313 }
2278 } 2314 }
2279 break; 2315 break;
2280 2316
2565 skipchr(); /* let's be friends with the lexer again */ 2601 skipchr(); /* let's be friends with the lexer again */
2566 *flagp |= HASWIDTH | SIMPLE; 2602 *flagp |= HASWIDTH | SIMPLE;
2567 break; 2603 break;
2568 } 2604 }
2569 else if (reg_strict) 2605 else if (reg_strict)
2570 EMSG_M_RET_NULL(_("E769: Missing ] after %s["), 2606 EMSG2_RET_NULL(_(e_missingbracket), reg_magic > MAGIC_OFF);
2571 reg_magic > MAGIC_OFF);
2572 } 2607 }
2573 /* FALLTHROUGH */ 2608 /* FALLTHROUGH */
2574 2609
2575 default: 2610 default:
2576 { 2611 {
2657 || (enc_utf8 && utf_iscomposing(c))); 2692 || (enc_utf8 && utf_iscomposing(c)));
2658 } 2693 }
2659 #endif 2694 #endif
2660 2695
2661 /* 2696 /*
2662 * emit a node 2697 * Emit a node.
2663 * Return pointer to generated code. 2698 * Return pointer to generated code.
2664 */ 2699 */
2665 static char_u * 2700 static char_u *
2666 regnode(op) 2701 regnode(op)
2667 int op; 2702 int op;
2709 regcode += (*mb_char2bytes)(c, regcode); 2744 regcode += (*mb_char2bytes)(c, regcode);
2710 } 2745 }
2711 #endif 2746 #endif
2712 2747
2713 /* 2748 /*
2714 * reginsert - insert an operator in front of already-emitted operand 2749 * Insert an operator in front of already-emitted operand
2715 * 2750 *
2716 * Means relocating the operand. 2751 * Means relocating the operand.
2717 */ 2752 */
2718 static void 2753 static void
2719 reginsert(op, opnd) 2754 reginsert(op, opnd)
2740 *place++ = NUL; 2775 *place++ = NUL;
2741 *place = NUL; 2776 *place = NUL;
2742 } 2777 }
2743 2778
2744 /* 2779 /*
2745 * reginsert_limits - insert an operator in front of already-emitted operand. 2780 * Insert an operator in front of already-emitted operand.
2746 * The operator has the given limit values as operands. Also set next pointer. 2781 * The operator has the given limit values as operands. Also set next pointer.
2747 * 2782 *
2748 * Means relocating the operand. 2783 * Means relocating the operand.
2749 */ 2784 */
2750 static void 2785 static void
2792 *p++ = (char_u) (val & 0377); 2827 *p++ = (char_u) (val & 0377);
2793 return p; 2828 return p;
2794 } 2829 }
2795 2830
2796 /* 2831 /*
2797 * regtail - set the next-pointer at the end of a node chain 2832 * Set the next-pointer at the end of a node chain.
2798 */ 2833 */
2799 static void 2834 static void
2800 regtail(p, val) 2835 regtail(p, val)
2801 char_u *p; 2836 char_u *p;
2802 char_u *val; 2837 char_u *val;
2833 *(scan + 2) = (char_u) (offset & 0377); 2868 *(scan + 2) = (char_u) (offset & 0377);
2834 } 2869 }
2835 } 2870 }
2836 2871
2837 /* 2872 /*
2838 * regoptail - regtail on item after a BRANCH; nop if none 2873 * Like regtail, on item after a BRANCH; nop if none.
2839 */ 2874 */
2840 static void 2875 static void
2841 regoptail(p, val) 2876 regoptail(p, val)
2842 char_u *p; 2877 char_u *p;
2843 char_u *val; 2878 char_u *val;
2849 return; 2884 return;
2850 regtail(OPERAND(p), val); 2885 regtail(OPERAND(p), val);
2851 } 2886 }
2852 2887
2853 /* 2888 /*
2854 * getchr() - get the next character from the pattern. We know about 2889 * Functions for getting characters from the regexp input.
2855 * magic and such, so therefore we need a lexical analyzer. 2890 */
2856 */ 2891
2857
2858 /* static int curchr; */
2859 static int prevprevchr;
2860 static int prevchr;
2861 static int nextchr; /* used for ungetchr() */
2862 /*
2863 * Note: prevchr is sometimes -1 when we are not at the start,
2864 * eg in /[ ^I]^ the pattern was never found even if it existed, because ^ was
2865 * taken to be magic -- webb
2866 */
2867 static int at_start; /* True when on the first character */ 2892 static int at_start; /* True when on the first character */
2868 static int prev_at_start; /* True when on the second character */ 2893 static int prev_at_start; /* True when on the second character */
2869 2894
2895 /*
2896 * Start parsing at "str".
2897 */
2870 static void 2898 static void
2871 initchr(str) 2899 initchr(str)
2872 char_u *str; 2900 char_u *str;
2873 { 2901 {
2874 regparse = str; 2902 regparse = str;
2876 curchr = prevprevchr = prevchr = nextchr = -1; 2904 curchr = prevprevchr = prevchr = nextchr = -1;
2877 at_start = TRUE; 2905 at_start = TRUE;
2878 prev_at_start = FALSE; 2906 prev_at_start = FALSE;
2879 } 2907 }
2880 2908
2909 /*
2910 * Get the next character without advancing.
2911 */
2881 static int 2912 static int
2882 peekchr() 2913 peekchr()
2883 { 2914 {
2884 static int after_slash = FALSE; 2915 static int after_slash = FALSE;
2885 2916
3084 at_start = as; 3115 at_start = as;
3085 prevchr = pr; 3116 prevchr = pr;
3086 prevprevchr = prpr; 3117 prevprevchr = prpr;
3087 } 3118 }
3088 3119
3120 /*
3121 * Get the next character from the pattern. We know about magic and such, so
3122 * therefore we need a lexical analyzer.
3123 */
3089 static int 3124 static int
3090 getchr() 3125 getchr()
3091 { 3126 {
3092 int chr = peekchr(); 3127 int chr = peekchr();
3093 3128
3338 save_se_T save_start[NSUBEXP]; 3373 save_se_T save_start[NSUBEXP];
3339 save_se_T save_end[NSUBEXP]; 3374 save_se_T save_end[NSUBEXP];
3340 } regbehind_T; 3375 } regbehind_T;
3341 3376
3342 static char_u *reg_getline __ARGS((linenr_T lnum)); 3377 static char_u *reg_getline __ARGS((linenr_T lnum));
3343 static long vim_regexec_both __ARGS((char_u *line, colnr_T col, proftime_T *tm)); 3378 static long bt_regexec_both __ARGS((char_u *line, colnr_T col, proftime_T *tm));
3344 static long regtry __ARGS((regprog_T *prog, colnr_T col)); 3379 static long regtry __ARGS((bt_regprog_T *prog, colnr_T col));
3345 static void cleanup_subexpr __ARGS((void)); 3380 static void cleanup_subexpr __ARGS((void));
3346 #ifdef FEAT_SYN_HL 3381 #ifdef FEAT_SYN_HL
3347 static void cleanup_zsubexpr __ARGS((void)); 3382 static void cleanup_zsubexpr __ARGS((void));
3348 #endif 3383 #endif
3349 static void save_subexpr __ARGS((regbehind_T *bp)); 3384 static void save_subexpr __ARGS((regbehind_T *bp));
3396 static colnr_T ireg_maxcol; 3431 static colnr_T ireg_maxcol;
3397 3432
3398 /* 3433 /*
3399 * Sometimes need to save a copy of a line. Since alloc()/free() is very 3434 * Sometimes need to save a copy of a line. Since alloc()/free() is very
3400 * slow, we keep one allocated piece of memory and only re-allocate it when 3435 * slow, we keep one allocated piece of memory and only re-allocate it when
3401 * it's too small. It's freed in vim_regexec_both() when finished. 3436 * it's too small. It's freed in bt_regexec_both() when finished.
3402 */ 3437 */
3403 static char_u *reg_tofree = NULL; 3438 static char_u *reg_tofree = NULL;
3404 static unsigned reg_tofreelen; 3439 static unsigned reg_tofreelen;
3405 3440
3406 /* 3441 /*
3554 #endif 3589 #endif
3555 3590
3556 /* TRUE if using multi-line regexp. */ 3591 /* TRUE if using multi-line regexp. */
3557 #define REG_MULTI (reg_match == NULL) 3592 #define REG_MULTI (reg_match == NULL)
3558 3593
3594 static int bt_regexec __ARGS((regmatch_T *rmp, char_u *line, colnr_T col));
3595
3559 /* 3596 /*
3560 * Match a regexp against a string. 3597 * Match a regexp against a string.
3561 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp(). 3598 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
3562 * Uses curbuf for line count and 'iskeyword'. 3599 * Uses curbuf for line count and 'iskeyword'.
3563 * 3600 *
3564 * Return TRUE if there is a match, FALSE if not. 3601 * Return TRUE if there is a match, FALSE if not.
3565 */ 3602 */
3566 int 3603 static int
3567 vim_regexec(rmp, line, col) 3604 bt_regexec(rmp, line, col)
3568 regmatch_T *rmp; 3605 regmatch_T *rmp;
3569 char_u *line; /* string to match against */ 3606 char_u *line; /* string to match against */
3570 colnr_T col; /* column to start looking for match */ 3607 colnr_T col; /* column to start looking for match */
3571 { 3608 {
3572 reg_match = rmp; 3609 reg_match = rmp;
3578 ireg_ic = rmp->rm_ic; 3615 ireg_ic = rmp->rm_ic;
3579 #ifdef FEAT_MBYTE 3616 #ifdef FEAT_MBYTE
3580 ireg_icombine = FALSE; 3617 ireg_icombine = FALSE;
3581 #endif 3618 #endif
3582 ireg_maxcol = 0; 3619 ireg_maxcol = 0;
3583 return (vim_regexec_both(line, col, NULL) != 0); 3620 return (bt_regexec_both(line, col, NULL) != 0);
3584 } 3621 }
3585 3622
3586 #if defined(FEAT_MODIFY_FNAME) || defined(FEAT_EVAL) \ 3623 #if defined(FEAT_MODIFY_FNAME) || defined(FEAT_EVAL) \
3587 || defined(FIND_REPLACE_DIALOG) || defined(PROTO) 3624 || defined(FIND_REPLACE_DIALOG) || defined(PROTO)
3625
3626 static int bt_regexec_nl __ARGS((regmatch_T *rmp, char_u *line, colnr_T col));
3627
3588 /* 3628 /*
3589 * Like vim_regexec(), but consider a "\n" in "line" to be a line break. 3629 * Like vim_regexec(), but consider a "\n" in "line" to be a line break.
3590 */ 3630 */
3591 int 3631 static int
3592 vim_regexec_nl(rmp, line, col) 3632 bt_regexec_nl(rmp, line, col)
3593 regmatch_T *rmp; 3633 regmatch_T *rmp;
3594 char_u *line; /* string to match against */ 3634 char_u *line; /* string to match against */
3595 colnr_T col; /* column to start looking for match */ 3635 colnr_T col; /* column to start looking for match */
3596 { 3636 {
3597 reg_match = rmp; 3637 reg_match = rmp;
3603 ireg_ic = rmp->rm_ic; 3643 ireg_ic = rmp->rm_ic;
3604 #ifdef FEAT_MBYTE 3644 #ifdef FEAT_MBYTE
3605 ireg_icombine = FALSE; 3645 ireg_icombine = FALSE;
3606 #endif 3646 #endif
3607 ireg_maxcol = 0; 3647 ireg_maxcol = 0;
3608 return (vim_regexec_both(line, col, NULL) != 0); 3648 return (bt_regexec_both(line, col, NULL) != 0);
3609 } 3649 }
3610 #endif 3650 #endif
3651
3652 static long bt_regexec_multi __ARGS((regmmatch_T *rmp, win_T *win, buf_T *buf, linenr_T lnum, colnr_T col, proftime_T *tm));
3611 3653
3612 /* 3654 /*
3613 * Match a regexp against multiple lines. 3655 * Match a regexp against multiple lines.
3614 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp(). 3656 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
3615 * Uses curbuf for line count and 'iskeyword'. 3657 * Uses curbuf for line count and 'iskeyword'.
3616 * 3658 *
3617 * Return zero if there is no match. Return number of lines contained in the 3659 * Return zero if there is no match. Return number of lines contained in the
3618 * match otherwise. 3660 * match otherwise.
3619 */ 3661 */
3620 long 3662 static long
3621 vim_regexec_multi(rmp, win, buf, lnum, col, tm) 3663 bt_regexec_multi(rmp, win, buf, lnum, col, tm)
3622 regmmatch_T *rmp; 3664 regmmatch_T *rmp;
3623 win_T *win; /* window in which to search or NULL */ 3665 win_T *win; /* window in which to search or NULL */
3624 buf_T *buf; /* buffer in which to search */ 3666 buf_T *buf; /* buffer in which to search */
3625 linenr_T lnum; /* nr of line to start looking for match */ 3667 linenr_T lnum; /* nr of line to start looking for match */
3626 colnr_T col; /* column to start looking for match */ 3668 colnr_T col; /* column to start looking for match */
3639 #ifdef FEAT_MBYTE 3681 #ifdef FEAT_MBYTE
3640 ireg_icombine = FALSE; 3682 ireg_icombine = FALSE;
3641 #endif 3683 #endif
3642 ireg_maxcol = rmp->rmm_maxcol; 3684 ireg_maxcol = rmp->rmm_maxcol;
3643 3685
3644 r = vim_regexec_both(NULL, col, tm); 3686 r = bt_regexec_both(NULL, col, tm);
3645 3687
3646 return r; 3688 return r;
3647 } 3689 }
3648 3690
3649 /* 3691 /*
3650 * Match a regexp against a string ("line" points to the string) or multiple 3692 * Match a regexp against a string ("line" points to the string) or multiple
3651 * lines ("line" is NULL, use reg_getline()). 3693 * lines ("line" is NULL, use reg_getline()).
3652 */ 3694 */
3653 static long 3695 static long
3654 vim_regexec_both(line, col, tm) 3696 bt_regexec_both(line, col, tm)
3655 char_u *line; 3697 char_u *line;
3656 colnr_T col; /* column to start looking for match */ 3698 colnr_T col; /* column to start looking for match */
3657 proftime_T *tm UNUSED; /* timeout limit or NULL */ 3699 proftime_T *tm UNUSED; /* timeout limit or NULL */
3658 { 3700 {
3659 regprog_T *prog; 3701 bt_regprog_T *prog;
3660 char_u *s; 3702 char_u *s;
3661 long retval = 0L; 3703 long retval = 0L;
3662 3704
3663 /* Create "regstack" and "backpos" if they are not allocated yet. 3705 /* Create "regstack" and "backpos" if they are not allocated yet.
3664 * We allocate *_INITIAL amount of bytes first and then set the grow size 3706 * We allocate *_INITIAL amount of bytes first and then set the grow size
3680 backpos.ga_growsize = BACKPOS_INITIAL * 8; 3722 backpos.ga_growsize = BACKPOS_INITIAL * 8;
3681 } 3723 }
3682 3724
3683 if (REG_MULTI) 3725 if (REG_MULTI)
3684 { 3726 {
3685 prog = reg_mmatch->regprog; 3727 prog = (bt_regprog_T *)reg_mmatch->regprog;
3686 line = reg_getline((linenr_T)0); 3728 line = reg_getline((linenr_T)0);
3687 reg_startpos = reg_mmatch->startpos; 3729 reg_startpos = reg_mmatch->startpos;
3688 reg_endpos = reg_mmatch->endpos; 3730 reg_endpos = reg_mmatch->endpos;
3689 } 3731 }
3690 else 3732 else
3691 { 3733 {
3692 prog = reg_match->regprog; 3734 prog = (bt_regprog_T *)reg_match->regprog;
3693 reg_startp = reg_match->startp; 3735 reg_startp = reg_match->startp;
3694 reg_endp = reg_match->endp; 3736 reg_endp = reg_match->endp;
3695 } 3737 }
3696 3738
3697 /* Be paranoid... */ 3739 /* Be paranoid... */
3929 * regtry - try match of "prog" with at regline["col"]. 3971 * regtry - try match of "prog" with at regline["col"].
3930 * Returns 0 for failure, number of lines contained in the match otherwise. 3972 * Returns 0 for failure, number of lines contained in the match otherwise.
3931 */ 3973 */
3932 static long 3974 static long
3933 regtry(prog, col) 3975 regtry(prog, col)
3934 regprog_T *prog; 3976 bt_regprog_T *prog;
3935 colnr_T col; 3977 colnr_T col;
3936 { 3978 {
3937 reginput = regline + col; 3979 reginput = regline + col;
3938 need_clear_subexpr = TRUE; 3980 need_clear_subexpr = TRUE;
3939 #ifdef FEAT_SYN_HL 3981 #ifdef FEAT_SYN_HL
4061 #define RA_BREAK 3 /* break inner loop */ 4103 #define RA_BREAK 3 /* break inner loop */
4062 #define RA_MATCH 4 /* successful match */ 4104 #define RA_MATCH 4 /* successful match */
4063 #define RA_NOMATCH 5 /* didn't match */ 4105 #define RA_NOMATCH 5 /* didn't match */
4064 4106
4065 /* Make "regstack" and "backpos" empty. They are allocated and freed in 4107 /* Make "regstack" and "backpos" empty. They are allocated and freed in
4066 * vim_regexec_both() to reduce malloc()/free() calls. */ 4108 * bt_regexec_both() to reduce malloc()/free() calls. */
4067 regstack.ga_len = 0; 4109 regstack.ga_len = 0;
4068 backpos.ga_len = 0; 4110 backpos.ga_len = 0;
4069 4111
4070 /* 4112 /*
4071 * Repeat until "regstack" is empty. 4113 * Repeat until "regstack" is empty.
4072 */ 4114 */
4073 for (;;) 4115 for (;;)
4074 { 4116 {
4075 /* Some patterns my cause a long time to match, even though they are not 4117 /* Some patterns may cause a long time to match, even though they are not
4076 * illegal. E.g., "\([a-z]\+\)\+Q". Allow breaking them with CTRL-C. */ 4118 * illegal. E.g., "\([a-z]\+\)\+Q". Allow breaking them with CTRL-C. */
4077 fast_breakcheck(); 4119 fast_breakcheck();
4078 4120
4079 #ifdef DEBUG 4121 #ifdef DEBUG
4080 if (scan != NULL && regnarrate) 4122 if (scan != NULL && regnarrate)
4081 { 4123 {
4082 mch_errmsg(regprop(scan)); 4124 mch_errmsg((char *)regprop(scan));
4083 mch_errmsg("(\n"); 4125 mch_errmsg("(\n");
4084 } 4126 }
4085 #endif 4127 #endif
4086 4128
4087 /* 4129 /*
4098 status = RA_CONT; 4140 status = RA_CONT;
4099 4141
4100 #ifdef DEBUG 4142 #ifdef DEBUG
4101 if (regnarrate) 4143 if (regnarrate)
4102 { 4144 {
4103 mch_errmsg(regprop(scan)); 4145 mch_errmsg((char *)regprop(scan));
4104 mch_errmsg("...\n"); 4146 mch_errmsg("...\n");
4105 # ifdef FEAT_SYN_HL 4147 # ifdef FEAT_SYN_HL
4106 if (re_extmatch_in != NULL) 4148 if (re_extmatch_in != NULL)
4107 { 4149 {
4108 int i; 4150 int i;
4110 mch_errmsg(_("External submatches:\n")); 4152 mch_errmsg(_("External submatches:\n"));
4111 for (i = 0; i < NSUBEXP; i++) 4153 for (i = 0; i < NSUBEXP; i++)
4112 { 4154 {
4113 mch_errmsg(" \""); 4155 mch_errmsg(" \"");
4114 if (re_extmatch_in->matches[i] != NULL) 4156 if (re_extmatch_in->matches[i] != NULL)
4115 mch_errmsg(re_extmatch_in->matches[i]); 4157 mch_errmsg((char *)re_extmatch_in->matches[i]);
4116 mch_errmsg("\"\n"); 4158 mch_errmsg("\"\n");
4117 } 4159 }
4118 } 4160 }
4119 # endif 4161 # endif
4120 } 4162 }
6089 * Return TRUE if it's wrong. 6131 * Return TRUE if it's wrong.
6090 */ 6132 */
6091 static int 6133 static int
6092 prog_magic_wrong() 6134 prog_magic_wrong()
6093 { 6135 {
6094 if (UCHARAT(REG_MULTI 6136 regprog_T *prog;
6095 ? reg_mmatch->regprog->program 6137
6096 : reg_match->regprog->program) != REGMAGIC) 6138 prog = REG_MULTI ? reg_mmatch->regprog : reg_match->regprog;
6139 if (prog->engine == &nfa_regengine)
6140 /* For NFA matcher we don't check the magic */
6141 return FALSE;
6142
6143 if (UCHARAT(((bt_regprog_T *)prog)->program) != REGMAGIC)
6097 { 6144 {
6098 EMSG(_(e_re_corr)); 6145 EMSG(_(e_re_corr));
6099 return TRUE; 6146 return TRUE;
6100 } 6147 }
6101 return FALSE; 6148 return FALSE;
6316 return val < n; 6363 return val < n;
6317 return val == n; 6364 return val == n;
6318 } 6365 }
6319 6366
6320 6367
6321 #ifdef DEBUG 6368 #ifdef BT_REGEXP_DUMP
6322 6369
6323 /* 6370 /*
6324 * regdump - dump a regexp onto stdout in vaguely comprehensible form 6371 * regdump - dump a regexp onto stdout in vaguely comprehensible form
6325 */ 6372 */
6326 static void 6373 static void
6327 regdump(pattern, r) 6374 regdump(pattern, r)
6328 char_u *pattern; 6375 char_u *pattern;
6329 regprog_T *r; 6376 bt_regprog_T *r;
6330 { 6377 {
6331 char_u *s; 6378 char_u *s;
6332 int op = EXACTLY; /* Arbitrary non-END op. */ 6379 int op = EXACTLY; /* Arbitrary non-END op. */
6333 char_u *next; 6380 char_u *next;
6334 char_u *end = NULL; 6381 char_u *end = NULL;
6335 6382 FILE *f;
6336 printf("\r\nregcomp(%s):\r\n", pattern); 6383
6384 #ifdef BT_REGEXP_LOG
6385 f = fopen("bt_regexp_log.log", "a");
6386 #else
6387 f = stdout;
6388 #endif
6389 if (f == NULL)
6390 return;
6391 fprintf(f, "-------------------------------------\n\r\nregcomp(%s):\r\n", pattern);
6337 6392
6338 s = r->program + 1; 6393 s = r->program + 1;
6339 /* 6394 /*
6340 * Loop until we find the END that isn't before a referred next (an END 6395 * Loop until we find the END that isn't before a referred next (an END
6341 * can also appear in a NOMATCH operand). 6396 * can also appear in a NOMATCH operand).
6342 */ 6397 */
6343 while (op != END || s <= end) 6398 while (op != END || s <= end)
6344 { 6399 {
6345 op = OP(s); 6400 op = OP(s);
6346 printf("%2d%s", (int)(s - r->program), regprop(s)); /* Where, what. */ 6401 fprintf(f, "%2d%s", (int)(s - r->program), regprop(s)); /* Where, what. */
6347 next = regnext(s); 6402 next = regnext(s);
6348 if (next == NULL) /* Next ptr. */ 6403 if (next == NULL) /* Next ptr. */
6349 printf("(0)"); 6404 fprintf(f, "(0)");
6350 else 6405 else
6351 printf("(%d)", (int)((s - r->program) + (next - s))); 6406 fprintf(f, "(%d)", (int)((s - r->program) + (next - s)));
6352 if (end < next) 6407 if (end < next)
6353 end = next; 6408 end = next;
6354 if (op == BRACE_LIMITS) 6409 if (op == BRACE_LIMITS)
6355 { 6410 {
6356 /* Two short ints */ 6411 /* Two short ints */
6357 printf(" minval %ld, maxval %ld", OPERAND_MIN(s), OPERAND_MAX(s)); 6412 fprintf(f, " minval %ld, maxval %ld", OPERAND_MIN(s), OPERAND_MAX(s));
6358 s += 8; 6413 s += 8;
6359 } 6414 }
6360 s += 3; 6415 s += 3;
6361 if (op == ANYOF || op == ANYOF + ADD_NL 6416 if (op == ANYOF || op == ANYOF + ADD_NL
6362 || op == ANYBUT || op == ANYBUT + ADD_NL 6417 || op == ANYBUT || op == ANYBUT + ADD_NL
6363 || op == EXACTLY) 6418 || op == EXACTLY)
6364 { 6419 {
6365 /* Literal string, where present. */ 6420 /* Literal string, where present. */
6421 fprintf(f, "\nxxxxxxxxx\n");
6366 while (*s != NUL) 6422 while (*s != NUL)
6367 printf("%c", *s++); 6423 fprintf(f, "%c", *s++);
6424 fprintf(f, "\nxxxxxxxxx\n");
6368 s++; 6425 s++;
6369 } 6426 }
6370 printf("\r\n"); 6427 fprintf(f, "\r\n");
6371 } 6428 }
6372 6429
6373 /* Header fields of interest. */ 6430 /* Header fields of interest. */
6374 if (r->regstart != NUL) 6431 if (r->regstart != NUL)
6375 printf("start `%s' 0x%x; ", r->regstart < 256 6432 fprintf(f, "start `%s' 0x%x; ", r->regstart < 256
6376 ? (char *)transchar(r->regstart) 6433 ? (char *)transchar(r->regstart)
6377 : "multibyte", r->regstart); 6434 : "multibyte", r->regstart);
6378 if (r->reganch) 6435 if (r->reganch)
6379 printf("anchored; "); 6436 fprintf(f, "anchored; ");
6380 if (r->regmust != NULL) 6437 if (r->regmust != NULL)
6381 printf("must have \"%s\"", r->regmust); 6438 fprintf(f, "must have \"%s\"", r->regmust);
6382 printf("\r\n"); 6439 fprintf(f, "\r\n");
6383 } 6440
6384 6441 #ifdef BT_REGEXP_LOG
6442 fclose(f);
6443 #endif
6444 }
6445 #endif /* BT_REGEXP_DUMP */
6446
6447 #ifdef DEBUG
6385 /* 6448 /*
6386 * regprop - printable representation of opcode 6449 * regprop - printable representation of opcode
6387 */ 6450 */
6388 static char_u * 6451 static char_u *
6389 regprop(op) 6452 regprop(op)
6390 char_u *op; 6453 char_u *op;
6391 { 6454 {
6392 char_u *p; 6455 char *p;
6393 static char_u buf[50]; 6456 static char buf[50];
6394 6457
6395 (void) strcpy(buf, ":"); 6458 STRCPY(buf, ":");
6396 6459
6397 switch (OP(op)) 6460 switch ((int) OP(op))
6398 { 6461 {
6399 case BOL: 6462 case BOL:
6400 p = "BOL"; 6463 p = "BOL";
6401 break; 6464 break;
6402 case EOL: 6465 case EOL:
6759 sprintf(buf + STRLEN(buf), "corrupt %d", OP(op)); 6822 sprintf(buf + STRLEN(buf), "corrupt %d", OP(op));
6760 p = NULL; 6823 p = NULL;
6761 break; 6824 break;
6762 } 6825 }
6763 if (p != NULL) 6826 if (p != NULL)
6764 (void) strcat(buf, p); 6827 STRCAT(buf, p);
6765 return buf; 6828 return (char_u *)buf;
6766 } 6829 }
6767 #endif 6830 #endif /* DEBUG */
6768 6831
6769 #ifdef FEAT_MBYTE 6832 #ifdef FEAT_MBYTE
6770 static void mb_decompose __ARGS((int c, int *c1, int *c2, int *c3)); 6833 static void mb_decompose __ARGS((int c, int *c1, int *c2, int *c3));
6771 6834
6772 typedef struct 6835 typedef struct
7665 } 7728 }
7666 7729
7667 return retval; 7730 return retval;
7668 } 7731 }
7669 #endif 7732 #endif
7733
7734 static regengine_T bt_regengine =
7735 {
7736 bt_regcomp,
7737 bt_regexec,
7738 #if defined(FEAT_MODIFY_FNAME) || defined(FEAT_EVAL) \
7739 || defined(FIND_REPLACE_DIALOG) || defined(PROTO)
7740 bt_regexec_nl,
7741 #endif
7742 bt_regexec_multi
7743 #ifdef DEBUG
7744 ,(char_u *)""
7745 #endif
7746 };
7747
7748
7749 #include "regexp_nfa.c"
7750
7751 static regengine_T nfa_regengine =
7752 {
7753 nfa_regcomp,
7754 nfa_regexec,
7755 #if defined(FEAT_MODIFY_FNAME) || defined(FEAT_EVAL) \
7756 || defined(FIND_REPLACE_DIALOG) || defined(PROTO)
7757 nfa_regexec_nl,
7758 #endif
7759 nfa_regexec_multi
7760 #ifdef DEBUG
7761 ,(char_u *)""
7762 #endif
7763 };
7764
7765 /* Which regexp engine to use? Needed for vim_regcomp().
7766 * Must match with 'regexpengine'. */
7767 static int regexp_engine = 0;
7768 #define AUTOMATIC_ENGINE 0
7769 #define BACKTRACKING_ENGINE 1
7770 #define NFA_ENGINE 2
7771 #ifdef DEBUG
7772 static char_u regname[][30] = {
7773 "AUTOMATIC Regexp Engine",
7774 "BACKTACKING Regexp Engine",
7775 "NFA Regexp Engine"
7776 };
7777 #endif
7778
7779 /*
7780 * Compile a regular expression into internal code.
7781 * Returns the program in allocated memory. Returns NULL for an error.
7782 */
7783 regprog_T *
7784 vim_regcomp(expr_arg, re_flags)
7785 char_u *expr_arg;
7786 int re_flags;
7787 {
7788 regprog_T *prog = NULL;
7789 char_u *expr = expr_arg;
7790
7791 syntax_error = FALSE;
7792 regexp_engine = p_re;
7793
7794 /* Check for prefix "\%#=", that sets the regexp engine */
7795 if (STRNCMP(expr, "\\%#=", 4) == 0)
7796 {
7797 int newengine = expr[4] - '0';
7798
7799 if (newengine == AUTOMATIC_ENGINE
7800 || newengine == BACKTRACKING_ENGINE
7801 || newengine == NFA_ENGINE)
7802 {
7803 regexp_engine = expr[4] - '0';
7804 expr += 5;
7805 #ifdef DEBUG
7806 EMSG3("New regexp mode selected (%d): %s", regexp_engine,
7807 regname[newengine]);
7808 #endif
7809 }
7810 else
7811 {
7812 EMSG(_("E864: \\%#= can only be followed by 0, 1, or 2. The automatic engine will be used "));
7813 regexp_engine = AUTOMATIC_ENGINE;
7814 }
7815 }
7816 #ifdef DEBUG
7817 bt_regengine.expr = expr;
7818 nfa_regengine.expr = expr;
7819 #endif
7820
7821 /*
7822 * First try the NFA engine, unless backtracking was requested.
7823 */
7824 if (regexp_engine != BACKTRACKING_ENGINE)
7825 prog = nfa_regengine.regcomp(expr, re_flags);
7826 else
7827 prog = bt_regengine.regcomp(expr, re_flags);
7828
7829 if (prog == NULL) /* error compiling regexp with initial engine */
7830 {
7831 #ifdef DEBUG
7832 if (regexp_engine != BACKTRACKING_ENGINE) /* debugging log for NFA */
7833 {
7834 FILE *f;
7835 f = fopen("debug.log", "a");
7836 if (f)
7837 {
7838 if (!syntax_error)
7839 fprintf(f, "NFA engine could not handle \"%s\"\n", expr);
7840 else
7841 fprintf(f, "Syntax error in \"%s\"\n", expr);
7842 fclose(f);
7843 }
7844 else
7845 EMSG("(NFA) Could not open \"debug.log\" to write !!!");
7846 /*
7847 if (syntax_error)
7848 EMSG("NFA Regexp: Syntax Error !");
7849 */
7850 }
7851 #endif
7852 /*
7853 * If NFA engine failed, then revert to the backtracking engine.
7854 * Except when there was a syntax error, which was properly handled by
7855 * NFA engine.
7856 */
7857 if (regexp_engine == AUTOMATIC_ENGINE)
7858 if (!syntax_error)
7859 prog = bt_regengine.regcomp(expr, re_flags);
7860
7861 } /* endif prog==NULL */
7862
7863
7864 return prog;
7865 }
7866
7867 /*
7868 * Match a regexp against a string.
7869 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
7870 * Uses curbuf for line count and 'iskeyword'.
7871 *
7872 * Return TRUE if there is a match, FALSE if not.
7873 */
7874 int
7875 vim_regexec(rmp, line, col)
7876 regmatch_T *rmp;
7877 char_u *line; /* string to match against */
7878 colnr_T col; /* column to start looking for match */
7879 {
7880 return rmp->regprog->engine->regexec(rmp, line, col);
7881 }
7882
7883 #if defined(FEAT_MODIFY_FNAME) || defined(FEAT_EVAL) \
7884 || defined(FIND_REPLACE_DIALOG) || defined(PROTO)
7885 /*
7886 * Like vim_regexec(), but consider a "\n" in "line" to be a line break.
7887 */
7888 int
7889 vim_regexec_nl(rmp, line, col)
7890 regmatch_T *rmp;
7891 char_u *line;
7892 colnr_T col;
7893 {
7894 return rmp->regprog->engine->regexec_nl(rmp, line, col);
7895 }
7896 #endif
7897
7898 /*
7899 * Match a regexp against multiple lines.
7900 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
7901 * Uses curbuf for line count and 'iskeyword'.
7902 *
7903 * Return zero if there is no match. Return number of lines contained in the
7904 * match otherwise.
7905 */
7906 long
7907 vim_regexec_multi(rmp, win, buf, lnum, col, tm)
7908 regmmatch_T *rmp;
7909 win_T *win; /* window in which to search or NULL */
7910 buf_T *buf; /* buffer in which to search */
7911 linenr_T lnum; /* nr of line to start looking for match */
7912 colnr_T col; /* column to start looking for match */
7913 proftime_T *tm; /* timeout limit or NULL */
7914 {
7915 return rmp->regprog->engine->regexec_multi(rmp, win, buf, lnum, col, tm);
7916 }