Mercurial > vim
comparison src/regexp.c @ 4444:ccecb03e5e8b v7.3.970
updated for version 7.3.970
Problem: Syntax highlighting can be slow.
Solution: Include the NFA regexp engine. Add the 'regexpengine' option to
select which one is used. (various authors, including Ken Takata,
Andrei Aiordachioaie, Russ Cox, Xiaozhou Liua, Ian Young)
author | Bram Moolenaar <bram@vim.org> |
---|---|
date | Sun, 19 May 2013 19:40:29 +0200 |
parents | 7faeece39228 |
children | fe8a0a6a1c2a |
comparison
equal
deleted
inserted
replaced
4443:34f806b8147f | 4444:ccecb03e5e8b |
---|---|
36 * Changes have been made by Tony Andrews, Olaf 'Rhialto' Seibert, Robert | 36 * Changes have been made by Tony Andrews, Olaf 'Rhialto' Seibert, Robert |
37 * Webb, Ciaran McCreesh and Bram Moolenaar. | 37 * Webb, Ciaran McCreesh and Bram Moolenaar. |
38 * Named character class support added by Walter Briscoe (1998 Jul 01) | 38 * Named character class support added by Walter Briscoe (1998 Jul 01) |
39 */ | 39 */ |
40 | 40 |
41 /* Uncomment the first if you do not want to see debugging logs or files | |
42 * related to regular expressions, even when compiling with -DDEBUG. | |
43 * Uncomment the second to get the regexp debugging. */ | |
44 /* #undef DEBUG */ | |
45 /* #define DEBUG */ | |
46 | |
41 #include "vim.h" | 47 #include "vim.h" |
42 | 48 |
43 #undef DEBUG | 49 #ifdef DEBUG |
50 /* show/save debugging data when BT engine is used */ | |
51 # define BT_REGEXP_DUMP | |
52 /* save the debugging data to a file instead of displaying it */ | |
53 # define BT_REGEXP_LOG | |
54 #endif | |
44 | 55 |
45 /* | 56 /* |
46 * The "internal use only" fields in regexp.h are present to pass info from | 57 * The "internal use only" fields in regexp.h are present to pass info from |
47 * compile to execute that permits the execute phase to run lots faster on | 58 * compile to execute that permits the execute phase to run lots faster on |
48 * simple cases. They are: | 59 * simple cases. They are: |
324 #define UCHARAT(p) ((int)*(char_u *)(p)) | 335 #define UCHARAT(p) ((int)*(char_u *)(p)) |
325 | 336 |
326 /* Used for an error (down from) vim_regcomp(): give the error message, set | 337 /* Used for an error (down from) vim_regcomp(): give the error message, set |
327 * rc_did_emsg and return NULL */ | 338 * rc_did_emsg and return NULL */ |
328 #define EMSG_RET_NULL(m) return (EMSG(m), rc_did_emsg = TRUE, (void *)NULL) | 339 #define EMSG_RET_NULL(m) return (EMSG(m), rc_did_emsg = TRUE, (void *)NULL) |
329 #define EMSG_M_RET_NULL(m, c) return (EMSG2((m), (c) ? "" : "\\"), rc_did_emsg = TRUE, (void *)NULL) | |
330 #define EMSG_RET_FAIL(m) return (EMSG(m), rc_did_emsg = TRUE, FAIL) | 340 #define EMSG_RET_FAIL(m) return (EMSG(m), rc_did_emsg = TRUE, FAIL) |
331 #define EMSG_ONE_RET_NULL EMSG_M_RET_NULL(_("E369: invalid item in %s%%[]"), reg_magic == MAGIC_ALL) | 341 #define EMSG2_RET_NULL(m, c) return (EMSG2((m), (c) ? "" : "\\"), rc_did_emsg = TRUE, (void *)NULL) |
342 #define EMSG2_RET_FAIL(m, c) return (EMSG2((m), (c) ? "" : "\\"), rc_did_emsg = TRUE, FAIL) | |
343 #define EMSG_ONE_RET_NULL EMSG2_RET_NULL(_("E369: invalid item in %s%%[]"), reg_magic == MAGIC_ALL) | |
332 | 344 |
333 #define MAX_LIMIT (32767L << 16L) | 345 #define MAX_LIMIT (32767L << 16L) |
334 | 346 |
335 static int re_multi_type __ARGS((int)); | 347 static int re_multi_type __ARGS((int)); |
336 static int cstrncmp __ARGS((char_u *s1, char_u *s2, int *n)); | 348 static int cstrncmp __ARGS((char_u *s1, char_u *s2, int *n)); |
337 static char_u *cstrchr __ARGS((char_u *, int)); | 349 static char_u *cstrchr __ARGS((char_u *, int)); |
338 | 350 |
351 #ifdef BT_REGEXP_DUMP | |
352 static void regdump __ARGS((char_u *, bt_regprog_T *)); | |
353 #endif | |
339 #ifdef DEBUG | 354 #ifdef DEBUG |
340 static void regdump __ARGS((char_u *, regprog_T *)); | |
341 static char_u *regprop __ARGS((char_u *)); | 355 static char_u *regprop __ARGS((char_u *)); |
342 #endif | 356 #endif |
357 | |
358 static char_u e_missingbracket[] = N_("E769: Missing ] after %s["); | |
359 static char_u e_unmatchedpp[] = N_("E53: Unmatched %s%%("); | |
360 static char_u e_unmatchedp[] = N_("E54: Unmatched %s("); | |
361 static char_u e_unmatchedpar[] = N_("E55: Unmatched %s)"); | |
343 | 362 |
344 #define NOT_MULTI 0 | 363 #define NOT_MULTI 0 |
345 #define MULTI_ONE 1 | 364 #define MULTI_ONE 1 |
346 #define MULTI_MULT 2 | 365 #define MULTI_MULT 2 |
347 /* | 366 /* |
628 /* p s u v w x z { | ~ */ | 647 /* p s u v w x z { | ~ */ |
629 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1 | 648 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1 |
630 }; | 649 }; |
631 #endif | 650 #endif |
632 | 651 |
633 static int curchr; | 652 static int curchr; /* currently parsed character */ |
653 /* Previous character. Note: prevchr is sometimes -1 when we are not at the | |
654 * start, eg in /[ ^I]^ the pattern was never found even if it existed, | |
655 * because ^ was taken to be magic -- webb */ | |
656 static int prevchr; | |
657 static int prevprevchr; /* previous-previous character */ | |
658 static int nextchr; /* used for ungetchr() */ | |
634 | 659 |
635 /* arguments for reg() */ | 660 /* arguments for reg() */ |
636 #define REG_NOPAREN 0 /* toplevel reg() */ | 661 #define REG_NOPAREN 0 /* toplevel reg() */ |
637 #define REG_PAREN 1 /* \(\) */ | 662 #define REG_PAREN 1 /* \(\) */ |
638 #define REG_ZPAREN 2 /* \z(\) */ | 663 #define REG_ZPAREN 2 /* \z(\) */ |
678 static char_u *re_put_long __ARGS((char_u *pr, long_u val)); | 703 static char_u *re_put_long __ARGS((char_u *pr, long_u val)); |
679 static int read_limits __ARGS((long *, long *)); | 704 static int read_limits __ARGS((long *, long *)); |
680 static void regtail __ARGS((char_u *, char_u *)); | 705 static void regtail __ARGS((char_u *, char_u *)); |
681 static void regoptail __ARGS((char_u *, char_u *)); | 706 static void regoptail __ARGS((char_u *, char_u *)); |
682 | 707 |
708 static regengine_T bt_regengine; | |
709 static regengine_T nfa_regengine; | |
710 | |
683 /* | 711 /* |
684 * Return TRUE if compiled regular expression "prog" can match a line break. | 712 * Return TRUE if compiled regular expression "prog" can match a line break. |
685 */ | 713 */ |
686 int | 714 int |
687 re_multiline(prog) | 715 re_multiline(prog) |
760 #endif | 788 #endif |
761 | 789 |
762 /* | 790 /* |
763 * Produce the bytes for equivalence class "c". | 791 * Produce the bytes for equivalence class "c". |
764 * Currently only handles latin1, latin9 and utf-8. | 792 * Currently only handles latin1, latin9 and utf-8. |
793 * NOTE: When changing this function, also change nfa_emit_equi_class() | |
765 */ | 794 */ |
766 static void | 795 static void |
767 reg_equi_class(c) | 796 reg_equi_class(c) |
768 int c; | 797 int c; |
769 { | 798 { |
1237 } | 1266 } |
1238 } | 1267 } |
1239 return p; | 1268 return p; |
1240 } | 1269 } |
1241 | 1270 |
1242 /* | 1271 static regprog_T *bt_regcomp __ARGS((char_u *expr, int re_flags)); |
1243 * vim_regcomp() - compile a regular expression into internal code | 1272 |
1273 /* | |
1274 * bt_regcomp() - compile a regular expression into internal code for the | |
1275 * traditional back track matcher. | |
1244 * Returns the program in allocated space. Returns NULL for an error. | 1276 * Returns the program in allocated space. Returns NULL for an error. |
1245 * | 1277 * |
1246 * We can't allocate space until we know how big the compiled form will be, | 1278 * We can't allocate space until we know how big the compiled form will be, |
1247 * but we can't compile it (and thus know how big it is) until we've got a | 1279 * but we can't compile it (and thus know how big it is) until we've got a |
1248 * place to put the code. So we cheat: we compile it twice, once with code | 1280 * place to put the code. So we cheat: we compile it twice, once with code |
1257 * | 1289 * |
1258 * Beware that the optimization-preparation code in here knows about some | 1290 * Beware that the optimization-preparation code in here knows about some |
1259 * of the structure of the compiled regexp. | 1291 * of the structure of the compiled regexp. |
1260 * "re_flags": RE_MAGIC and/or RE_STRING. | 1292 * "re_flags": RE_MAGIC and/or RE_STRING. |
1261 */ | 1293 */ |
1262 regprog_T * | 1294 static regprog_T * |
1263 vim_regcomp(expr, re_flags) | 1295 bt_regcomp(expr, re_flags) |
1264 char_u *expr; | 1296 char_u *expr; |
1265 int re_flags; | 1297 int re_flags; |
1266 { | 1298 { |
1267 regprog_T *r; | 1299 bt_regprog_T *r; |
1268 char_u *scan; | 1300 char_u *scan; |
1269 char_u *longest; | 1301 char_u *longest; |
1270 int len; | 1302 int len; |
1271 int flags; | 1303 int flags; |
1272 | 1304 |
1289 if (regsize >= 65536L - 256L) | 1321 if (regsize >= 65536L - 256L) |
1290 EMSG_RET_NULL(_("E339: Pattern too long")); | 1322 EMSG_RET_NULL(_("E339: Pattern too long")); |
1291 #endif | 1323 #endif |
1292 | 1324 |
1293 /* Allocate space. */ | 1325 /* Allocate space. */ |
1294 r = (regprog_T *)lalloc(sizeof(regprog_T) + regsize, TRUE); | 1326 r = (bt_regprog_T *)lalloc(sizeof(bt_regprog_T) + regsize, TRUE); |
1295 if (r == NULL) | 1327 if (r == NULL) |
1296 return NULL; | 1328 return NULL; |
1297 | 1329 |
1298 /* | 1330 /* |
1299 * Second pass: emit code. | 1331 * Second pass: emit code. |
1384 } | 1416 } |
1385 r->regmust = longest; | 1417 r->regmust = longest; |
1386 r->regmlen = len; | 1418 r->regmlen = len; |
1387 } | 1419 } |
1388 } | 1420 } |
1389 #ifdef DEBUG | 1421 #ifdef BT_REGEXP_DUMP |
1390 regdump(expr, r); | 1422 regdump(expr, r); |
1391 #endif | 1423 #endif |
1392 return r; | 1424 r->engine = &bt_regengine; |
1425 return (regprog_T *)r; | |
1393 } | 1426 } |
1394 | 1427 |
1395 /* | 1428 /* |
1396 * Setup to parse the regexp. Used once to get the length and once to do it. | 1429 * Setup to parse the regexp. Used once to get the length and once to do it. |
1397 */ | 1430 */ |
1434 return had_eol; | 1467 return had_eol; |
1435 } | 1468 } |
1436 #endif | 1469 #endif |
1437 | 1470 |
1438 /* | 1471 /* |
1439 * reg - regular expression, i.e. main body or parenthesized thing | 1472 * Parse regular expression, i.e. main body or parenthesized thing. |
1440 * | 1473 * |
1441 * Caller must absorb opening parenthesis. | 1474 * Caller must absorb opening parenthesis. |
1442 * | 1475 * |
1443 * Combining parenthesis handling with the base level of regular expression | 1476 * Combining parenthesis handling with the base level of regular expression |
1444 * is a trifle forced, but the need to tie the tails of the branches to what | 1477 * is a trifle forced, but the need to tie the tails of the branches to what |
1471 #endif | 1504 #endif |
1472 if (paren == REG_PAREN) | 1505 if (paren == REG_PAREN) |
1473 { | 1506 { |
1474 /* Make a MOPEN node. */ | 1507 /* Make a MOPEN node. */ |
1475 if (regnpar >= NSUBEXP) | 1508 if (regnpar >= NSUBEXP) |
1476 EMSG_M_RET_NULL(_("E51: Too many %s("), reg_magic == MAGIC_ALL); | 1509 EMSG2_RET_NULL(_("E51: Too many %s("), reg_magic == MAGIC_ALL); |
1477 parno = regnpar; | 1510 parno = regnpar; |
1478 ++regnpar; | 1511 ++regnpar; |
1479 ret = regnode(MOPEN + parno); | 1512 ret = regnode(MOPEN + parno); |
1480 } | 1513 } |
1481 else if (paren == REG_NPAREN) | 1514 else if (paren == REG_NPAREN) |
1532 if (paren == REG_ZPAREN) | 1565 if (paren == REG_ZPAREN) |
1533 EMSG_RET_NULL(_("E52: Unmatched \\z(")); | 1566 EMSG_RET_NULL(_("E52: Unmatched \\z(")); |
1534 else | 1567 else |
1535 #endif | 1568 #endif |
1536 if (paren == REG_NPAREN) | 1569 if (paren == REG_NPAREN) |
1537 EMSG_M_RET_NULL(_("E53: Unmatched %s%%("), reg_magic == MAGIC_ALL); | 1570 EMSG2_RET_NULL(_(e_unmatchedpp), reg_magic == MAGIC_ALL); |
1538 else | 1571 else |
1539 EMSG_M_RET_NULL(_("E54: Unmatched %s("), reg_magic == MAGIC_ALL); | 1572 EMSG2_RET_NULL(_(e_unmatchedp), reg_magic == MAGIC_ALL); |
1540 } | 1573 } |
1541 else if (paren == REG_NOPAREN && peekchr() != NUL) | 1574 else if (paren == REG_NOPAREN && peekchr() != NUL) |
1542 { | 1575 { |
1543 if (curchr == Magic(')')) | 1576 if (curchr == Magic(')')) |
1544 EMSG_M_RET_NULL(_("E55: Unmatched %s)"), reg_magic == MAGIC_ALL); | 1577 EMSG2_RET_NULL(_(e_unmatchedpar), reg_magic == MAGIC_ALL); |
1545 else | 1578 else |
1546 EMSG_RET_NULL(_(e_trailing)); /* "Can't happen". */ | 1579 EMSG_RET_NULL(_(e_trailing)); /* "Can't happen". */ |
1547 /* NOTREACHED */ | 1580 /* NOTREACHED */ |
1548 } | 1581 } |
1549 /* | 1582 /* |
1554 had_endbrace[parno] = TRUE; /* have seen the close paren */ | 1587 had_endbrace[parno] = TRUE; /* have seen the close paren */ |
1555 return ret; | 1588 return ret; |
1556 } | 1589 } |
1557 | 1590 |
1558 /* | 1591 /* |
1559 * Handle one alternative of an | operator. | 1592 * Parse one alternative of an | operator. |
1560 * Implements the & operator. | 1593 * Implements the & operator. |
1561 */ | 1594 */ |
1562 static char_u * | 1595 static char_u * |
1563 regbranch(flagp) | 1596 regbranch(flagp) |
1564 int *flagp; | 1597 int *flagp; |
1597 | 1630 |
1598 return ret; | 1631 return ret; |
1599 } | 1632 } |
1600 | 1633 |
1601 /* | 1634 /* |
1602 * Handle one alternative of an | or & operator. | 1635 * Parse one alternative of an | or & operator. |
1603 * Implements the concatenation operator. | 1636 * Implements the concatenation operator. |
1604 */ | 1637 */ |
1605 static char_u * | 1638 static char_u * |
1606 regconcat(flagp) | 1639 regconcat(flagp) |
1607 int *flagp; | 1640 int *flagp; |
1677 first = regnode(NOTHING); | 1710 first = regnode(NOTHING); |
1678 return first; | 1711 return first; |
1679 } | 1712 } |
1680 | 1713 |
1681 /* | 1714 /* |
1682 * regpiece - something followed by possible [*+=] | 1715 * Parse something followed by possible [*+=]. |
1683 * | 1716 * |
1684 * Note that the branching code sequences used for = and the general cases | 1717 * Note that the branching code sequences used for = and the general cases |
1685 * of * and + are somewhat optimized: they use the same NOTHING node as | 1718 * of * and + are somewhat optimized: they use the same NOTHING node as |
1686 * both the endmarker for their branch list and the body of the last branch. | 1719 * both the endmarker for their branch list and the body of the last branch. |
1687 * It might seem that this node could be dispensed with entirely, but the | 1720 * It might seem that this node could be dispensed with entirely, but the |
1757 case '=': lop = BEHIND; break; /* \@<= */ | 1790 case '=': lop = BEHIND; break; /* \@<= */ |
1758 case '!': lop = NOBEHIND; break; /* \@<! */ | 1791 case '!': lop = NOBEHIND; break; /* \@<! */ |
1759 } | 1792 } |
1760 } | 1793 } |
1761 if (lop == END) | 1794 if (lop == END) |
1762 EMSG_M_RET_NULL(_("E59: invalid character after %s@"), | 1795 EMSG2_RET_NULL(_("E59: invalid character after %s@"), |
1763 reg_magic == MAGIC_ALL); | 1796 reg_magic == MAGIC_ALL); |
1764 /* Look behind must match with behind_pos. */ | 1797 /* Look behind must match with behind_pos. */ |
1765 if (lop == BEHIND || lop == NOBEHIND) | 1798 if (lop == BEHIND || lop == NOBEHIND) |
1766 { | 1799 { |
1767 regtail(ret, regnode(BHPOS)); | 1800 regtail(ret, regnode(BHPOS)); |
1791 reginsert_limits(BRACE_LIMITS, minval, maxval, ret); | 1824 reginsert_limits(BRACE_LIMITS, minval, maxval, ret); |
1792 } | 1825 } |
1793 else | 1826 else |
1794 { | 1827 { |
1795 if (num_complex_braces >= 10) | 1828 if (num_complex_braces >= 10) |
1796 EMSG_M_RET_NULL(_("E60: Too many complex %s{...}s"), | 1829 EMSG2_RET_NULL(_("E60: Too many complex %s{...}s"), |
1797 reg_magic == MAGIC_ALL); | 1830 reg_magic == MAGIC_ALL); |
1798 reginsert(BRACE_COMPLEX + num_complex_braces, ret); | 1831 reginsert(BRACE_COMPLEX + num_complex_braces, ret); |
1799 regoptail(ret, regnode(BACK)); | 1832 regoptail(ret, regnode(BACK)); |
1800 regoptail(ret, ret); | 1833 regoptail(ret, ret); |
1801 reginsert_limits(BRACE_LIMITS, minval, maxval, ret); | 1834 reginsert_limits(BRACE_LIMITS, minval, maxval, ret); |
1818 } | 1851 } |
1819 | 1852 |
1820 return ret; | 1853 return ret; |
1821 } | 1854 } |
1822 | 1855 |
1823 /* | 1856 /* When making changes to classchars also change nfa_classcodes. */ |
1824 * regatom - the lowest level | 1857 static char_u *classchars = (char_u *)".iIkKfFpPsSdDxXoOwWhHaAlLuU"; |
1858 static int classcodes[] = { | |
1859 ANY, IDENT, SIDENT, KWORD, SKWORD, | |
1860 FNAME, SFNAME, PRINT, SPRINT, | |
1861 WHITE, NWHITE, DIGIT, NDIGIT, | |
1862 HEX, NHEX, OCTAL, NOCTAL, | |
1863 WORD, NWORD, HEAD, NHEAD, | |
1864 ALPHA, NALPHA, LOWER, NLOWER, | |
1865 UPPER, NUPPER | |
1866 }; | |
1867 | |
1868 /* | |
1869 * Parse the lowest level. | |
1825 * | 1870 * |
1826 * Optimization: gobbles an entire sequence of ordinary characters so that | 1871 * Optimization: gobbles an entire sequence of ordinary characters so that |
1827 * it can turn them into a single node, which is smaller to store and | 1872 * it can turn them into a single node, which is smaller to store and |
1828 * faster to run. Don't do this when one_exactly is set. | 1873 * faster to run. Don't do this when one_exactly is set. |
1829 */ | 1874 */ |
1834 char_u *ret; | 1879 char_u *ret; |
1835 int flags; | 1880 int flags; |
1836 int cpo_lit; /* 'cpoptions' contains 'l' flag */ | 1881 int cpo_lit; /* 'cpoptions' contains 'l' flag */ |
1837 int cpo_bsl; /* 'cpoptions' contains '\' flag */ | 1882 int cpo_bsl; /* 'cpoptions' contains '\' flag */ |
1838 int c; | 1883 int c; |
1839 static char_u *classchars = (char_u *)".iIkKfFpPsSdDxXoOwWhHaAlLuU"; | |
1840 static int classcodes[] = {ANY, IDENT, SIDENT, KWORD, SKWORD, | |
1841 FNAME, SFNAME, PRINT, SPRINT, | |
1842 WHITE, NWHITE, DIGIT, NDIGIT, | |
1843 HEX, NHEX, OCTAL, NOCTAL, | |
1844 WORD, NWORD, HEAD, NHEAD, | |
1845 ALPHA, NALPHA, LOWER, NLOWER, | |
1846 UPPER, NUPPER | |
1847 }; | |
1848 char_u *p; | 1884 char_u *p; |
1849 int extra = 0; | 1885 int extra = 0; |
1850 | 1886 |
1851 *flagp = WORST; /* Tentatively. */ | 1887 *flagp = WORST; /* Tentatively. */ |
1852 cpo_lit = vim_strchr(p_cpo, CPO_LITERAL) != NULL; | 1888 cpo_lit = vim_strchr(p_cpo, CPO_LITERAL) != NULL; |
2138 | 2174 |
2139 ret = NULL; | 2175 ret = NULL; |
2140 while ((c = getchr()) != ']') | 2176 while ((c = getchr()) != ']') |
2141 { | 2177 { |
2142 if (c == NUL) | 2178 if (c == NUL) |
2143 EMSG_M_RET_NULL(_("E69: Missing ] after %s%%["), | 2179 EMSG2_RET_NULL(_("E69: Missing ] after %s%%["), |
2144 reg_magic == MAGIC_ALL); | 2180 reg_magic == MAGIC_ALL); |
2145 br = regnode(BRANCH); | 2181 br = regnode(BRANCH); |
2146 if (ret == NULL) | 2182 if (ret == NULL) |
2147 ret = br; | 2183 ret = br; |
2148 else | 2184 else |
2154 one_exactly = FALSE; | 2190 one_exactly = FALSE; |
2155 if (lastnode == NULL) | 2191 if (lastnode == NULL) |
2156 return NULL; | 2192 return NULL; |
2157 } | 2193 } |
2158 if (ret == NULL) | 2194 if (ret == NULL) |
2159 EMSG_M_RET_NULL(_("E70: Empty %s%%[]"), | 2195 EMSG2_RET_NULL(_("E70: Empty %s%%[]"), |
2160 reg_magic == MAGIC_ALL); | 2196 reg_magic == MAGIC_ALL); |
2161 lastbranch = regnode(BRANCH); | 2197 lastbranch = regnode(BRANCH); |
2162 br = regnode(NOTHING); | 2198 br = regnode(NOTHING); |
2163 if (ret != JUST_CALC_SIZE) | 2199 if (ret != JUST_CALC_SIZE) |
2164 { | 2200 { |
2198 case 'U': i = gethexchrs(8); break; | 2234 case 'U': i = gethexchrs(8); break; |
2199 default: i = -1; break; | 2235 default: i = -1; break; |
2200 } | 2236 } |
2201 | 2237 |
2202 if (i < 0) | 2238 if (i < 0) |
2203 EMSG_M_RET_NULL( | 2239 EMSG2_RET_NULL( |
2204 _("E678: Invalid character after %s%%[dxouU]"), | 2240 _("E678: Invalid character after %s%%[dxouU]"), |
2205 reg_magic == MAGIC_ALL); | 2241 reg_magic == MAGIC_ALL); |
2206 #ifdef FEAT_MBYTE | 2242 #ifdef FEAT_MBYTE |
2207 if (use_multibytecode(i)) | 2243 if (use_multibytecode(i)) |
2208 ret = regnode(MULTIBYTECODE); | 2244 ret = regnode(MULTIBYTECODE); |
2270 } | 2306 } |
2271 break; | 2307 break; |
2272 } | 2308 } |
2273 } | 2309 } |
2274 | 2310 |
2275 EMSG_M_RET_NULL(_("E71: Invalid character after %s%%"), | 2311 EMSG2_RET_NULL(_("E71: Invalid character after %s%%"), |
2276 reg_magic == MAGIC_ALL); | 2312 reg_magic == MAGIC_ALL); |
2277 } | 2313 } |
2278 } | 2314 } |
2279 break; | 2315 break; |
2280 | 2316 |
2565 skipchr(); /* let's be friends with the lexer again */ | 2601 skipchr(); /* let's be friends with the lexer again */ |
2566 *flagp |= HASWIDTH | SIMPLE; | 2602 *flagp |= HASWIDTH | SIMPLE; |
2567 break; | 2603 break; |
2568 } | 2604 } |
2569 else if (reg_strict) | 2605 else if (reg_strict) |
2570 EMSG_M_RET_NULL(_("E769: Missing ] after %s["), | 2606 EMSG2_RET_NULL(_(e_missingbracket), reg_magic > MAGIC_OFF); |
2571 reg_magic > MAGIC_OFF); | |
2572 } | 2607 } |
2573 /* FALLTHROUGH */ | 2608 /* FALLTHROUGH */ |
2574 | 2609 |
2575 default: | 2610 default: |
2576 { | 2611 { |
2657 || (enc_utf8 && utf_iscomposing(c))); | 2692 || (enc_utf8 && utf_iscomposing(c))); |
2658 } | 2693 } |
2659 #endif | 2694 #endif |
2660 | 2695 |
2661 /* | 2696 /* |
2662 * emit a node | 2697 * Emit a node. |
2663 * Return pointer to generated code. | 2698 * Return pointer to generated code. |
2664 */ | 2699 */ |
2665 static char_u * | 2700 static char_u * |
2666 regnode(op) | 2701 regnode(op) |
2667 int op; | 2702 int op; |
2709 regcode += (*mb_char2bytes)(c, regcode); | 2744 regcode += (*mb_char2bytes)(c, regcode); |
2710 } | 2745 } |
2711 #endif | 2746 #endif |
2712 | 2747 |
2713 /* | 2748 /* |
2714 * reginsert - insert an operator in front of already-emitted operand | 2749 * Insert an operator in front of already-emitted operand |
2715 * | 2750 * |
2716 * Means relocating the operand. | 2751 * Means relocating the operand. |
2717 */ | 2752 */ |
2718 static void | 2753 static void |
2719 reginsert(op, opnd) | 2754 reginsert(op, opnd) |
2740 *place++ = NUL; | 2775 *place++ = NUL; |
2741 *place = NUL; | 2776 *place = NUL; |
2742 } | 2777 } |
2743 | 2778 |
2744 /* | 2779 /* |
2745 * reginsert_limits - insert an operator in front of already-emitted operand. | 2780 * Insert an operator in front of already-emitted operand. |
2746 * The operator has the given limit values as operands. Also set next pointer. | 2781 * The operator has the given limit values as operands. Also set next pointer. |
2747 * | 2782 * |
2748 * Means relocating the operand. | 2783 * Means relocating the operand. |
2749 */ | 2784 */ |
2750 static void | 2785 static void |
2792 *p++ = (char_u) (val & 0377); | 2827 *p++ = (char_u) (val & 0377); |
2793 return p; | 2828 return p; |
2794 } | 2829 } |
2795 | 2830 |
2796 /* | 2831 /* |
2797 * regtail - set the next-pointer at the end of a node chain | 2832 * Set the next-pointer at the end of a node chain. |
2798 */ | 2833 */ |
2799 static void | 2834 static void |
2800 regtail(p, val) | 2835 regtail(p, val) |
2801 char_u *p; | 2836 char_u *p; |
2802 char_u *val; | 2837 char_u *val; |
2833 *(scan + 2) = (char_u) (offset & 0377); | 2868 *(scan + 2) = (char_u) (offset & 0377); |
2834 } | 2869 } |
2835 } | 2870 } |
2836 | 2871 |
2837 /* | 2872 /* |
2838 * regoptail - regtail on item after a BRANCH; nop if none | 2873 * Like regtail, on item after a BRANCH; nop if none. |
2839 */ | 2874 */ |
2840 static void | 2875 static void |
2841 regoptail(p, val) | 2876 regoptail(p, val) |
2842 char_u *p; | 2877 char_u *p; |
2843 char_u *val; | 2878 char_u *val; |
2849 return; | 2884 return; |
2850 regtail(OPERAND(p), val); | 2885 regtail(OPERAND(p), val); |
2851 } | 2886 } |
2852 | 2887 |
2853 /* | 2888 /* |
2854 * getchr() - get the next character from the pattern. We know about | 2889 * Functions for getting characters from the regexp input. |
2855 * magic and such, so therefore we need a lexical analyzer. | 2890 */ |
2856 */ | 2891 |
2857 | |
2858 /* static int curchr; */ | |
2859 static int prevprevchr; | |
2860 static int prevchr; | |
2861 static int nextchr; /* used for ungetchr() */ | |
2862 /* | |
2863 * Note: prevchr is sometimes -1 when we are not at the start, | |
2864 * eg in /[ ^I]^ the pattern was never found even if it existed, because ^ was | |
2865 * taken to be magic -- webb | |
2866 */ | |
2867 static int at_start; /* True when on the first character */ | 2892 static int at_start; /* True when on the first character */ |
2868 static int prev_at_start; /* True when on the second character */ | 2893 static int prev_at_start; /* True when on the second character */ |
2869 | 2894 |
2895 /* | |
2896 * Start parsing at "str". | |
2897 */ | |
2870 static void | 2898 static void |
2871 initchr(str) | 2899 initchr(str) |
2872 char_u *str; | 2900 char_u *str; |
2873 { | 2901 { |
2874 regparse = str; | 2902 regparse = str; |
2876 curchr = prevprevchr = prevchr = nextchr = -1; | 2904 curchr = prevprevchr = prevchr = nextchr = -1; |
2877 at_start = TRUE; | 2905 at_start = TRUE; |
2878 prev_at_start = FALSE; | 2906 prev_at_start = FALSE; |
2879 } | 2907 } |
2880 | 2908 |
2909 /* | |
2910 * Get the next character without advancing. | |
2911 */ | |
2881 static int | 2912 static int |
2882 peekchr() | 2913 peekchr() |
2883 { | 2914 { |
2884 static int after_slash = FALSE; | 2915 static int after_slash = FALSE; |
2885 | 2916 |
3084 at_start = as; | 3115 at_start = as; |
3085 prevchr = pr; | 3116 prevchr = pr; |
3086 prevprevchr = prpr; | 3117 prevprevchr = prpr; |
3087 } | 3118 } |
3088 | 3119 |
3120 /* | |
3121 * Get the next character from the pattern. We know about magic and such, so | |
3122 * therefore we need a lexical analyzer. | |
3123 */ | |
3089 static int | 3124 static int |
3090 getchr() | 3125 getchr() |
3091 { | 3126 { |
3092 int chr = peekchr(); | 3127 int chr = peekchr(); |
3093 | 3128 |
3338 save_se_T save_start[NSUBEXP]; | 3373 save_se_T save_start[NSUBEXP]; |
3339 save_se_T save_end[NSUBEXP]; | 3374 save_se_T save_end[NSUBEXP]; |
3340 } regbehind_T; | 3375 } regbehind_T; |
3341 | 3376 |
3342 static char_u *reg_getline __ARGS((linenr_T lnum)); | 3377 static char_u *reg_getline __ARGS((linenr_T lnum)); |
3343 static long vim_regexec_both __ARGS((char_u *line, colnr_T col, proftime_T *tm)); | 3378 static long bt_regexec_both __ARGS((char_u *line, colnr_T col, proftime_T *tm)); |
3344 static long regtry __ARGS((regprog_T *prog, colnr_T col)); | 3379 static long regtry __ARGS((bt_regprog_T *prog, colnr_T col)); |
3345 static void cleanup_subexpr __ARGS((void)); | 3380 static void cleanup_subexpr __ARGS((void)); |
3346 #ifdef FEAT_SYN_HL | 3381 #ifdef FEAT_SYN_HL |
3347 static void cleanup_zsubexpr __ARGS((void)); | 3382 static void cleanup_zsubexpr __ARGS((void)); |
3348 #endif | 3383 #endif |
3349 static void save_subexpr __ARGS((regbehind_T *bp)); | 3384 static void save_subexpr __ARGS((regbehind_T *bp)); |
3396 static colnr_T ireg_maxcol; | 3431 static colnr_T ireg_maxcol; |
3397 | 3432 |
3398 /* | 3433 /* |
3399 * Sometimes need to save a copy of a line. Since alloc()/free() is very | 3434 * Sometimes need to save a copy of a line. Since alloc()/free() is very |
3400 * slow, we keep one allocated piece of memory and only re-allocate it when | 3435 * slow, we keep one allocated piece of memory and only re-allocate it when |
3401 * it's too small. It's freed in vim_regexec_both() when finished. | 3436 * it's too small. It's freed in bt_regexec_both() when finished. |
3402 */ | 3437 */ |
3403 static char_u *reg_tofree = NULL; | 3438 static char_u *reg_tofree = NULL; |
3404 static unsigned reg_tofreelen; | 3439 static unsigned reg_tofreelen; |
3405 | 3440 |
3406 /* | 3441 /* |
3554 #endif | 3589 #endif |
3555 | 3590 |
3556 /* TRUE if using multi-line regexp. */ | 3591 /* TRUE if using multi-line regexp. */ |
3557 #define REG_MULTI (reg_match == NULL) | 3592 #define REG_MULTI (reg_match == NULL) |
3558 | 3593 |
3594 static int bt_regexec __ARGS((regmatch_T *rmp, char_u *line, colnr_T col)); | |
3595 | |
3559 /* | 3596 /* |
3560 * Match a regexp against a string. | 3597 * Match a regexp against a string. |
3561 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp(). | 3598 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp(). |
3562 * Uses curbuf for line count and 'iskeyword'. | 3599 * Uses curbuf for line count and 'iskeyword'. |
3563 * | 3600 * |
3564 * Return TRUE if there is a match, FALSE if not. | 3601 * Return TRUE if there is a match, FALSE if not. |
3565 */ | 3602 */ |
3566 int | 3603 static int |
3567 vim_regexec(rmp, line, col) | 3604 bt_regexec(rmp, line, col) |
3568 regmatch_T *rmp; | 3605 regmatch_T *rmp; |
3569 char_u *line; /* string to match against */ | 3606 char_u *line; /* string to match against */ |
3570 colnr_T col; /* column to start looking for match */ | 3607 colnr_T col; /* column to start looking for match */ |
3571 { | 3608 { |
3572 reg_match = rmp; | 3609 reg_match = rmp; |
3578 ireg_ic = rmp->rm_ic; | 3615 ireg_ic = rmp->rm_ic; |
3579 #ifdef FEAT_MBYTE | 3616 #ifdef FEAT_MBYTE |
3580 ireg_icombine = FALSE; | 3617 ireg_icombine = FALSE; |
3581 #endif | 3618 #endif |
3582 ireg_maxcol = 0; | 3619 ireg_maxcol = 0; |
3583 return (vim_regexec_both(line, col, NULL) != 0); | 3620 return (bt_regexec_both(line, col, NULL) != 0); |
3584 } | 3621 } |
3585 | 3622 |
3586 #if defined(FEAT_MODIFY_FNAME) || defined(FEAT_EVAL) \ | 3623 #if defined(FEAT_MODIFY_FNAME) || defined(FEAT_EVAL) \ |
3587 || defined(FIND_REPLACE_DIALOG) || defined(PROTO) | 3624 || defined(FIND_REPLACE_DIALOG) || defined(PROTO) |
3625 | |
3626 static int bt_regexec_nl __ARGS((regmatch_T *rmp, char_u *line, colnr_T col)); | |
3627 | |
3588 /* | 3628 /* |
3589 * Like vim_regexec(), but consider a "\n" in "line" to be a line break. | 3629 * Like vim_regexec(), but consider a "\n" in "line" to be a line break. |
3590 */ | 3630 */ |
3591 int | 3631 static int |
3592 vim_regexec_nl(rmp, line, col) | 3632 bt_regexec_nl(rmp, line, col) |
3593 regmatch_T *rmp; | 3633 regmatch_T *rmp; |
3594 char_u *line; /* string to match against */ | 3634 char_u *line; /* string to match against */ |
3595 colnr_T col; /* column to start looking for match */ | 3635 colnr_T col; /* column to start looking for match */ |
3596 { | 3636 { |
3597 reg_match = rmp; | 3637 reg_match = rmp; |
3603 ireg_ic = rmp->rm_ic; | 3643 ireg_ic = rmp->rm_ic; |
3604 #ifdef FEAT_MBYTE | 3644 #ifdef FEAT_MBYTE |
3605 ireg_icombine = FALSE; | 3645 ireg_icombine = FALSE; |
3606 #endif | 3646 #endif |
3607 ireg_maxcol = 0; | 3647 ireg_maxcol = 0; |
3608 return (vim_regexec_both(line, col, NULL) != 0); | 3648 return (bt_regexec_both(line, col, NULL) != 0); |
3609 } | 3649 } |
3610 #endif | 3650 #endif |
3651 | |
3652 static long bt_regexec_multi __ARGS((regmmatch_T *rmp, win_T *win, buf_T *buf, linenr_T lnum, colnr_T col, proftime_T *tm)); | |
3611 | 3653 |
3612 /* | 3654 /* |
3613 * Match a regexp against multiple lines. | 3655 * Match a regexp against multiple lines. |
3614 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp(). | 3656 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp(). |
3615 * Uses curbuf for line count and 'iskeyword'. | 3657 * Uses curbuf for line count and 'iskeyword'. |
3616 * | 3658 * |
3617 * Return zero if there is no match. Return number of lines contained in the | 3659 * Return zero if there is no match. Return number of lines contained in the |
3618 * match otherwise. | 3660 * match otherwise. |
3619 */ | 3661 */ |
3620 long | 3662 static long |
3621 vim_regexec_multi(rmp, win, buf, lnum, col, tm) | 3663 bt_regexec_multi(rmp, win, buf, lnum, col, tm) |
3622 regmmatch_T *rmp; | 3664 regmmatch_T *rmp; |
3623 win_T *win; /* window in which to search or NULL */ | 3665 win_T *win; /* window in which to search or NULL */ |
3624 buf_T *buf; /* buffer in which to search */ | 3666 buf_T *buf; /* buffer in which to search */ |
3625 linenr_T lnum; /* nr of line to start looking for match */ | 3667 linenr_T lnum; /* nr of line to start looking for match */ |
3626 colnr_T col; /* column to start looking for match */ | 3668 colnr_T col; /* column to start looking for match */ |
3639 #ifdef FEAT_MBYTE | 3681 #ifdef FEAT_MBYTE |
3640 ireg_icombine = FALSE; | 3682 ireg_icombine = FALSE; |
3641 #endif | 3683 #endif |
3642 ireg_maxcol = rmp->rmm_maxcol; | 3684 ireg_maxcol = rmp->rmm_maxcol; |
3643 | 3685 |
3644 r = vim_regexec_both(NULL, col, tm); | 3686 r = bt_regexec_both(NULL, col, tm); |
3645 | 3687 |
3646 return r; | 3688 return r; |
3647 } | 3689 } |
3648 | 3690 |
3649 /* | 3691 /* |
3650 * Match a regexp against a string ("line" points to the string) or multiple | 3692 * Match a regexp against a string ("line" points to the string) or multiple |
3651 * lines ("line" is NULL, use reg_getline()). | 3693 * lines ("line" is NULL, use reg_getline()). |
3652 */ | 3694 */ |
3653 static long | 3695 static long |
3654 vim_regexec_both(line, col, tm) | 3696 bt_regexec_both(line, col, tm) |
3655 char_u *line; | 3697 char_u *line; |
3656 colnr_T col; /* column to start looking for match */ | 3698 colnr_T col; /* column to start looking for match */ |
3657 proftime_T *tm UNUSED; /* timeout limit or NULL */ | 3699 proftime_T *tm UNUSED; /* timeout limit or NULL */ |
3658 { | 3700 { |
3659 regprog_T *prog; | 3701 bt_regprog_T *prog; |
3660 char_u *s; | 3702 char_u *s; |
3661 long retval = 0L; | 3703 long retval = 0L; |
3662 | 3704 |
3663 /* Create "regstack" and "backpos" if they are not allocated yet. | 3705 /* Create "regstack" and "backpos" if they are not allocated yet. |
3664 * We allocate *_INITIAL amount of bytes first and then set the grow size | 3706 * We allocate *_INITIAL amount of bytes first and then set the grow size |
3680 backpos.ga_growsize = BACKPOS_INITIAL * 8; | 3722 backpos.ga_growsize = BACKPOS_INITIAL * 8; |
3681 } | 3723 } |
3682 | 3724 |
3683 if (REG_MULTI) | 3725 if (REG_MULTI) |
3684 { | 3726 { |
3685 prog = reg_mmatch->regprog; | 3727 prog = (bt_regprog_T *)reg_mmatch->regprog; |
3686 line = reg_getline((linenr_T)0); | 3728 line = reg_getline((linenr_T)0); |
3687 reg_startpos = reg_mmatch->startpos; | 3729 reg_startpos = reg_mmatch->startpos; |
3688 reg_endpos = reg_mmatch->endpos; | 3730 reg_endpos = reg_mmatch->endpos; |
3689 } | 3731 } |
3690 else | 3732 else |
3691 { | 3733 { |
3692 prog = reg_match->regprog; | 3734 prog = (bt_regprog_T *)reg_match->regprog; |
3693 reg_startp = reg_match->startp; | 3735 reg_startp = reg_match->startp; |
3694 reg_endp = reg_match->endp; | 3736 reg_endp = reg_match->endp; |
3695 } | 3737 } |
3696 | 3738 |
3697 /* Be paranoid... */ | 3739 /* Be paranoid... */ |
3929 * regtry - try match of "prog" with at regline["col"]. | 3971 * regtry - try match of "prog" with at regline["col"]. |
3930 * Returns 0 for failure, number of lines contained in the match otherwise. | 3972 * Returns 0 for failure, number of lines contained in the match otherwise. |
3931 */ | 3973 */ |
3932 static long | 3974 static long |
3933 regtry(prog, col) | 3975 regtry(prog, col) |
3934 regprog_T *prog; | 3976 bt_regprog_T *prog; |
3935 colnr_T col; | 3977 colnr_T col; |
3936 { | 3978 { |
3937 reginput = regline + col; | 3979 reginput = regline + col; |
3938 need_clear_subexpr = TRUE; | 3980 need_clear_subexpr = TRUE; |
3939 #ifdef FEAT_SYN_HL | 3981 #ifdef FEAT_SYN_HL |
4061 #define RA_BREAK 3 /* break inner loop */ | 4103 #define RA_BREAK 3 /* break inner loop */ |
4062 #define RA_MATCH 4 /* successful match */ | 4104 #define RA_MATCH 4 /* successful match */ |
4063 #define RA_NOMATCH 5 /* didn't match */ | 4105 #define RA_NOMATCH 5 /* didn't match */ |
4064 | 4106 |
4065 /* Make "regstack" and "backpos" empty. They are allocated and freed in | 4107 /* Make "regstack" and "backpos" empty. They are allocated and freed in |
4066 * vim_regexec_both() to reduce malloc()/free() calls. */ | 4108 * bt_regexec_both() to reduce malloc()/free() calls. */ |
4067 regstack.ga_len = 0; | 4109 regstack.ga_len = 0; |
4068 backpos.ga_len = 0; | 4110 backpos.ga_len = 0; |
4069 | 4111 |
4070 /* | 4112 /* |
4071 * Repeat until "regstack" is empty. | 4113 * Repeat until "regstack" is empty. |
4072 */ | 4114 */ |
4073 for (;;) | 4115 for (;;) |
4074 { | 4116 { |
4075 /* Some patterns my cause a long time to match, even though they are not | 4117 /* Some patterns may cause a long time to match, even though they are not |
4076 * illegal. E.g., "\([a-z]\+\)\+Q". Allow breaking them with CTRL-C. */ | 4118 * illegal. E.g., "\([a-z]\+\)\+Q". Allow breaking them with CTRL-C. */ |
4077 fast_breakcheck(); | 4119 fast_breakcheck(); |
4078 | 4120 |
4079 #ifdef DEBUG | 4121 #ifdef DEBUG |
4080 if (scan != NULL && regnarrate) | 4122 if (scan != NULL && regnarrate) |
4081 { | 4123 { |
4082 mch_errmsg(regprop(scan)); | 4124 mch_errmsg((char *)regprop(scan)); |
4083 mch_errmsg("(\n"); | 4125 mch_errmsg("(\n"); |
4084 } | 4126 } |
4085 #endif | 4127 #endif |
4086 | 4128 |
4087 /* | 4129 /* |
4098 status = RA_CONT; | 4140 status = RA_CONT; |
4099 | 4141 |
4100 #ifdef DEBUG | 4142 #ifdef DEBUG |
4101 if (regnarrate) | 4143 if (regnarrate) |
4102 { | 4144 { |
4103 mch_errmsg(regprop(scan)); | 4145 mch_errmsg((char *)regprop(scan)); |
4104 mch_errmsg("...\n"); | 4146 mch_errmsg("...\n"); |
4105 # ifdef FEAT_SYN_HL | 4147 # ifdef FEAT_SYN_HL |
4106 if (re_extmatch_in != NULL) | 4148 if (re_extmatch_in != NULL) |
4107 { | 4149 { |
4108 int i; | 4150 int i; |
4110 mch_errmsg(_("External submatches:\n")); | 4152 mch_errmsg(_("External submatches:\n")); |
4111 for (i = 0; i < NSUBEXP; i++) | 4153 for (i = 0; i < NSUBEXP; i++) |
4112 { | 4154 { |
4113 mch_errmsg(" \""); | 4155 mch_errmsg(" \""); |
4114 if (re_extmatch_in->matches[i] != NULL) | 4156 if (re_extmatch_in->matches[i] != NULL) |
4115 mch_errmsg(re_extmatch_in->matches[i]); | 4157 mch_errmsg((char *)re_extmatch_in->matches[i]); |
4116 mch_errmsg("\"\n"); | 4158 mch_errmsg("\"\n"); |
4117 } | 4159 } |
4118 } | 4160 } |
4119 # endif | 4161 # endif |
4120 } | 4162 } |
6089 * Return TRUE if it's wrong. | 6131 * Return TRUE if it's wrong. |
6090 */ | 6132 */ |
6091 static int | 6133 static int |
6092 prog_magic_wrong() | 6134 prog_magic_wrong() |
6093 { | 6135 { |
6094 if (UCHARAT(REG_MULTI | 6136 regprog_T *prog; |
6095 ? reg_mmatch->regprog->program | 6137 |
6096 : reg_match->regprog->program) != REGMAGIC) | 6138 prog = REG_MULTI ? reg_mmatch->regprog : reg_match->regprog; |
6139 if (prog->engine == &nfa_regengine) | |
6140 /* For NFA matcher we don't check the magic */ | |
6141 return FALSE; | |
6142 | |
6143 if (UCHARAT(((bt_regprog_T *)prog)->program) != REGMAGIC) | |
6097 { | 6144 { |
6098 EMSG(_(e_re_corr)); | 6145 EMSG(_(e_re_corr)); |
6099 return TRUE; | 6146 return TRUE; |
6100 } | 6147 } |
6101 return FALSE; | 6148 return FALSE; |
6316 return val < n; | 6363 return val < n; |
6317 return val == n; | 6364 return val == n; |
6318 } | 6365 } |
6319 | 6366 |
6320 | 6367 |
6321 #ifdef DEBUG | 6368 #ifdef BT_REGEXP_DUMP |
6322 | 6369 |
6323 /* | 6370 /* |
6324 * regdump - dump a regexp onto stdout in vaguely comprehensible form | 6371 * regdump - dump a regexp onto stdout in vaguely comprehensible form |
6325 */ | 6372 */ |
6326 static void | 6373 static void |
6327 regdump(pattern, r) | 6374 regdump(pattern, r) |
6328 char_u *pattern; | 6375 char_u *pattern; |
6329 regprog_T *r; | 6376 bt_regprog_T *r; |
6330 { | 6377 { |
6331 char_u *s; | 6378 char_u *s; |
6332 int op = EXACTLY; /* Arbitrary non-END op. */ | 6379 int op = EXACTLY; /* Arbitrary non-END op. */ |
6333 char_u *next; | 6380 char_u *next; |
6334 char_u *end = NULL; | 6381 char_u *end = NULL; |
6335 | 6382 FILE *f; |
6336 printf("\r\nregcomp(%s):\r\n", pattern); | 6383 |
6384 #ifdef BT_REGEXP_LOG | |
6385 f = fopen("bt_regexp_log.log", "a"); | |
6386 #else | |
6387 f = stdout; | |
6388 #endif | |
6389 if (f == NULL) | |
6390 return; | |
6391 fprintf(f, "-------------------------------------\n\r\nregcomp(%s):\r\n", pattern); | |
6337 | 6392 |
6338 s = r->program + 1; | 6393 s = r->program + 1; |
6339 /* | 6394 /* |
6340 * Loop until we find the END that isn't before a referred next (an END | 6395 * Loop until we find the END that isn't before a referred next (an END |
6341 * can also appear in a NOMATCH operand). | 6396 * can also appear in a NOMATCH operand). |
6342 */ | 6397 */ |
6343 while (op != END || s <= end) | 6398 while (op != END || s <= end) |
6344 { | 6399 { |
6345 op = OP(s); | 6400 op = OP(s); |
6346 printf("%2d%s", (int)(s - r->program), regprop(s)); /* Where, what. */ | 6401 fprintf(f, "%2d%s", (int)(s - r->program), regprop(s)); /* Where, what. */ |
6347 next = regnext(s); | 6402 next = regnext(s); |
6348 if (next == NULL) /* Next ptr. */ | 6403 if (next == NULL) /* Next ptr. */ |
6349 printf("(0)"); | 6404 fprintf(f, "(0)"); |
6350 else | 6405 else |
6351 printf("(%d)", (int)((s - r->program) + (next - s))); | 6406 fprintf(f, "(%d)", (int)((s - r->program) + (next - s))); |
6352 if (end < next) | 6407 if (end < next) |
6353 end = next; | 6408 end = next; |
6354 if (op == BRACE_LIMITS) | 6409 if (op == BRACE_LIMITS) |
6355 { | 6410 { |
6356 /* Two short ints */ | 6411 /* Two short ints */ |
6357 printf(" minval %ld, maxval %ld", OPERAND_MIN(s), OPERAND_MAX(s)); | 6412 fprintf(f, " minval %ld, maxval %ld", OPERAND_MIN(s), OPERAND_MAX(s)); |
6358 s += 8; | 6413 s += 8; |
6359 } | 6414 } |
6360 s += 3; | 6415 s += 3; |
6361 if (op == ANYOF || op == ANYOF + ADD_NL | 6416 if (op == ANYOF || op == ANYOF + ADD_NL |
6362 || op == ANYBUT || op == ANYBUT + ADD_NL | 6417 || op == ANYBUT || op == ANYBUT + ADD_NL |
6363 || op == EXACTLY) | 6418 || op == EXACTLY) |
6364 { | 6419 { |
6365 /* Literal string, where present. */ | 6420 /* Literal string, where present. */ |
6421 fprintf(f, "\nxxxxxxxxx\n"); | |
6366 while (*s != NUL) | 6422 while (*s != NUL) |
6367 printf("%c", *s++); | 6423 fprintf(f, "%c", *s++); |
6424 fprintf(f, "\nxxxxxxxxx\n"); | |
6368 s++; | 6425 s++; |
6369 } | 6426 } |
6370 printf("\r\n"); | 6427 fprintf(f, "\r\n"); |
6371 } | 6428 } |
6372 | 6429 |
6373 /* Header fields of interest. */ | 6430 /* Header fields of interest. */ |
6374 if (r->regstart != NUL) | 6431 if (r->regstart != NUL) |
6375 printf("start `%s' 0x%x; ", r->regstart < 256 | 6432 fprintf(f, "start `%s' 0x%x; ", r->regstart < 256 |
6376 ? (char *)transchar(r->regstart) | 6433 ? (char *)transchar(r->regstart) |
6377 : "multibyte", r->regstart); | 6434 : "multibyte", r->regstart); |
6378 if (r->reganch) | 6435 if (r->reganch) |
6379 printf("anchored; "); | 6436 fprintf(f, "anchored; "); |
6380 if (r->regmust != NULL) | 6437 if (r->regmust != NULL) |
6381 printf("must have \"%s\"", r->regmust); | 6438 fprintf(f, "must have \"%s\"", r->regmust); |
6382 printf("\r\n"); | 6439 fprintf(f, "\r\n"); |
6383 } | 6440 |
6384 | 6441 #ifdef BT_REGEXP_LOG |
6442 fclose(f); | |
6443 #endif | |
6444 } | |
6445 #endif /* BT_REGEXP_DUMP */ | |
6446 | |
6447 #ifdef DEBUG | |
6385 /* | 6448 /* |
6386 * regprop - printable representation of opcode | 6449 * regprop - printable representation of opcode |
6387 */ | 6450 */ |
6388 static char_u * | 6451 static char_u * |
6389 regprop(op) | 6452 regprop(op) |
6390 char_u *op; | 6453 char_u *op; |
6391 { | 6454 { |
6392 char_u *p; | 6455 char *p; |
6393 static char_u buf[50]; | 6456 static char buf[50]; |
6394 | 6457 |
6395 (void) strcpy(buf, ":"); | 6458 STRCPY(buf, ":"); |
6396 | 6459 |
6397 switch (OP(op)) | 6460 switch ((int) OP(op)) |
6398 { | 6461 { |
6399 case BOL: | 6462 case BOL: |
6400 p = "BOL"; | 6463 p = "BOL"; |
6401 break; | 6464 break; |
6402 case EOL: | 6465 case EOL: |
6759 sprintf(buf + STRLEN(buf), "corrupt %d", OP(op)); | 6822 sprintf(buf + STRLEN(buf), "corrupt %d", OP(op)); |
6760 p = NULL; | 6823 p = NULL; |
6761 break; | 6824 break; |
6762 } | 6825 } |
6763 if (p != NULL) | 6826 if (p != NULL) |
6764 (void) strcat(buf, p); | 6827 STRCAT(buf, p); |
6765 return buf; | 6828 return (char_u *)buf; |
6766 } | 6829 } |
6767 #endif | 6830 #endif /* DEBUG */ |
6768 | 6831 |
6769 #ifdef FEAT_MBYTE | 6832 #ifdef FEAT_MBYTE |
6770 static void mb_decompose __ARGS((int c, int *c1, int *c2, int *c3)); | 6833 static void mb_decompose __ARGS((int c, int *c1, int *c2, int *c3)); |
6771 | 6834 |
6772 typedef struct | 6835 typedef struct |
7665 } | 7728 } |
7666 | 7729 |
7667 return retval; | 7730 return retval; |
7668 } | 7731 } |
7669 #endif | 7732 #endif |
7733 | |
7734 static regengine_T bt_regengine = | |
7735 { | |
7736 bt_regcomp, | |
7737 bt_regexec, | |
7738 #if defined(FEAT_MODIFY_FNAME) || defined(FEAT_EVAL) \ | |
7739 || defined(FIND_REPLACE_DIALOG) || defined(PROTO) | |
7740 bt_regexec_nl, | |
7741 #endif | |
7742 bt_regexec_multi | |
7743 #ifdef DEBUG | |
7744 ,(char_u *)"" | |
7745 #endif | |
7746 }; | |
7747 | |
7748 | |
7749 #include "regexp_nfa.c" | |
7750 | |
7751 static regengine_T nfa_regengine = | |
7752 { | |
7753 nfa_regcomp, | |
7754 nfa_regexec, | |
7755 #if defined(FEAT_MODIFY_FNAME) || defined(FEAT_EVAL) \ | |
7756 || defined(FIND_REPLACE_DIALOG) || defined(PROTO) | |
7757 nfa_regexec_nl, | |
7758 #endif | |
7759 nfa_regexec_multi | |
7760 #ifdef DEBUG | |
7761 ,(char_u *)"" | |
7762 #endif | |
7763 }; | |
7764 | |
7765 /* Which regexp engine to use? Needed for vim_regcomp(). | |
7766 * Must match with 'regexpengine'. */ | |
7767 static int regexp_engine = 0; | |
7768 #define AUTOMATIC_ENGINE 0 | |
7769 #define BACKTRACKING_ENGINE 1 | |
7770 #define NFA_ENGINE 2 | |
7771 #ifdef DEBUG | |
7772 static char_u regname[][30] = { | |
7773 "AUTOMATIC Regexp Engine", | |
7774 "BACKTACKING Regexp Engine", | |
7775 "NFA Regexp Engine" | |
7776 }; | |
7777 #endif | |
7778 | |
7779 /* | |
7780 * Compile a regular expression into internal code. | |
7781 * Returns the program in allocated memory. Returns NULL for an error. | |
7782 */ | |
7783 regprog_T * | |
7784 vim_regcomp(expr_arg, re_flags) | |
7785 char_u *expr_arg; | |
7786 int re_flags; | |
7787 { | |
7788 regprog_T *prog = NULL; | |
7789 char_u *expr = expr_arg; | |
7790 | |
7791 syntax_error = FALSE; | |
7792 regexp_engine = p_re; | |
7793 | |
7794 /* Check for prefix "\%#=", that sets the regexp engine */ | |
7795 if (STRNCMP(expr, "\\%#=", 4) == 0) | |
7796 { | |
7797 int newengine = expr[4] - '0'; | |
7798 | |
7799 if (newengine == AUTOMATIC_ENGINE | |
7800 || newengine == BACKTRACKING_ENGINE | |
7801 || newengine == NFA_ENGINE) | |
7802 { | |
7803 regexp_engine = expr[4] - '0'; | |
7804 expr += 5; | |
7805 #ifdef DEBUG | |
7806 EMSG3("New regexp mode selected (%d): %s", regexp_engine, | |
7807 regname[newengine]); | |
7808 #endif | |
7809 } | |
7810 else | |
7811 { | |
7812 EMSG(_("E864: \\%#= can only be followed by 0, 1, or 2. The automatic engine will be used ")); | |
7813 regexp_engine = AUTOMATIC_ENGINE; | |
7814 } | |
7815 } | |
7816 #ifdef DEBUG | |
7817 bt_regengine.expr = expr; | |
7818 nfa_regengine.expr = expr; | |
7819 #endif | |
7820 | |
7821 /* | |
7822 * First try the NFA engine, unless backtracking was requested. | |
7823 */ | |
7824 if (regexp_engine != BACKTRACKING_ENGINE) | |
7825 prog = nfa_regengine.regcomp(expr, re_flags); | |
7826 else | |
7827 prog = bt_regengine.regcomp(expr, re_flags); | |
7828 | |
7829 if (prog == NULL) /* error compiling regexp with initial engine */ | |
7830 { | |
7831 #ifdef DEBUG | |
7832 if (regexp_engine != BACKTRACKING_ENGINE) /* debugging log for NFA */ | |
7833 { | |
7834 FILE *f; | |
7835 f = fopen("debug.log", "a"); | |
7836 if (f) | |
7837 { | |
7838 if (!syntax_error) | |
7839 fprintf(f, "NFA engine could not handle \"%s\"\n", expr); | |
7840 else | |
7841 fprintf(f, "Syntax error in \"%s\"\n", expr); | |
7842 fclose(f); | |
7843 } | |
7844 else | |
7845 EMSG("(NFA) Could not open \"debug.log\" to write !!!"); | |
7846 /* | |
7847 if (syntax_error) | |
7848 EMSG("NFA Regexp: Syntax Error !"); | |
7849 */ | |
7850 } | |
7851 #endif | |
7852 /* | |
7853 * If NFA engine failed, then revert to the backtracking engine. | |
7854 * Except when there was a syntax error, which was properly handled by | |
7855 * NFA engine. | |
7856 */ | |
7857 if (regexp_engine == AUTOMATIC_ENGINE) | |
7858 if (!syntax_error) | |
7859 prog = bt_regengine.regcomp(expr, re_flags); | |
7860 | |
7861 } /* endif prog==NULL */ | |
7862 | |
7863 | |
7864 return prog; | |
7865 } | |
7866 | |
7867 /* | |
7868 * Match a regexp against a string. | |
7869 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp(). | |
7870 * Uses curbuf for line count and 'iskeyword'. | |
7871 * | |
7872 * Return TRUE if there is a match, FALSE if not. | |
7873 */ | |
7874 int | |
7875 vim_regexec(rmp, line, col) | |
7876 regmatch_T *rmp; | |
7877 char_u *line; /* string to match against */ | |
7878 colnr_T col; /* column to start looking for match */ | |
7879 { | |
7880 return rmp->regprog->engine->regexec(rmp, line, col); | |
7881 } | |
7882 | |
7883 #if defined(FEAT_MODIFY_FNAME) || defined(FEAT_EVAL) \ | |
7884 || defined(FIND_REPLACE_DIALOG) || defined(PROTO) | |
7885 /* | |
7886 * Like vim_regexec(), but consider a "\n" in "line" to be a line break. | |
7887 */ | |
7888 int | |
7889 vim_regexec_nl(rmp, line, col) | |
7890 regmatch_T *rmp; | |
7891 char_u *line; | |
7892 colnr_T col; | |
7893 { | |
7894 return rmp->regprog->engine->regexec_nl(rmp, line, col); | |
7895 } | |
7896 #endif | |
7897 | |
7898 /* | |
7899 * Match a regexp against multiple lines. | |
7900 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp(). | |
7901 * Uses curbuf for line count and 'iskeyword'. | |
7902 * | |
7903 * Return zero if there is no match. Return number of lines contained in the | |
7904 * match otherwise. | |
7905 */ | |
7906 long | |
7907 vim_regexec_multi(rmp, win, buf, lnum, col, tm) | |
7908 regmmatch_T *rmp; | |
7909 win_T *win; /* window in which to search or NULL */ | |
7910 buf_T *buf; /* buffer in which to search */ | |
7911 linenr_T lnum; /* nr of line to start looking for match */ | |
7912 colnr_T col; /* column to start looking for match */ | |
7913 proftime_T *tm; /* timeout limit or NULL */ | |
7914 { | |
7915 return rmp->regprog->engine->regexec_multi(rmp, win, buf, lnum, col, tm); | |
7916 } |