Mercurial > vim
comparison src/regexp_nfa.c @ 4682:2f51ee8825db v7.3.1088
updated for version 7.3.1088
Problem: New regexp engine: \@<= and \@<! are not implemented.
Solution: Implement look-behind matching. Fix off-by-one error in old
regexp engine.
author | Bram Moolenaar <bram@vim.org> |
---|---|
date | Sat, 01 Jun 2013 19:54:43 +0200 |
parents | 4d92b873acef |
children | 8db697ae406a |
comparison
equal
deleted
inserted
replaced
4681:2eb30f341e8d | 4682:2f51ee8825db |
---|---|
54 NFA_ZSTART, /* Used for \zs */ | 54 NFA_ZSTART, /* Used for \zs */ |
55 NFA_ZEND, /* Used for \ze */ | 55 NFA_ZEND, /* Used for \ze */ |
56 NFA_NOPEN, /* Start of subexpression marked with \%( */ | 56 NFA_NOPEN, /* Start of subexpression marked with \%( */ |
57 NFA_NCLOSE, /* End of subexpr. marked with \%( ... \) */ | 57 NFA_NCLOSE, /* End of subexpr. marked with \%( ... \) */ |
58 NFA_START_INVISIBLE, | 58 NFA_START_INVISIBLE, |
59 NFA_START_INVISIBLE_BEFORE, | |
59 NFA_END_INVISIBLE, | 60 NFA_END_INVISIBLE, |
60 NFA_COMPOSING, /* Next nodes in NFA are part of the | 61 NFA_COMPOSING, /* Next nodes in NFA are part of the |
61 composing multibyte char */ | 62 composing multibyte char */ |
62 NFA_END_COMPOSING, /* End of a composing char in the NFA */ | 63 NFA_END_COMPOSING, /* End of a composing char in the NFA */ |
63 | 64 |
1367 EMIT(NFA_CONCAT); | 1368 EMIT(NFA_CONCAT); |
1368 skipchr(); /* skip the \+ */ | 1369 skipchr(); /* skip the \+ */ |
1369 break; | 1370 break; |
1370 | 1371 |
1371 case Magic('@'): | 1372 case Magic('@'): |
1373 c2 = getdecchrs(); | |
1372 op = no_Magic(getchr()); | 1374 op = no_Magic(getchr()); |
1375 i = 0; | |
1373 switch(op) | 1376 switch(op) |
1374 { | 1377 { |
1375 case '=': | 1378 case '=': |
1376 EMIT(NFA_PREV_ATOM_NO_WIDTH); | 1379 /* \@= */ |
1380 i = NFA_PREV_ATOM_NO_WIDTH; | |
1377 break; | 1381 break; |
1378 case '!': | 1382 case '!': |
1379 EMIT(NFA_PREV_ATOM_NO_WIDTH_NEG); | 1383 /* \@! */ |
1384 i = NFA_PREV_ATOM_NO_WIDTH_NEG; | |
1380 break; | 1385 break; |
1381 case '0': | |
1382 case '1': | |
1383 case '2': | |
1384 case '3': | |
1385 case '4': | |
1386 case '5': | |
1387 case '6': | |
1388 case '7': | |
1389 case '8': | |
1390 case '9': | |
1391 case '<': | 1386 case '<': |
1387 op = no_Magic(getchr()); | |
1388 if (op == '=') | |
1389 /* \@<= */ | |
1390 i = NFA_PREV_ATOM_JUST_BEFORE; | |
1391 else if (op == '!') | |
1392 /* \@<! */ | |
1393 i = NFA_PREV_ATOM_JUST_BEFORE_NEG; | |
1394 break; | |
1392 case '>': | 1395 case '>': |
1393 /* Not supported yet */ | 1396 /* \@> Not supported yet */ |
1397 /* i = NFA_PREV_ATOM_LIKE_PATTERN; */ | |
1394 return FAIL; | 1398 return FAIL; |
1395 default: | 1399 } |
1396 syntax_error = TRUE; | 1400 if (i == 0) |
1397 EMSGN(_("E869: (NFA) Unknown operator '\\@%c'"), op); | 1401 { |
1398 return FAIL; | 1402 syntax_error = TRUE; |
1399 } | 1403 EMSGN(_("E869: (NFA) Unknown operator '\\@%c'"), op); |
1404 return FAIL; | |
1405 } | |
1406 EMIT(i); | |
1407 if (i == NFA_PREV_ATOM_JUST_BEFORE | |
1408 || i == NFA_PREV_ATOM_JUST_BEFORE_NEG) | |
1409 EMIT(c2); | |
1400 break; | 1410 break; |
1401 | 1411 |
1402 case Magic('?'): | 1412 case Magic('?'): |
1403 case Magic('='): | 1413 case Magic('='): |
1404 EMIT(NFA_QUEST); | 1414 EMIT(NFA_QUEST); |
1732 | 1742 |
1733 case NFA_PREV_ATOM_NO_WIDTH: | 1743 case NFA_PREV_ATOM_NO_WIDTH: |
1734 STRCPY(code, "NFA_PREV_ATOM_NO_WIDTH"); break; | 1744 STRCPY(code, "NFA_PREV_ATOM_NO_WIDTH"); break; |
1735 case NFA_PREV_ATOM_NO_WIDTH_NEG: | 1745 case NFA_PREV_ATOM_NO_WIDTH_NEG: |
1736 STRCPY(code, "NFA_PREV_ATOM_NO_WIDTH_NEG"); break; | 1746 STRCPY(code, "NFA_PREV_ATOM_NO_WIDTH_NEG"); break; |
1747 case NFA_PREV_ATOM_JUST_BEFORE: | |
1748 STRCPY(code, "NFA_PREV_ATOM_JUST_BEFORE"); break; | |
1749 case NFA_PREV_ATOM_JUST_BEFORE_NEG: | |
1750 STRCPY(code, "NFA_PREV_ATOM_JUST_BEFORE_NEG"); break; | |
1737 case NFA_NOPEN: STRCPY(code, "NFA_NOPEN"); break; | 1751 case NFA_NOPEN: STRCPY(code, "NFA_NOPEN"); break; |
1738 case NFA_NCLOSE: STRCPY(code, "NFA_NCLOSE"); break; | 1752 case NFA_NCLOSE: STRCPY(code, "NFA_NCLOSE"); break; |
1739 case NFA_START_INVISIBLE: STRCPY(code, "NFA_START_INVISIBLE"); break; | 1753 case NFA_START_INVISIBLE: STRCPY(code, "NFA_START_INVISIBLE"); break; |
1754 case NFA_START_INVISIBLE_BEFORE: | |
1755 STRCPY(code, "NFA_START_INVISIBLE_BEFORE"); break; | |
1740 case NFA_END_INVISIBLE: STRCPY(code, "NFA_END_INVISIBLE"); break; | 1756 case NFA_END_INVISIBLE: STRCPY(code, "NFA_END_INVISIBLE"); break; |
1741 | 1757 |
1742 case NFA_COMPOSING: STRCPY(code, "NFA_COMPOSING"); break; | 1758 case NFA_COMPOSING: STRCPY(code, "NFA_COMPOSING"); break; |
1743 case NFA_END_COMPOSING: STRCPY(code, "NFA_END_COMPOSING"); break; | 1759 case NFA_END_COMPOSING: STRCPY(code, "NFA_END_COMPOSING"); break; |
1744 | 1760 |
2235 } | 2251 } |
2236 | 2252 |
2237 if (nfa_calc_size == FALSE) | 2253 if (nfa_calc_size == FALSE) |
2238 { | 2254 { |
2239 /* Allocate space for the stack. Max states on the stack : nstate */ | 2255 /* Allocate space for the stack. Max states on the stack : nstate */ |
2240 stack = (Frag_T *) lalloc((nstate + 1) * sizeof(Frag_T), TRUE); | 2256 stack = (Frag_T *)lalloc((nstate + 1) * sizeof(Frag_T), TRUE); |
2241 stackp = stack; | 2257 stackp = stack; |
2242 stack_end = stack + (nstate + 1); | 2258 stack_end = stack + (nstate + 1); |
2243 } | 2259 } |
2244 | 2260 |
2245 for (p = postfix; p < end; ++p) | 2261 for (p = postfix; p < end; ++p) |
2368 PUSH(frag(s, list1(&s->out))); | 2384 PUSH(frag(s, list1(&s->out))); |
2369 break; | 2385 break; |
2370 | 2386 |
2371 case NFA_PREV_ATOM_NO_WIDTH: | 2387 case NFA_PREV_ATOM_NO_WIDTH: |
2372 case NFA_PREV_ATOM_NO_WIDTH_NEG: | 2388 case NFA_PREV_ATOM_NO_WIDTH_NEG: |
2389 case NFA_PREV_ATOM_JUST_BEFORE: | |
2390 case NFA_PREV_ATOM_JUST_BEFORE_NEG: | |
2373 /* The \@= operator: match the preceding atom with zero width. | 2391 /* The \@= operator: match the preceding atom with zero width. |
2374 * The \@! operator: no match for the preceding atom. | 2392 * The \@! operator: no match for the preceding atom. |
2393 * The \@<= operator: match for the preceding atom. | |
2394 * The \@<! operator: no match for the preceding atom. | |
2375 * Surrounds the preceding atom with START_INVISIBLE and | 2395 * Surrounds the preceding atom with START_INVISIBLE and |
2376 * END_INVISIBLE, similarly to MOPEN. */ | 2396 * END_INVISIBLE, similarly to MOPEN. */ |
2377 | 2397 |
2378 if (nfa_calc_size == TRUE) | 2398 if (nfa_calc_size == TRUE) |
2379 { | 2399 { |
2387 patch(e.out, s1); | 2407 patch(e.out, s1); |
2388 | 2408 |
2389 s = new_state(NFA_START_INVISIBLE, e.start, s1); | 2409 s = new_state(NFA_START_INVISIBLE, e.start, s1); |
2390 if (s == NULL) | 2410 if (s == NULL) |
2391 goto theend; | 2411 goto theend; |
2392 if (*p == NFA_PREV_ATOM_NO_WIDTH_NEG) | 2412 if (*p == NFA_PREV_ATOM_NO_WIDTH_NEG |
2413 || *p == NFA_PREV_ATOM_JUST_BEFORE_NEG) | |
2393 { | 2414 { |
2394 s->negated = TRUE; | 2415 s->negated = TRUE; |
2395 s1->negated = TRUE; | 2416 s1->negated = TRUE; |
2417 } | |
2418 if (*p == NFA_PREV_ATOM_JUST_BEFORE | |
2419 || *p == NFA_PREV_ATOM_JUST_BEFORE_NEG) | |
2420 { | |
2421 s->val = *++p; /* get the count */ | |
2422 ++s->c; /* NFA_START_INVISIBLE -> NFA_START_INVISIBLE_BEFORE */ | |
2396 } | 2423 } |
2397 | 2424 |
2398 PUSH(frag(s, list1(&s1->out))); | 2425 PUSH(frag(s, list1(&s1->out))); |
2399 break; | 2426 break; |
2400 | 2427 |
3305 if (op == 1) return pos > val; | 3332 if (op == 1) return pos > val; |
3306 if (op == 2) return pos < val; | 3333 if (op == 2) return pos < val; |
3307 return val == pos; | 3334 return val == pos; |
3308 } | 3335 } |
3309 | 3336 |
3310 static int nfa_regmatch __ARGS((nfa_state_T *start, regsub_T *submatch, regsub_T *m)); | 3337 static int nfa_regmatch __ARGS((nfa_state_T *start, regsub_T *submatch, regsub_T *m, save_se_T *endp)); |
3311 | 3338 |
3312 /* | 3339 /* |
3313 * Main matching routine. | 3340 * Main matching routine. |
3314 * | 3341 * |
3315 * Run NFA to determine whether it matches reginput. | 3342 * Run NFA to determine whether it matches reginput. |
3316 * | 3343 * |
3344 * When "endp" is not NULL it is a required end-of-match position. | |
3345 * | |
3317 * Return TRUE if there is a match, FALSE otherwise. | 3346 * Return TRUE if there is a match, FALSE otherwise. |
3318 * Note: Caller must ensure that: start != NULL. | 3347 * Note: Caller must ensure that: start != NULL. |
3319 */ | 3348 */ |
3320 static int | 3349 static int |
3321 nfa_regmatch(start, submatch, m) | 3350 nfa_regmatch(start, submatch, m, endp) |
3322 nfa_state_T *start; | 3351 nfa_state_T *start; |
3323 regsub_T *submatch; | 3352 regsub_T *submatch; |
3324 regsub_T *m; | 3353 regsub_T *m; |
3354 save_se_T *endp; | |
3325 { | 3355 { |
3326 int result; | 3356 int result; |
3327 int size = 0; | 3357 int size = 0; |
3328 int flag = 0; | 3358 int flag = 0; |
3329 int go_to_nextline = FALSE; | 3359 int go_to_nextline = FALSE; |
3530 clen = 0; | 3560 clen = 0; |
3531 goto nextchar; | 3561 goto nextchar; |
3532 } | 3562 } |
3533 | 3563 |
3534 case NFA_END_INVISIBLE: | 3564 case NFA_END_INVISIBLE: |
3535 /* This is only encountered after a NFA_START_INVISIBLE node. | 3565 /* This is only encountered after a NFA_START_INVISIBLE or |
3536 * They surround a zero-width group, used with "\@=" and "\&". | 3566 * NFA_START_INVISIBLE_BEFORE node. |
3567 * They surround a zero-width group, used with "\@=", "\&", | |
3568 * "\@!", "\@<=" and "\@<!". | |
3537 * If we got here, it means that the current "invisible" group | 3569 * If we got here, it means that the current "invisible" group |
3538 * finished successfully, so return control to the parent | 3570 * finished successfully, so return control to the parent |
3539 * nfa_regmatch(). Submatches are stored in *m, and used in | 3571 * nfa_regmatch(). Submatches are stored in *m, and used in |
3540 * the parent call. */ | 3572 * the parent call. */ |
3541 if (start->c == NFA_MOPEN + 0) | 3573 if (start->c == NFA_MOPEN + 0) |
3574 /* TODO: do we ever get here? */ | |
3542 addstate_here(thislist, t->state->out, &t->sub, &listidx); | 3575 addstate_here(thislist, t->state->out, &t->sub, &listidx); |
3543 else | 3576 else |
3544 { | 3577 { |
3578 #ifdef ENABLE_LOG | |
3579 if (endp != NULL) | |
3580 { | |
3581 if (REG_MULTI) | |
3582 fprintf(log_fd, "Current lnum: %d, endp lnum: %d; current col: %d, endp col: %d\n", | |
3583 (int)reglnum, | |
3584 (int)endp->se_u.pos.lnum, | |
3585 (int)(reginput - regline), | |
3586 endp->se_u.pos.col); | |
3587 else | |
3588 fprintf(log_fd, "Current col: %d, endp col: %d\n", | |
3589 (int)(reginput - regline), | |
3590 (int)(endp->se_u.ptr - reginput)); | |
3591 } | |
3592 #endif | |
3593 /* It's only a match if it ends at "endp" */ | |
3594 if (endp != NULL && (REG_MULTI | |
3595 ? (reglnum != endp->se_u.pos.lnum | |
3596 || (int)(reginput - regline) | |
3597 != endp->se_u.pos.col) | |
3598 : reginput != endp->se_u.ptr)) | |
3599 break; | |
3600 | |
3545 /* do not set submatches for \@! */ | 3601 /* do not set submatches for \@! */ |
3546 if (!t->state->negated) | 3602 if (!t->state->negated) |
3547 /* TODO: only copy positions in use. */ | 3603 /* TODO: only copy positions in use. */ |
3548 *m = t->sub; | 3604 *m = t->sub; |
3549 nfa_match = TRUE; | 3605 nfa_match = TRUE; |
3550 } | 3606 } |
3551 break; | 3607 break; |
3552 | 3608 |
3553 case NFA_START_INVISIBLE: | 3609 case NFA_START_INVISIBLE: |
3610 case NFA_START_INVISIBLE_BEFORE: | |
3554 { | 3611 { |
3555 char_u *save_reginput = reginput; | 3612 char_u *save_reginput = reginput; |
3556 char_u *save_regline = regline; | 3613 char_u *save_regline = regline; |
3557 int save_reglnum = reglnum; | 3614 int save_reglnum = reglnum; |
3558 int save_nfa_match = nfa_match; | 3615 int save_nfa_match = nfa_match; |
3616 save_se_T endpos; | |
3617 save_se_T *endposp = NULL; | |
3618 | |
3619 if (t->state->c == NFA_START_INVISIBLE_BEFORE) | |
3620 { | |
3621 /* The recursive match must end at the current position. */ | |
3622 endposp = &endpos; | |
3623 if (REG_MULTI) | |
3624 { | |
3625 endpos.se_u.pos.col = (int)(reginput - regline); | |
3626 endpos.se_u.pos.lnum = reglnum; | |
3627 } | |
3628 else | |
3629 endpos.se_u.ptr = reginput; | |
3630 | |
3631 /* Go back the specified number of bytes, or as far as the | |
3632 * start of the previous line, to try matching "\@<=" or | |
3633 * not matching "\@<!". */ | |
3634 if (t->state->val <= 0) | |
3635 { | |
3636 if (REG_MULTI) | |
3637 { | |
3638 regline = reg_getline(--reglnum); | |
3639 if (regline == NULL) | |
3640 /* can't go before the first line */ | |
3641 regline = reg_getline(++reglnum); | |
3642 } | |
3643 reginput = regline; | |
3644 } | |
3645 else | |
3646 { | |
3647 if (REG_MULTI | |
3648 && (int)(reginput - regline) < t->state->val) | |
3649 { | |
3650 /* Not enough bytes in this line, go to end of | |
3651 * previous line. */ | |
3652 regline = reg_getline(--reglnum); | |
3653 if (regline == NULL) | |
3654 { | |
3655 /* can't go before the first line */ | |
3656 regline = reg_getline(++reglnum); | |
3657 reginput = regline; | |
3658 } | |
3659 else | |
3660 reginput = regline + STRLEN(regline); | |
3661 } | |
3662 if ((int)(reginput - regline) >= t->state->val) | |
3663 { | |
3664 reginput -= t->state->val; | |
3665 #ifdef FEAT_MBYTE | |
3666 if (has_mbyte) | |
3667 reginput -= mb_head_off(regline, reginput); | |
3668 #endif | |
3669 } | |
3670 else | |
3671 reginput = regline; | |
3672 } | |
3673 } | |
3559 | 3674 |
3560 /* Call nfa_regmatch() to check if the current concat matches | 3675 /* Call nfa_regmatch() to check if the current concat matches |
3561 * at this position. The concat ends with the node | 3676 * at this position. The concat ends with the node |
3562 * NFA_END_INVISIBLE */ | 3677 * NFA_END_INVISIBLE */ |
3563 if (listids == NULL) | 3678 if (listids == NULL) |
3577 /* Have to clear the listid field of the NFA nodes, so that | 3692 /* Have to clear the listid field of the NFA nodes, so that |
3578 * nfa_regmatch() and addstate() can run properly after | 3693 * nfa_regmatch() and addstate() can run properly after |
3579 * recursion. */ | 3694 * recursion. */ |
3580 nfa_save_listids(start, listids); | 3695 nfa_save_listids(start, listids); |
3581 nfa_set_null_listids(start); | 3696 nfa_set_null_listids(start); |
3582 result = nfa_regmatch(t->state->out, submatch, m); | 3697 result = nfa_regmatch(t->state->out, submatch, m, endposp); |
3583 nfa_set_neg_listids(start); | 3698 nfa_set_neg_listids(start); |
3584 nfa_restore_listids(start, listids); | 3699 nfa_restore_listids(start, listids); |
3585 | 3700 |
3586 /* restore position in input text */ | 3701 /* restore position in input text */ |
3587 reginput = save_reginput; | 3702 reginput = save_reginput; |
4118 * start state to the list of states. | 4233 * start state to the list of states. |
4119 * The first found match is the leftmost one, thus the order of states | 4234 * The first found match is the leftmost one, thus the order of states |
4120 * matters! | 4235 * matters! |
4121 * Do not add the start state in recursive calls of nfa_regmatch(), | 4236 * Do not add the start state in recursive calls of nfa_regmatch(), |
4122 * because recursive calls should only start in the first position. | 4237 * because recursive calls should only start in the first position. |
4238 * Unless "endp" is not NULL, then we match the end position. | |
4123 * Also don't start a match past the first line. */ | 4239 * Also don't start a match past the first line. */ |
4124 if (nfa_match == FALSE && start->c == NFA_MOPEN + 0 | 4240 if (nfa_match == FALSE |
4125 && reglnum == 0 && clen != 0 | 4241 && ((start->c == NFA_MOPEN + 0 |
4126 && (ireg_maxcol == 0 | 4242 && reglnum == 0 |
4127 || (colnr_T)(reginput - regline) < ireg_maxcol)) | 4243 && clen != 0 |
4244 && (ireg_maxcol == 0 | |
4245 || (colnr_T)(reginput - regline) < ireg_maxcol)) | |
4246 || (endp != NULL | |
4247 && (REG_MULTI | |
4248 ? (reglnum < endp->se_u.pos.lnum | |
4249 || (reglnum == endp->se_u.pos.lnum | |
4250 && (int)(reginput - regline) | |
4251 < endp->se_u.pos.col)) | |
4252 : reginput < endp->se_u.ptr)))) | |
4128 { | 4253 { |
4129 #ifdef ENABLE_LOG | 4254 #ifdef ENABLE_LOG |
4130 fprintf(log_fd, "(---) STARTSTATE\n"); | 4255 fprintf(log_fd, "(---) STARTSTATE\n"); |
4131 #endif | 4256 #endif |
4132 addstate(nextlist, start, m, clen); | 4257 addstate(nextlist, start, m, clen); |
4146 nextchar: | 4271 nextchar: |
4147 /* Advance to the next character, or advance to the next line, or | 4272 /* Advance to the next character, or advance to the next line, or |
4148 * finish. */ | 4273 * finish. */ |
4149 if (clen != 0) | 4274 if (clen != 0) |
4150 reginput += clen; | 4275 reginput += clen; |
4151 else if (go_to_nextline) | 4276 else if (go_to_nextline || (endp != NULL && REG_MULTI |
4277 && reglnum < endp->se_u.pos.lnum)) | |
4152 reg_nextline(); | 4278 reg_nextline(); |
4153 else | 4279 else |
4154 break; | 4280 break; |
4155 } | 4281 } |
4156 | 4282 |
4223 vim_memset(m.list.line, 0, sizeof(struct linepos) * nfa_nsubexpr); | 4349 vim_memset(m.list.line, 0, sizeof(struct linepos) * nfa_nsubexpr); |
4224 } | 4350 } |
4225 sub.in_use = 0; | 4351 sub.in_use = 0; |
4226 m.in_use = 0; | 4352 m.in_use = 0; |
4227 | 4353 |
4228 if (nfa_regmatch(start, &sub, &m) == FALSE) | 4354 if (nfa_regmatch(start, &sub, &m, NULL) == FALSE) |
4229 return 0; | 4355 return 0; |
4230 | 4356 |
4231 cleanup_subexpr(); | 4357 cleanup_subexpr(); |
4232 if (REG_MULTI) | 4358 if (REG_MULTI) |
4233 { | 4359 { |