comparison src/regexp_nfa.c @ 4682:2f51ee8825db v7.3.1088

updated for version 7.3.1088 Problem: New regexp engine: \@<= and \@<! are not implemented. Solution: Implement look-behind matching. Fix off-by-one error in old regexp engine.
author Bram Moolenaar <bram@vim.org>
date Sat, 01 Jun 2013 19:54:43 +0200
parents 4d92b873acef
children 8db697ae406a
comparison
equal deleted inserted replaced
4681:2eb30f341e8d 4682:2f51ee8825db
54 NFA_ZSTART, /* Used for \zs */ 54 NFA_ZSTART, /* Used for \zs */
55 NFA_ZEND, /* Used for \ze */ 55 NFA_ZEND, /* Used for \ze */
56 NFA_NOPEN, /* Start of subexpression marked with \%( */ 56 NFA_NOPEN, /* Start of subexpression marked with \%( */
57 NFA_NCLOSE, /* End of subexpr. marked with \%( ... \) */ 57 NFA_NCLOSE, /* End of subexpr. marked with \%( ... \) */
58 NFA_START_INVISIBLE, 58 NFA_START_INVISIBLE,
59 NFA_START_INVISIBLE_BEFORE,
59 NFA_END_INVISIBLE, 60 NFA_END_INVISIBLE,
60 NFA_COMPOSING, /* Next nodes in NFA are part of the 61 NFA_COMPOSING, /* Next nodes in NFA are part of the
61 composing multibyte char */ 62 composing multibyte char */
62 NFA_END_COMPOSING, /* End of a composing char in the NFA */ 63 NFA_END_COMPOSING, /* End of a composing char in the NFA */
63 64
1367 EMIT(NFA_CONCAT); 1368 EMIT(NFA_CONCAT);
1368 skipchr(); /* skip the \+ */ 1369 skipchr(); /* skip the \+ */
1369 break; 1370 break;
1370 1371
1371 case Magic('@'): 1372 case Magic('@'):
1373 c2 = getdecchrs();
1372 op = no_Magic(getchr()); 1374 op = no_Magic(getchr());
1375 i = 0;
1373 switch(op) 1376 switch(op)
1374 { 1377 {
1375 case '=': 1378 case '=':
1376 EMIT(NFA_PREV_ATOM_NO_WIDTH); 1379 /* \@= */
1380 i = NFA_PREV_ATOM_NO_WIDTH;
1377 break; 1381 break;
1378 case '!': 1382 case '!':
1379 EMIT(NFA_PREV_ATOM_NO_WIDTH_NEG); 1383 /* \@! */
1384 i = NFA_PREV_ATOM_NO_WIDTH_NEG;
1380 break; 1385 break;
1381 case '0':
1382 case '1':
1383 case '2':
1384 case '3':
1385 case '4':
1386 case '5':
1387 case '6':
1388 case '7':
1389 case '8':
1390 case '9':
1391 case '<': 1386 case '<':
1387 op = no_Magic(getchr());
1388 if (op == '=')
1389 /* \@<= */
1390 i = NFA_PREV_ATOM_JUST_BEFORE;
1391 else if (op == '!')
1392 /* \@<! */
1393 i = NFA_PREV_ATOM_JUST_BEFORE_NEG;
1394 break;
1392 case '>': 1395 case '>':
1393 /* Not supported yet */ 1396 /* \@> Not supported yet */
1397 /* i = NFA_PREV_ATOM_LIKE_PATTERN; */
1394 return FAIL; 1398 return FAIL;
1395 default: 1399 }
1396 syntax_error = TRUE; 1400 if (i == 0)
1397 EMSGN(_("E869: (NFA) Unknown operator '\\@%c'"), op); 1401 {
1398 return FAIL; 1402 syntax_error = TRUE;
1399 } 1403 EMSGN(_("E869: (NFA) Unknown operator '\\@%c'"), op);
1404 return FAIL;
1405 }
1406 EMIT(i);
1407 if (i == NFA_PREV_ATOM_JUST_BEFORE
1408 || i == NFA_PREV_ATOM_JUST_BEFORE_NEG)
1409 EMIT(c2);
1400 break; 1410 break;
1401 1411
1402 case Magic('?'): 1412 case Magic('?'):
1403 case Magic('='): 1413 case Magic('='):
1404 EMIT(NFA_QUEST); 1414 EMIT(NFA_QUEST);
1732 1742
1733 case NFA_PREV_ATOM_NO_WIDTH: 1743 case NFA_PREV_ATOM_NO_WIDTH:
1734 STRCPY(code, "NFA_PREV_ATOM_NO_WIDTH"); break; 1744 STRCPY(code, "NFA_PREV_ATOM_NO_WIDTH"); break;
1735 case NFA_PREV_ATOM_NO_WIDTH_NEG: 1745 case NFA_PREV_ATOM_NO_WIDTH_NEG:
1736 STRCPY(code, "NFA_PREV_ATOM_NO_WIDTH_NEG"); break; 1746 STRCPY(code, "NFA_PREV_ATOM_NO_WIDTH_NEG"); break;
1747 case NFA_PREV_ATOM_JUST_BEFORE:
1748 STRCPY(code, "NFA_PREV_ATOM_JUST_BEFORE"); break;
1749 case NFA_PREV_ATOM_JUST_BEFORE_NEG:
1750 STRCPY(code, "NFA_PREV_ATOM_JUST_BEFORE_NEG"); break;
1737 case NFA_NOPEN: STRCPY(code, "NFA_NOPEN"); break; 1751 case NFA_NOPEN: STRCPY(code, "NFA_NOPEN"); break;
1738 case NFA_NCLOSE: STRCPY(code, "NFA_NCLOSE"); break; 1752 case NFA_NCLOSE: STRCPY(code, "NFA_NCLOSE"); break;
1739 case NFA_START_INVISIBLE: STRCPY(code, "NFA_START_INVISIBLE"); break; 1753 case NFA_START_INVISIBLE: STRCPY(code, "NFA_START_INVISIBLE"); break;
1754 case NFA_START_INVISIBLE_BEFORE:
1755 STRCPY(code, "NFA_START_INVISIBLE_BEFORE"); break;
1740 case NFA_END_INVISIBLE: STRCPY(code, "NFA_END_INVISIBLE"); break; 1756 case NFA_END_INVISIBLE: STRCPY(code, "NFA_END_INVISIBLE"); break;
1741 1757
1742 case NFA_COMPOSING: STRCPY(code, "NFA_COMPOSING"); break; 1758 case NFA_COMPOSING: STRCPY(code, "NFA_COMPOSING"); break;
1743 case NFA_END_COMPOSING: STRCPY(code, "NFA_END_COMPOSING"); break; 1759 case NFA_END_COMPOSING: STRCPY(code, "NFA_END_COMPOSING"); break;
1744 1760
2235 } 2251 }
2236 2252
2237 if (nfa_calc_size == FALSE) 2253 if (nfa_calc_size == FALSE)
2238 { 2254 {
2239 /* Allocate space for the stack. Max states on the stack : nstate */ 2255 /* Allocate space for the stack. Max states on the stack : nstate */
2240 stack = (Frag_T *) lalloc((nstate + 1) * sizeof(Frag_T), TRUE); 2256 stack = (Frag_T *)lalloc((nstate + 1) * sizeof(Frag_T), TRUE);
2241 stackp = stack; 2257 stackp = stack;
2242 stack_end = stack + (nstate + 1); 2258 stack_end = stack + (nstate + 1);
2243 } 2259 }
2244 2260
2245 for (p = postfix; p < end; ++p) 2261 for (p = postfix; p < end; ++p)
2368 PUSH(frag(s, list1(&s->out))); 2384 PUSH(frag(s, list1(&s->out)));
2369 break; 2385 break;
2370 2386
2371 case NFA_PREV_ATOM_NO_WIDTH: 2387 case NFA_PREV_ATOM_NO_WIDTH:
2372 case NFA_PREV_ATOM_NO_WIDTH_NEG: 2388 case NFA_PREV_ATOM_NO_WIDTH_NEG:
2389 case NFA_PREV_ATOM_JUST_BEFORE:
2390 case NFA_PREV_ATOM_JUST_BEFORE_NEG:
2373 /* The \@= operator: match the preceding atom with zero width. 2391 /* The \@= operator: match the preceding atom with zero width.
2374 * The \@! operator: no match for the preceding atom. 2392 * The \@! operator: no match for the preceding atom.
2393 * The \@<= operator: match for the preceding atom.
2394 * The \@<! operator: no match for the preceding atom.
2375 * Surrounds the preceding atom with START_INVISIBLE and 2395 * Surrounds the preceding atom with START_INVISIBLE and
2376 * END_INVISIBLE, similarly to MOPEN. */ 2396 * END_INVISIBLE, similarly to MOPEN. */
2377 2397
2378 if (nfa_calc_size == TRUE) 2398 if (nfa_calc_size == TRUE)
2379 { 2399 {
2387 patch(e.out, s1); 2407 patch(e.out, s1);
2388 2408
2389 s = new_state(NFA_START_INVISIBLE, e.start, s1); 2409 s = new_state(NFA_START_INVISIBLE, e.start, s1);
2390 if (s == NULL) 2410 if (s == NULL)
2391 goto theend; 2411 goto theend;
2392 if (*p == NFA_PREV_ATOM_NO_WIDTH_NEG) 2412 if (*p == NFA_PREV_ATOM_NO_WIDTH_NEG
2413 || *p == NFA_PREV_ATOM_JUST_BEFORE_NEG)
2393 { 2414 {
2394 s->negated = TRUE; 2415 s->negated = TRUE;
2395 s1->negated = TRUE; 2416 s1->negated = TRUE;
2417 }
2418 if (*p == NFA_PREV_ATOM_JUST_BEFORE
2419 || *p == NFA_PREV_ATOM_JUST_BEFORE_NEG)
2420 {
2421 s->val = *++p; /* get the count */
2422 ++s->c; /* NFA_START_INVISIBLE -> NFA_START_INVISIBLE_BEFORE */
2396 } 2423 }
2397 2424
2398 PUSH(frag(s, list1(&s1->out))); 2425 PUSH(frag(s, list1(&s1->out)));
2399 break; 2426 break;
2400 2427
3305 if (op == 1) return pos > val; 3332 if (op == 1) return pos > val;
3306 if (op == 2) return pos < val; 3333 if (op == 2) return pos < val;
3307 return val == pos; 3334 return val == pos;
3308 } 3335 }
3309 3336
3310 static int nfa_regmatch __ARGS((nfa_state_T *start, regsub_T *submatch, regsub_T *m)); 3337 static int nfa_regmatch __ARGS((nfa_state_T *start, regsub_T *submatch, regsub_T *m, save_se_T *endp));
3311 3338
3312 /* 3339 /*
3313 * Main matching routine. 3340 * Main matching routine.
3314 * 3341 *
3315 * Run NFA to determine whether it matches reginput. 3342 * Run NFA to determine whether it matches reginput.
3316 * 3343 *
3344 * When "endp" is not NULL it is a required end-of-match position.
3345 *
3317 * Return TRUE if there is a match, FALSE otherwise. 3346 * Return TRUE if there is a match, FALSE otherwise.
3318 * Note: Caller must ensure that: start != NULL. 3347 * Note: Caller must ensure that: start != NULL.
3319 */ 3348 */
3320 static int 3349 static int
3321 nfa_regmatch(start, submatch, m) 3350 nfa_regmatch(start, submatch, m, endp)
3322 nfa_state_T *start; 3351 nfa_state_T *start;
3323 regsub_T *submatch; 3352 regsub_T *submatch;
3324 regsub_T *m; 3353 regsub_T *m;
3354 save_se_T *endp;
3325 { 3355 {
3326 int result; 3356 int result;
3327 int size = 0; 3357 int size = 0;
3328 int flag = 0; 3358 int flag = 0;
3329 int go_to_nextline = FALSE; 3359 int go_to_nextline = FALSE;
3530 clen = 0; 3560 clen = 0;
3531 goto nextchar; 3561 goto nextchar;
3532 } 3562 }
3533 3563
3534 case NFA_END_INVISIBLE: 3564 case NFA_END_INVISIBLE:
3535 /* This is only encountered after a NFA_START_INVISIBLE node. 3565 /* This is only encountered after a NFA_START_INVISIBLE or
3536 * They surround a zero-width group, used with "\@=" and "\&". 3566 * NFA_START_INVISIBLE_BEFORE node.
3567 * They surround a zero-width group, used with "\@=", "\&",
3568 * "\@!", "\@<=" and "\@<!".
3537 * If we got here, it means that the current "invisible" group 3569 * If we got here, it means that the current "invisible" group
3538 * finished successfully, so return control to the parent 3570 * finished successfully, so return control to the parent
3539 * nfa_regmatch(). Submatches are stored in *m, and used in 3571 * nfa_regmatch(). Submatches are stored in *m, and used in
3540 * the parent call. */ 3572 * the parent call. */
3541 if (start->c == NFA_MOPEN + 0) 3573 if (start->c == NFA_MOPEN + 0)
3574 /* TODO: do we ever get here? */
3542 addstate_here(thislist, t->state->out, &t->sub, &listidx); 3575 addstate_here(thislist, t->state->out, &t->sub, &listidx);
3543 else 3576 else
3544 { 3577 {
3578 #ifdef ENABLE_LOG
3579 if (endp != NULL)
3580 {
3581 if (REG_MULTI)
3582 fprintf(log_fd, "Current lnum: %d, endp lnum: %d; current col: %d, endp col: %d\n",
3583 (int)reglnum,
3584 (int)endp->se_u.pos.lnum,
3585 (int)(reginput - regline),
3586 endp->se_u.pos.col);
3587 else
3588 fprintf(log_fd, "Current col: %d, endp col: %d\n",
3589 (int)(reginput - regline),
3590 (int)(endp->se_u.ptr - reginput));
3591 }
3592 #endif
3593 /* It's only a match if it ends at "endp" */
3594 if (endp != NULL && (REG_MULTI
3595 ? (reglnum != endp->se_u.pos.lnum
3596 || (int)(reginput - regline)
3597 != endp->se_u.pos.col)
3598 : reginput != endp->se_u.ptr))
3599 break;
3600
3545 /* do not set submatches for \@! */ 3601 /* do not set submatches for \@! */
3546 if (!t->state->negated) 3602 if (!t->state->negated)
3547 /* TODO: only copy positions in use. */ 3603 /* TODO: only copy positions in use. */
3548 *m = t->sub; 3604 *m = t->sub;
3549 nfa_match = TRUE; 3605 nfa_match = TRUE;
3550 } 3606 }
3551 break; 3607 break;
3552 3608
3553 case NFA_START_INVISIBLE: 3609 case NFA_START_INVISIBLE:
3610 case NFA_START_INVISIBLE_BEFORE:
3554 { 3611 {
3555 char_u *save_reginput = reginput; 3612 char_u *save_reginput = reginput;
3556 char_u *save_regline = regline; 3613 char_u *save_regline = regline;
3557 int save_reglnum = reglnum; 3614 int save_reglnum = reglnum;
3558 int save_nfa_match = nfa_match; 3615 int save_nfa_match = nfa_match;
3616 save_se_T endpos;
3617 save_se_T *endposp = NULL;
3618
3619 if (t->state->c == NFA_START_INVISIBLE_BEFORE)
3620 {
3621 /* The recursive match must end at the current position. */
3622 endposp = &endpos;
3623 if (REG_MULTI)
3624 {
3625 endpos.se_u.pos.col = (int)(reginput - regline);
3626 endpos.se_u.pos.lnum = reglnum;
3627 }
3628 else
3629 endpos.se_u.ptr = reginput;
3630
3631 /* Go back the specified number of bytes, or as far as the
3632 * start of the previous line, to try matching "\@<=" or
3633 * not matching "\@<!". */
3634 if (t->state->val <= 0)
3635 {
3636 if (REG_MULTI)
3637 {
3638 regline = reg_getline(--reglnum);
3639 if (regline == NULL)
3640 /* can't go before the first line */
3641 regline = reg_getline(++reglnum);
3642 }
3643 reginput = regline;
3644 }
3645 else
3646 {
3647 if (REG_MULTI
3648 && (int)(reginput - regline) < t->state->val)
3649 {
3650 /* Not enough bytes in this line, go to end of
3651 * previous line. */
3652 regline = reg_getline(--reglnum);
3653 if (regline == NULL)
3654 {
3655 /* can't go before the first line */
3656 regline = reg_getline(++reglnum);
3657 reginput = regline;
3658 }
3659 else
3660 reginput = regline + STRLEN(regline);
3661 }
3662 if ((int)(reginput - regline) >= t->state->val)
3663 {
3664 reginput -= t->state->val;
3665 #ifdef FEAT_MBYTE
3666 if (has_mbyte)
3667 reginput -= mb_head_off(regline, reginput);
3668 #endif
3669 }
3670 else
3671 reginput = regline;
3672 }
3673 }
3559 3674
3560 /* Call nfa_regmatch() to check if the current concat matches 3675 /* Call nfa_regmatch() to check if the current concat matches
3561 * at this position. The concat ends with the node 3676 * at this position. The concat ends with the node
3562 * NFA_END_INVISIBLE */ 3677 * NFA_END_INVISIBLE */
3563 if (listids == NULL) 3678 if (listids == NULL)
3577 /* Have to clear the listid field of the NFA nodes, so that 3692 /* Have to clear the listid field of the NFA nodes, so that
3578 * nfa_regmatch() and addstate() can run properly after 3693 * nfa_regmatch() and addstate() can run properly after
3579 * recursion. */ 3694 * recursion. */
3580 nfa_save_listids(start, listids); 3695 nfa_save_listids(start, listids);
3581 nfa_set_null_listids(start); 3696 nfa_set_null_listids(start);
3582 result = nfa_regmatch(t->state->out, submatch, m); 3697 result = nfa_regmatch(t->state->out, submatch, m, endposp);
3583 nfa_set_neg_listids(start); 3698 nfa_set_neg_listids(start);
3584 nfa_restore_listids(start, listids); 3699 nfa_restore_listids(start, listids);
3585 3700
3586 /* restore position in input text */ 3701 /* restore position in input text */
3587 reginput = save_reginput; 3702 reginput = save_reginput;
4118 * start state to the list of states. 4233 * start state to the list of states.
4119 * The first found match is the leftmost one, thus the order of states 4234 * The first found match is the leftmost one, thus the order of states
4120 * matters! 4235 * matters!
4121 * Do not add the start state in recursive calls of nfa_regmatch(), 4236 * Do not add the start state in recursive calls of nfa_regmatch(),
4122 * because recursive calls should only start in the first position. 4237 * because recursive calls should only start in the first position.
4238 * Unless "endp" is not NULL, then we match the end position.
4123 * Also don't start a match past the first line. */ 4239 * Also don't start a match past the first line. */
4124 if (nfa_match == FALSE && start->c == NFA_MOPEN + 0 4240 if (nfa_match == FALSE
4125 && reglnum == 0 && clen != 0 4241 && ((start->c == NFA_MOPEN + 0
4126 && (ireg_maxcol == 0 4242 && reglnum == 0
4127 || (colnr_T)(reginput - regline) < ireg_maxcol)) 4243 && clen != 0
4244 && (ireg_maxcol == 0
4245 || (colnr_T)(reginput - regline) < ireg_maxcol))
4246 || (endp != NULL
4247 && (REG_MULTI
4248 ? (reglnum < endp->se_u.pos.lnum
4249 || (reglnum == endp->se_u.pos.lnum
4250 && (int)(reginput - regline)
4251 < endp->se_u.pos.col))
4252 : reginput < endp->se_u.ptr))))
4128 { 4253 {
4129 #ifdef ENABLE_LOG 4254 #ifdef ENABLE_LOG
4130 fprintf(log_fd, "(---) STARTSTATE\n"); 4255 fprintf(log_fd, "(---) STARTSTATE\n");
4131 #endif 4256 #endif
4132 addstate(nextlist, start, m, clen); 4257 addstate(nextlist, start, m, clen);
4146 nextchar: 4271 nextchar:
4147 /* Advance to the next character, or advance to the next line, or 4272 /* Advance to the next character, or advance to the next line, or
4148 * finish. */ 4273 * finish. */
4149 if (clen != 0) 4274 if (clen != 0)
4150 reginput += clen; 4275 reginput += clen;
4151 else if (go_to_nextline) 4276 else if (go_to_nextline || (endp != NULL && REG_MULTI
4277 && reglnum < endp->se_u.pos.lnum))
4152 reg_nextline(); 4278 reg_nextline();
4153 else 4279 else
4154 break; 4280 break;
4155 } 4281 }
4156 4282
4223 vim_memset(m.list.line, 0, sizeof(struct linepos) * nfa_nsubexpr); 4349 vim_memset(m.list.line, 0, sizeof(struct linepos) * nfa_nsubexpr);
4224 } 4350 }
4225 sub.in_use = 0; 4351 sub.in_use = 0;
4226 m.in_use = 0; 4352 m.in_use = 0;
4227 4353
4228 if (nfa_regmatch(start, &sub, &m) == FALSE) 4354 if (nfa_regmatch(start, &sub, &m, NULL) == FALSE)
4229 return 0; 4355 return 0;
4230 4356
4231 cleanup_subexpr(); 4357 cleanup_subexpr();
4232 if (REG_MULTI) 4358 if (REG_MULTI)
4233 { 4359 {