comparison src/regexp_nfa.c @ 4750:7793e737ec87 v7.3.1122

updated for version 7.3.1122 Problem: New regexp engine: \%> not supported. Solution: Implement \%>.
author Bram Moolenaar <bram@vim.org>
date Wed, 05 Jun 2013 18:52:40 +0200
parents 4b9503f0c7d3
children 96f3348f9f11
comparison
equal deleted inserted replaced
4749:31754f244d6e 4750:7793e737ec87
55 NFA_ZEND, /* Used for \ze */ 55 NFA_ZEND, /* Used for \ze */
56 NFA_NOPEN, /* Start of subexpression marked with \%( */ 56 NFA_NOPEN, /* Start of subexpression marked with \%( */
57 NFA_NCLOSE, /* End of subexpr. marked with \%( ... \) */ 57 NFA_NCLOSE, /* End of subexpr. marked with \%( ... \) */
58 NFA_START_INVISIBLE, 58 NFA_START_INVISIBLE,
59 NFA_START_INVISIBLE_BEFORE, 59 NFA_START_INVISIBLE_BEFORE,
60 NFA_START_PATTERN,
60 NFA_END_INVISIBLE, 61 NFA_END_INVISIBLE,
62 NFA_END_PATTERN,
61 NFA_COMPOSING, /* Next nodes in NFA are part of the 63 NFA_COMPOSING, /* Next nodes in NFA are part of the
62 composing multibyte char */ 64 composing multibyte char */
63 NFA_END_COMPOSING, /* End of a composing char in the NFA */ 65 NFA_END_COMPOSING, /* End of a composing char in the NFA */
64 NFA_OPT_CHARS, /* \%[abc] */ 66 NFA_OPT_CHARS, /* \%[abc] */
65 67
1503 else if (op == '!') 1505 else if (op == '!')
1504 /* \@<! */ 1506 /* \@<! */
1505 i = NFA_PREV_ATOM_JUST_BEFORE_NEG; 1507 i = NFA_PREV_ATOM_JUST_BEFORE_NEG;
1506 break; 1508 break;
1507 case '>': 1509 case '>':
1508 /* \@> Not supported yet */ 1510 /* \@> */
1509 /* i = NFA_PREV_ATOM_LIKE_PATTERN; */ 1511 i = NFA_PREV_ATOM_LIKE_PATTERN;
1510 return FAIL; 1512 break;
1511 } 1513 }
1512 if (i == 0) 1514 if (i == 0)
1513 { 1515 {
1514 syntax_error = TRUE; 1516 syntax_error = TRUE;
1515 EMSGN(_("E869: (NFA) Unknown operator '\\@%c'"), op); 1517 EMSGN(_("E869: (NFA) Unknown operator '\\@%c'"), op);
1883 STRCPY(code, "NFA_PREV_ATOM_NO_WIDTH_NEG"); break; 1885 STRCPY(code, "NFA_PREV_ATOM_NO_WIDTH_NEG"); break;
1884 case NFA_PREV_ATOM_JUST_BEFORE: 1886 case NFA_PREV_ATOM_JUST_BEFORE:
1885 STRCPY(code, "NFA_PREV_ATOM_JUST_BEFORE"); break; 1887 STRCPY(code, "NFA_PREV_ATOM_JUST_BEFORE"); break;
1886 case NFA_PREV_ATOM_JUST_BEFORE_NEG: 1888 case NFA_PREV_ATOM_JUST_BEFORE_NEG:
1887 STRCPY(code, "NFA_PREV_ATOM_JUST_BEFORE_NEG"); break; 1889 STRCPY(code, "NFA_PREV_ATOM_JUST_BEFORE_NEG"); break;
1890 case NFA_PREV_ATOM_LIKE_PATTERN:
1891 STRCPY(code, "NFA_PREV_ATOM_LIKE_PATTERN"); break;
1892
1888 case NFA_NOPEN: STRCPY(code, "NFA_NOPEN"); break; 1893 case NFA_NOPEN: STRCPY(code, "NFA_NOPEN"); break;
1889 case NFA_NCLOSE: STRCPY(code, "NFA_NCLOSE"); break; 1894 case NFA_NCLOSE: STRCPY(code, "NFA_NCLOSE"); break;
1890 case NFA_START_INVISIBLE: STRCPY(code, "NFA_START_INVISIBLE"); break; 1895 case NFA_START_INVISIBLE: STRCPY(code, "NFA_START_INVISIBLE"); break;
1891 case NFA_START_INVISIBLE_BEFORE: 1896 case NFA_START_INVISIBLE_BEFORE:
1892 STRCPY(code, "NFA_START_INVISIBLE_BEFORE"); break; 1897 STRCPY(code, "NFA_START_INVISIBLE_BEFORE"); break;
1898 case NFA_START_PATTERN: STRCPY(code, "NFA_START_PATTERN"); break;
1893 case NFA_END_INVISIBLE: STRCPY(code, "NFA_END_INVISIBLE"); break; 1899 case NFA_END_INVISIBLE: STRCPY(code, "NFA_END_INVISIBLE"); break;
1900 case NFA_END_PATTERN: STRCPY(code, "NFA_END_PATTERN"); break;
1894 1901
1895 case NFA_COMPOSING: STRCPY(code, "NFA_COMPOSING"); break; 1902 case NFA_COMPOSING: STRCPY(code, "NFA_COMPOSING"); break;
1896 case NFA_END_COMPOSING: STRCPY(code, "NFA_END_COMPOSING"); break; 1903 case NFA_END_COMPOSING: STRCPY(code, "NFA_END_COMPOSING"); break;
1897 case NFA_OPT_CHARS: STRCPY(code, "NFA_OPT_CHARS"); break; 1904 case NFA_OPT_CHARS: STRCPY(code, "NFA_OPT_CHARS"); break;
1898 1905
2599 2606
2600 case NFA_PREV_ATOM_NO_WIDTH: 2607 case NFA_PREV_ATOM_NO_WIDTH:
2601 case NFA_PREV_ATOM_NO_WIDTH_NEG: 2608 case NFA_PREV_ATOM_NO_WIDTH_NEG:
2602 case NFA_PREV_ATOM_JUST_BEFORE: 2609 case NFA_PREV_ATOM_JUST_BEFORE:
2603 case NFA_PREV_ATOM_JUST_BEFORE_NEG: 2610 case NFA_PREV_ATOM_JUST_BEFORE_NEG:
2611 case NFA_PREV_ATOM_LIKE_PATTERN:
2604 { 2612 {
2605 int neg = (*p == NFA_PREV_ATOM_NO_WIDTH_NEG 2613 int neg = (*p == NFA_PREV_ATOM_NO_WIDTH_NEG
2606 || *p == NFA_PREV_ATOM_JUST_BEFORE_NEG); 2614 || *p == NFA_PREV_ATOM_JUST_BEFORE_NEG);
2607 int before = (*p == NFA_PREV_ATOM_JUST_BEFORE 2615 int before = (*p == NFA_PREV_ATOM_JUST_BEFORE
2608 || *p == NFA_PREV_ATOM_JUST_BEFORE_NEG); 2616 || *p == NFA_PREV_ATOM_JUST_BEFORE_NEG);
2609 int n; 2617 int pattern = (*p == NFA_PREV_ATOM_LIKE_PATTERN);
2618 int start_state = NFA_START_INVISIBLE;
2619 int end_state = NFA_END_INVISIBLE;
2620 int n = 0;
2621 nfa_state_T *zend;
2622 nfa_state_T *skip;
2623
2624 if (before)
2625 start_state = NFA_START_INVISIBLE_BEFORE;
2626 else if (pattern)
2627 {
2628 start_state = NFA_START_PATTERN;
2629 end_state = NFA_END_PATTERN;
2630 }
2610 2631
2611 if (before) 2632 if (before)
2612 n = *++p; /* get the count */ 2633 n = *++p; /* get the count */
2613 2634
2614 /* The \@= operator: match the preceding atom with zero width. 2635 /* The \@= operator: match the preceding atom with zero width.
2618 * Surrounds the preceding atom with START_INVISIBLE and 2639 * Surrounds the preceding atom with START_INVISIBLE and
2619 * END_INVISIBLE, similarly to MOPEN. */ 2640 * END_INVISIBLE, similarly to MOPEN. */
2620 2641
2621 if (nfa_calc_size == TRUE) 2642 if (nfa_calc_size == TRUE)
2622 { 2643 {
2623 nstate += 2; 2644 nstate += pattern ? 4 : 2;
2624 break; 2645 break;
2625 } 2646 }
2626 e = POP(); 2647 e = POP();
2627 s1 = alloc_state(NFA_END_INVISIBLE, NULL, NULL); 2648 s1 = alloc_state(end_state, NULL, NULL);
2628 if (s1 == NULL) 2649 if (s1 == NULL)
2629 goto theend; 2650 goto theend;
2630 patch(e.out, s1); 2651
2631 2652 s = alloc_state(start_state, e.start, s1);
2632 s = alloc_state(NFA_START_INVISIBLE, e.start, s1);
2633 if (s == NULL) 2653 if (s == NULL)
2634 goto theend; 2654 goto theend;
2635 if (neg) 2655 if (neg)
2636 { 2656 {
2637 s->negated = TRUE; 2657 s->negated = TRUE;
2638 s1->negated = TRUE; 2658 s1->negated = TRUE;
2639 } 2659 }
2640 if (before) 2660 if (before)
2641 {
2642 s->val = n; /* store the count */ 2661 s->val = n; /* store the count */
2643 ++s->c; /* NFA_START_INVISIBLE -> NFA_START_INVISIBLE_BEFORE */ 2662 if (pattern)
2644 } 2663 {
2645 2664 /* NFA_ZEND -> NFA_END_PATTERN -> NFA_SKIP -> what follows. */
2646 PUSH(frag(s, list1(&s1->out))); 2665 skip = alloc_state(NFA_SKIP, NULL, NULL);
2666 zend = alloc_state(NFA_ZEND, s1, NULL);
2667 s1->out= skip;
2668 patch(e.out, zend);
2669 PUSH(frag(s, list1(&skip->out)));
2670 }
2671 else
2672 {
2673 patch(e.out, s1);
2674 PUSH(frag(s, list1(&s1->out)));
2675 }
2647 break; 2676 break;
2648 } 2677 }
2649 2678
2650 #ifdef FEAT_MBYTE 2679 #ifdef FEAT_MBYTE
2651 case NFA_COMPOSING: /* char with composing char */ 2680 case NFA_COMPOSING: /* char with composing char */
2951 { 2980 {
2952 int j; 2981 int j;
2953 2982
2954 for (j = 0; j < sub->in_use; j++) 2983 for (j = 0; j < sub->in_use; j++)
2955 if (REG_MULTI) 2984 if (REG_MULTI)
2956 fprintf(log_fd, "\n *** group %d, start: c=%d, l=%d, end: c=%d, l=%d", 2985 fprintf(log_fd, "*** group %d, start: c=%d, l=%d, end: c=%d, l=%d\n",
2957 j, 2986 j,
2958 sub->list.multi[j].start.col, 2987 sub->list.multi[j].start.col,
2959 (int)sub->list.multi[j].start.lnum, 2988 (int)sub->list.multi[j].start.lnum,
2960 sub->list.multi[j].end.col, 2989 sub->list.multi[j].end.col,
2961 (int)sub->list.multi[j].end.lnum); 2990 (int)sub->list.multi[j].end.lnum);
2962 else 2991 else
2963 { 2992 {
2964 char *s = (char *)sub->list.line[j].start; 2993 char *s = (char *)sub->list.line[j].start;
2965 char *e = (char *)sub->list.line[j].end; 2994 char *e = (char *)sub->list.line[j].end;
2966 2995
2967 fprintf(log_fd, "\n *** group %d, start: \"%s\", end: \"%s\"", 2996 fprintf(log_fd, "*** group %d, start: \"%s\", end: \"%s\"\n",
2968 j, 2997 j,
2969 s == NULL ? "NULL" : s, 2998 s == NULL ? "NULL" : s,
2970 e == NULL ? "NULL" : e); 2999 e == NULL ? "NULL" : e);
2971 } 3000 }
2972 fprintf(log_fd, "\n");
2973 } 3001 }
2974 #endif 3002 #endif
2975 3003
2976 /* Used during execution: whether a match has been found. */ 3004 /* Used during execution: whether a match has been found. */
2977 static int nfa_match; 3005 static int nfa_match;
4315 clen = 0; 4343 clen = 0;
4316 goto nextchar; 4344 goto nextchar;
4317 } 4345 }
4318 4346
4319 case NFA_END_INVISIBLE: 4347 case NFA_END_INVISIBLE:
4348 case NFA_END_PATTERN:
4320 /* 4349 /*
4321 * This is only encountered after a NFA_START_INVISIBLE or 4350 * This is only encountered after a NFA_START_INVISIBLE or
4322 * NFA_START_INVISIBLE_BEFORE node. 4351 * NFA_START_INVISIBLE_BEFORE node.
4323 * They surround a zero-width group, used with "\@=", "\&", 4352 * They surround a zero-width group, used with "\@=", "\&",
4324 * "\@!", "\@<=" and "\@<!". 4353 * "\@!", "\@<=" and "\@<!".
4341 fprintf(log_fd, "Current col: %d, endp col: %d\n", 4370 fprintf(log_fd, "Current col: %d, endp col: %d\n",
4342 (int)(reginput - regline), 4371 (int)(reginput - regline),
4343 (int)(nfa_endp->se_u.ptr - reginput)); 4372 (int)(nfa_endp->se_u.ptr - reginput));
4344 } 4373 }
4345 #endif 4374 #endif
4346 /* It's only a match if it ends at "nfa_endp" */ 4375 /* If "nfa_endp" is set it's only a match if it ends at
4376 * "nfa_endp" */
4347 if (nfa_endp != NULL && (REG_MULTI 4377 if (nfa_endp != NULL && (REG_MULTI
4348 ? (reglnum != nfa_endp->se_u.pos.lnum 4378 ? (reglnum != nfa_endp->se_u.pos.lnum
4349 || (int)(reginput - regline) 4379 || (int)(reginput - regline)
4350 != nfa_endp->se_u.pos.col) 4380 != nfa_endp->se_u.pos.col)
4351 : reginput != nfa_endp->se_u.ptr)) 4381 : reginput != nfa_endp->se_u.ptr))
4358 #ifdef FEAT_SYN_HL 4388 #ifdef FEAT_SYN_HL
4359 if (nfa_has_zsubexpr) 4389 if (nfa_has_zsubexpr)
4360 copy_sub(&m->synt, &t->subs.synt); 4390 copy_sub(&m->synt, &t->subs.synt);
4361 #endif 4391 #endif
4362 } 4392 }
4393 #ifdef ENABLE_LOG
4394 fprintf(log_fd, "Match found:\n");
4395 log_subsexpr(m);
4396 #endif
4363 nfa_match = TRUE; 4397 nfa_match = TRUE;
4364 break; 4398 break;
4365 4399
4366 case NFA_START_INVISIBLE: 4400 case NFA_START_INVISIBLE:
4367 case NFA_START_INVISIBLE_BEFORE: 4401 case NFA_START_INVISIBLE_BEFORE:
4433 pim, &listidx); 4467 pim, &listidx);
4434 } 4468 }
4435 } 4469 }
4436 break; 4470 break;
4437 4471
4472 case NFA_START_PATTERN:
4473 /* First try matching the pattern. */
4474 result = recursive_regmatch(t->state, prog,
4475 submatch, m, &listids);
4476 if (result)
4477 {
4478 int bytelen;
4479
4480 #ifdef ENABLE_LOG
4481 fprintf(log_fd, "NFA_START_PATTERN matches:\n");
4482 log_subsexpr(m);
4483 #endif
4484 /* Copy submatch info from the recursive call */
4485 copy_sub_off(&t->subs.norm, &m->norm);
4486 #ifdef FEAT_SYN_HL
4487 copy_sub_off(&t->subs.synt, &m->synt);
4488 #endif
4489 /* Now we need to skip over the matched text and then
4490 * continue with what follows. */
4491 if (REG_MULTI)
4492 /* TODO: multi-line match */
4493 bytelen = m->norm.list.multi[0].end.col
4494 - (int)(reginput - regline);
4495 else
4496 bytelen = (int)(m->norm.list.line[0].end - reginput);
4497
4498 #ifdef ENABLE_LOG
4499 fprintf(log_fd, "NFA_START_PATTERN length: %d\n", bytelen);
4500 #endif
4501 if (bytelen == 0)
4502 {
4503 /* empty match, output of corresponding
4504 * NFA_END_PATTERN/NFA_SKIP to be used at current
4505 * position */
4506 addstate_here(thislist, t->state->out1->out->out,
4507 &t->subs, t->pim, &listidx);
4508 }
4509 else if (bytelen <= clen)
4510 {
4511 /* match current character, output of corresponding
4512 * NFA_END_PATTERN to be used at next position. */
4513 ll = nextlist;
4514 add_state = t->state->out1->out->out;
4515 add_off = clen;
4516 }
4517 else
4518 {
4519 /* skip over the matched characters, set character
4520 * count in NFA_SKIP */
4521 ll = nextlist;
4522 add_state = t->state->out1->out;
4523 add_off = bytelen;
4524 add_count = bytelen - clen;
4525 }
4526 }
4527 break;
4528
4438 case NFA_BOL: 4529 case NFA_BOL:
4439 if (reginput == regline) 4530 if (reginput == regline)
4440 addstate_here(thislist, t->state->out, &t->subs, 4531 addstate_here(thislist, t->state->out, &t->subs,
4441 t->pim, &listidx); 4532 t->pim, &listidx);
4442 break; 4533 break;
4844 /* match current character, jump ahead to out of 4935 /* match current character, jump ahead to out of
4845 * NFA_SKIP */ 4936 * NFA_SKIP */
4846 ll = nextlist; 4937 ll = nextlist;
4847 add_state = t->state->out->out; 4938 add_state = t->state->out->out;
4848 add_off = clen; 4939 add_off = clen;
4849 #ifdef ENABLE_LOG
4850 log_subsexpr(&nextlist->t[nextlist->n - 1].subs);
4851 #endif
4852 } 4940 }
4853 else 4941 else
4854 { 4942 {
4855 /* skip over the matched characters, set character 4943 /* skip over the matched characters, set character
4856 * count in NFA_SKIP */ 4944 * count in NFA_SKIP */
4857 ll = nextlist; 4945 ll = nextlist;
4858 add_state = t->state->out; 4946 add_state = t->state->out;
4859 add_off = bytelen; 4947 add_off = bytelen;
4860 add_count = bytelen - clen; 4948 add_count = bytelen - clen;
4861 #ifdef ENABLE_LOG
4862 log_subsexpr(&nextlist->t[nextlist->n - 1].subs);
4863 #endif
4864 } 4949 }
4865 } 4950 }
4866 break; 4951 break;
4867 } 4952 }
4868 case NFA_SKIP: 4953 case NFA_SKIP:
4871 { 4956 {
4872 /* end of match, go to what follows */ 4957 /* end of match, go to what follows */
4873 ll = nextlist; 4958 ll = nextlist;
4874 add_state = t->state->out; 4959 add_state = t->state->out;
4875 add_off = clen; 4960 add_off = clen;
4876 #ifdef ENABLE_LOG
4877 log_subsexpr(&nextlist->t[nextlist->n - 1].subs);
4878 #endif
4879 } 4961 }
4880 else 4962 else
4881 { 4963 {
4882 /* add state again with decremented count */ 4964 /* add state again with decremented count */
4883 ll = nextlist; 4965 ll = nextlist;
4884 add_state = t->state; 4966 add_state = t->state;
4885 add_off = 0; 4967 add_off = 0;
4886 add_count = t->count - clen; 4968 add_count = t->count - clen;
4887 #ifdef ENABLE_LOG
4888 log_subsexpr(&nextlist->t[nextlist->n - 1].subs);
4889 #endif
4890 } 4969 }
4891 break; 4970 break;
4892 4971
4893 case NFA_LNUM: 4972 case NFA_LNUM:
4894 case NFA_LNUM_GT: 4973 case NFA_LNUM_GT:
5156 5235
5157 #ifdef ENABLE_LOG 5236 #ifdef ENABLE_LOG
5158 f = fopen(NFA_REGEXP_RUN_LOG, "a"); 5237 f = fopen(NFA_REGEXP_RUN_LOG, "a");
5159 if (f != NULL) 5238 if (f != NULL)
5160 { 5239 {
5161 fprintf(f, "\n\n\n\n\n\n\t\t=======================================================\n"); 5240 fprintf(f, "\n\n\t=======================================================\n");
5162 fprintf(f, " =======================================================\n");
5163 #ifdef DEBUG 5241 #ifdef DEBUG
5164 fprintf(f, "\tRegexp is \"%s\"\n", nfa_regengine.expr); 5242 fprintf(f, "\tRegexp is \"%s\"\n", nfa_regengine.expr);
5165 #endif 5243 #endif
5166 fprintf(f, "\tInput text is \"%s\" \n", reginput); 5244 fprintf(f, "\tInput text is \"%s\" \n", reginput);
5167 fprintf(f, " =======================================================\n\n"); 5245 fprintf(f, "\t=======================================================\n\n");
5168 nfa_print_state(f, start); 5246 nfa_print_state(f, start);
5169 fprintf(f, "\n\n"); 5247 fprintf(f, "\n\n");
5170 fclose(f); 5248 fclose(f);
5171 } 5249 }
5172 else 5250 else