comparison src/regexp.c @ 7:3fc0f57ecb91 v7.0001

updated for version 7.0001
author vimboss
date Sun, 13 Jun 2004 20:20:40 +0000
parents
children 8ff7fd162d3c
comparison
equal deleted inserted replaced
6:c2daee826b8f 7:3fc0f57ecb91
1 /* vi:set ts=8 sts=4 sw=4:
2 *
3 * Handling of regular expressions: vim_regcomp(), vim_regexec(), vim_regsub()
4 *
5 * NOTICE:
6 *
7 * This is NOT the original regular expression code as written by Henry
8 * Spencer. This code has been modified specifically for use with the VIM
9 * editor, and should not be used separately from Vim. If you want a good
10 * regular expression library, get the original code. The copyright notice
11 * that follows is from the original.
12 *
13 * END NOTICE
14 *
15 * Copyright (c) 1986 by University of Toronto.
16 * Written by Henry Spencer. Not derived from licensed software.
17 *
18 * Permission is granted to anyone to use this software for any
19 * purpose on any computer system, and to redistribute it freely,
20 * subject to the following restrictions:
21 *
22 * 1. The author is not responsible for the consequences of use of
23 * this software, no matter how awful, even if they arise
24 * from defects in it.
25 *
26 * 2. The origin of this software must not be misrepresented, either
27 * by explicit claim or by omission.
28 *
29 * 3. Altered versions must be plainly marked as such, and must not
30 * be misrepresented as being the original software.
31 *
32 * Beware that some of this code is subtly aware of the way operator
33 * precedence is structured in regular expressions. Serious changes in
34 * regular-expression syntax might require a total rethink.
35 *
36 * Changes have been made by Tony Andrews, Olaf 'Rhialto' Seibert, Robert Webb
37 * and Bram Moolenaar.
38 * Named character class support added by Walter Briscoe (1998 Jul 01)
39 */
40
41 #include "vim.h"
42
43 #undef DEBUG
44
45 /*
46 * The "internal use only" fields in regexp.h are present to pass info from
47 * compile to execute that permits the execute phase to run lots faster on
48 * simple cases. They are:
49 *
50 * regstart char that must begin a match; NUL if none obvious; Can be a
51 * multi-byte character.
52 * reganch is the match anchored (at beginning-of-line only)?
53 * regmust string (pointer into program) that match must include, or NULL
54 * regmlen length of regmust string
55 * regflags RF_ values or'ed together
56 *
57 * Regstart and reganch permit very fast decisions on suitable starting points
58 * for a match, cutting down the work a lot. Regmust permits fast rejection
59 * of lines that cannot possibly match. The regmust tests are costly enough
60 * that vim_regcomp() supplies a regmust only if the r.e. contains something
61 * potentially expensive (at present, the only such thing detected is * or +
62 * at the start of the r.e., which can involve a lot of backup). Regmlen is
63 * supplied because the test in vim_regexec() needs it and vim_regcomp() is
64 * computing it anyway.
65 */
66
67 /*
68 * Structure for regexp "program". This is essentially a linear encoding
69 * of a nondeterministic finite-state machine (aka syntax charts or
70 * "railroad normal form" in parsing technology). Each node is an opcode
71 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
72 * all nodes except BRANCH and BRACES_COMPLEX implement concatenation; a "next"
73 * pointer with a BRANCH on both ends of it is connecting two alternatives.
74 * (Here we have one of the subtle syntax dependencies: an individual BRANCH
75 * (as opposed to a collection of them) is never concatenated with anything
76 * because of operator precedence). The "next" pointer of a BRACES_COMPLEX
77 * node points to the node after the stuff to be repeated. The operand of some
78 * types of node is a literal string; for others, it is a node leading into a
79 * sub-FSM. In particular, the operand of a BRANCH node is the first node of
80 * the branch. (NB this is *not* a tree structure: the tail of the branch
81 * connects to the thing following the set of BRANCHes.)
82 *
83 * pattern is coded like:
84 *
85 * +-----------------+
86 * | V
87 * <aa>\|<bb> BRANCH <aa> BRANCH <bb> --> END
88 * | ^ | ^
89 * +------+ +----------+
90 *
91 *
92 * +------------------+
93 * V |
94 * <aa>* BRANCH BRANCH <aa> --> BACK BRANCH --> NOTHING --> END
95 * | | ^ ^
96 * | +---------------+ |
97 * +---------------------------------------------+
98 *
99 *
100 * +-------------------------+
101 * V |
102 * <aa>\{} BRANCH BRACE_LIMITS --> BRACE_COMPLEX <aa> --> BACK END
103 * | | ^
104 * | +----------------+
105 * +-----------------------------------------------+
106 *
107 *
108 * <aa>\@!<bb> BRANCH NOMATCH <aa> --> END <bb> --> END
109 * | | ^ ^
110 * | +----------------+ |
111 * +--------------------------------+
112 *
113 * +---------+
114 * | V
115 * \z[abc] BRANCH BRANCH a BRANCH b BRANCH c BRANCH NOTHING --> END
116 * | | | | ^ ^
117 * | | | +-----+ |
118 * | | +----------------+ |
119 * | +---------------------------+ |
120 * +------------------------------------------------------+
121 *
122 * They all start with a BRANCH for "\|" alternaties, even when there is only
123 * one alternative.
124 */
125
126 /*
127 * The opcodes are:
128 */
129
130 /* definition number opnd? meaning */
131 #define END 0 /* End of program or NOMATCH operand. */
132 #define BOL 1 /* Match "" at beginning of line. */
133 #define EOL 2 /* Match "" at end of line. */
134 #define BRANCH 3 /* node Match this alternative, or the
135 * next... */
136 #define BACK 4 /* Match "", "next" ptr points backward. */
137 #define EXACTLY 5 /* str Match this string. */
138 #define NOTHING 6 /* Match empty string. */
139 #define STAR 7 /* node Match this (simple) thing 0 or more
140 * times. */
141 #define PLUS 8 /* node Match this (simple) thing 1 or more
142 * times. */
143 #define MATCH 9 /* node match the operand zero-width */
144 #define NOMATCH 10 /* node check for no match with operand */
145 #define BEHIND 11 /* node look behind for a match with operand */
146 #define NOBEHIND 12 /* node look behind for no match with operand */
147 #define SUBPAT 13 /* node match the operand here */
148 #define BRACE_SIMPLE 14 /* node Match this (simple) thing between m and
149 * n times (\{m,n\}). */
150 #define BOW 15 /* Match "" after [^a-zA-Z0-9_] */
151 #define EOW 16 /* Match "" at [^a-zA-Z0-9_] */
152 #define BRACE_LIMITS 17 /* nr nr define the min & max for BRACE_SIMPLE
153 * and BRACE_COMPLEX. */
154 #define NEWL 18 /* Match line-break */
155 #define BHPOS 19 /* End position for BEHIND or NOBEHIND */
156
157
158 /* character classes: 20-48 normal, 50-78 include a line-break */
159 #define ADD_NL 30
160 #define FIRST_NL ANY + ADD_NL
161 #define ANY 20 /* Match any one character. */
162 #define ANYOF 21 /* str Match any character in this string. */
163 #define ANYBUT 22 /* str Match any character not in this
164 * string. */
165 #define IDENT 23 /* Match identifier char */
166 #define SIDENT 24 /* Match identifier char but no digit */
167 #define KWORD 25 /* Match keyword char */
168 #define SKWORD 26 /* Match word char but no digit */
169 #define FNAME 27 /* Match file name char */
170 #define SFNAME 28 /* Match file name char but no digit */
171 #define PRINT 29 /* Match printable char */
172 #define SPRINT 30 /* Match printable char but no digit */
173 #define WHITE 31 /* Match whitespace char */
174 #define NWHITE 32 /* Match non-whitespace char */
175 #define DIGIT 33 /* Match digit char */
176 #define NDIGIT 34 /* Match non-digit char */
177 #define HEX 35 /* Match hex char */
178 #define NHEX 36 /* Match non-hex char */
179 #define OCTAL 37 /* Match octal char */
180 #define NOCTAL 38 /* Match non-octal char */
181 #define WORD 39 /* Match word char */
182 #define NWORD 40 /* Match non-word char */
183 #define HEAD 41 /* Match head char */
184 #define NHEAD 42 /* Match non-head char */
185 #define ALPHA 43 /* Match alpha char */
186 #define NALPHA 44 /* Match non-alpha char */
187 #define LOWER 45 /* Match lowercase char */
188 #define NLOWER 46 /* Match non-lowercase char */
189 #define UPPER 47 /* Match uppercase char */
190 #define NUPPER 48 /* Match non-uppercase char */
191 #define LAST_NL NUPPER + ADD_NL
192 #define WITH_NL(op) ((op) >= FIRST_NL && (op) <= LAST_NL)
193
194 #define MOPEN 80 /* -89 Mark this point in input as start of
195 * \( subexpr. MOPEN + 0 marks start of
196 * match. */
197 #define MCLOSE 90 /* -99 Analogous to MOPEN. MCLOSE + 0 marks
198 * end of match. */
199 #define BACKREF 100 /* -109 node Match same string again \1-\9 */
200
201 #ifdef FEAT_SYN_HL
202 # define ZOPEN 110 /* -119 Mark this point in input as start of
203 * \z( subexpr. */
204 # define ZCLOSE 120 /* -129 Analogous to ZOPEN. */
205 # define ZREF 130 /* -139 node Match external submatch \z1-\z9 */
206 #endif
207
208 #define BRACE_COMPLEX 140 /* -149 node Match nodes between m & n times */
209
210 #define NOPEN 150 /* Mark this point in input as start of
211 \%( subexpr. */
212 #define NCLOSE 151 /* Analogous to NOPEN. */
213
214 #define MULTIBYTECODE 200 /* mbc Match one multi-byte character */
215 #define RE_BOF 201 /* Match "" at beginning of file. */
216 #define RE_EOF 202 /* Match "" at end of file. */
217 #define CURSOR 203 /* Match location of cursor. */
218
219 #define RE_LNUM 204 /* nr cmp Match line number */
220 #define RE_COL 205 /* nr cmp Match column number */
221 #define RE_VCOL 206 /* nr cmp Match virtual column number */
222
223 /*
224 * Magic characters have a special meaning, they don't match literally.
225 * Magic characters are negative. This separates them from literal characters
226 * (possibly multi-byte). Only ASCII characters can be Magic.
227 */
228 #define Magic(x) ((int)(x) - 256)
229 #define un_Magic(x) ((x) + 256)
230 #define is_Magic(x) ((x) < 0)
231
232 static int no_Magic __ARGS((int x));
233 static int toggle_Magic __ARGS((int x));
234
235 static int
236 no_Magic(x)
237 int x;
238 {
239 if (is_Magic(x))
240 return un_Magic(x);
241 return x;
242 }
243
244 static int
245 toggle_Magic(x)
246 int x;
247 {
248 if (is_Magic(x))
249 return un_Magic(x);
250 return Magic(x);
251 }
252
253 /*
254 * The first byte of the regexp internal "program" is actually this magic
255 * number; the start node begins in the second byte. It's used to catch the
256 * most severe mutilation of the program by the caller.
257 */
258
259 #define REGMAGIC 0234
260
261 /*
262 * Opcode notes:
263 *
264 * BRANCH The set of branches constituting a single choice are hooked
265 * together with their "next" pointers, since precedence prevents
266 * anything being concatenated to any individual branch. The
267 * "next" pointer of the last BRANCH in a choice points to the
268 * thing following the whole choice. This is also where the
269 * final "next" pointer of each individual branch points; each
270 * branch starts with the operand node of a BRANCH node.
271 *
272 * BACK Normal "next" pointers all implicitly point forward; BACK
273 * exists to make loop structures possible.
274 *
275 * STAR,PLUS '=', and complex '*' and '+', are implemented as circular
276 * BRANCH structures using BACK. Simple cases (one character
277 * per match) are implemented with STAR and PLUS for speed
278 * and to minimize recursive plunges.
279 *
280 * BRACE_LIMITS This is always followed by a BRACE_SIMPLE or BRACE_COMPLEX
281 * node, and defines the min and max limits to be used for that
282 * node.
283 *
284 * MOPEN,MCLOSE ...are numbered at compile time.
285 * ZOPEN,ZCLOSE ...ditto
286 */
287
288 /*
289 * A node is one char of opcode followed by two chars of "next" pointer.
290 * "Next" pointers are stored as two 8-bit bytes, high order first. The
291 * value is a positive offset from the opcode of the node containing it.
292 * An operand, if any, simply follows the node. (Note that much of the
293 * code generation knows about this implicit relationship.)
294 *
295 * Using two bytes for the "next" pointer is vast overkill for most things,
296 * but allows patterns to get big without disasters.
297 */
298 #define OP(p) ((int)*(p))
299 #define NEXT(p) (((*((p) + 1) & 0377) << 8) + (*((p) + 2) & 0377))
300 #define OPERAND(p) ((p) + 3)
301 /* Obtain an operand that was stored as four bytes, MSB first. */
302 #define OPERAND_MIN(p) (((long)(p)[3] << 24) + ((long)(p)[4] << 16) \
303 + ((long)(p)[5] << 8) + (long)(p)[6])
304 /* Obtain a second operand stored as four bytes. */
305 #define OPERAND_MAX(p) OPERAND_MIN((p) + 4)
306 /* Obtain a second single-byte operand stored after a four bytes operand. */
307 #define OPERAND_CMP(p) (p)[7]
308
309 /*
310 * Utility definitions.
311 */
312 #define UCHARAT(p) ((int)*(char_u *)(p))
313
314 /* Used for an error (down from) vim_regcomp(): give the error message, set
315 * rc_did_emsg and return NULL */
316 #define EMSG_RET_NULL(m) { EMSG(m); rc_did_emsg = TRUE; return NULL; }
317 #define EMSG_M_RET_NULL(m, c) { EMSG2(m, c ? "" : "\\"); rc_did_emsg = TRUE; return NULL; }
318 #define EMSG_RET_FAIL(m) { EMSG(m); rc_did_emsg = TRUE; return FAIL; }
319 #define EMSG_ONE_RET_NULL EMSG_M_RET_NULL(_("E369: invalid item in %s%%[]"), reg_magic == MAGIC_ALL)
320
321 #define MAX_LIMIT (32767L << 16L)
322
323 static int re_multi_type __ARGS((int));
324 static int cstrncmp __ARGS((char_u *s1, char_u *s2, int *n));
325 static char_u *cstrchr __ARGS((char_u *, int));
326
327 #ifdef DEBUG
328 static void regdump __ARGS((char_u *, regprog_T *));
329 static char_u *regprop __ARGS((char_u *));
330 #endif
331
332 #define NOT_MULTI 0
333 #define MULTI_ONE 1
334 #define MULTI_MULT 2
335 /*
336 * Return NOT_MULTI if c is not a "multi" operator.
337 * Return MULTI_ONE if c is a single "multi" operator.
338 * Return MULTI_MULT if c is a multi "multi" operator.
339 */
340 static int
341 re_multi_type(c)
342 int c;
343 {
344 if (c == Magic('@') || c == Magic('=') || c == Magic('?'))
345 return MULTI_ONE;
346 if (c == Magic('*') || c == Magic('+') || c == Magic('{'))
347 return MULTI_MULT;
348 return NOT_MULTI;
349 }
350
351 /*
352 * Flags to be passed up and down.
353 */
354 #define HASWIDTH 0x1 /* Known never to match null string. */
355 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
356 #define SPSTART 0x4 /* Starts with * or +. */
357 #define HASNL 0x8 /* Contains some \n. */
358 #define HASLOOKBH 0x10 /* Contains "\@<=" or "\@<!". */
359 #define WORST 0 /* Worst case. */
360
361 /*
362 * When regcode is set to this value, code is not emitted and size is computed
363 * instead.
364 */
365 #define JUST_CALC_SIZE ((char_u *) -1)
366
367 static char_u *reg_prev_sub;
368
369 /*
370 * REGEXP_INRANGE contains all characters which are always special in a []
371 * range after '\'.
372 * REGEXP_ABBR contains all characters which act as abbreviations after '\'.
373 * These are:
374 * \n - New line (NL).
375 * \r - Carriage Return (CR).
376 * \t - Tab (TAB).
377 * \e - Escape (ESC).
378 * \b - Backspace (Ctrl_H).
379 */
380 static char_u REGEXP_INRANGE[] = "]^-n\\";
381 static char_u REGEXP_ABBR[] = "nrteb";
382
383 static int backslash_trans __ARGS((int c));
384 static int skip_class_name __ARGS((char_u **pp));
385 static char_u *skip_anyof __ARGS((char_u *p));
386 static void init_class_tab __ARGS((void));
387
388 /*
389 * Translate '\x' to its control character, except "\n", which is Magic.
390 */
391 static int
392 backslash_trans(c)
393 int c;
394 {
395 switch (c)
396 {
397 case 'r': return CAR;
398 case 't': return TAB;
399 case 'e': return ESC;
400 case 'b': return BS;
401 }
402 return c;
403 }
404
405 /*
406 * Check for a character class name. "pp" points to the '['.
407 * Returns one of the CLASS_ items. CLASS_NONE means that no item was
408 * recognized. Otherwise "pp" is advanced to after the item.
409 */
410 static int
411 skip_class_name(pp)
412 char_u **pp;
413 {
414 static const char *(class_names[]) =
415 {
416 "alnum:]",
417 #define CLASS_ALNUM 0
418 "alpha:]",
419 #define CLASS_ALPHA 1
420 "blank:]",
421 #define CLASS_BLANK 2
422 "cntrl:]",
423 #define CLASS_CNTRL 3
424 "digit:]",
425 #define CLASS_DIGIT 4
426 "graph:]",
427 #define CLASS_GRAPH 5
428 "lower:]",
429 #define CLASS_LOWER 6
430 "print:]",
431 #define CLASS_PRINT 7
432 "punct:]",
433 #define CLASS_PUNCT 8
434 "space:]",
435 #define CLASS_SPACE 9
436 "upper:]",
437 #define CLASS_UPPER 10
438 "xdigit:]",
439 #define CLASS_XDIGIT 11
440 "tab:]",
441 #define CLASS_TAB 12
442 "return:]",
443 #define CLASS_RETURN 13
444 "backspace:]",
445 #define CLASS_BACKSPACE 14
446 "escape:]",
447 #define CLASS_ESCAPE 15
448 };
449 #define CLASS_NONE 99
450 int i;
451
452 if ((*pp)[1] == ':')
453 {
454 for (i = 0; i < sizeof(class_names) / sizeof(*class_names); ++i)
455 if (STRNCMP(*pp + 2, class_names[i], STRLEN(class_names[i])) == 0)
456 {
457 *pp += STRLEN(class_names[i]) + 2;
458 return i;
459 }
460 }
461 return CLASS_NONE;
462 }
463
464 /*
465 * Skip over a "[]" range.
466 * "p" must point to the character after the '['.
467 * The returned pointer is on the matching ']', or the terminating NUL.
468 */
469 static char_u *
470 skip_anyof(p)
471 char_u *p;
472 {
473 int cpo_lit; /* 'cpoptions' contains 'l' flag */
474 #ifdef FEAT_MBYTE
475 int l;
476 #endif
477
478 cpo_lit = (!reg_syn && vim_strchr(p_cpo, CPO_LITERAL) != NULL);
479
480 if (*p == '^') /* Complement of range. */
481 ++p;
482 if (*p == ']' || *p == '-')
483 ++p;
484 while (*p != NUL && *p != ']')
485 {
486 #ifdef FEAT_MBYTE
487 if (has_mbyte && (l = (*mb_ptr2len_check)(p)) > 1)
488 p += l;
489 else
490 #endif
491 if (*p == '-')
492 {
493 ++p;
494 if (*p != ']' && *p != NUL)
495 {
496 #ifdef FEAT_MBYTE
497 if (has_mbyte)
498 p += (*mb_ptr2len_check)(p);
499 else
500 #endif
501 ++p;
502 }
503 }
504 else if (*p == '\\'
505 && (vim_strchr(REGEXP_INRANGE, p[1]) != NULL
506 || (!cpo_lit && vim_strchr(REGEXP_ABBR, p[1]) != NULL)))
507 p += 2;
508 else if (*p == '[')
509 {
510 if (skip_class_name(&p) == CLASS_NONE)
511 ++p; /* It was not a class name */
512 }
513 else
514 ++p;
515 }
516
517 return p;
518 }
519
520 /*
521 * Specific version of character class functions.
522 * Using a table to keep this fast.
523 */
524 static short class_tab[256];
525
526 #define RI_DIGIT 0x01
527 #define RI_HEX 0x02
528 #define RI_OCTAL 0x04
529 #define RI_WORD 0x08
530 #define RI_HEAD 0x10
531 #define RI_ALPHA 0x20
532 #define RI_LOWER 0x40
533 #define RI_UPPER 0x80
534 #define RI_WHITE 0x100
535
536 static void
537 init_class_tab()
538 {
539 int i;
540 static int done = FALSE;
541
542 if (done)
543 return;
544
545 for (i = 0; i < 256; ++i)
546 {
547 if (i >= '0' && i <= '7')
548 class_tab[i] = RI_DIGIT + RI_HEX + RI_OCTAL + RI_WORD;
549 else if (i >= '8' && i <= '9')
550 class_tab[i] = RI_DIGIT + RI_HEX + RI_WORD;
551 else if (i >= 'a' && i <= 'f')
552 class_tab[i] = RI_HEX + RI_WORD + RI_HEAD + RI_ALPHA + RI_LOWER;
553 #ifdef EBCDIC
554 else if ((i >= 'g' && i <= 'i') || (i >= 'j' && i <= 'r')
555 || (i >= 's' && i <= 'z'))
556 #else
557 else if (i >= 'g' && i <= 'z')
558 #endif
559 class_tab[i] = RI_WORD + RI_HEAD + RI_ALPHA + RI_LOWER;
560 else if (i >= 'A' && i <= 'F')
561 class_tab[i] = RI_HEX + RI_WORD + RI_HEAD + RI_ALPHA + RI_UPPER;
562 #ifdef EBCDIC
563 else if ((i >= 'G' && i <= 'I') || ( i >= 'J' && i <= 'R')
564 || (i >= 'S' && i <= 'Z'))
565 #else
566 else if (i >= 'G' && i <= 'Z')
567 #endif
568 class_tab[i] = RI_WORD + RI_HEAD + RI_ALPHA + RI_UPPER;
569 else if (i == '_')
570 class_tab[i] = RI_WORD + RI_HEAD;
571 else
572 class_tab[i] = 0;
573 }
574 class_tab[' '] |= RI_WHITE;
575 class_tab['\t'] |= RI_WHITE;
576 done = TRUE;
577 }
578
579 #ifdef FEAT_MBYTE
580 # define ri_digit(c) (c < 0x100 && (class_tab[c] & RI_DIGIT))
581 # define ri_hex(c) (c < 0x100 && (class_tab[c] & RI_HEX))
582 # define ri_octal(c) (c < 0x100 && (class_tab[c] & RI_OCTAL))
583 # define ri_word(c) (c < 0x100 && (class_tab[c] & RI_WORD))
584 # define ri_head(c) (c < 0x100 && (class_tab[c] & RI_HEAD))
585 # define ri_alpha(c) (c < 0x100 && (class_tab[c] & RI_ALPHA))
586 # define ri_lower(c) (c < 0x100 && (class_tab[c] & RI_LOWER))
587 # define ri_upper(c) (c < 0x100 && (class_tab[c] & RI_UPPER))
588 # define ri_white(c) (c < 0x100 && (class_tab[c] & RI_WHITE))
589 #else
590 # define ri_digit(c) (class_tab[c] & RI_DIGIT)
591 # define ri_hex(c) (class_tab[c] & RI_HEX)
592 # define ri_octal(c) (class_tab[c] & RI_OCTAL)
593 # define ri_word(c) (class_tab[c] & RI_WORD)
594 # define ri_head(c) (class_tab[c] & RI_HEAD)
595 # define ri_alpha(c) (class_tab[c] & RI_ALPHA)
596 # define ri_lower(c) (class_tab[c] & RI_LOWER)
597 # define ri_upper(c) (class_tab[c] & RI_UPPER)
598 # define ri_white(c) (class_tab[c] & RI_WHITE)
599 #endif
600
601 /* flags for regflags */
602 #define RF_ICASE 1 /* ignore case */
603 #define RF_NOICASE 2 /* don't ignore case */
604 #define RF_HASNL 4 /* can match a NL */
605 #define RF_ICOMBINE 8 /* ignore combining characters */
606 #define RF_LOOKBH 16 /* uses "\@<=" or "\@<!" */
607
608 /*
609 * Global work variables for vim_regcomp().
610 */
611
612 static char_u *regparse; /* Input-scan pointer. */
613 static int prevchr_len; /* byte length of previous char */
614 static int num_complex_braces; /* Complex \{...} count */
615 static int regnpar; /* () count. */
616 #ifdef FEAT_SYN_HL
617 static int regnzpar; /* \z() count. */
618 static int re_has_z; /* \z item detected */
619 #endif
620 static char_u *regcode; /* Code-emit pointer, or JUST_CALC_SIZE */
621 static long regsize; /* Code size. */
622 static char_u had_endbrace[NSUBEXP]; /* flags, TRUE if end of () found */
623 static unsigned regflags; /* RF_ flags for prog */
624 static long brace_min[10]; /* Minimums for complex brace repeats */
625 static long brace_max[10]; /* Maximums for complex brace repeats */
626 static int brace_count[10]; /* Current counts for complex brace repeats */
627 #if defined(FEAT_SYN_HL) || defined(PROTO)
628 static int had_eol; /* TRUE when EOL found by vim_regcomp() */
629 #endif
630 static int one_exactly = FALSE; /* only do one char for EXACTLY */
631
632 static int reg_magic; /* magicness of the pattern: */
633 #define MAGIC_NONE 1 /* "\V" very unmagic */
634 #define MAGIC_OFF 2 /* "\M" or 'magic' off */
635 #define MAGIC_ON 3 /* "\m" or 'magic' */
636 #define MAGIC_ALL 4 /* "\v" very magic */
637
638 static int reg_string; /* matching with a string instead of a buffer
639 line */
640
641 /*
642 * META contains all characters that may be magic, except '^' and '$'.
643 */
644
645 #ifdef EBCDIC
646 static char_u META[] = "%&()*+.123456789<=>?@ACDFHIKLMOPSUVWX[_acdfhiklmnopsuvwxz{|~";
647 #else
648 /* META[] is used often enough to justify turning it into a table. */
649 static char_u META_flags[] = {
650 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
651 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
652 /* % & ( ) * + . */
653 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0,
654 /* 1 2 3 4 5 6 7 8 9 < = > ? */
655 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
656 /* @ A C D F H I K L M O */
657 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1,
658 /* P S U V W X Z [ _ */
659 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1,
660 /* a c d f h i k l m n o */
661 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
662 /* p s u v w x z { | ~ */
663 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1
664 };
665 #endif
666
667 static int curchr;
668
669 /* arguments for reg() */
670 #define REG_NOPAREN 0 /* toplevel reg() */
671 #define REG_PAREN 1 /* \(\) */
672 #define REG_ZPAREN 2 /* \z(\) */
673 #define REG_NPAREN 3 /* \%(\) */
674
675 /*
676 * Forward declarations for vim_regcomp()'s friends.
677 */
678 static void initchr __ARGS((char_u *));
679 static int getchr __ARGS((void));
680 static void skipchr_keepstart __ARGS((void));
681 static int peekchr __ARGS((void));
682 static void skipchr __ARGS((void));
683 static void ungetchr __ARGS((void));
684 static void regcomp_start __ARGS((char_u *expr, int flags));
685 static char_u *reg __ARGS((int, int *));
686 static char_u *regbranch __ARGS((int *flagp));
687 static char_u *regconcat __ARGS((int *flagp));
688 static char_u *regpiece __ARGS((int *));
689 static char_u *regatom __ARGS((int *));
690 static char_u *regnode __ARGS((int));
691 static int prog_magic_wrong __ARGS((void));
692 static char_u *regnext __ARGS((char_u *));
693 static void regc __ARGS((int b));
694 #ifdef FEAT_MBYTE
695 static void regmbc __ARGS((int c));
696 #endif
697 static void reginsert __ARGS((int, char_u *));
698 static void reginsert_limits __ARGS((int, long, long, char_u *));
699 static char_u *re_put_long __ARGS((char_u *pr, long_u val));
700 static int read_limits __ARGS((long *, long *));
701 static void regtail __ARGS((char_u *, char_u *));
702 static void regoptail __ARGS((char_u *, char_u *));
703
704 /*
705 * Return TRUE if compiled regular expression "prog" can match a line break.
706 */
707 int
708 re_multiline(prog)
709 regprog_T *prog;
710 {
711 return (prog->regflags & RF_HASNL);
712 }
713
714 /*
715 * Return TRUE if compiled regular expression "prog" looks before the start
716 * position (pattern contains "\@<=" or "\@<!").
717 */
718 int
719 re_lookbehind(prog)
720 regprog_T *prog;
721 {
722 return (prog->regflags & RF_LOOKBH);
723 }
724
725 /*
726 * Skip past regular expression.
727 * Stop at end of 'p' of where 'dirc' is found ('/', '?', etc).
728 * Take care of characters with a backslash in front of it.
729 * Skip strings inside [ and ].
730 * When "newp" is not NULL and "dirc" is '?', make an allocated copy of the
731 * expression and change "\?" to "?". If "*newp" is not NULL the expression
732 * is changed in-place.
733 */
734 char_u *
735 skip_regexp(startp, dirc, magic, newp)
736 char_u *startp;
737 int dirc;
738 int magic;
739 char_u **newp;
740 {
741 int mymagic;
742 char_u *p = startp;
743
744 if (magic)
745 mymagic = MAGIC_ON;
746 else
747 mymagic = MAGIC_OFF;
748
749 for (; p[0] != NUL; ++p)
750 {
751 if (p[0] == dirc) /* found end of regexp */
752 break;
753 if ((p[0] == '[' && mymagic >= MAGIC_ON)
754 || (p[0] == '\\' && p[1] == '[' && mymagic <= MAGIC_OFF))
755 {
756 p = skip_anyof(p + 1);
757 if (p[0] == NUL)
758 break;
759 }
760 else if (p[0] == '\\' && p[1] != NUL)
761 {
762 if (dirc == '?' && newp != NULL && p[1] == '?')
763 {
764 /* change "\?" to "?", make a copy first. */
765 if (*newp == NULL)
766 {
767 *newp = vim_strsave(startp);
768 if (*newp != NULL)
769 p = *newp + (p - startp);
770 }
771 if (*newp != NULL)
772 mch_memmove(p, p + 1, STRLEN(p));
773 else
774 ++p;
775 }
776 else
777 ++p; /* skip next character */
778 if (*p == 'v')
779 mymagic = MAGIC_ALL;
780 else if (*p == 'V')
781 mymagic = MAGIC_NONE;
782 }
783 #ifdef FEAT_MBYTE
784 else if (has_mbyte)
785 p += (*mb_ptr2len_check)(p) - 1;
786 #endif
787 }
788 return p;
789 }
790
791 /*
792 * vim_regcomp - compile a regular expression into internal code
793 *
794 * We can't allocate space until we know how big the compiled form will be,
795 * but we can't compile it (and thus know how big it is) until we've got a
796 * place to put the code. So we cheat: we compile it twice, once with code
797 * generation turned off and size counting turned on, and once "for real".
798 * This also means that we don't allocate space until we are sure that the
799 * thing really will compile successfully, and we never have to move the
800 * code and thus invalidate pointers into it. (Note that it has to be in
801 * one piece because vim_free() must be able to free it all.)
802 *
803 * Whether upper/lower case is to be ignored is decided when executing the
804 * program, it does not matter here.
805 *
806 * Beware that the optimization-preparation code in here knows about some
807 * of the structure of the compiled regexp.
808 * "re_flags": RE_MAGIC and/or RE_STRING.
809 */
810 regprog_T *
811 vim_regcomp(expr, re_flags)
812 char_u *expr;
813 int re_flags;
814 {
815 regprog_T *r;
816 char_u *scan;
817 char_u *longest;
818 int len;
819 int flags;
820
821 if (expr == NULL)
822 EMSG_RET_NULL(_(e_null));
823
824 init_class_tab();
825
826 /*
827 * First pass: determine size, legality.
828 */
829 regcomp_start(expr, re_flags);
830 regcode = JUST_CALC_SIZE;
831 regc(REGMAGIC);
832 if (reg(REG_NOPAREN, &flags) == NULL)
833 return NULL;
834
835 /* Small enough for pointer-storage convention? */
836 #ifdef SMALL_MALLOC /* 16 bit storage allocation */
837 if (regsize >= 65536L - 256L)
838 EMSG_RET_NULL(_("E339: Pattern too long"));
839 #endif
840
841 /* Allocate space. */
842 r = (regprog_T *)lalloc(sizeof(regprog_T) + regsize, TRUE);
843 if (r == NULL)
844 return NULL;
845
846 /*
847 * Second pass: emit code.
848 */
849 regcomp_start(expr, re_flags);
850 regcode = r->program;
851 regc(REGMAGIC);
852 if (reg(REG_NOPAREN, &flags) == NULL)
853 {
854 vim_free(r);
855 return NULL;
856 }
857
858 /* Dig out information for optimizations. */
859 r->regstart = NUL; /* Worst-case defaults. */
860 r->reganch = 0;
861 r->regmust = NULL;
862 r->regmlen = 0;
863 r->regflags = regflags;
864 if (flags & HASNL)
865 r->regflags |= RF_HASNL;
866 if (flags & HASLOOKBH)
867 r->regflags |= RF_LOOKBH;
868 #ifdef FEAT_SYN_HL
869 /* Remember whether this pattern has any \z specials in it. */
870 r->reghasz = re_has_z;
871 #endif
872 scan = r->program + 1; /* First BRANCH. */
873 if (OP(regnext(scan)) == END) /* Only one top-level choice. */
874 {
875 scan = OPERAND(scan);
876
877 /* Starting-point info. */
878 if (OP(scan) == BOL || OP(scan) == RE_BOF)
879 {
880 r->reganch++;
881 scan = regnext(scan);
882 }
883
884 if (OP(scan) == EXACTLY)
885 {
886 #ifdef FEAT_MBYTE
887 if (has_mbyte)
888 r->regstart = (*mb_ptr2char)(OPERAND(scan));
889 else
890 #endif
891 r->regstart = *OPERAND(scan);
892 }
893 else if ((OP(scan) == BOW
894 || OP(scan) == EOW
895 || OP(scan) == NOTHING
896 || OP(scan) == MOPEN + 0 || OP(scan) == NOPEN
897 || OP(scan) == MCLOSE + 0 || OP(scan) == NCLOSE)
898 && OP(regnext(scan)) == EXACTLY)
899 {
900 #ifdef FEAT_MBYTE
901 if (has_mbyte)
902 r->regstart = (*mb_ptr2char)(OPERAND(regnext(scan)));
903 else
904 #endif
905 r->regstart = *OPERAND(regnext(scan));
906 }
907
908 /*
909 * If there's something expensive in the r.e., find the longest
910 * literal string that must appear and make it the regmust. Resolve
911 * ties in favor of later strings, since the regstart check works
912 * with the beginning of the r.e. and avoiding duplication
913 * strengthens checking. Not a strong reason, but sufficient in the
914 * absence of others.
915 */
916 /*
917 * When the r.e. starts with BOW, it is faster to look for a regmust
918 * first. Used a lot for "#" and "*" commands. (Added by mool).
919 */
920 if ((flags & SPSTART || OP(scan) == BOW || OP(scan) == EOW)
921 && !(flags & HASNL))
922 {
923 longest = NULL;
924 len = 0;
925 for (; scan != NULL; scan = regnext(scan))
926 if (OP(scan) == EXACTLY && STRLEN(OPERAND(scan)) >= (size_t)len)
927 {
928 longest = OPERAND(scan);
929 len = (int)STRLEN(OPERAND(scan));
930 }
931 r->regmust = longest;
932 r->regmlen = len;
933 }
934 }
935 #ifdef DEBUG
936 regdump(expr, r);
937 #endif
938 return r;
939 }
940
941 /*
942 * Setup to parse the regexp. Used once to get the length and once to do it.
943 */
944 static void
945 regcomp_start(expr, re_flags)
946 char_u *expr;
947 int re_flags; /* see vim_regcomp() */
948 {
949 initchr(expr);
950 if (re_flags & RE_MAGIC)
951 reg_magic = MAGIC_ON;
952 else
953 reg_magic = MAGIC_OFF;
954 reg_string = (re_flags & RE_STRING);
955
956 num_complex_braces = 0;
957 regnpar = 1;
958 vim_memset(had_endbrace, 0, sizeof(had_endbrace));
959 #ifdef FEAT_SYN_HL
960 regnzpar = 1;
961 re_has_z = 0;
962 #endif
963 regsize = 0L;
964 regflags = 0;
965 #if defined(FEAT_SYN_HL) || defined(PROTO)
966 had_eol = FALSE;
967 #endif
968 }
969
970 #if defined(FEAT_SYN_HL) || defined(PROTO)
971 /*
972 * Check if during the previous call to vim_regcomp the EOL item "$" has been
973 * found. This is messy, but it works fine.
974 */
975 int
976 vim_regcomp_had_eol()
977 {
978 return had_eol;
979 }
980 #endif
981
982 /*
983 * reg - regular expression, i.e. main body or parenthesized thing
984 *
985 * Caller must absorb opening parenthesis.
986 *
987 * Combining parenthesis handling with the base level of regular expression
988 * is a trifle forced, but the need to tie the tails of the branches to what
989 * follows makes it hard to avoid.
990 */
991 static char_u *
992 reg(paren, flagp)
993 int paren; /* REG_NOPAREN, REG_PAREN, REG_NPAREN or REG_ZPAREN */
994 int *flagp;
995 {
996 char_u *ret;
997 char_u *br;
998 char_u *ender;
999 int parno = 0;
1000 int flags;
1001
1002 *flagp = HASWIDTH; /* Tentatively. */
1003
1004 #ifdef FEAT_SYN_HL
1005 if (paren == REG_ZPAREN)
1006 {
1007 /* Make a ZOPEN node. */
1008 if (regnzpar >= NSUBEXP)
1009 EMSG_RET_NULL(_("E50: Too many \\z("));
1010 parno = regnzpar;
1011 regnzpar++;
1012 ret = regnode(ZOPEN + parno);
1013 }
1014 else
1015 #endif
1016 if (paren == REG_PAREN)
1017 {
1018 /* Make a MOPEN node. */
1019 if (regnpar >= NSUBEXP)
1020 EMSG_M_RET_NULL(_("E51: Too many %s("), reg_magic == MAGIC_ALL);
1021 parno = regnpar;
1022 ++regnpar;
1023 ret = regnode(MOPEN + parno);
1024 }
1025 else if (paren == REG_NPAREN)
1026 {
1027 /* Make a NOPEN node. */
1028 ret = regnode(NOPEN);
1029 }
1030 else
1031 ret = NULL;
1032
1033 /* Pick up the branches, linking them together. */
1034 br = regbranch(&flags);
1035 if (br == NULL)
1036 return NULL;
1037 if (ret != NULL)
1038 regtail(ret, br); /* [MZ]OPEN -> first. */
1039 else
1040 ret = br;
1041 /* If one of the branches can be zero-width, the whole thing can.
1042 * If one of the branches has * at start or matches a line-break, the
1043 * whole thing can. */
1044 if (!(flags & HASWIDTH))
1045 *flagp &= ~HASWIDTH;
1046 *flagp |= flags & (SPSTART | HASNL | HASLOOKBH);
1047 while (peekchr() == Magic('|'))
1048 {
1049 skipchr();
1050 br = regbranch(&flags);
1051 if (br == NULL)
1052 return NULL;
1053 regtail(ret, br); /* BRANCH -> BRANCH. */
1054 if (!(flags & HASWIDTH))
1055 *flagp &= ~HASWIDTH;
1056 *flagp |= flags & (SPSTART | HASNL | HASLOOKBH);
1057 }
1058
1059 /* Make a closing node, and hook it on the end. */
1060 ender = regnode(
1061 #ifdef FEAT_SYN_HL
1062 paren == REG_ZPAREN ? ZCLOSE + parno :
1063 #endif
1064 paren == REG_PAREN ? MCLOSE + parno :
1065 paren == REG_NPAREN ? NCLOSE : END);
1066 regtail(ret, ender);
1067
1068 /* Hook the tails of the branches to the closing node. */
1069 for (br = ret; br != NULL; br = regnext(br))
1070 regoptail(br, ender);
1071
1072 /* Check for proper termination. */
1073 if (paren != REG_NOPAREN && getchr() != Magic(')'))
1074 {
1075 #ifdef FEAT_SYN_HL
1076 if (paren == REG_ZPAREN)
1077 EMSG_RET_NULL(_("E52: Unmatched \\z("))
1078 else
1079 #endif
1080 if (paren == REG_NPAREN)
1081 EMSG_M_RET_NULL(_("E53: Unmatched %s%%("), reg_magic == MAGIC_ALL)
1082 else
1083 EMSG_M_RET_NULL(_("E54: Unmatched %s("), reg_magic == MAGIC_ALL)
1084 }
1085 else if (paren == REG_NOPAREN && peekchr() != NUL)
1086 {
1087 if (curchr == Magic(')'))
1088 EMSG_M_RET_NULL(_("E55: Unmatched %s)"), reg_magic == MAGIC_ALL)
1089 else
1090 EMSG_RET_NULL(_(e_trailing)) /* "Can't happen". */
1091 /* NOTREACHED */
1092 }
1093 /*
1094 * Here we set the flag allowing back references to this set of
1095 * parentheses.
1096 */
1097 if (paren == REG_PAREN)
1098 had_endbrace[parno] = TRUE; /* have seen the close paren */
1099 return ret;
1100 }
1101
1102 /*
1103 * regbranch - one alternative of an | operator
1104 *
1105 * Implements the & operator.
1106 */
1107 static char_u *
1108 regbranch(flagp)
1109 int *flagp;
1110 {
1111 char_u *ret;
1112 char_u *chain = NULL;
1113 char_u *latest;
1114 int flags;
1115
1116 *flagp = WORST | HASNL; /* Tentatively. */
1117
1118 ret = regnode(BRANCH);
1119 for (;;)
1120 {
1121 latest = regconcat(&flags);
1122 if (latest == NULL)
1123 return NULL;
1124 /* If one of the branches has width, the whole thing has. If one of
1125 * the branches anchors at start-of-line, the whole thing does.
1126 * If one of the branches uses look-behind, the whole thing does. */
1127 *flagp |= flags & (HASWIDTH | SPSTART | HASLOOKBH);
1128 /* If one of the branches doesn't match a line-break, the whole thing
1129 * doesn't. */
1130 *flagp &= ~HASNL | (flags & HASNL);
1131 if (chain != NULL)
1132 regtail(chain, latest);
1133 if (peekchr() != Magic('&'))
1134 break;
1135 skipchr();
1136 regtail(latest, regnode(END)); /* operand ends */
1137 reginsert(MATCH, latest);
1138 chain = latest;
1139 }
1140
1141 return ret;
1142 }
1143
1144 /*
1145 * regbranch - one alternative of an | or & operator
1146 *
1147 * Implements the concatenation operator.
1148 */
1149 static char_u *
1150 regconcat(flagp)
1151 int *flagp;
1152 {
1153 char_u *first = NULL;
1154 char_u *chain = NULL;
1155 char_u *latest;
1156 int flags;
1157 int cont = TRUE;
1158
1159 *flagp = WORST; /* Tentatively. */
1160
1161 while (cont)
1162 {
1163 switch (peekchr())
1164 {
1165 case NUL:
1166 case Magic('|'):
1167 case Magic('&'):
1168 case Magic(')'):
1169 cont = FALSE;
1170 break;
1171 case Magic('Z'):
1172 #ifdef FEAT_MBYTE
1173 regflags |= RF_ICOMBINE;
1174 #endif
1175 skipchr_keepstart();
1176 break;
1177 case Magic('c'):
1178 regflags |= RF_ICASE;
1179 skipchr_keepstart();
1180 break;
1181 case Magic('C'):
1182 regflags |= RF_NOICASE;
1183 skipchr_keepstart();
1184 break;
1185 case Magic('v'):
1186 reg_magic = MAGIC_ALL;
1187 skipchr_keepstart();
1188 curchr = -1;
1189 break;
1190 case Magic('m'):
1191 reg_magic = MAGIC_ON;
1192 skipchr_keepstart();
1193 curchr = -1;
1194 break;
1195 case Magic('M'):
1196 reg_magic = MAGIC_OFF;
1197 skipchr_keepstart();
1198 curchr = -1;
1199 break;
1200 case Magic('V'):
1201 reg_magic = MAGIC_NONE;
1202 skipchr_keepstart();
1203 curchr = -1;
1204 break;
1205 default:
1206 latest = regpiece(&flags);
1207 if (latest == NULL)
1208 return NULL;
1209 *flagp |= flags & (HASWIDTH | HASNL | HASLOOKBH);
1210 if (chain == NULL) /* First piece. */
1211 *flagp |= flags & SPSTART;
1212 else
1213 regtail(chain, latest);
1214 chain = latest;
1215 if (first == NULL)
1216 first = latest;
1217 break;
1218 }
1219 }
1220 if (first == NULL) /* Loop ran zero times. */
1221 first = regnode(NOTHING);
1222 return first;
1223 }
1224
1225 /*
1226 * regpiece - something followed by possible [*+=]
1227 *
1228 * Note that the branching code sequences used for = and the general cases
1229 * of * and + are somewhat optimized: they use the same NOTHING node as
1230 * both the endmarker for their branch list and the body of the last branch.
1231 * It might seem that this node could be dispensed with entirely, but the
1232 * endmarker role is not redundant.
1233 */
1234 static char_u *
1235 regpiece(flagp)
1236 int *flagp;
1237 {
1238 char_u *ret;
1239 int op;
1240 char_u *next;
1241 int flags;
1242 long minval;
1243 long maxval;
1244
1245 ret = regatom(&flags);
1246 if (ret == NULL)
1247 return NULL;
1248
1249 op = peekchr();
1250 if (re_multi_type(op) == NOT_MULTI)
1251 {
1252 *flagp = flags;
1253 return ret;
1254 }
1255 if (!(flags & HASWIDTH) && re_multi_type(op) == MULTI_MULT)
1256 {
1257 if (op == Magic('*'))
1258 EMSG_M_RET_NULL(_("E56: %s* operand could be empty"),
1259 reg_magic >= MAGIC_ON);
1260 if (op == Magic('+'))
1261 EMSG_M_RET_NULL(_("E57: %s+ operand could be empty"),
1262 reg_magic == MAGIC_ALL);
1263 /* "\{}" is checked below, it's allowed when there is an upper limit */
1264 }
1265 /* default flags */
1266 *flagp = (WORST | SPSTART | (flags & (HASNL | HASLOOKBH)));
1267
1268 skipchr();
1269 switch (op)
1270 {
1271 case Magic('*'):
1272 if (flags & SIMPLE)
1273 reginsert(STAR, ret);
1274 else
1275 {
1276 /* Emit x* as (x&|), where & means "self". */
1277 reginsert(BRANCH, ret); /* Either x */
1278 regoptail(ret, regnode(BACK)); /* and loop */
1279 regoptail(ret, ret); /* back */
1280 regtail(ret, regnode(BRANCH)); /* or */
1281 regtail(ret, regnode(NOTHING)); /* null. */
1282 }
1283 break;
1284
1285 case Magic('+'):
1286 if (flags & SIMPLE)
1287 reginsert(PLUS, ret);
1288 else
1289 {
1290 /* Emit x+ as x(&|), where & means "self". */
1291 next = regnode(BRANCH); /* Either */
1292 regtail(ret, next);
1293 regtail(regnode(BACK), ret); /* loop back */
1294 regtail(next, regnode(BRANCH)); /* or */
1295 regtail(ret, regnode(NOTHING)); /* null. */
1296 }
1297 *flagp = (WORST | HASWIDTH | (flags & (HASNL | HASLOOKBH)));
1298 break;
1299
1300 case Magic('@'):
1301 {
1302 int lop = END;
1303
1304 switch (no_Magic(getchr()))
1305 {
1306 case '=': lop = MATCH; break; /* \@= */
1307 case '!': lop = NOMATCH; break; /* \@! */
1308 case '>': lop = SUBPAT; break; /* \@> */
1309 case '<': switch (no_Magic(getchr()))
1310 {
1311 case '=': lop = BEHIND; break; /* \@<= */
1312 case '!': lop = NOBEHIND; break; /* \@<! */
1313 }
1314 }
1315 if (lop == END)
1316 EMSG_M_RET_NULL(_("E59: invalid character after %s@"),
1317 reg_magic == MAGIC_ALL);
1318 /* Look behind must match with behind_pos. */
1319 if (lop == BEHIND || lop == NOBEHIND)
1320 {
1321 regtail(ret, regnode(BHPOS));
1322 *flagp |= HASLOOKBH;
1323 }
1324 regtail(ret, regnode(END)); /* operand ends */
1325 reginsert(lop, ret);
1326 break;
1327 }
1328
1329 case Magic('?'):
1330 case Magic('='):
1331 /* Emit x= as (x|) */
1332 reginsert(BRANCH, ret); /* Either x */
1333 regtail(ret, regnode(BRANCH)); /* or */
1334 next = regnode(NOTHING); /* null. */
1335 regtail(ret, next);
1336 regoptail(ret, next);
1337 break;
1338
1339 case Magic('{'):
1340 if (!read_limits(&minval, &maxval))
1341 return NULL;
1342 if (!(flags & HASWIDTH) && (maxval > minval
1343 ? maxval >= MAX_LIMIT : minval >= MAX_LIMIT))
1344 EMSG_M_RET_NULL(_("E58: %s{ operand could be empty"),
1345 reg_magic == MAGIC_ALL);
1346 if (flags & SIMPLE)
1347 {
1348 reginsert(BRACE_SIMPLE, ret);
1349 reginsert_limits(BRACE_LIMITS, minval, maxval, ret);
1350 }
1351 else
1352 {
1353 if (num_complex_braces >= 10)
1354 EMSG_M_RET_NULL(_("E60: Too many complex %s{...}s"),
1355 reg_magic == MAGIC_ALL);
1356 reginsert(BRACE_COMPLEX + num_complex_braces, ret);
1357 regoptail(ret, regnode(BACK));
1358 regoptail(ret, ret);
1359 reginsert_limits(BRACE_LIMITS, minval, maxval, ret);
1360 ++num_complex_braces;
1361 }
1362 if (minval > 0 && maxval > 0)
1363 *flagp = (HASWIDTH | (flags & (HASNL | HASLOOKBH)));
1364 break;
1365 }
1366 if (re_multi_type(peekchr()) != NOT_MULTI)
1367 {
1368 /* Can't have a multi follow a multi. */
1369 if (peekchr() == Magic('*'))
1370 sprintf((char *)IObuff, _("E61: Nested %s*"),
1371 reg_magic >= MAGIC_ON ? "" : "\\");
1372 else
1373 sprintf((char *)IObuff, _("E62: Nested %s%c"),
1374 reg_magic == MAGIC_ALL ? "" : "\\", no_Magic(peekchr()));
1375 EMSG_RET_NULL(IObuff);
1376 }
1377
1378 return ret;
1379 }
1380
1381 /*
1382 * regatom - the lowest level
1383 *
1384 * Optimization: gobbles an entire sequence of ordinary characters so that
1385 * it can turn them into a single node, which is smaller to store and
1386 * faster to run. Don't do this when one_exactly is set.
1387 */
1388 static char_u *
1389 regatom(flagp)
1390 int *flagp;
1391 {
1392 char_u *ret;
1393 int flags;
1394 int cpo_lit; /* 'cpoptions' contains 'l' flag */
1395 int c;
1396 static char_u *classchars = (char_u *)".iIkKfFpPsSdDxXoOwWhHaAlLuU";
1397 static int classcodes[] = {ANY, IDENT, SIDENT, KWORD, SKWORD,
1398 FNAME, SFNAME, PRINT, SPRINT,
1399 WHITE, NWHITE, DIGIT, NDIGIT,
1400 HEX, NHEX, OCTAL, NOCTAL,
1401 WORD, NWORD, HEAD, NHEAD,
1402 ALPHA, NALPHA, LOWER, NLOWER,
1403 UPPER, NUPPER
1404 };
1405 char_u *p;
1406 int extra = 0;
1407
1408 *flagp = WORST; /* Tentatively. */
1409 cpo_lit = (!reg_syn && vim_strchr(p_cpo, CPO_LITERAL) != NULL);
1410
1411 c = getchr();
1412 switch (c)
1413 {
1414 case Magic('^'):
1415 ret = regnode(BOL);
1416 break;
1417
1418 case Magic('$'):
1419 ret = regnode(EOL);
1420 #if defined(FEAT_SYN_HL) || defined(PROTO)
1421 had_eol = TRUE;
1422 #endif
1423 break;
1424
1425 case Magic('<'):
1426 ret = regnode(BOW);
1427 break;
1428
1429 case Magic('>'):
1430 ret = regnode(EOW);
1431 break;
1432
1433 case Magic('_'):
1434 c = no_Magic(getchr());
1435 if (c == '^') /* "\_^" is start-of-line */
1436 {
1437 ret = regnode(BOL);
1438 break;
1439 }
1440 if (c == '$') /* "\_$" is end-of-line */
1441 {
1442 ret = regnode(EOL);
1443 #if defined(FEAT_SYN_HL) || defined(PROTO)
1444 had_eol = TRUE;
1445 #endif
1446 break;
1447 }
1448
1449 extra = ADD_NL;
1450 *flagp |= HASNL;
1451
1452 /* "\_[" is character range plus newline */
1453 if (c == '[')
1454 goto collection;
1455
1456 /* "\_x" is character class plus newline */
1457 /*FALLTHROUGH*/
1458
1459 /*
1460 * Character classes.
1461 */
1462 case Magic('.'):
1463 case Magic('i'):
1464 case Magic('I'):
1465 case Magic('k'):
1466 case Magic('K'):
1467 case Magic('f'):
1468 case Magic('F'):
1469 case Magic('p'):
1470 case Magic('P'):
1471 case Magic('s'):
1472 case Magic('S'):
1473 case Magic('d'):
1474 case Magic('D'):
1475 case Magic('x'):
1476 case Magic('X'):
1477 case Magic('o'):
1478 case Magic('O'):
1479 case Magic('w'):
1480 case Magic('W'):
1481 case Magic('h'):
1482 case Magic('H'):
1483 case Magic('a'):
1484 case Magic('A'):
1485 case Magic('l'):
1486 case Magic('L'):
1487 case Magic('u'):
1488 case Magic('U'):
1489 p = vim_strchr(classchars, no_Magic(c));
1490 if (p == NULL)
1491 EMSG_RET_NULL(_("E63: invalid use of \\_"));
1492 ret = regnode(classcodes[p - classchars] + extra);
1493 *flagp |= HASWIDTH | SIMPLE;
1494 break;
1495
1496 case Magic('n'):
1497 if (reg_string)
1498 {
1499 /* In a string "\n" matches a newline character. */
1500 ret = regnode(EXACTLY);
1501 regc(NL);
1502 regc(NUL);
1503 *flagp |= HASWIDTH | SIMPLE;
1504 }
1505 else
1506 {
1507 /* In buffer text "\n" matches the end of a line. */
1508 ret = regnode(NEWL);
1509 *flagp |= HASWIDTH | HASNL;
1510 }
1511 break;
1512
1513 case Magic('('):
1514 if (one_exactly)
1515 EMSG_ONE_RET_NULL;
1516 ret = reg(REG_PAREN, &flags);
1517 if (ret == NULL)
1518 return NULL;
1519 *flagp |= flags & (HASWIDTH | SPSTART | HASNL | HASLOOKBH);
1520 break;
1521
1522 case NUL:
1523 case Magic('|'):
1524 case Magic('&'):
1525 case Magic(')'):
1526 EMSG_RET_NULL(_(e_internal)); /* Supposed to be caught earlier. */
1527 /* NOTREACHED */
1528
1529 case Magic('='):
1530 case Magic('?'):
1531 case Magic('+'):
1532 case Magic('@'):
1533 case Magic('{'):
1534 case Magic('*'):
1535 c = no_Magic(c);
1536 sprintf((char *)IObuff, _("E64: %s%c follows nothing"),
1537 (c == '*' ? reg_magic >= MAGIC_ON : reg_magic == MAGIC_ALL)
1538 ? "" : "\\", c);
1539 EMSG_RET_NULL(IObuff);
1540 /* NOTREACHED */
1541
1542 case Magic('~'): /* previous substitute pattern */
1543 if (reg_prev_sub)
1544 {
1545 char_u *lp;
1546
1547 ret = regnode(EXACTLY);
1548 lp = reg_prev_sub;
1549 while (*lp != NUL)
1550 regc(*lp++);
1551 regc(NUL);
1552 if (*reg_prev_sub != NUL)
1553 {
1554 *flagp |= HASWIDTH;
1555 if ((lp - reg_prev_sub) == 1)
1556 *flagp |= SIMPLE;
1557 }
1558 }
1559 else
1560 EMSG_RET_NULL(_(e_nopresub));
1561 break;
1562
1563 case Magic('1'):
1564 case Magic('2'):
1565 case Magic('3'):
1566 case Magic('4'):
1567 case Magic('5'):
1568 case Magic('6'):
1569 case Magic('7'):
1570 case Magic('8'):
1571 case Magic('9'):
1572 {
1573 int refnum;
1574
1575 refnum = c - Magic('0');
1576 /*
1577 * Check if the back reference is legal. We must have seen the
1578 * close brace.
1579 * TODO: Should also check that we don't refer to something
1580 * that is repeated (+*=): what instance of the repetition
1581 * should we match?
1582 */
1583 if (!had_endbrace[refnum])
1584 {
1585 /* Trick: check if "@<=" or "@<!" follows, in which case
1586 * the \1 can appear before the referenced match. */
1587 for (p = regparse; *p != NUL; ++p)
1588 if (p[0] == '@' && p[1] == '<'
1589 && (p[2] == '!' || p[2] == '='))
1590 break;
1591 if (*p == NUL)
1592 EMSG_RET_NULL(_("E65: Illegal back reference"));
1593 }
1594 ret = regnode(BACKREF + refnum);
1595 }
1596 break;
1597
1598 #ifdef FEAT_SYN_HL
1599 case Magic('z'):
1600 {
1601 c = no_Magic(getchr());
1602 switch (c)
1603 {
1604 case '(': if (reg_do_extmatch != REX_SET)
1605 EMSG_RET_NULL(_("E66: \\z( not allowed here"));
1606 if (one_exactly)
1607 EMSG_ONE_RET_NULL;
1608 ret = reg(REG_ZPAREN, &flags);
1609 if (ret == NULL)
1610 return NULL;
1611 *flagp |= flags & (HASWIDTH|SPSTART|HASNL|HASLOOKBH);
1612 re_has_z = REX_SET;
1613 break;
1614
1615 case '1':
1616 case '2':
1617 case '3':
1618 case '4':
1619 case '5':
1620 case '6':
1621 case '7':
1622 case '8':
1623 case '9': if (reg_do_extmatch != REX_USE)
1624 EMSG_RET_NULL(_("E67: \\z1 et al. not allowed here"));
1625 ret = regnode(ZREF + c - '0');
1626 re_has_z = REX_USE;
1627 break;
1628
1629 case 's': ret = regnode(MOPEN + 0);
1630 break;
1631
1632 case 'e': ret = regnode(MCLOSE + 0);
1633 break;
1634
1635 default: EMSG_RET_NULL(_("E68: Invalid character after \\z"));
1636 }
1637 }
1638 break;
1639 #endif
1640
1641 case Magic('%'):
1642 {
1643 c = no_Magic(getchr());
1644 switch (c)
1645 {
1646 /* () without a back reference */
1647 case '(':
1648 if (one_exactly)
1649 EMSG_ONE_RET_NULL;
1650 ret = reg(REG_NPAREN, &flags);
1651 if (ret == NULL)
1652 return NULL;
1653 *flagp |= flags & (HASWIDTH | SPSTART | HASNL | HASLOOKBH);
1654 break;
1655
1656 /* Catch \%^ and \%$ regardless of where they appear in the
1657 * pattern -- regardless of whether or not it makes sense. */
1658 case '^':
1659 ret = regnode(RE_BOF);
1660 break;
1661
1662 case '$':
1663 ret = regnode(RE_EOF);
1664 break;
1665
1666 case '#':
1667 ret = regnode(CURSOR);
1668 break;
1669
1670 /* \%[abc]: Emit as a list of branches, all ending at the last
1671 * branch which matches nothing. */
1672 case '[':
1673 if (one_exactly) /* doesn't nest */
1674 EMSG_ONE_RET_NULL;
1675 {
1676 char_u *lastbranch;
1677 char_u *lastnode = NULL;
1678 char_u *br;
1679
1680 ret = NULL;
1681 while ((c = getchr()) != ']')
1682 {
1683 if (c == NUL)
1684 EMSG_M_RET_NULL(_("E69: Missing ] after %s%%["),
1685 reg_magic == MAGIC_ALL);
1686 br = regnode(BRANCH);
1687 if (ret == NULL)
1688 ret = br;
1689 else
1690 regtail(lastnode, br);
1691
1692 ungetchr();
1693 one_exactly = TRUE;
1694 lastnode = regatom(flagp);
1695 one_exactly = FALSE;
1696 if (lastnode == NULL)
1697 return NULL;
1698 }
1699 if (ret == NULL)
1700 EMSG_M_RET_NULL(_("E70: Empty %s%%[]"),
1701 reg_magic == MAGIC_ALL);
1702 lastbranch = regnode(BRANCH);
1703 br = regnode(NOTHING);
1704 if (ret != JUST_CALC_SIZE)
1705 {
1706 regtail(lastnode, br);
1707 regtail(lastbranch, br);
1708 /* connect all branches to the NOTHING
1709 * branch at the end */
1710 for (br = ret; br != lastnode; )
1711 {
1712 if (OP(br) == BRANCH)
1713 {
1714 regtail(br, lastbranch);
1715 br = OPERAND(br);
1716 }
1717 else
1718 br = regnext(br);
1719 }
1720 }
1721 *flagp &= ~HASWIDTH;
1722 break;
1723 }
1724
1725 default:
1726 if (VIM_ISDIGIT(c) || c == '<' || c == '>')
1727 {
1728 long_u n = 0;
1729 int cmp;
1730
1731 cmp = c;
1732 if (cmp == '<' || cmp == '>')
1733 c = getchr();
1734 while (VIM_ISDIGIT(c))
1735 {
1736 n = n * 10 + (c - '0');
1737 c = getchr();
1738 }
1739 if (c == 'l' || c == 'c' || c == 'v')
1740 {
1741 if (c == 'l')
1742 ret = regnode(RE_LNUM);
1743 else if (c == 'c')
1744 ret = regnode(RE_COL);
1745 else
1746 ret = regnode(RE_VCOL);
1747 if (ret == JUST_CALC_SIZE)
1748 regsize += 5;
1749 else
1750 {
1751 /* put the number and the optional
1752 * comparator after the opcode */
1753 regcode = re_put_long(regcode, n);
1754 *regcode++ = cmp;
1755 }
1756 break;
1757 }
1758 }
1759
1760 EMSG_M_RET_NULL(_("E71: Invalid character after %s%%"),
1761 reg_magic == MAGIC_ALL);
1762 }
1763 }
1764 break;
1765
1766 case Magic('['):
1767 collection:
1768 {
1769 char_u *lp;
1770
1771 /*
1772 * If there is no matching ']', we assume the '[' is a normal
1773 * character. This makes 'incsearch' and ":help [" work.
1774 */
1775 lp = skip_anyof(regparse);
1776 if (*lp == ']') /* there is a matching ']' */
1777 {
1778 int startc = -1; /* > 0 when next '-' is a range */
1779 int endc;
1780
1781 /*
1782 * In a character class, different parsing rules apply.
1783 * Not even \ is special anymore, nothing is.
1784 */
1785 if (*regparse == '^') /* Complement of range. */
1786 {
1787 ret = regnode(ANYBUT + extra);
1788 regparse++;
1789 }
1790 else
1791 ret = regnode(ANYOF + extra);
1792
1793 /* At the start ']' and '-' mean the literal character. */
1794 if (*regparse == ']' || *regparse == '-')
1795 regc(*regparse++);
1796
1797 while (*regparse != NUL && *regparse != ']')
1798 {
1799 if (*regparse == '-')
1800 {
1801 ++regparse;
1802 /* The '-' is not used for a range at the end and
1803 * after or before a '\n'. */
1804 if (*regparse == ']' || *regparse == NUL
1805 || startc == -1
1806 || (regparse[0] == '\\' && regparse[1] == 'n'))
1807 {
1808 regc('-');
1809 startc = '-'; /* [--x] is a range */
1810 }
1811 else
1812 {
1813 #ifdef FEAT_MBYTE
1814 if (has_mbyte)
1815 endc = mb_ptr2char_adv(&regparse);
1816 else
1817 #endif
1818 endc = *regparse++;
1819 if (startc > endc)
1820 EMSG_RET_NULL(_(e_invrange));
1821 #ifdef FEAT_MBYTE
1822 if (has_mbyte && ((*mb_char2len)(startc) > 1
1823 || (*mb_char2len)(endc) > 1))
1824 {
1825 /* Limit to a range of 256 chars */
1826 if (endc > startc + 256)
1827 EMSG_RET_NULL(_(e_invrange));
1828 while (++startc <= endc)
1829 regmbc(startc);
1830 }
1831 else
1832 #endif
1833 {
1834 #ifdef EBCDIC
1835 int alpha_only = FALSE;
1836
1837 /* for alphabetical range skip the gaps
1838 * 'i'-'j', 'r'-'s', 'I'-'J' and 'R'-'S'. */
1839 if (isalpha(startc) && isalpha(endc))
1840 alpha_only = TRUE;
1841 #endif
1842 while (++startc <= endc)
1843 #ifdef EBCDIC
1844 if (!alpha_only || isalpha(startc))
1845 #endif
1846 regc(startc);
1847 }
1848 startc = -1;
1849 }
1850 }
1851 /*
1852 * Only "\]", "\^", "\]" and "\\" are special in Vi. Vim
1853 * accepts "\t", "\e", etc., but only when the 'l' flag in
1854 * 'cpoptions' is not included.
1855 */
1856 else if (*regparse == '\\'
1857 && (vim_strchr(REGEXP_INRANGE, regparse[1]) != NULL
1858 || (!cpo_lit
1859 && vim_strchr(REGEXP_ABBR,
1860 regparse[1]) != NULL)))
1861 {
1862 regparse++;
1863 if (*regparse == 'n')
1864 {
1865 /* '\n' in range: also match NL */
1866 if (ret != JUST_CALC_SIZE)
1867 {
1868 if (*ret == ANYBUT)
1869 *ret = ANYBUT + ADD_NL;
1870 else if (*ret == ANYOF)
1871 *ret = ANYOF + ADD_NL;
1872 /* else: must have had a \n already */
1873 }
1874 *flagp |= HASNL;
1875 regparse++;
1876 startc = -1;
1877 }
1878 else
1879 {
1880 startc = backslash_trans(*regparse++);
1881 regc(startc);
1882 }
1883 }
1884 else if (*regparse == '[')
1885 {
1886 int c_class;
1887 int cu;
1888
1889 c_class = skip_class_name(&regparse);
1890 startc = -1;
1891 /* Characters assumed to be 8 bits! */
1892 switch (c_class)
1893 {
1894 case CLASS_NONE:
1895 /* literal '[', allow [[-x] as a range */
1896 startc = *regparse++;
1897 regc(startc);
1898 break;
1899 case CLASS_ALNUM:
1900 for (cu = 1; cu <= 255; cu++)
1901 if (isalnum(cu))
1902 regc(cu);
1903 break;
1904 case CLASS_ALPHA:
1905 for (cu = 1; cu <= 255; cu++)
1906 if (isalpha(cu))
1907 regc(cu);
1908 break;
1909 case CLASS_BLANK:
1910 regc(' ');
1911 regc('\t');
1912 break;
1913 case CLASS_CNTRL:
1914 for (cu = 1; cu <= 255; cu++)
1915 if (iscntrl(cu))
1916 regc(cu);
1917 break;
1918 case CLASS_DIGIT:
1919 for (cu = 1; cu <= 255; cu++)
1920 if (VIM_ISDIGIT(cu))
1921 regc(cu);
1922 break;
1923 case CLASS_GRAPH:
1924 for (cu = 1; cu <= 255; cu++)
1925 if (isgraph(cu))
1926 regc(cu);
1927 break;
1928 case CLASS_LOWER:
1929 for (cu = 1; cu <= 255; cu++)
1930 if (islower(cu))
1931 regc(cu);
1932 break;
1933 case CLASS_PRINT:
1934 for (cu = 1; cu <= 255; cu++)
1935 if (vim_isprintc(cu))
1936 regc(cu);
1937 break;
1938 case CLASS_PUNCT:
1939 for (cu = 1; cu <= 255; cu++)
1940 if (ispunct(cu))
1941 regc(cu);
1942 break;
1943 case CLASS_SPACE:
1944 for (cu = 9; cu <= 13; cu++)
1945 regc(cu);
1946 regc(' ');
1947 break;
1948 case CLASS_UPPER:
1949 for (cu = 1; cu <= 255; cu++)
1950 if (isupper(cu))
1951 regc(cu);
1952 break;
1953 case CLASS_XDIGIT:
1954 for (cu = 1; cu <= 255; cu++)
1955 if (vim_isxdigit(cu))
1956 regc(cu);
1957 break;
1958 case CLASS_TAB:
1959 regc('\t');
1960 break;
1961 case CLASS_RETURN:
1962 regc('\r');
1963 break;
1964 case CLASS_BACKSPACE:
1965 regc('\b');
1966 break;
1967 case CLASS_ESCAPE:
1968 regc('\033');
1969 break;
1970 }
1971 }
1972 else
1973 {
1974 #ifdef FEAT_MBYTE
1975 if (has_mbyte)
1976 {
1977 int len;
1978
1979 /* produce a multibyte character, including any
1980 * following composing characters */
1981 startc = mb_ptr2char(regparse);
1982 len = (*mb_ptr2len_check)(regparse);
1983 if (enc_utf8 && utf_char2len(startc) != len)
1984 startc = -1; /* composing chars */
1985 while (--len >= 0)
1986 regc(*regparse++);
1987 }
1988 else
1989 #endif
1990 {
1991 startc = *regparse++;
1992 regc(startc);
1993 }
1994 }
1995 }
1996 regc(NUL);
1997 prevchr_len = 1; /* last char was the ']' */
1998 if (*regparse != ']')
1999 EMSG_RET_NULL(_(e_toomsbra)); /* Cannot happen? */
2000 skipchr(); /* let's be friends with the lexer again */
2001 *flagp |= HASWIDTH | SIMPLE;
2002 break;
2003 }
2004 }
2005 /* FALLTHROUGH */
2006
2007 default:
2008 {
2009 int len;
2010
2011 #ifdef FEAT_MBYTE
2012 /* A multi-byte character is handled as a separate atom if it's
2013 * before a multi. */
2014 if (has_mbyte && (*mb_char2len)(c) > 1
2015 && re_multi_type(peekchr()) != NOT_MULTI)
2016 {
2017 ret = regnode(MULTIBYTECODE);
2018 regmbc(c);
2019 *flagp |= HASWIDTH | SIMPLE;
2020 break;
2021 }
2022 #endif
2023
2024 ret = regnode(EXACTLY);
2025
2026 /*
2027 * Append characters as long as:
2028 * - there is no following multi, we then need the character in
2029 * front of it as a single character operand
2030 * - not running into a Magic character
2031 * - "one_exactly" is not set
2032 * But always emit at least one character. Might be a Multi,
2033 * e.g., a "[" without matching "]".
2034 */
2035 for (len = 0; c != NUL && (len == 0
2036 || (re_multi_type(peekchr()) == NOT_MULTI
2037 && !one_exactly
2038 && !is_Magic(c))); ++len)
2039 {
2040 c = no_Magic(c);
2041 #ifdef FEAT_MBYTE
2042 if (has_mbyte)
2043 {
2044 regmbc(c);
2045 if (enc_utf8)
2046 {
2047 int off;
2048 int l;
2049
2050 /* Need to get composing character too, directly
2051 * access regparse for that, because skipchr() skips
2052 * over composing chars. */
2053 ungetchr();
2054 if (*regparse == '\\' && regparse[1] != NUL)
2055 off = 1;
2056 else
2057 off = 0;
2058 for (;;)
2059 {
2060 l = utf_ptr2len_check(regparse + off);
2061 if (!UTF_COMPOSINGLIKE(regparse + off,
2062 regparse + off + l))
2063 break;
2064 off += l;
2065 regmbc(utf_ptr2char(regparse + off));
2066 }
2067 skipchr();
2068 }
2069 }
2070 else
2071 #endif
2072 regc(c);
2073 c = getchr();
2074 }
2075 ungetchr();
2076
2077 regc(NUL);
2078 *flagp |= HASWIDTH;
2079 if (len == 1)
2080 *flagp |= SIMPLE;
2081 }
2082 break;
2083 }
2084
2085 return ret;
2086 }
2087
2088 /*
2089 * emit a node
2090 * Return pointer to generated code.
2091 */
2092 static char_u *
2093 regnode(op)
2094 int op;
2095 {
2096 char_u *ret;
2097
2098 ret = regcode;
2099 if (ret == JUST_CALC_SIZE)
2100 regsize += 3;
2101 else
2102 {
2103 *regcode++ = op;
2104 *regcode++ = NUL; /* Null "next" pointer. */
2105 *regcode++ = NUL;
2106 }
2107 return ret;
2108 }
2109
2110 /*
2111 * Emit (if appropriate) a byte of code
2112 */
2113 static void
2114 regc(b)
2115 int b;
2116 {
2117 if (regcode == JUST_CALC_SIZE)
2118 regsize++;
2119 else
2120 *regcode++ = b;
2121 }
2122
2123 #ifdef FEAT_MBYTE
2124 /*
2125 * Emit (if appropriate) a multi-byte character of code
2126 */
2127 static void
2128 regmbc(c)
2129 int c;
2130 {
2131 if (regcode == JUST_CALC_SIZE)
2132 regsize += (*mb_char2len)(c);
2133 else
2134 regcode += (*mb_char2bytes)(c, regcode);
2135 }
2136 #endif
2137
2138 /*
2139 * reginsert - insert an operator in front of already-emitted operand
2140 *
2141 * Means relocating the operand.
2142 */
2143 static void
2144 reginsert(op, opnd)
2145 int op;
2146 char_u *opnd;
2147 {
2148 char_u *src;
2149 char_u *dst;
2150 char_u *place;
2151
2152 if (regcode == JUST_CALC_SIZE)
2153 {
2154 regsize += 3;
2155 return;
2156 }
2157 src = regcode;
2158 regcode += 3;
2159 dst = regcode;
2160 while (src > opnd)
2161 *--dst = *--src;
2162
2163 place = opnd; /* Op node, where operand used to be. */
2164 *place++ = op;
2165 *place++ = NUL;
2166 *place = NUL;
2167 }
2168
2169 /*
2170 * reginsert_limits - insert an operator in front of already-emitted operand.
2171 * The operator has the given limit values as operands. Also set next pointer.
2172 *
2173 * Means relocating the operand.
2174 */
2175 static void
2176 reginsert_limits(op, minval, maxval, opnd)
2177 int op;
2178 long minval;
2179 long maxval;
2180 char_u *opnd;
2181 {
2182 char_u *src;
2183 char_u *dst;
2184 char_u *place;
2185
2186 if (regcode == JUST_CALC_SIZE)
2187 {
2188 regsize += 11;
2189 return;
2190 }
2191 src = regcode;
2192 regcode += 11;
2193 dst = regcode;
2194 while (src > opnd)
2195 *--dst = *--src;
2196
2197 place = opnd; /* Op node, where operand used to be. */
2198 *place++ = op;
2199 *place++ = NUL;
2200 *place++ = NUL;
2201 place = re_put_long(place, (long_u)minval);
2202 place = re_put_long(place, (long_u)maxval);
2203 regtail(opnd, place);
2204 }
2205
2206 /*
2207 * Write a long as four bytes at "p" and return pointer to the next char.
2208 */
2209 static char_u *
2210 re_put_long(p, val)
2211 char_u *p;
2212 long_u val;
2213 {
2214 *p++ = (char_u) ((val >> 24) & 0377);
2215 *p++ = (char_u) ((val >> 16) & 0377);
2216 *p++ = (char_u) ((val >> 8) & 0377);
2217 *p++ = (char_u) (val & 0377);
2218 return p;
2219 }
2220
2221 /*
2222 * regtail - set the next-pointer at the end of a node chain
2223 */
2224 static void
2225 regtail(p, val)
2226 char_u *p;
2227 char_u *val;
2228 {
2229 char_u *scan;
2230 char_u *temp;
2231 int offset;
2232
2233 if (p == JUST_CALC_SIZE)
2234 return;
2235
2236 /* Find last node. */
2237 scan = p;
2238 for (;;)
2239 {
2240 temp = regnext(scan);
2241 if (temp == NULL)
2242 break;
2243 scan = temp;
2244 }
2245
2246 if (OP(scan) == BACK)
2247 offset = (int)(scan - val);
2248 else
2249 offset = (int)(val - scan);
2250 *(scan + 1) = (char_u) (((unsigned)offset >> 8) & 0377);
2251 *(scan + 2) = (char_u) (offset & 0377);
2252 }
2253
2254 /*
2255 * regoptail - regtail on item after a BRANCH; nop if none
2256 */
2257 static void
2258 regoptail(p, val)
2259 char_u *p;
2260 char_u *val;
2261 {
2262 /* When op is neither BRANCH nor BRACE_COMPLEX0-9, it is "operandless" */
2263 if (p == NULL || p == JUST_CALC_SIZE
2264 || (OP(p) != BRANCH
2265 && (OP(p) < BRACE_COMPLEX || OP(p) > BRACE_COMPLEX + 9)))
2266 return;
2267 regtail(OPERAND(p), val);
2268 }
2269
2270 /*
2271 * getchr() - get the next character from the pattern. We know about
2272 * magic and such, so therefore we need a lexical analyzer.
2273 */
2274
2275 /* static int curchr; */
2276 static int prevprevchr;
2277 static int prevchr;
2278 static int nextchr; /* used for ungetchr() */
2279 /*
2280 * Note: prevchr is sometimes -1 when we are not at the start,
2281 * eg in /[ ^I]^ the pattern was never found even if it existed, because ^ was
2282 * taken to be magic -- webb
2283 */
2284 static int at_start; /* True when on the first character */
2285 static int prev_at_start; /* True when on the second character */
2286
2287 static void
2288 initchr(str)
2289 char_u *str;
2290 {
2291 regparse = str;
2292 prevchr_len = 0;
2293 curchr = prevprevchr = prevchr = nextchr = -1;
2294 at_start = TRUE;
2295 prev_at_start = FALSE;
2296 }
2297
2298 static int
2299 peekchr()
2300 {
2301 if (curchr == -1)
2302 {
2303 switch (curchr = regparse[0])
2304 {
2305 case '.':
2306 case '[':
2307 case '~':
2308 /* magic when 'magic' is on */
2309 if (reg_magic >= MAGIC_ON)
2310 curchr = Magic(curchr);
2311 break;
2312 case '(':
2313 case ')':
2314 case '{':
2315 case '%':
2316 case '+':
2317 case '=':
2318 case '?':
2319 case '@':
2320 case '!':
2321 case '&':
2322 case '|':
2323 case '<':
2324 case '>':
2325 case '#': /* future ext. */
2326 case '"': /* future ext. */
2327 case '\'': /* future ext. */
2328 case ',': /* future ext. */
2329 case '-': /* future ext. */
2330 case ':': /* future ext. */
2331 case ';': /* future ext. */
2332 case '`': /* future ext. */
2333 case '/': /* Can't be used in / command */
2334 /* magic only after "\v" */
2335 if (reg_magic == MAGIC_ALL)
2336 curchr = Magic(curchr);
2337 break;
2338 case '*':
2339 /* * is not magic as the very first character, eg "?*ptr" and when
2340 * after '^', eg "/^*ptr" */
2341 if (reg_magic >= MAGIC_ON && !at_start
2342 && !(prev_at_start && prevchr == Magic('^')))
2343 curchr = Magic('*');
2344 break;
2345 case '^':
2346 /* '^' is only magic as the very first character and if it's after
2347 * "\(", "\|", "\&' or "\n" */
2348 if (reg_magic >= MAGIC_OFF
2349 && (at_start
2350 || reg_magic == MAGIC_ALL
2351 || prevchr == Magic('(')
2352 || prevchr == Magic('|')
2353 || prevchr == Magic('&')
2354 || prevchr == Magic('n')
2355 || (no_Magic(prevchr) == '('
2356 && prevprevchr == Magic('%'))))
2357 {
2358 curchr = Magic('^');
2359 at_start = TRUE;
2360 prev_at_start = FALSE;
2361 }
2362 break;
2363 case '$':
2364 /* '$' is only magic as the very last char and if it's in front of
2365 * either "\|", "\)", "\&", or "\n" */
2366 if (reg_magic >= MAGIC_OFF)
2367 {
2368 char_u *p = regparse + 1;
2369
2370 /* ignore \c \C \m and \M after '$' */
2371 while (p[0] == '\\' && (p[1] == 'c' || p[1] == 'C'
2372 || p[1] == 'm' || p[1] == 'M' || p[1] == 'Z'))
2373 p += 2;
2374 if (p[0] == NUL
2375 || (p[0] == '\\'
2376 && (p[1] == '|' || p[1] == '&' || p[1] == ')'
2377 || p[1] == 'n'))
2378 || reg_magic == MAGIC_ALL)
2379 curchr = Magic('$');
2380 }
2381 break;
2382 case '\\':
2383 {
2384 int c = regparse[1];
2385
2386 if (c == NUL)
2387 curchr = '\\'; /* trailing '\' */
2388 else if (
2389 #ifdef EBCDIC
2390 vim_strchr(META, c)
2391 #else
2392 c <= '~' && META_flags[c]
2393 #endif
2394 )
2395 {
2396 /*
2397 * META contains everything that may be magic sometimes,
2398 * except ^ and $ ("\^" and "\$" are only magic after
2399 * "\v"). We now fetch the next character and toggle its
2400 * magicness. Therefore, \ is so meta-magic that it is
2401 * not in META.
2402 */
2403 curchr = -1;
2404 prev_at_start = at_start;
2405 at_start = FALSE; /* be able to say "/\*ptr" */
2406 ++regparse;
2407 peekchr();
2408 --regparse;
2409 curchr = toggle_Magic(curchr);
2410 }
2411 else if (vim_strchr(REGEXP_ABBR, c))
2412 {
2413 /*
2414 * Handle abbreviations, like "\t" for TAB -- webb
2415 */
2416 curchr = backslash_trans(c);
2417 }
2418 else if (reg_magic == MAGIC_NONE && (c == '$' || c == '^'))
2419 curchr = toggle_Magic(c);
2420 else
2421 {
2422 /*
2423 * Next character can never be (made) magic?
2424 * Then backslashing it won't do anything.
2425 */
2426 #ifdef FEAT_MBYTE
2427 if (has_mbyte)
2428 curchr = (*mb_ptr2char)(regparse + 1);
2429 else
2430 #endif
2431 curchr = c;
2432 }
2433 break;
2434 }
2435
2436 #ifdef FEAT_MBYTE
2437 default:
2438 if (has_mbyte)
2439 curchr = (*mb_ptr2char)(regparse);
2440 #endif
2441 }
2442 }
2443
2444 return curchr;
2445 }
2446
2447 /*
2448 * Eat one lexed character. Do this in a way that we can undo it.
2449 */
2450 static void
2451 skipchr()
2452 {
2453 /* peekchr() eats a backslash, do the same here */
2454 if (*regparse == '\\')
2455 prevchr_len = 1;
2456 else
2457 prevchr_len = 0;
2458 if (regparse[prevchr_len] != NUL)
2459 {
2460 #ifdef FEAT_MBYTE
2461 if (has_mbyte)
2462 prevchr_len += (*mb_ptr2len_check)(regparse + prevchr_len);
2463 else
2464 #endif
2465 ++prevchr_len;
2466 }
2467 regparse += prevchr_len;
2468 prev_at_start = at_start;
2469 at_start = FALSE;
2470 prevprevchr = prevchr;
2471 prevchr = curchr;
2472 curchr = nextchr; /* use previously unget char, or -1 */
2473 nextchr = -1;
2474 }
2475
2476 /*
2477 * Skip a character while keeping the value of prev_at_start for at_start.
2478 * prevchr and prevprevchr are also kept.
2479 */
2480 static void
2481 skipchr_keepstart()
2482 {
2483 int as = prev_at_start;
2484 int pr = prevchr;
2485 int prpr = prevprevchr;
2486
2487 skipchr();
2488 at_start = as;
2489 prevchr = pr;
2490 prevprevchr = prpr;
2491 }
2492
2493 static int
2494 getchr()
2495 {
2496 int chr = peekchr();
2497
2498 skipchr();
2499 return chr;
2500 }
2501
2502 /*
2503 * put character back. Works only once!
2504 */
2505 static void
2506 ungetchr()
2507 {
2508 nextchr = curchr;
2509 curchr = prevchr;
2510 prevchr = prevprevchr;
2511 at_start = prev_at_start;
2512 prev_at_start = FALSE;
2513
2514 /* Backup regparse, so that it's at the same position as before the
2515 * getchr(). */
2516 regparse -= prevchr_len;
2517 }
2518
2519 /*
2520 * read_limits - Read two integers to be taken as a minimum and maximum.
2521 * If the first character is '-', then the range is reversed.
2522 * Should end with 'end'. If minval is missing, zero is default, if maxval is
2523 * missing, a very big number is the default.
2524 */
2525 static int
2526 read_limits(minval, maxval)
2527 long *minval;
2528 long *maxval;
2529 {
2530 int reverse = FALSE;
2531 char_u *first_char;
2532 long tmp;
2533
2534 if (*regparse == '-')
2535 {
2536 /* Starts with '-', so reverse the range later */
2537 regparse++;
2538 reverse = TRUE;
2539 }
2540 first_char = regparse;
2541 *minval = getdigits(&regparse);
2542 if (*regparse == ',') /* There is a comma */
2543 {
2544 if (vim_isdigit(*++regparse))
2545 *maxval = getdigits(&regparse);
2546 else
2547 *maxval = MAX_LIMIT;
2548 }
2549 else if (VIM_ISDIGIT(*first_char))
2550 *maxval = *minval; /* It was \{n} or \{-n} */
2551 else
2552 *maxval = MAX_LIMIT; /* It was \{} or \{-} */
2553 if (*regparse == '\\')
2554 regparse++; /* Allow either \{...} or \{...\} */
2555 if (*regparse != '}' || (*maxval == 0 && *minval == 0))
2556 {
2557 sprintf((char *)IObuff, _("E554: Syntax error in %s{...}"),
2558 reg_magic == MAGIC_ALL ? "" : "\\");
2559 EMSG_RET_FAIL(IObuff);
2560 }
2561
2562 /*
2563 * Reverse the range if there was a '-', or make sure it is in the right
2564 * order otherwise.
2565 */
2566 if ((!reverse && *minval > *maxval) || (reverse && *minval < *maxval))
2567 {
2568 tmp = *minval;
2569 *minval = *maxval;
2570 *maxval = tmp;
2571 }
2572 skipchr(); /* let's be friends with the lexer again */
2573 return OK;
2574 }
2575
2576 /*
2577 * vim_regexec and friends
2578 */
2579
2580 /*
2581 * Global work variables for vim_regexec().
2582 */
2583
2584 /* The current match-position is remembered with these variables: */
2585 static linenr_T reglnum; /* line number, relative to first line */
2586 static char_u *regline; /* start of current line */
2587 static char_u *reginput; /* current input, points into "regline" */
2588
2589 static int need_clear_subexpr; /* subexpressions still need to be
2590 * cleared */
2591 #ifdef FEAT_SYN_HL
2592 static int need_clear_zsubexpr = FALSE; /* extmatch subexpressions
2593 * still need to be cleared */
2594 #endif
2595
2596 static int out_of_stack; /* TRUE when ran out of stack space */
2597
2598 /*
2599 * Structure used to save the current input state, when it needs to be
2600 * restored after trying a match. Used by reg_save() and reg_restore().
2601 */
2602 typedef struct
2603 {
2604 union
2605 {
2606 char_u *ptr; /* reginput pointer, for single-line regexp */
2607 lpos_T pos; /* reginput pos, for multi-line regexp */
2608 } rs_u;
2609 } regsave_T;
2610
2611 /* struct to save start/end pointer/position in for \(\) */
2612 typedef struct
2613 {
2614 union
2615 {
2616 char_u *ptr;
2617 lpos_T pos;
2618 } se_u;
2619 } save_se_T;
2620
2621 static char_u *reg_getline __ARGS((linenr_T lnum));
2622 static long vim_regexec_both __ARGS((char_u *line, colnr_T col));
2623 static long regtry __ARGS((regprog_T *prog, colnr_T col));
2624 static void cleanup_subexpr __ARGS((void));
2625 #ifdef FEAT_SYN_HL
2626 static void cleanup_zsubexpr __ARGS((void));
2627 #endif
2628 static void reg_nextline __ARGS((void));
2629 static void reg_save __ARGS((regsave_T *save));
2630 static void reg_restore __ARGS((regsave_T *save));
2631 static int reg_save_equal __ARGS((regsave_T *save));
2632 static void save_se_multi __ARGS((save_se_T *savep, lpos_T *posp));
2633 static void save_se_one __ARGS((save_se_T *savep, char_u **pp));
2634
2635 /* Save the sub-expressions before attempting a match. */
2636 #define save_se(savep, posp, pp) \
2637 REG_MULTI ? save_se_multi((savep), (posp)) : save_se_one((savep), (pp))
2638
2639 /* After a failed match restore the sub-expressions. */
2640 #define restore_se(savep, posp, pp) { \
2641 if (REG_MULTI) \
2642 *(posp) = (savep)->se_u.pos; \
2643 else \
2644 *(pp) = (savep)->se_u.ptr; }
2645
2646 static int re_num_cmp __ARGS((long_u val, char_u *scan));
2647 static int regmatch __ARGS((char_u *prog));
2648 static int regrepeat __ARGS((char_u *p, long maxcount));
2649
2650 #ifdef DEBUG
2651 int regnarrate = 0;
2652 #endif
2653
2654 /*
2655 * Internal copy of 'ignorecase'. It is set at each call to vim_regexec().
2656 * Normally it gets the value of "rm_ic" or "rmm_ic", but when the pattern
2657 * contains '\c' or '\C' the value is overruled.
2658 */
2659 static int ireg_ic;
2660
2661 #ifdef FEAT_MBYTE
2662 /*
2663 * Similar to ireg_ic, but only for 'combining' characters. Set with \Z flag
2664 * in the regexp. Defaults to false, always.
2665 */
2666 static int ireg_icombine;
2667 #endif
2668
2669 /*
2670 * Sometimes need to save a copy of a line. Since alloc()/free() is very
2671 * slow, we keep one allocated piece of memory and only re-allocate it when
2672 * it's too small. It's freed in vim_regexec_both() when finished.
2673 */
2674 static char_u *reg_tofree;
2675 static unsigned reg_tofreelen;
2676
2677 /*
2678 * These variables are set when executing a regexp to speed up the execution.
2679 * Which ones are set depends on whethere a single-line or multi-line match is
2680 * done:
2681 * single-line multi-line
2682 * reg_match &regmatch_T NULL
2683 * reg_mmatch NULL &regmmatch_T
2684 * reg_startp reg_match->startp <invalid>
2685 * reg_endp reg_match->endp <invalid>
2686 * reg_startpos <invalid> reg_mmatch->startpos
2687 * reg_endpos <invalid> reg_mmatch->endpos
2688 * reg_win NULL window in which to search
2689 * reg_buf <invalid> buffer in which to search
2690 * reg_firstlnum <invalid> first line in which to search
2691 * reg_maxline 0 last line nr
2692 * reg_line_lbr FALSE or TRUE FALSE
2693 */
2694 static regmatch_T *reg_match;
2695 static regmmatch_T *reg_mmatch;
2696 static char_u **reg_startp = NULL;
2697 static char_u **reg_endp = NULL;
2698 static lpos_T *reg_startpos = NULL;
2699 static lpos_T *reg_endpos = NULL;
2700 static win_T *reg_win;
2701 static buf_T *reg_buf;
2702 static linenr_T reg_firstlnum;
2703 static linenr_T reg_maxline;
2704 static int reg_line_lbr; /* "\n" in string is line break */
2705
2706 /*
2707 * Get pointer to the line "lnum", which is relative to "reg_firstlnum".
2708 */
2709 static char_u *
2710 reg_getline(lnum)
2711 linenr_T lnum;
2712 {
2713 /* when looking behind for a match/no-match lnum is negative. But we
2714 * can't go before line 1 */
2715 if (reg_firstlnum + lnum < 1)
2716 return NULL;
2717 return ml_get_buf(reg_buf, reg_firstlnum + lnum, FALSE);
2718 }
2719
2720 static regsave_T behind_pos;
2721
2722 #ifdef FEAT_SYN_HL
2723 static char_u *reg_startzp[NSUBEXP]; /* Workspace to mark beginning */
2724 static char_u *reg_endzp[NSUBEXP]; /* and end of \z(...\) matches */
2725 static lpos_T reg_startzpos[NSUBEXP]; /* idem, beginning pos */
2726 static lpos_T reg_endzpos[NSUBEXP]; /* idem, end pos */
2727 #endif
2728
2729 /* TRUE if using multi-line regexp. */
2730 #define REG_MULTI (reg_match == NULL)
2731
2732 /*
2733 * Match a regexp against a string.
2734 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
2735 * Uses curbuf for line count and 'iskeyword'.
2736 *
2737 * Return TRUE if there is a match, FALSE if not.
2738 */
2739 int
2740 vim_regexec(rmp, line, col)
2741 regmatch_T *rmp;
2742 char_u *line; /* string to match against */
2743 colnr_T col; /* column to start looking for match */
2744 {
2745 reg_match = rmp;
2746 reg_mmatch = NULL;
2747 reg_maxline = 0;
2748 reg_line_lbr = FALSE;
2749 reg_win = NULL;
2750 ireg_ic = rmp->rm_ic;
2751 #ifdef FEAT_MBYTE
2752 ireg_icombine = FALSE;
2753 #endif
2754 return (vim_regexec_both(line, col) != 0);
2755 }
2756
2757 #if defined(FEAT_MODIFY_FNAME) || defined(FEAT_EVAL) || defined(PROTO)
2758 /*
2759 * Like vim_regexec(), but consider a "\n" in "line" to be a line break.
2760 */
2761 int
2762 vim_regexec_nl(rmp, line, col)
2763 regmatch_T *rmp;
2764 char_u *line; /* string to match against */
2765 colnr_T col; /* column to start looking for match */
2766 {
2767 reg_match = rmp;
2768 reg_mmatch = NULL;
2769 reg_maxline = 0;
2770 reg_line_lbr = TRUE;
2771 reg_win = NULL;
2772 ireg_ic = rmp->rm_ic;
2773 #ifdef FEAT_MBYTE
2774 ireg_icombine = FALSE;
2775 #endif
2776 return (vim_regexec_both(line, col) != 0);
2777 }
2778 #endif
2779
2780 /*
2781 * Match a regexp against multiple lines.
2782 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
2783 * Uses curbuf for line count and 'iskeyword'.
2784 *
2785 * Return zero if there is no match. Return number of lines contained in the
2786 * match otherwise.
2787 */
2788 long
2789 vim_regexec_multi(rmp, win, buf, lnum, col)
2790 regmmatch_T *rmp;
2791 win_T *win; /* window in which to search or NULL */
2792 buf_T *buf; /* buffer in which to search */
2793 linenr_T lnum; /* nr of line to start looking for match */
2794 colnr_T col; /* column to start looking for match */
2795 {
2796 long r;
2797 buf_T *save_curbuf = curbuf;
2798
2799 reg_match = NULL;
2800 reg_mmatch = rmp;
2801 reg_buf = buf;
2802 reg_win = win;
2803 reg_firstlnum = lnum;
2804 reg_maxline = reg_buf->b_ml.ml_line_count - lnum;
2805 reg_line_lbr = FALSE;
2806 ireg_ic = rmp->rmm_ic;
2807 #ifdef FEAT_MBYTE
2808 ireg_icombine = FALSE;
2809 #endif
2810
2811 /* Need to switch to buffer "buf" to make vim_iswordc() work. */
2812 curbuf = buf;
2813 r = vim_regexec_both(NULL, col);
2814 curbuf = save_curbuf;
2815
2816 return r;
2817 }
2818
2819 /*
2820 * Match a regexp against a string ("line" points to the string) or multiple
2821 * lines ("line" is NULL, use reg_getline()).
2822 */
2823 #ifdef HAVE_SETJMP_H
2824 static long
2825 vim_regexec_both(line_arg, col_arg)
2826 char_u *line_arg;
2827 colnr_T col_arg; /* column to start looking for match */
2828 #else
2829 static long
2830 vim_regexec_both(line, col)
2831 char_u *line;
2832 colnr_T col; /* column to start looking for match */
2833 #endif
2834 {
2835 regprog_T *prog;
2836 char_u *s;
2837 long retval;
2838 #ifdef HAVE_SETJMP_H
2839 char_u *line;
2840 colnr_T col;
2841 #endif
2842
2843 reg_tofree = NULL;
2844
2845 #ifdef HAVE_TRY_EXCEPT
2846 __try
2847 {
2848 #endif
2849
2850 #ifdef HAVE_SETJMP_H
2851 /*
2852 * Matching with a regexp may cause a very deep recursive call of
2853 * regmatch(). Vim will crash when running out of stack space. Catch
2854 * this here if the system supports it.
2855 */
2856 mch_startjmp();
2857 if (SETJMP(lc_jump_env) != 0)
2858 {
2859 mch_didjmp();
2860 # ifdef SIGHASARG
2861 if (lc_signal != SIGINT)
2862 # endif
2863 EMSG(_("E361: Crash intercepted; regexp too complex?"));
2864 retval = 0L;
2865 goto theend;
2866 }
2867
2868 /* Trick to avoid "might be clobbered by `longjmp'" warning from gcc. */
2869 line = line_arg;
2870 col = col_arg;
2871 #endif
2872 retval = 0L;
2873
2874 if (REG_MULTI)
2875 {
2876 prog = reg_mmatch->regprog;
2877 line = reg_getline((linenr_T)0);
2878 reg_startpos = reg_mmatch->startpos;
2879 reg_endpos = reg_mmatch->endpos;
2880 }
2881 else
2882 {
2883 prog = reg_match->regprog;
2884 reg_startp = reg_match->startp;
2885 reg_endp = reg_match->endp;
2886 }
2887
2888 /* Be paranoid... */
2889 if (prog == NULL || line == NULL)
2890 {
2891 EMSG(_(e_null));
2892 goto theend;
2893 }
2894
2895 /* Check validity of program. */
2896 if (prog_magic_wrong())
2897 goto theend;
2898
2899 /* If pattern contains "\c" or "\C": overrule value of ireg_ic */
2900 if (prog->regflags & RF_ICASE)
2901 ireg_ic = TRUE;
2902 else if (prog->regflags & RF_NOICASE)
2903 ireg_ic = FALSE;
2904
2905 #ifdef FEAT_MBYTE
2906 /* If pattern contains "\Z" overrule value of ireg_icombine */
2907 if (prog->regflags & RF_ICOMBINE)
2908 ireg_icombine = TRUE;
2909 #endif
2910
2911 /* If there is a "must appear" string, look for it. */
2912 if (prog->regmust != NULL)
2913 {
2914 int c;
2915
2916 #ifdef FEAT_MBYTE
2917 if (has_mbyte)
2918 c = (*mb_ptr2char)(prog->regmust);
2919 else
2920 #endif
2921 c = *prog->regmust;
2922 s = line + col;
2923 while ((s = cstrchr(s, c)) != NULL)
2924 {
2925 if (cstrncmp(s, prog->regmust, &prog->regmlen) == 0)
2926 break; /* Found it. */
2927 #ifdef FEAT_MBYTE
2928 if (has_mbyte)
2929 s += (*mb_ptr2len_check)(s);
2930 else
2931 #endif
2932 ++s;
2933 }
2934 if (s == NULL) /* Not present. */
2935 goto theend;
2936 }
2937
2938 regline = line;
2939 reglnum = 0;
2940 out_of_stack = FALSE;
2941
2942 /* Simplest case: Anchored match need be tried only once. */
2943 if (prog->reganch)
2944 {
2945 int c;
2946
2947 #ifdef FEAT_MBYTE
2948 if (has_mbyte)
2949 c = (*mb_ptr2char)(regline + col);
2950 else
2951 #endif
2952 c = regline[col];
2953 if (prog->regstart == NUL
2954 || prog->regstart == c
2955 || (ireg_ic && ((
2956 #ifdef FEAT_MBYTE
2957 (enc_utf8 && utf_fold(prog->regstart) == utf_fold(c)))
2958 || (c < 255 && prog->regstart < 255 &&
2959 #endif
2960 TOLOWER_LOC(prog->regstart) == TOLOWER_LOC(c)))))
2961 retval = regtry(prog, col);
2962 else
2963 retval = 0;
2964 }
2965 else
2966 {
2967 /* Messy cases: unanchored match. */
2968 while (!got_int && !out_of_stack)
2969 {
2970 if (prog->regstart != NUL)
2971 {
2972 /* Skip until the char we know it must start with. */
2973 s = cstrchr(regline + col, prog->regstart);
2974 if (s == NULL)
2975 {
2976 retval = 0;
2977 break;
2978 }
2979 col = (int)(s - regline);
2980 }
2981
2982 retval = regtry(prog, col);
2983 if (retval > 0)
2984 break;
2985
2986 /* if not currently on the first line, get it again */
2987 if (reglnum != 0)
2988 {
2989 regline = reg_getline((linenr_T)0);
2990 reglnum = 0;
2991 }
2992 if (regline[col] == NUL)
2993 break;
2994 #ifdef FEAT_MBYTE
2995 if (has_mbyte)
2996 col += (*mb_ptr2len_check)(regline + col);
2997 else
2998 #endif
2999 ++col;
3000 }
3001 }
3002
3003 if (out_of_stack)
3004 EMSG(_("E363: pattern caused out-of-stack error"));
3005
3006 #ifdef HAVE_TRY_EXCEPT
3007 }
3008 __except(EXCEPTION_EXECUTE_HANDLER)
3009 {
3010 if (GetExceptionCode() == EXCEPTION_STACK_OVERFLOW)
3011 {
3012 RESETSTKOFLW();
3013 EMSG(_("E363: pattern caused out-of-stack error"));
3014 }
3015 else
3016 EMSG(_("E361: Crash intercepted; regexp too complex?"));
3017 retval = 0L;
3018 }
3019 #endif
3020
3021 theend:
3022 /* Didn't find a match. */
3023 vim_free(reg_tofree);
3024 #ifdef HAVE_SETJMP_H
3025 mch_endjmp();
3026 #endif
3027 return retval;
3028 }
3029
3030 #ifdef FEAT_SYN_HL
3031 static reg_extmatch_T *make_extmatch __ARGS((void));
3032
3033 /*
3034 * Create a new extmatch and mark it as referenced once.
3035 */
3036 static reg_extmatch_T *
3037 make_extmatch()
3038 {
3039 reg_extmatch_T *em;
3040
3041 em = (reg_extmatch_T *)alloc_clear((unsigned)sizeof(reg_extmatch_T));
3042 if (em != NULL)
3043 em->refcnt = 1;
3044 return em;
3045 }
3046
3047 /*
3048 * Add a reference to an extmatch.
3049 */
3050 reg_extmatch_T *
3051 ref_extmatch(em)
3052 reg_extmatch_T *em;
3053 {
3054 if (em != NULL)
3055 em->refcnt++;
3056 return em;
3057 }
3058
3059 /*
3060 * Remove a reference to an extmatch. If there are no references left, free
3061 * the info.
3062 */
3063 void
3064 unref_extmatch(em)
3065 reg_extmatch_T *em;
3066 {
3067 int i;
3068
3069 if (em != NULL && --em->refcnt <= 0)
3070 {
3071 for (i = 0; i < NSUBEXP; ++i)
3072 vim_free(em->matches[i]);
3073 vim_free(em);
3074 }
3075 }
3076 #endif
3077
3078 /*
3079 * regtry - try match of "prog" with at regline["col"].
3080 * Returns 0 for failure, number of lines contained in the match otherwise.
3081 */
3082 static long
3083 regtry(prog, col)
3084 regprog_T *prog;
3085 colnr_T col;
3086 {
3087 reginput = regline + col;
3088 need_clear_subexpr = TRUE;
3089 #ifdef FEAT_SYN_HL
3090 /* Clear the external match subpointers if necessary. */
3091 if (prog->reghasz == REX_SET)
3092 need_clear_zsubexpr = TRUE;
3093 #endif
3094
3095 if (regmatch(prog->program + 1))
3096 {
3097 cleanup_subexpr();
3098 if (REG_MULTI)
3099 {
3100 if (reg_startpos[0].lnum < 0)
3101 {
3102 reg_startpos[0].lnum = 0;
3103 reg_startpos[0].col = col;
3104 }
3105 if (reg_endpos[0].lnum < 0)
3106 {
3107 reg_endpos[0].lnum = reglnum;
3108 reg_endpos[0].col = (int)(reginput - regline);
3109 }
3110 else
3111 /* Use line number of "\ze". */
3112 reglnum = reg_endpos[0].lnum;
3113 }
3114 else
3115 {
3116 if (reg_startp[0] == NULL)
3117 reg_startp[0] = regline + col;
3118 if (reg_endp[0] == NULL)
3119 reg_endp[0] = reginput;
3120 }
3121 #ifdef FEAT_SYN_HL
3122 /* Package any found \z(...\) matches for export. Default is none. */
3123 unref_extmatch(re_extmatch_out);
3124 re_extmatch_out = NULL;
3125
3126 if (prog->reghasz == REX_SET)
3127 {
3128 int i;
3129
3130 cleanup_zsubexpr();
3131 re_extmatch_out = make_extmatch();
3132 for (i = 0; i < NSUBEXP; i++)
3133 {
3134 if (REG_MULTI)
3135 {
3136 /* Only accept single line matches. */
3137 if (reg_startzpos[i].lnum >= 0
3138 && reg_endzpos[i].lnum == reg_startzpos[i].lnum)
3139 re_extmatch_out->matches[i] =
3140 vim_strnsave(reg_getline(reg_startzpos[i].lnum)
3141 + reg_startzpos[i].col,
3142 reg_endzpos[i].col - reg_startzpos[i].col);
3143 }
3144 else
3145 {
3146 if (reg_startzp[i] != NULL && reg_endzp[i] != NULL)
3147 re_extmatch_out->matches[i] =
3148 vim_strnsave(reg_startzp[i],
3149 (int)(reg_endzp[i] - reg_startzp[i]));
3150 }
3151 }
3152 }
3153 #endif
3154 return 1 + reglnum;
3155 }
3156 return 0;
3157 }
3158
3159 #ifdef FEAT_MBYTE
3160 /* multi-byte: advance reginput with a function */
3161 # define ADVANCE_REGINPUT() advance_reginput()
3162
3163 static void advance_reginput __ARGS((void));
3164 static int reg_prev_class __ARGS((void));
3165
3166 static void
3167 advance_reginput()
3168 {
3169 if (has_mbyte)
3170 reginput += (*mb_ptr2len_check)(reginput);
3171 else
3172 ++reginput;
3173 }
3174
3175 /*
3176 * Get class of previous character.
3177 */
3178 static int
3179 reg_prev_class()
3180 {
3181 if (reginput > regline)
3182 return mb_get_class(reginput - 1
3183 - (*mb_head_off)(regline, reginput - 1));
3184 return -1;
3185 }
3186
3187 #else
3188 /* No multi-byte: It's too simple to make a function for. */
3189 # define ADVANCE_REGINPUT() ++reginput
3190 #endif
3191
3192 /*
3193 * The arguments from BRACE_LIMITS are stored here. They are actually local
3194 * to regmatch(), but they are here to reduce the amount of stack space used
3195 * (it can be called recursively many times).
3196 */
3197 static long bl_minval;
3198 static long bl_maxval;
3199
3200 /*
3201 * regmatch - main matching routine
3202 *
3203 * Conceptually the strategy is simple: Check to see whether the current
3204 * node matches, call self recursively to see whether the rest matches,
3205 * and then act accordingly. In practice we make some effort to avoid
3206 * recursion, in particular by going through "ordinary" nodes (that don't
3207 * need to know whether the rest of the match failed) by a loop instead of
3208 * by recursion.
3209 *
3210 * Returns TRUE when there is a match. Leaves reginput and reglnum just after
3211 * the last matched character.
3212 * Returns FALSE when there is no match. Leaves reginput and reglnum in an
3213 * undefined state!
3214 */
3215 static int
3216 regmatch(scan)
3217 char_u *scan; /* Current node. */
3218 {
3219 char_u *next; /* Next node. */
3220 int op;
3221 int c;
3222
3223 #ifdef HAVE_GETRLIMIT
3224 /* Check if we are running out of stack space. Could be caused by
3225 * recursively calling ourselves. */
3226 if (out_of_stack || mch_stackcheck((char *)&op) == FAIL)
3227 {
3228 out_of_stack = TRUE;
3229 return FALSE;
3230 }
3231 #endif
3232
3233 /* Some patterns my cause a long time to match, even though they are not
3234 * illegal. E.g., "\([a-z]\+\)\+Q". Allow breaking them with CTRL-C. */
3235 fast_breakcheck();
3236
3237 #ifdef DEBUG
3238 if (scan != NULL && regnarrate)
3239 {
3240 mch_errmsg(regprop(scan));
3241 mch_errmsg("(\n");
3242 }
3243 #endif
3244 while (scan != NULL)
3245 {
3246 if (got_int || out_of_stack)
3247 return FALSE;
3248 #ifdef DEBUG
3249 if (regnarrate)
3250 {
3251 mch_errmsg(regprop(scan));
3252 mch_errmsg("...\n");
3253 # ifdef FEAT_SYN_HL
3254 if (re_extmatch_in != NULL)
3255 {
3256 int i;
3257
3258 mch_errmsg(_("External submatches:\n"));
3259 for (i = 0; i < NSUBEXP; i++)
3260 {
3261 mch_errmsg(" \"");
3262 if (re_extmatch_in->matches[i] != NULL)
3263 mch_errmsg(re_extmatch_in->matches[i]);
3264 mch_errmsg("\"\n");
3265 }
3266 }
3267 # endif
3268 }
3269 #endif
3270 next = regnext(scan);
3271
3272 op = OP(scan);
3273 /* Check for character class with NL added. */
3274 if (WITH_NL(op) && *reginput == NUL && reglnum < reg_maxline)
3275 {
3276 reg_nextline();
3277 }
3278 else if (reg_line_lbr && WITH_NL(op) && *reginput == '\n')
3279 {
3280 ADVANCE_REGINPUT();
3281 }
3282 else
3283 {
3284 if (WITH_NL(op))
3285 op -= ADD_NL;
3286 #ifdef FEAT_MBYTE
3287 if (has_mbyte)
3288 c = (*mb_ptr2char)(reginput);
3289 else
3290 #endif
3291 c = *reginput;
3292 switch (op)
3293 {
3294 case BOL:
3295 if (reginput != regline)
3296 return FALSE;
3297 break;
3298
3299 case EOL:
3300 if (c != NUL)
3301 return FALSE;
3302 break;
3303
3304 case RE_BOF:
3305 /* Passing -1 to the getline() function provided for the search
3306 * should always return NULL if the current line is the first
3307 * line of the file. */
3308 if (reglnum != 0 || reginput != regline
3309 || (REG_MULTI && reg_getline((linenr_T)-1) != NULL))
3310 return FALSE;
3311 break;
3312
3313 case RE_EOF:
3314 if (reglnum != reg_maxline || c != NUL)
3315 return FALSE;
3316 break;
3317
3318 case CURSOR:
3319 /* Check if the buffer is in a window and compare the
3320 * reg_win->w_cursor position to the match position. */
3321 if (reg_win == NULL
3322 || (reglnum + reg_firstlnum != reg_win->w_cursor.lnum)
3323 || ((colnr_T)(reginput - regline) != reg_win->w_cursor.col))
3324 return FALSE;
3325 break;
3326
3327 case RE_LNUM:
3328 if (!REG_MULTI || !re_num_cmp((long_u)(reglnum + reg_firstlnum),
3329 scan))
3330 return FALSE;
3331 break;
3332
3333 case RE_COL:
3334 if (!re_num_cmp((long_u)(reginput - regline) + 1, scan))
3335 return FALSE;
3336 break;
3337
3338 case RE_VCOL:
3339 if (!re_num_cmp((long_u)win_linetabsize(
3340 reg_win == NULL ? curwin : reg_win,
3341 regline, (colnr_T)(reginput - regline)) + 1, scan))
3342 return FALSE;
3343 break;
3344
3345 case BOW: /* \<word; reginput points to w */
3346 if (c == NUL) /* Can't match at end of line */
3347 return FALSE;
3348 #ifdef FEAT_MBYTE
3349 if (has_mbyte)
3350 {
3351 int this_class;
3352
3353 /* Get class of current and previous char (if it exists). */
3354 this_class = mb_get_class(reginput);
3355 if (this_class <= 1)
3356 return FALSE; /* not on a word at all */
3357 if (reg_prev_class() == this_class)
3358 return FALSE; /* previous char is in same word */
3359 }
3360 #endif
3361 else
3362 {
3363 if (!vim_iswordc(c)
3364 || (reginput > regline && vim_iswordc(reginput[-1])))
3365 return FALSE;
3366 }
3367 break;
3368
3369 case EOW: /* word\>; reginput points after d */
3370 if (reginput == regline) /* Can't match at start of line */
3371 return FALSE;
3372 #ifdef FEAT_MBYTE
3373 if (has_mbyte)
3374 {
3375 int this_class, prev_class;
3376
3377 /* Get class of current and previous char (if it exists). */
3378 this_class = mb_get_class(reginput);
3379 prev_class = reg_prev_class();
3380 if (this_class == prev_class)
3381 return FALSE;
3382 if (prev_class == 0 || prev_class == 1)
3383 return FALSE;
3384 }
3385 else
3386 #endif
3387 {
3388 if (!vim_iswordc(reginput[-1]))
3389 return FALSE;
3390 if (reginput[0] != NUL && vim_iswordc(c))
3391 return FALSE;
3392 }
3393 break; /* Matched with EOW */
3394
3395 case ANY:
3396 if (c == NUL)
3397 return FALSE;
3398 ADVANCE_REGINPUT();
3399 break;
3400
3401 case IDENT:
3402 if (!vim_isIDc(c))
3403 return FALSE;
3404 ADVANCE_REGINPUT();
3405 break;
3406
3407 case SIDENT:
3408 if (VIM_ISDIGIT(*reginput) || !vim_isIDc(c))
3409 return FALSE;
3410 ADVANCE_REGINPUT();
3411 break;
3412
3413 case KWORD:
3414 if (!vim_iswordp(reginput))
3415 return FALSE;
3416 ADVANCE_REGINPUT();
3417 break;
3418
3419 case SKWORD:
3420 if (VIM_ISDIGIT(*reginput) || !vim_iswordp(reginput))
3421 return FALSE;
3422 ADVANCE_REGINPUT();
3423 break;
3424
3425 case FNAME:
3426 if (!vim_isfilec(c))
3427 return FALSE;
3428 ADVANCE_REGINPUT();
3429 break;
3430
3431 case SFNAME:
3432 if (VIM_ISDIGIT(*reginput) || !vim_isfilec(c))
3433 return FALSE;
3434 ADVANCE_REGINPUT();
3435 break;
3436
3437 case PRINT:
3438 if (ptr2cells(reginput) != 1)
3439 return FALSE;
3440 ADVANCE_REGINPUT();
3441 break;
3442
3443 case SPRINT:
3444 if (VIM_ISDIGIT(*reginput) || ptr2cells(reginput) != 1)
3445 return FALSE;
3446 ADVANCE_REGINPUT();
3447 break;
3448
3449 case WHITE:
3450 if (!vim_iswhite(c))
3451 return FALSE;
3452 ADVANCE_REGINPUT();
3453 break;
3454
3455 case NWHITE:
3456 if (c == NUL || vim_iswhite(c))
3457 return FALSE;
3458 ADVANCE_REGINPUT();
3459 break;
3460
3461 case DIGIT:
3462 if (!ri_digit(c))
3463 return FALSE;
3464 ADVANCE_REGINPUT();
3465 break;
3466
3467 case NDIGIT:
3468 if (c == NUL || ri_digit(c))
3469 return FALSE;
3470 ADVANCE_REGINPUT();
3471 break;
3472
3473 case HEX:
3474 if (!ri_hex(c))
3475 return FALSE;
3476 ADVANCE_REGINPUT();
3477 break;
3478
3479 case NHEX:
3480 if (c == NUL || ri_hex(c))
3481 return FALSE;
3482 ADVANCE_REGINPUT();
3483 break;
3484
3485 case OCTAL:
3486 if (!ri_octal(c))
3487 return FALSE;
3488 ADVANCE_REGINPUT();
3489 break;
3490
3491 case NOCTAL:
3492 if (c == NUL || ri_octal(c))
3493 return FALSE;
3494 ADVANCE_REGINPUT();
3495 break;
3496
3497 case WORD:
3498 if (!ri_word(c))
3499 return FALSE;
3500 ADVANCE_REGINPUT();
3501 break;
3502
3503 case NWORD:
3504 if (c == NUL || ri_word(c))
3505 return FALSE;
3506 ADVANCE_REGINPUT();
3507 break;
3508
3509 case HEAD:
3510 if (!ri_head(c))
3511 return FALSE;
3512 ADVANCE_REGINPUT();
3513 break;
3514
3515 case NHEAD:
3516 if (c == NUL || ri_head(c))
3517 return FALSE;
3518 ADVANCE_REGINPUT();
3519 break;
3520
3521 case ALPHA:
3522 if (!ri_alpha(c))
3523 return FALSE;
3524 ADVANCE_REGINPUT();
3525 break;
3526
3527 case NALPHA:
3528 if (c == NUL || ri_alpha(c))
3529 return FALSE;
3530 ADVANCE_REGINPUT();
3531 break;
3532
3533 case LOWER:
3534 if (!ri_lower(c))
3535 return FALSE;
3536 ADVANCE_REGINPUT();
3537 break;
3538
3539 case NLOWER:
3540 if (c == NUL || ri_lower(c))
3541 return FALSE;
3542 ADVANCE_REGINPUT();
3543 break;
3544
3545 case UPPER:
3546 if (!ri_upper(c))
3547 return FALSE;
3548 ADVANCE_REGINPUT();
3549 break;
3550
3551 case NUPPER:
3552 if (c == NUL || ri_upper(c))
3553 return FALSE;
3554 ADVANCE_REGINPUT();
3555 break;
3556
3557 case EXACTLY:
3558 {
3559 int len;
3560 char_u *opnd;
3561
3562 opnd = OPERAND(scan);
3563 /* Inline the first byte, for speed. */
3564 if (*opnd != *reginput
3565 && (!ireg_ic || (
3566 #ifdef FEAT_MBYTE
3567 !enc_utf8 &&
3568 #endif
3569 TOLOWER_LOC(*opnd) != TOLOWER_LOC(*reginput))))
3570 return FALSE;
3571 if (*opnd == NUL)
3572 {
3573 /* match empty string always works; happens when "~" is
3574 * empty. */
3575 }
3576 else if (opnd[1] == NUL
3577 #ifdef FEAT_MBYTE
3578 && !(enc_utf8 && ireg_ic)
3579 #endif
3580 )
3581 ++reginput; /* matched a single char */
3582 else
3583 {
3584 len = (int)STRLEN(opnd);
3585 /* Need to match first byte again for multi-byte. */
3586 if (cstrncmp(opnd, reginput, &len) != 0)
3587 return FALSE;
3588 #ifdef FEAT_MBYTE
3589 /* Check for following composing character. */
3590 if (enc_utf8 && UTF_COMPOSINGLIKE(reginput, reginput + len))
3591 {
3592 /* raaron: This code makes a composing character get
3593 * ignored, which is the correct behavior (sometimes)
3594 * for voweled Hebrew texts. */
3595 if (!ireg_icombine)
3596 return FALSE;
3597 }
3598 else
3599 #endif
3600 reginput += len;
3601 }
3602 }
3603 break;
3604
3605 case ANYOF:
3606 case ANYBUT:
3607 if (c == NUL)
3608 return FALSE;
3609 if ((cstrchr(OPERAND(scan), c) == NULL) == (op == ANYOF))
3610 return FALSE;
3611 ADVANCE_REGINPUT();
3612 break;
3613
3614 #ifdef FEAT_MBYTE
3615 case MULTIBYTECODE:
3616 if (has_mbyte)
3617 {
3618 int i, len;
3619 char_u *opnd;
3620
3621 opnd = OPERAND(scan);
3622 /* Safety check (just in case 'encoding' was changed since
3623 * compiling the program). */
3624 if ((len = (*mb_ptr2len_check)(opnd)) < 2)
3625 return FALSE;
3626 for (i = 0; i < len; ++i)
3627 if (opnd[i] != reginput[i])
3628 return FALSE;
3629 reginput += len;
3630 }
3631 else
3632 return FALSE;
3633 break;
3634 #endif
3635
3636 case NOTHING:
3637 break;
3638
3639 case BACK:
3640 break;
3641
3642 case MOPEN + 0: /* Match start: \zs */
3643 case MOPEN + 1: /* \( */
3644 case MOPEN + 2:
3645 case MOPEN + 3:
3646 case MOPEN + 4:
3647 case MOPEN + 5:
3648 case MOPEN + 6:
3649 case MOPEN + 7:
3650 case MOPEN + 8:
3651 case MOPEN + 9:
3652 {
3653 int no;
3654 save_se_T save;
3655
3656 no = op - MOPEN;
3657 cleanup_subexpr();
3658 save_se(&save, &reg_startpos[no], &reg_startp[no]);
3659
3660 if (regmatch(next))
3661 return TRUE;
3662
3663 restore_se(&save, &reg_startpos[no], &reg_startp[no]);
3664 return FALSE;
3665 }
3666 /* break; Not Reached */
3667
3668 case NOPEN: /* \%( */
3669 case NCLOSE: /* \) after \%( */
3670 if (regmatch(next))
3671 return TRUE;
3672 return FALSE;
3673 /* break; Not Reached */
3674
3675 #ifdef FEAT_SYN_HL
3676 case ZOPEN + 1:
3677 case ZOPEN + 2:
3678 case ZOPEN + 3:
3679 case ZOPEN + 4:
3680 case ZOPEN + 5:
3681 case ZOPEN + 6:
3682 case ZOPEN + 7:
3683 case ZOPEN + 8:
3684 case ZOPEN + 9:
3685 {
3686 int no;
3687 save_se_T save;
3688
3689 no = op - ZOPEN;
3690 cleanup_zsubexpr();
3691 save_se(&save, &reg_startzpos[no], &reg_startzp[no]);
3692
3693 if (regmatch(next))
3694 return TRUE;
3695
3696 restore_se(&save, &reg_startzpos[no], &reg_startzp[no]);
3697 return FALSE;
3698 }
3699 /* break; Not Reached */
3700 #endif
3701
3702 case MCLOSE + 0: /* Match end: \ze */
3703 case MCLOSE + 1: /* \) */
3704 case MCLOSE + 2:
3705 case MCLOSE + 3:
3706 case MCLOSE + 4:
3707 case MCLOSE + 5:
3708 case MCLOSE + 6:
3709 case MCLOSE + 7:
3710 case MCLOSE + 8:
3711 case MCLOSE + 9:
3712 {
3713 int no;
3714 save_se_T save;
3715
3716 no = op - MCLOSE;
3717 cleanup_subexpr();
3718 save_se(&save, &reg_endpos[no], &reg_endp[no]);
3719
3720 if (regmatch(next))
3721 return TRUE;
3722
3723 restore_se(&save, &reg_endpos[no], &reg_endp[no]);
3724 return FALSE;
3725 }
3726 /* break; Not Reached */
3727
3728 #ifdef FEAT_SYN_HL
3729 case ZCLOSE + 1: /* \) after \z( */
3730 case ZCLOSE + 2:
3731 case ZCLOSE + 3:
3732 case ZCLOSE + 4:
3733 case ZCLOSE + 5:
3734 case ZCLOSE + 6:
3735 case ZCLOSE + 7:
3736 case ZCLOSE + 8:
3737 case ZCLOSE + 9:
3738 {
3739 int no;
3740 save_se_T save;
3741
3742 no = op - ZCLOSE;
3743 cleanup_zsubexpr();
3744 save_se(&save, &reg_endzpos[no], &reg_endzp[no]);
3745
3746 if (regmatch(next))
3747 return TRUE;
3748
3749 restore_se(&save, &reg_endzpos[no], &reg_endzp[no]);
3750 return FALSE;
3751 }
3752 /* break; Not Reached */
3753 #endif
3754
3755 case BACKREF + 1:
3756 case BACKREF + 2:
3757 case BACKREF + 3:
3758 case BACKREF + 4:
3759 case BACKREF + 5:
3760 case BACKREF + 6:
3761 case BACKREF + 7:
3762 case BACKREF + 8:
3763 case BACKREF + 9:
3764 {
3765 int no;
3766 int len;
3767 linenr_T clnum;
3768 colnr_T ccol;
3769 char_u *p;
3770
3771 no = op - BACKREF;
3772 cleanup_subexpr();
3773 if (!REG_MULTI) /* Single-line regexp */
3774 {
3775 if (reg_endp[no] == NULL)
3776 {
3777 /* Backref was not set: Match an empty string. */
3778 len = 0;
3779 }
3780 else
3781 {
3782 /* Compare current input with back-ref in the same
3783 * line. */
3784 len = (int)(reg_endp[no] - reg_startp[no]);
3785 if (cstrncmp(reg_startp[no], reginput, &len) != 0)
3786 return FALSE;
3787 }
3788 }
3789 else /* Multi-line regexp */
3790 {
3791 if (reg_endpos[no].lnum < 0)
3792 {
3793 /* Backref was not set: Match an empty string. */
3794 len = 0;
3795 }
3796 else
3797 {
3798 if (reg_startpos[no].lnum == reglnum
3799 && reg_endpos[no].lnum == reglnum)
3800 {
3801 /* Compare back-ref within the current line. */
3802 len = reg_endpos[no].col - reg_startpos[no].col;
3803 if (cstrncmp(regline + reg_startpos[no].col,
3804 reginput, &len) != 0)
3805 return FALSE;
3806 }
3807 else
3808 {
3809 /* Messy situation: Need to compare between two
3810 * lines. */
3811 ccol = reg_startpos[no].col;
3812 clnum = reg_startpos[no].lnum;
3813 for (;;)
3814 {
3815 /* Since getting one line may invalidate
3816 * the other, need to make copy. Slow! */
3817 if (regline != reg_tofree)
3818 {
3819 len = (int)STRLEN(regline);
3820 if (reg_tofree == NULL
3821 || len >= (int)reg_tofreelen)
3822 {
3823 len += 50; /* get some extra */
3824 vim_free(reg_tofree);
3825 reg_tofree = alloc(len);
3826 if (reg_tofree == NULL)
3827 return FALSE; /* out of memory! */
3828 reg_tofreelen = len;
3829 }
3830 STRCPY(reg_tofree, regline);
3831 reginput = reg_tofree
3832 + (reginput - regline);
3833 regline = reg_tofree;
3834 }
3835
3836 /* Get the line to compare with. */
3837 p = reg_getline(clnum);
3838 if (clnum == reg_endpos[no].lnum)
3839 len = reg_endpos[no].col - ccol;
3840 else
3841 len = (int)STRLEN(p + ccol);
3842
3843 if (cstrncmp(p + ccol, reginput, &len) != 0)
3844 return FALSE; /* doesn't match */
3845 if (clnum == reg_endpos[no].lnum)
3846 break; /* match and at end! */
3847 if (reglnum == reg_maxline)
3848 return FALSE; /* text too short */
3849
3850 /* Advance to next line. */
3851 reg_nextline();
3852 ++clnum;
3853 ccol = 0;
3854 if (got_int || out_of_stack)
3855 return FALSE;
3856 }
3857
3858 /* found a match! Note that regline may now point
3859 * to a copy of the line, that should not matter. */
3860 }
3861 }
3862 }
3863
3864 /* Matched the backref, skip over it. */
3865 reginput += len;
3866 }
3867 break;
3868
3869 #ifdef FEAT_SYN_HL
3870 case ZREF + 1:
3871 case ZREF + 2:
3872 case ZREF + 3:
3873 case ZREF + 4:
3874 case ZREF + 5:
3875 case ZREF + 6:
3876 case ZREF + 7:
3877 case ZREF + 8:
3878 case ZREF + 9:
3879 {
3880 int no;
3881 int len;
3882
3883 cleanup_zsubexpr();
3884 no = op - ZREF;
3885 if (re_extmatch_in != NULL
3886 && re_extmatch_in->matches[no] != NULL)
3887 {
3888 len = (int)STRLEN(re_extmatch_in->matches[no]);
3889 if (cstrncmp(re_extmatch_in->matches[no],
3890 reginput, &len) != 0)
3891 return FALSE;
3892 reginput += len;
3893 }
3894 else
3895 {
3896 /* Backref was not set: Match an empty string. */
3897 }
3898 }
3899 break;
3900 #endif
3901
3902 case BRANCH:
3903 {
3904 if (OP(next) != BRANCH) /* No choice. */
3905 next = OPERAND(scan); /* Avoid recursion. */
3906 else
3907 {
3908 regsave_T save;
3909
3910 do
3911 {
3912 reg_save(&save);
3913 if (regmatch(OPERAND(scan)))
3914 return TRUE;
3915 reg_restore(&save);
3916 scan = regnext(scan);
3917 } while (scan != NULL && OP(scan) == BRANCH);
3918 return FALSE;
3919 /* NOTREACHED */
3920 }
3921 }
3922 break;
3923
3924 case BRACE_LIMITS:
3925 {
3926 int no;
3927
3928 if (OP(next) == BRACE_SIMPLE)
3929 {
3930 bl_minval = OPERAND_MIN(scan);
3931 bl_maxval = OPERAND_MAX(scan);
3932 }
3933 else if (OP(next) >= BRACE_COMPLEX
3934 && OP(next) < BRACE_COMPLEX + 10)
3935 {
3936 no = OP(next) - BRACE_COMPLEX;
3937 brace_min[no] = OPERAND_MIN(scan);
3938 brace_max[no] = OPERAND_MAX(scan);
3939 brace_count[no] = 0;
3940 }
3941 else
3942 {
3943 EMSG(_(e_internal)); /* Shouldn't happen */
3944 return FALSE;
3945 }
3946 }
3947 break;
3948
3949 case BRACE_COMPLEX + 0:
3950 case BRACE_COMPLEX + 1:
3951 case BRACE_COMPLEX + 2:
3952 case BRACE_COMPLEX + 3:
3953 case BRACE_COMPLEX + 4:
3954 case BRACE_COMPLEX + 5:
3955 case BRACE_COMPLEX + 6:
3956 case BRACE_COMPLEX + 7:
3957 case BRACE_COMPLEX + 8:
3958 case BRACE_COMPLEX + 9:
3959 {
3960 int no;
3961 regsave_T save;
3962
3963 no = op - BRACE_COMPLEX;
3964 ++brace_count[no];
3965
3966 /* If not matched enough times yet, try one more */
3967 if (brace_count[no] <= (brace_min[no] <= brace_max[no]
3968 ? brace_min[no] : brace_max[no]))
3969 {
3970 reg_save(&save);
3971 if (regmatch(OPERAND(scan)))
3972 return TRUE;
3973 reg_restore(&save);
3974 --brace_count[no]; /* failed, decrement match count */
3975 return FALSE;
3976 }
3977
3978 /* If matched enough times, may try matching some more */
3979 if (brace_min[no] <= brace_max[no])
3980 {
3981 /* Range is the normal way around, use longest match */
3982 if (brace_count[no] <= brace_max[no])
3983 {
3984 reg_save(&save);
3985 if (regmatch(OPERAND(scan)))
3986 return TRUE; /* matched some more times */
3987 reg_restore(&save);
3988 --brace_count[no]; /* matched just enough times */
3989 /* continue with the items after \{} */
3990 }
3991 }
3992 else
3993 {
3994 /* Range is backwards, use shortest match first */
3995 if (brace_count[no] <= brace_min[no])
3996 {
3997 reg_save(&save);
3998 if (regmatch(next))
3999 return TRUE;
4000 reg_restore(&save);
4001 next = OPERAND(scan);
4002 /* must try to match one more item */
4003 }
4004 }
4005 }
4006 break;
4007
4008 case BRACE_SIMPLE:
4009 case STAR:
4010 case PLUS:
4011 {
4012 int nextb; /* next byte */
4013 int nextb_ic; /* next byte reverse case */
4014 long count;
4015 regsave_T save;
4016 long minval;
4017 long maxval;
4018
4019 /*
4020 * Lookahead to avoid useless match attempts when we know
4021 * what character comes next.
4022 */
4023 if (OP(next) == EXACTLY)
4024 {
4025 nextb = *OPERAND(next);
4026 if (ireg_ic)
4027 {
4028 if (isupper(nextb))
4029 nextb_ic = TOLOWER_LOC(nextb);
4030 else
4031 nextb_ic = TOUPPER_LOC(nextb);
4032 }
4033 else
4034 nextb_ic = nextb;
4035 }
4036 else
4037 {
4038 nextb = NUL;
4039 nextb_ic = NUL;
4040 }
4041 if (op != BRACE_SIMPLE)
4042 {
4043 minval = (op == STAR) ? 0 : 1;
4044 maxval = MAX_LIMIT;
4045 }
4046 else
4047 {
4048 minval = bl_minval;
4049 maxval = bl_maxval;
4050 }
4051
4052 /*
4053 * When maxval > minval, try matching as much as possible, up
4054 * to maxval. When maxval < minval, try matching at least the
4055 * minimal number (since the range is backwards, that's also
4056 * maxval!).
4057 */
4058 count = regrepeat(OPERAND(scan), maxval);
4059 if (got_int)
4060 return FALSE;
4061 if (minval <= maxval)
4062 {
4063 /* Range is the normal way around, use longest match */
4064 while (count >= minval)
4065 {
4066 /* If it could match, try it. */
4067 if (nextb == NUL || *reginput == nextb
4068 || *reginput == nextb_ic)
4069 {
4070 reg_save(&save);
4071 if (regmatch(next))
4072 return TRUE;
4073 reg_restore(&save);
4074 }
4075 /* Couldn't or didn't match -- back up one char. */
4076 if (--count < minval)
4077 break;
4078 if (reginput == regline)
4079 {
4080 /* backup to last char of previous line */
4081 --reglnum;
4082 regline = reg_getline(reglnum);
4083 /* Just in case regrepeat() didn't count right. */
4084 if (regline == NULL)
4085 return FALSE;
4086 reginput = regline + STRLEN(regline);
4087 fast_breakcheck();
4088 if (got_int || out_of_stack)
4089 return FALSE;
4090 }
4091 else
4092 {
4093 --reginput;
4094 #ifdef FEAT_MBYTE
4095 if (has_mbyte)
4096 reginput -= (*mb_head_off)(regline, reginput);
4097 #endif
4098 }
4099 }
4100 }
4101 else
4102 {
4103 /* Range is backwards, use shortest match first.
4104 * Careful: maxval and minval are exchanged! */
4105 if (count < maxval)
4106 return FALSE;
4107 for (;;)
4108 {
4109 /* If it could work, try it. */
4110 if (nextb == NUL || *reginput == nextb
4111 || *reginput == nextb_ic)
4112 {
4113 reg_save(&save);
4114 if (regmatch(next))
4115 return TRUE;
4116 reg_restore(&save);
4117 }
4118 /* Couldn't or didn't match: try advancing one char. */
4119 if (count == minval
4120 || regrepeat(OPERAND(scan), 1L) == 0)
4121 break;
4122 ++count;
4123 if (got_int || out_of_stack)
4124 return FALSE;
4125 }
4126 }
4127 return FALSE;
4128 }
4129 /* break; Not Reached */
4130
4131 case NOMATCH:
4132 {
4133 regsave_T save;
4134
4135 /* If the operand matches, we fail. Otherwise backup and
4136 * continue with the next item. */
4137 reg_save(&save);
4138 if (regmatch(OPERAND(scan)))
4139 return FALSE;
4140 reg_restore(&save);
4141 }
4142 break;
4143
4144 case MATCH:
4145 case SUBPAT:
4146 {
4147 regsave_T save;
4148
4149 /* If the operand doesn't match, we fail. Otherwise backup
4150 * and continue with the next item. */
4151 reg_save(&save);
4152 if (!regmatch(OPERAND(scan)))
4153 return FALSE;
4154 if (op == MATCH) /* zero-width */
4155 reg_restore(&save);
4156 }
4157 break;
4158
4159 case BEHIND:
4160 case NOBEHIND:
4161 {
4162 regsave_T save_after, save_start;
4163 regsave_T save_behind_pos;
4164 int needmatch = (op == BEHIND);
4165
4166 /*
4167 * Look back in the input of the operand matches or not. This
4168 * must be done at every position in the input and checking if
4169 * the match ends at the current position.
4170 * First check if the next item matches, that's probably
4171 * faster.
4172 */
4173 reg_save(&save_start);
4174 if (regmatch(next))
4175 {
4176 /* save the position after the found match for next */
4177 reg_save(&save_after);
4178
4179 /* start looking for a match with operand at the current
4180 * postion. Go back one character until we find the
4181 * result, hitting the start of the line or the previous
4182 * line (for multi-line matching).
4183 * Set behind_pos to where the match should end, BHPOS
4184 * will match it. */
4185 save_behind_pos = behind_pos;
4186 behind_pos = save_start;
4187 for (;;)
4188 {
4189 reg_restore(&save_start);
4190 if (regmatch(OPERAND(scan))
4191 && reg_save_equal(&behind_pos))
4192 {
4193 behind_pos = save_behind_pos;
4194 /* found a match that ends where "next" started */
4195 if (needmatch)
4196 {
4197 reg_restore(&save_after);
4198 return TRUE;
4199 }
4200 return FALSE;
4201 }
4202 /*
4203 * No match: Go back one character. May go to
4204 * previous line once.
4205 */
4206 if (REG_MULTI)
4207 {
4208 if (save_start.rs_u.pos.col == 0)
4209 {
4210 if (save_start.rs_u.pos.lnum
4211 < behind_pos.rs_u.pos.lnum
4212 || reg_getline(
4213 --save_start.rs_u.pos.lnum) == NULL)
4214 break;
4215 reg_restore(&save_start);
4216 save_start.rs_u.pos.col =
4217 (colnr_T)STRLEN(regline);
4218 }
4219 else
4220 --save_start.rs_u.pos.col;
4221 }
4222 else
4223 {
4224 if (save_start.rs_u.ptr == regline)
4225 break;
4226 --save_start.rs_u.ptr;
4227 }
4228 }
4229
4230 /* NOBEHIND succeeds when no match was found */
4231 behind_pos = save_behind_pos;
4232 if (!needmatch)
4233 {
4234 reg_restore(&save_after);
4235 return TRUE;
4236 }
4237 }
4238 return FALSE;
4239 }
4240
4241 case BHPOS:
4242 if (REG_MULTI)
4243 {
4244 if (behind_pos.rs_u.pos.col != (colnr_T)(reginput - regline)
4245 || behind_pos.rs_u.pos.lnum != reglnum)
4246 return FALSE;
4247 }
4248 else if (behind_pos.rs_u.ptr != reginput)
4249 return FALSE;
4250 break;
4251
4252 case NEWL:
4253 if ((c != NUL || reglnum == reg_maxline)
4254 && (c != '\n' || !reg_line_lbr))
4255 return FALSE;
4256 if (reg_line_lbr)
4257 ADVANCE_REGINPUT();
4258 else
4259 reg_nextline();
4260 break;
4261
4262 case END:
4263 return TRUE; /* Success! */
4264
4265 default:
4266 EMSG(_(e_re_corr));
4267 #ifdef DEBUG
4268 printf("Illegal op code %d\n", op);
4269 #endif
4270 return FALSE;
4271 }
4272 }
4273
4274 scan = next;
4275 }
4276
4277 /*
4278 * We get here only if there's trouble -- normally "case END" is the
4279 * terminating point.
4280 */
4281 EMSG(_(e_re_corr));
4282 #ifdef DEBUG
4283 printf("Premature EOL\n");
4284 #endif
4285 return FALSE;
4286 }
4287
4288 #ifdef FEAT_MBYTE
4289 # define ADVANCE_P(x) if (has_mbyte) x += (*mb_ptr2len_check)(x); else ++x
4290 #else
4291 # define ADVANCE_P(x) ++x
4292 #endif
4293
4294 /*
4295 * regrepeat - repeatedly match something simple, return how many.
4296 * Advances reginput (and reglnum) to just after the matched chars.
4297 */
4298 static int
4299 regrepeat(p, maxcount)
4300 char_u *p;
4301 long maxcount; /* maximum number of matches allowed */
4302 {
4303 long count = 0;
4304 char_u *scan;
4305 char_u *opnd;
4306 int mask;
4307 int testval = 0;
4308
4309 scan = reginput; /* Make local copy of reginput for speed. */
4310 opnd = OPERAND(p);
4311 switch (OP(p))
4312 {
4313 case ANY:
4314 case ANY + ADD_NL:
4315 while (count < maxcount)
4316 {
4317 /* Matching anything means we continue until end-of-line (or
4318 * end-of-file for ANY + ADD_NL), only limited by maxcount. */
4319 while (*scan != NUL && count < maxcount)
4320 {
4321 ++count;
4322 ADVANCE_P(scan);
4323 }
4324 if (!WITH_NL(OP(p)) || reglnum == reg_maxline || count == maxcount)
4325 break;
4326 ++count; /* count the line-break */
4327 reg_nextline();
4328 scan = reginput;
4329 if (got_int)
4330 break;
4331 }
4332 break;
4333
4334 case IDENT:
4335 case IDENT + ADD_NL:
4336 testval = TRUE;
4337 /*FALLTHROUGH*/
4338 case SIDENT:
4339 case SIDENT + ADD_NL:
4340 while (count < maxcount)
4341 {
4342 if (vim_isIDc(*scan) && (testval || !VIM_ISDIGIT(*scan)))
4343 {
4344 ADVANCE_P(scan);
4345 }
4346 else if (*scan == NUL)
4347 {
4348 if (!WITH_NL(OP(p)) || reglnum == reg_maxline)
4349 break;
4350 reg_nextline();
4351 scan = reginput;
4352 if (got_int)
4353 break;
4354 }
4355 else if (reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
4356 ++scan;
4357 else
4358 break;
4359 ++count;
4360 }
4361 break;
4362
4363 case KWORD:
4364 case KWORD + ADD_NL:
4365 testval = TRUE;
4366 /*FALLTHROUGH*/
4367 case SKWORD:
4368 case SKWORD + ADD_NL:
4369 while (count < maxcount)
4370 {
4371 if (vim_iswordp(scan) && (testval || !VIM_ISDIGIT(*scan)))
4372 {
4373 ADVANCE_P(scan);
4374 }
4375 else if (*scan == NUL)
4376 {
4377 if (!WITH_NL(OP(p)) || reglnum == reg_maxline)
4378 break;
4379 reg_nextline();
4380 scan = reginput;
4381 if (got_int)
4382 break;
4383 }
4384 else if (reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
4385 ++scan;
4386 else
4387 break;
4388 ++count;
4389 }
4390 break;
4391
4392 case FNAME:
4393 case FNAME + ADD_NL:
4394 testval = TRUE;
4395 /*FALLTHROUGH*/
4396 case SFNAME:
4397 case SFNAME + ADD_NL:
4398 while (count < maxcount)
4399 {
4400 if (vim_isfilec(*scan) && (testval || !VIM_ISDIGIT(*scan)))
4401 {
4402 ADVANCE_P(scan);
4403 }
4404 else if (*scan == NUL)
4405 {
4406 if (!WITH_NL(OP(p)) || reglnum == reg_maxline)
4407 break;
4408 reg_nextline();
4409 scan = reginput;
4410 if (got_int)
4411 break;
4412 }
4413 else if (reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
4414 ++scan;
4415 else
4416 break;
4417 ++count;
4418 }
4419 break;
4420
4421 case PRINT:
4422 case PRINT + ADD_NL:
4423 testval = TRUE;
4424 /*FALLTHROUGH*/
4425 case SPRINT:
4426 case SPRINT + ADD_NL:
4427 while (count < maxcount)
4428 {
4429 if (*scan == NUL)
4430 {
4431 if (!WITH_NL(OP(p)) || reglnum == reg_maxline)
4432 break;
4433 reg_nextline();
4434 scan = reginput;
4435 if (got_int)
4436 break;
4437 }
4438 else if (ptr2cells(scan) == 1 && (testval || !VIM_ISDIGIT(*scan)))
4439 {
4440 ADVANCE_P(scan);
4441 }
4442 else if (reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
4443 ++scan;
4444 else
4445 break;
4446 ++count;
4447 }
4448 break;
4449
4450 case WHITE:
4451 case WHITE + ADD_NL:
4452 testval = mask = RI_WHITE;
4453 do_class:
4454 while (count < maxcount)
4455 {
4456 #ifdef FEAT_MBYTE
4457 int l;
4458 #endif
4459 if (*scan == NUL)
4460 {
4461 if (!WITH_NL(OP(p)) || reglnum == reg_maxline)
4462 break;
4463 reg_nextline();
4464 scan = reginput;
4465 if (got_int)
4466 break;
4467 }
4468 #ifdef FEAT_MBYTE
4469 else if (has_mbyte && (l = (*mb_ptr2len_check)(scan)) > 1)
4470 {
4471 if (testval != 0)
4472 break;
4473 scan += l;
4474 }
4475 #endif
4476 else if ((class_tab[*scan] & mask) == testval)
4477 ++scan;
4478 else if (reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
4479 ++scan;
4480 else
4481 break;
4482 ++count;
4483 }
4484 break;
4485
4486 case NWHITE:
4487 case NWHITE + ADD_NL:
4488 mask = RI_WHITE;
4489 goto do_class;
4490 case DIGIT:
4491 case DIGIT + ADD_NL:
4492 testval = mask = RI_DIGIT;
4493 goto do_class;
4494 case NDIGIT:
4495 case NDIGIT + ADD_NL:
4496 mask = RI_DIGIT;
4497 goto do_class;
4498 case HEX:
4499 case HEX + ADD_NL:
4500 testval = mask = RI_HEX;
4501 goto do_class;
4502 case NHEX:
4503 case NHEX + ADD_NL:
4504 mask = RI_HEX;
4505 goto do_class;
4506 case OCTAL:
4507 case OCTAL + ADD_NL:
4508 testval = mask = RI_OCTAL;
4509 goto do_class;
4510 case NOCTAL:
4511 case NOCTAL + ADD_NL:
4512 mask = RI_OCTAL;
4513 goto do_class;
4514 case WORD:
4515 case WORD + ADD_NL:
4516 testval = mask = RI_WORD;
4517 goto do_class;
4518 case NWORD:
4519 case NWORD + ADD_NL:
4520 mask = RI_WORD;
4521 goto do_class;
4522 case HEAD:
4523 case HEAD + ADD_NL:
4524 testval = mask = RI_HEAD;
4525 goto do_class;
4526 case NHEAD:
4527 case NHEAD + ADD_NL:
4528 mask = RI_HEAD;
4529 goto do_class;
4530 case ALPHA:
4531 case ALPHA + ADD_NL:
4532 testval = mask = RI_ALPHA;
4533 goto do_class;
4534 case NALPHA:
4535 case NALPHA + ADD_NL:
4536 mask = RI_ALPHA;
4537 goto do_class;
4538 case LOWER:
4539 case LOWER + ADD_NL:
4540 testval = mask = RI_LOWER;
4541 goto do_class;
4542 case NLOWER:
4543 case NLOWER + ADD_NL:
4544 mask = RI_LOWER;
4545 goto do_class;
4546 case UPPER:
4547 case UPPER + ADD_NL:
4548 testval = mask = RI_UPPER;
4549 goto do_class;
4550 case NUPPER:
4551 case NUPPER + ADD_NL:
4552 mask = RI_UPPER;
4553 goto do_class;
4554
4555 case EXACTLY:
4556 {
4557 int cu, cl;
4558
4559 /* This doesn't do a multi-byte character, because a MULTIBYTECODE
4560 * would have been used for it. */
4561 if (ireg_ic)
4562 {
4563 cu = TOUPPER_LOC(*opnd);
4564 cl = TOLOWER_LOC(*opnd);
4565 while (count < maxcount && (*scan == cu || *scan == cl))
4566 {
4567 count++;
4568 scan++;
4569 }
4570 }
4571 else
4572 {
4573 cu = *opnd;
4574 while (count < maxcount && *scan == cu)
4575 {
4576 count++;
4577 scan++;
4578 }
4579 }
4580 break;
4581 }
4582
4583 #ifdef FEAT_MBYTE
4584 case MULTIBYTECODE:
4585 {
4586 int i, len, cf = 0;
4587
4588 /* Safety check (just in case 'encoding' was changed since
4589 * compiling the program). */
4590 if ((len = (*mb_ptr2len_check)(opnd)) > 1)
4591 {
4592 if (ireg_ic && enc_utf8)
4593 cf = utf_fold(utf_ptr2char(opnd));
4594 while (count < maxcount)
4595 {
4596 for (i = 0; i < len; ++i)
4597 if (opnd[i] != scan[i])
4598 break;
4599 if (i < len && (!ireg_ic || !enc_utf8
4600 || utf_fold(utf_ptr2char(scan)) != cf))
4601 break;
4602 scan += len;
4603 ++count;
4604 }
4605 }
4606 }
4607 break;
4608 #endif
4609
4610 case ANYOF:
4611 case ANYOF + ADD_NL:
4612 testval = TRUE;
4613 /*FALLTHROUGH*/
4614
4615 case ANYBUT:
4616 case ANYBUT + ADD_NL:
4617 while (count < maxcount)
4618 {
4619 #ifdef FEAT_MBYTE
4620 int len;
4621 #endif
4622 if (*scan == NUL)
4623 {
4624 if (!WITH_NL(OP(p)) || reglnum == reg_maxline)
4625 break;
4626 reg_nextline();
4627 scan = reginput;
4628 if (got_int)
4629 break;
4630 }
4631 else if (reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
4632 ++scan;
4633 #ifdef FEAT_MBYTE
4634 else if (has_mbyte && (len = (*mb_ptr2len_check)(scan)) > 1)
4635 {
4636 if ((cstrchr(opnd, (*mb_ptr2char)(scan)) == NULL) == testval)
4637 break;
4638 scan += len;
4639 }
4640 #endif
4641 else
4642 {
4643 if ((cstrchr(opnd, *scan) == NULL) == testval)
4644 break;
4645 ++scan;
4646 }
4647 ++count;
4648 }
4649 break;
4650
4651 case NEWL:
4652 while (count < maxcount
4653 && ((*scan == NUL && reglnum < reg_maxline)
4654 || (*scan == '\n' && reg_line_lbr)))
4655 {
4656 count++;
4657 if (reg_line_lbr)
4658 ADVANCE_REGINPUT();
4659 else
4660 reg_nextline();
4661 scan = reginput;
4662 if (got_int)
4663 break;
4664 }
4665 break;
4666
4667 default: /* Oh dear. Called inappropriately. */
4668 EMSG(_(e_re_corr));
4669 #ifdef DEBUG
4670 printf("Called regrepeat with op code %d\n", OP(p));
4671 #endif
4672 break;
4673 }
4674
4675 reginput = scan;
4676
4677 return (int)count;
4678 }
4679
4680 /*
4681 * regnext - dig the "next" pointer out of a node
4682 */
4683 static char_u *
4684 regnext(p)
4685 char_u *p;
4686 {
4687 int offset;
4688
4689 if (p == JUST_CALC_SIZE)
4690 return NULL;
4691
4692 offset = NEXT(p);
4693 if (offset == 0)
4694 return NULL;
4695
4696 if (OP(p) == BACK)
4697 return p - offset;
4698 else
4699 return p + offset;
4700 }
4701
4702 /*
4703 * Check the regexp program for its magic number.
4704 * Return TRUE if it's wrong.
4705 */
4706 static int
4707 prog_magic_wrong()
4708 {
4709 if (UCHARAT(REG_MULTI
4710 ? reg_mmatch->regprog->program
4711 : reg_match->regprog->program) != REGMAGIC)
4712 {
4713 EMSG(_(e_re_corr));
4714 return TRUE;
4715 }
4716 return FALSE;
4717 }
4718
4719 /*
4720 * Cleanup the subexpressions, if this wasn't done yet.
4721 * This construction is used to clear the subexpressions only when they are
4722 * used (to increase speed).
4723 */
4724 static void
4725 cleanup_subexpr()
4726 {
4727 if (need_clear_subexpr)
4728 {
4729 if (REG_MULTI)
4730 {
4731 /* Use 0xff to set lnum to -1 */
4732 vim_memset(reg_startpos, 0xff, sizeof(lpos_T) * NSUBEXP);
4733 vim_memset(reg_endpos, 0xff, sizeof(lpos_T) * NSUBEXP);
4734 }
4735 else
4736 {
4737 vim_memset(reg_startp, 0, sizeof(char_u *) * NSUBEXP);
4738 vim_memset(reg_endp, 0, sizeof(char_u *) * NSUBEXP);
4739 }
4740 need_clear_subexpr = FALSE;
4741 }
4742 }
4743
4744 #ifdef FEAT_SYN_HL
4745 static void
4746 cleanup_zsubexpr()
4747 {
4748 if (need_clear_zsubexpr)
4749 {
4750 if (REG_MULTI)
4751 {
4752 /* Use 0xff to set lnum to -1 */
4753 vim_memset(reg_startzpos, 0xff, sizeof(lpos_T) * NSUBEXP);
4754 vim_memset(reg_endzpos, 0xff, sizeof(lpos_T) * NSUBEXP);
4755 }
4756 else
4757 {
4758 vim_memset(reg_startzp, 0, sizeof(char_u *) * NSUBEXP);
4759 vim_memset(reg_endzp, 0, sizeof(char_u *) * NSUBEXP);
4760 }
4761 need_clear_zsubexpr = FALSE;
4762 }
4763 }
4764 #endif
4765
4766 /*
4767 * Advance reglnum, regline and reginput to the next line.
4768 */
4769 static void
4770 reg_nextline()
4771 {
4772 regline = reg_getline(++reglnum);
4773 reginput = regline;
4774 fast_breakcheck();
4775 }
4776
4777 /*
4778 * Save the input line and position in a regsave_T.
4779 */
4780 static void
4781 reg_save(save)
4782 regsave_T *save;
4783 {
4784 if (REG_MULTI)
4785 {
4786 save->rs_u.pos.col = (colnr_T)(reginput - regline);
4787 save->rs_u.pos.lnum = reglnum;
4788 }
4789 else
4790 save->rs_u.ptr = reginput;
4791 }
4792
4793 /*
4794 * Restore the input line and position from a regsave_T.
4795 */
4796 static void
4797 reg_restore(save)
4798 regsave_T *save;
4799 {
4800 if (REG_MULTI)
4801 {
4802 if (reglnum != save->rs_u.pos.lnum)
4803 {
4804 /* only call reg_getline() when the line number changed to save
4805 * a bit of time */
4806 reglnum = save->rs_u.pos.lnum;
4807 regline = reg_getline(reglnum);
4808 }
4809 reginput = regline + save->rs_u.pos.col;
4810 }
4811 else
4812 reginput = save->rs_u.ptr;
4813 }
4814
4815 /*
4816 * Return TRUE if current position is equal to saved position.
4817 */
4818 static int
4819 reg_save_equal(save)
4820 regsave_T *save;
4821 {
4822 if (REG_MULTI)
4823 return reglnum == save->rs_u.pos.lnum
4824 && reginput == regline + save->rs_u.pos.col;
4825 return reginput == save->rs_u.ptr;
4826 }
4827
4828 /*
4829 * Tentatively set the sub-expression start to the current position (after
4830 * calling regmatch() they will have changed). Need to save the existing
4831 * values for when there is no match.
4832 * Use se_save() to use pointer (save_se_multi()) or position (save_se_one()),
4833 * depending on REG_MULTI.
4834 */
4835 static void
4836 save_se_multi(savep, posp)
4837 save_se_T *savep;
4838 lpos_T *posp;
4839 {
4840 savep->se_u.pos = *posp;
4841 posp->lnum = reglnum;
4842 posp->col = (colnr_T)(reginput - regline);
4843 }
4844
4845 static void
4846 save_se_one(savep, pp)
4847 save_se_T *savep;
4848 char_u **pp;
4849 {
4850 savep->se_u.ptr = *pp;
4851 *pp = reginput;
4852 }
4853
4854 /*
4855 * Compare a number with the operand of RE_LNUM, RE_COL or RE_VCOL.
4856 */
4857 static int
4858 re_num_cmp(val, scan)
4859 long_u val;
4860 char_u *scan;
4861 {
4862 long_u n = OPERAND_MIN(scan);
4863
4864 if (OPERAND_CMP(scan) == '>')
4865 return val > n;
4866 if (OPERAND_CMP(scan) == '<')
4867 return val < n;
4868 return val == n;
4869 }
4870
4871
4872 #ifdef DEBUG
4873
4874 /*
4875 * regdump - dump a regexp onto stdout in vaguely comprehensible form
4876 */
4877 static void
4878 regdump(pattern, r)
4879 char_u *pattern;
4880 regprog_T *r;
4881 {
4882 char_u *s;
4883 int op = EXACTLY; /* Arbitrary non-END op. */
4884 char_u *next;
4885 char_u *end = NULL;
4886
4887 printf("\r\nregcomp(%s):\r\n", pattern);
4888
4889 s = r->program + 1;
4890 /*
4891 * Loop until we find the END that isn't before a referred next (an END
4892 * can also appear in a NOMATCH operand).
4893 */
4894 while (op != END || s <= end)
4895 {
4896 op = OP(s);
4897 printf("%2d%s", (int)(s - r->program), regprop(s)); /* Where, what. */
4898 next = regnext(s);
4899 if (next == NULL) /* Next ptr. */
4900 printf("(0)");
4901 else
4902 printf("(%d)", (int)((s - r->program) + (next - s)));
4903 if (end < next)
4904 end = next;
4905 if (op == BRACE_LIMITS)
4906 {
4907 /* Two short ints */
4908 printf(" minval %ld, maxval %ld", OPERAND_MIN(s), OPERAND_MAX(s));
4909 s += 8;
4910 }
4911 s += 3;
4912 if (op == ANYOF || op == ANYOF + ADD_NL
4913 || op == ANYBUT || op == ANYBUT + ADD_NL
4914 || op == EXACTLY)
4915 {
4916 /* Literal string, where present. */
4917 while (*s != NUL)
4918 printf("%c", *s++);
4919 s++;
4920 }
4921 printf("\r\n");
4922 }
4923
4924 /* Header fields of interest. */
4925 if (r->regstart != NUL)
4926 printf("start `%s' 0x%x; ", r->regstart < 256
4927 ? (char *)transchar(r->regstart)
4928 : "multibyte", r->regstart);
4929 if (r->reganch)
4930 printf("anchored; ");
4931 if (r->regmust != NULL)
4932 printf("must have \"%s\"", r->regmust);
4933 printf("\r\n");
4934 }
4935
4936 /*
4937 * regprop - printable representation of opcode
4938 */
4939 static char_u *
4940 regprop(op)
4941 char_u *op;
4942 {
4943 char_u *p;
4944 static char_u buf[50];
4945
4946 (void) strcpy(buf, ":");
4947
4948 switch (OP(op))
4949 {
4950 case BOL:
4951 p = "BOL";
4952 break;
4953 case EOL:
4954 p = "EOL";
4955 break;
4956 case RE_BOF:
4957 p = "BOF";
4958 break;
4959 case RE_EOF:
4960 p = "EOF";
4961 break;
4962 case CURSOR:
4963 p = "CURSOR";
4964 break;
4965 case RE_LNUM:
4966 p = "RE_LNUM";
4967 break;
4968 case RE_COL:
4969 p = "RE_COL";
4970 break;
4971 case RE_VCOL:
4972 p = "RE_VCOL";
4973 break;
4974 case BOW:
4975 p = "BOW";
4976 break;
4977 case EOW:
4978 p = "EOW";
4979 break;
4980 case ANY:
4981 p = "ANY";
4982 break;
4983 case ANY + ADD_NL:
4984 p = "ANY+NL";
4985 break;
4986 case ANYOF:
4987 p = "ANYOF";
4988 break;
4989 case ANYOF + ADD_NL:
4990 p = "ANYOF+NL";
4991 break;
4992 case ANYBUT:
4993 p = "ANYBUT";
4994 break;
4995 case ANYBUT + ADD_NL:
4996 p = "ANYBUT+NL";
4997 break;
4998 case IDENT:
4999 p = "IDENT";
5000 break;
5001 case IDENT + ADD_NL:
5002 p = "IDENT+NL";
5003 break;
5004 case SIDENT:
5005 p = "SIDENT";
5006 break;
5007 case SIDENT + ADD_NL:
5008 p = "SIDENT+NL";
5009 break;
5010 case KWORD:
5011 p = "KWORD";
5012 break;
5013 case KWORD + ADD_NL:
5014 p = "KWORD+NL";
5015 break;
5016 case SKWORD:
5017 p = "SKWORD";
5018 break;
5019 case SKWORD + ADD_NL:
5020 p = "SKWORD+NL";
5021 break;
5022 case FNAME:
5023 p = "FNAME";
5024 break;
5025 case FNAME + ADD_NL:
5026 p = "FNAME+NL";
5027 break;
5028 case SFNAME:
5029 p = "SFNAME";
5030 break;
5031 case SFNAME + ADD_NL:
5032 p = "SFNAME+NL";
5033 break;
5034 case PRINT:
5035 p = "PRINT";
5036 break;
5037 case PRINT + ADD_NL:
5038 p = "PRINT+NL";
5039 break;
5040 case SPRINT:
5041 p = "SPRINT";
5042 break;
5043 case SPRINT + ADD_NL:
5044 p = "SPRINT+NL";
5045 break;
5046 case WHITE:
5047 p = "WHITE";
5048 break;
5049 case WHITE + ADD_NL:
5050 p = "WHITE+NL";
5051 break;
5052 case NWHITE:
5053 p = "NWHITE";
5054 break;
5055 case NWHITE + ADD_NL:
5056 p = "NWHITE+NL";
5057 break;
5058 case DIGIT:
5059 p = "DIGIT";
5060 break;
5061 case DIGIT + ADD_NL:
5062 p = "DIGIT+NL";
5063 break;
5064 case NDIGIT:
5065 p = "NDIGIT";
5066 break;
5067 case NDIGIT + ADD_NL:
5068 p = "NDIGIT+NL";
5069 break;
5070 case HEX:
5071 p = "HEX";
5072 break;
5073 case HEX + ADD_NL:
5074 p = "HEX+NL";
5075 break;
5076 case NHEX:
5077 p = "NHEX";
5078 break;
5079 case NHEX + ADD_NL:
5080 p = "NHEX+NL";
5081 break;
5082 case OCTAL:
5083 p = "OCTAL";
5084 break;
5085 case OCTAL + ADD_NL:
5086 p = "OCTAL+NL";
5087 break;
5088 case NOCTAL:
5089 p = "NOCTAL";
5090 break;
5091 case NOCTAL + ADD_NL:
5092 p = "NOCTAL+NL";
5093 break;
5094 case WORD:
5095 p = "WORD";
5096 break;
5097 case WORD + ADD_NL:
5098 p = "WORD+NL";
5099 break;
5100 case NWORD:
5101 p = "NWORD";
5102 break;
5103 case NWORD + ADD_NL:
5104 p = "NWORD+NL";
5105 break;
5106 case HEAD:
5107 p = "HEAD";
5108 break;
5109 case HEAD + ADD_NL:
5110 p = "HEAD+NL";
5111 break;
5112 case NHEAD:
5113 p = "NHEAD";
5114 break;
5115 case NHEAD + ADD_NL:
5116 p = "NHEAD+NL";
5117 break;
5118 case ALPHA:
5119 p = "ALPHA";
5120 break;
5121 case ALPHA + ADD_NL:
5122 p = "ALPHA+NL";
5123 break;
5124 case NALPHA:
5125 p = "NALPHA";
5126 break;
5127 case NALPHA + ADD_NL:
5128 p = "NALPHA+NL";
5129 break;
5130 case LOWER:
5131 p = "LOWER";
5132 break;
5133 case LOWER + ADD_NL:
5134 p = "LOWER+NL";
5135 break;
5136 case NLOWER:
5137 p = "NLOWER";
5138 break;
5139 case NLOWER + ADD_NL:
5140 p = "NLOWER+NL";
5141 break;
5142 case UPPER:
5143 p = "UPPER";
5144 break;
5145 case UPPER + ADD_NL:
5146 p = "UPPER+NL";
5147 break;
5148 case NUPPER:
5149 p = "NUPPER";
5150 break;
5151 case NUPPER + ADD_NL:
5152 p = "NUPPER+NL";
5153 break;
5154 case BRANCH:
5155 p = "BRANCH";
5156 break;
5157 case EXACTLY:
5158 p = "EXACTLY";
5159 break;
5160 case NOTHING:
5161 p = "NOTHING";
5162 break;
5163 case BACK:
5164 p = "BACK";
5165 break;
5166 case END:
5167 p = "END";
5168 break;
5169 case MOPEN + 0:
5170 p = "MATCH START";
5171 break;
5172 case MOPEN + 1:
5173 case MOPEN + 2:
5174 case MOPEN + 3:
5175 case MOPEN + 4:
5176 case MOPEN + 5:
5177 case MOPEN + 6:
5178 case MOPEN + 7:
5179 case MOPEN + 8:
5180 case MOPEN + 9:
5181 sprintf(buf + STRLEN(buf), "MOPEN%d", OP(op) - MOPEN);
5182 p = NULL;
5183 break;
5184 case MCLOSE + 0:
5185 p = "MATCH END";
5186 break;
5187 case MCLOSE + 1:
5188 case MCLOSE + 2:
5189 case MCLOSE + 3:
5190 case MCLOSE + 4:
5191 case MCLOSE + 5:
5192 case MCLOSE + 6:
5193 case MCLOSE + 7:
5194 case MCLOSE + 8:
5195 case MCLOSE + 9:
5196 sprintf(buf + STRLEN(buf), "MCLOSE%d", OP(op) - MCLOSE);
5197 p = NULL;
5198 break;
5199 case BACKREF + 1:
5200 case BACKREF + 2:
5201 case BACKREF + 3:
5202 case BACKREF + 4:
5203 case BACKREF + 5:
5204 case BACKREF + 6:
5205 case BACKREF + 7:
5206 case BACKREF + 8:
5207 case BACKREF + 9:
5208 sprintf(buf + STRLEN(buf), "BACKREF%d", OP(op) - BACKREF);
5209 p = NULL;
5210 break;
5211 case NOPEN:
5212 p = "NOPEN";
5213 break;
5214 case NCLOSE:
5215 p = "NCLOSE";
5216 break;
5217 #ifdef FEAT_SYN_HL
5218 case ZOPEN + 1:
5219 case ZOPEN + 2:
5220 case ZOPEN + 3:
5221 case ZOPEN + 4:
5222 case ZOPEN + 5:
5223 case ZOPEN + 6:
5224 case ZOPEN + 7:
5225 case ZOPEN + 8:
5226 case ZOPEN + 9:
5227 sprintf(buf + STRLEN(buf), "ZOPEN%d", OP(op) - ZOPEN);
5228 p = NULL;
5229 break;
5230 case ZCLOSE + 1:
5231 case ZCLOSE + 2:
5232 case ZCLOSE + 3:
5233 case ZCLOSE + 4:
5234 case ZCLOSE + 5:
5235 case ZCLOSE + 6:
5236 case ZCLOSE + 7:
5237 case ZCLOSE + 8:
5238 case ZCLOSE + 9:
5239 sprintf(buf + STRLEN(buf), "ZCLOSE%d", OP(op) - ZCLOSE);
5240 p = NULL;
5241 break;
5242 case ZREF + 1:
5243 case ZREF + 2:
5244 case ZREF + 3:
5245 case ZREF + 4:
5246 case ZREF + 5:
5247 case ZREF + 6:
5248 case ZREF + 7:
5249 case ZREF + 8:
5250 case ZREF + 9:
5251 sprintf(buf + STRLEN(buf), "ZREF%d", OP(op) - ZREF);
5252 p = NULL;
5253 break;
5254 #endif
5255 case STAR:
5256 p = "STAR";
5257 break;
5258 case PLUS:
5259 p = "PLUS";
5260 break;
5261 case NOMATCH:
5262 p = "NOMATCH";
5263 break;
5264 case MATCH:
5265 p = "MATCH";
5266 break;
5267 case BEHIND:
5268 p = "BEHIND";
5269 break;
5270 case NOBEHIND:
5271 p = "NOBEHIND";
5272 break;
5273 case SUBPAT:
5274 p = "SUBPAT";
5275 break;
5276 case BRACE_LIMITS:
5277 p = "BRACE_LIMITS";
5278 break;
5279 case BRACE_SIMPLE:
5280 p = "BRACE_SIMPLE";
5281 break;
5282 case BRACE_COMPLEX + 0:
5283 case BRACE_COMPLEX + 1:
5284 case BRACE_COMPLEX + 2:
5285 case BRACE_COMPLEX + 3:
5286 case BRACE_COMPLEX + 4:
5287 case BRACE_COMPLEX + 5:
5288 case BRACE_COMPLEX + 6:
5289 case BRACE_COMPLEX + 7:
5290 case BRACE_COMPLEX + 8:
5291 case BRACE_COMPLEX + 9:
5292 sprintf(buf + STRLEN(buf), "BRACE_COMPLEX%d", OP(op) - BRACE_COMPLEX);
5293 p = NULL;
5294 break;
5295 #ifdef FEAT_MBYTE
5296 case MULTIBYTECODE:
5297 p = "MULTIBYTECODE";
5298 break;
5299 #endif
5300 case NEWL:
5301 p = "NEWL";
5302 break;
5303 default:
5304 sprintf(buf + STRLEN(buf), "corrupt %d", OP(op));
5305 p = NULL;
5306 break;
5307 }
5308 if (p != NULL)
5309 (void) strcat(buf, p);
5310 return buf;
5311 }
5312 #endif
5313
5314 #ifdef FEAT_MBYTE
5315 static void mb_decompose __ARGS((int c, int *c1, int *c2, int *c3));
5316
5317 typedef struct
5318 {
5319 int a, b, c;
5320 } decomp_T;
5321
5322
5323 /* 0xfb20 - 0xfb4f */
5324 decomp_T decomp_table[0xfb4f-0xfb20+1] =
5325 {
5326 {0x5e2,0,0}, /* 0xfb20 alt ayin */
5327 {0x5d0,0,0}, /* 0xfb21 alt alef */
5328 {0x5d3,0,0}, /* 0xfb22 alt dalet */
5329 {0x5d4,0,0}, /* 0xfb23 alt he */
5330 {0x5db,0,0}, /* 0xfb24 alt kaf */
5331 {0x5dc,0,0}, /* 0xfb25 alt lamed */
5332 {0x5dd,0,0}, /* 0xfb26 alt mem-sofit */
5333 {0x5e8,0,0}, /* 0xfb27 alt resh */
5334 {0x5ea,0,0}, /* 0xfb28 alt tav */
5335 {'+', 0, 0}, /* 0xfb29 alt plus */
5336 {0x5e9, 0x5c1, 0}, /* 0xfb2a shin+shin-dot */
5337 {0x5e9, 0x5c2, 0}, /* 0xfb2b shin+sin-dot */
5338 {0x5e9, 0x5c1, 0x5bc}, /* 0xfb2c shin+shin-dot+dagesh */
5339 {0x5e9, 0x5c2, 0x5bc}, /* 0xfb2d shin+sin-dot+dagesh */
5340 {0x5d0, 0x5b7, 0}, /* 0xfb2e alef+patah */
5341 {0x5d0, 0x5b8, 0}, /* 0xfb2f alef+qamats */
5342 {0x5d0, 0x5b4, 0}, /* 0xfb30 alef+hiriq */
5343 {0x5d1, 0x5bc, 0}, /* 0xfb31 bet+dagesh */
5344 {0x5d2, 0x5bc, 0}, /* 0xfb32 gimel+dagesh */
5345 {0x5d3, 0x5bc, 0}, /* 0xfb33 dalet+dagesh */
5346 {0x5d4, 0x5bc, 0}, /* 0xfb34 he+dagesh */
5347 {0x5d5, 0x5bc, 0}, /* 0xfb35 vav+dagesh */
5348 {0x5d6, 0x5bc, 0}, /* 0xfb36 zayin+dagesh */
5349 {0xfb37, 0, 0}, /* 0xfb37 -- UNUSED */
5350 {0x5d8, 0x5bc, 0}, /* 0xfb38 tet+dagesh */
5351 {0x5d9, 0x5bc, 0}, /* 0xfb39 yud+dagesh */
5352 {0x5da, 0x5bc, 0}, /* 0xfb3a kaf sofit+dagesh */
5353 {0x5db, 0x5bc, 0}, /* 0xfb3b kaf+dagesh */
5354 {0x5dc, 0x5bc, 0}, /* 0xfb3c lamed+dagesh */
5355 {0xfb3d, 0, 0}, /* 0xfb3d -- UNUSED */
5356 {0x5de, 0x5bc, 0}, /* 0xfb3e mem+dagesh */
5357 {0xfb3f, 0, 0}, /* 0xfb3f -- UNUSED */
5358 {0x5e0, 0x5bc, 0}, /* 0xfb40 nun+dagesh */
5359 {0x5e1, 0x5bc, 0}, /* 0xfb41 samech+dagesh */
5360 {0xfb42, 0, 0}, /* 0xfb42 -- UNUSED */
5361 {0x5e3, 0x5bc, 0}, /* 0xfb43 pe sofit+dagesh */
5362 {0x5e4, 0x5bc,0}, /* 0xfb44 pe+dagesh */
5363 {0xfb45, 0, 0}, /* 0xfb45 -- UNUSED */
5364 {0x5e6, 0x5bc, 0}, /* 0xfb46 tsadi+dagesh */
5365 {0x5e7, 0x5bc, 0}, /* 0xfb47 qof+dagesh */
5366 {0x5e8, 0x5bc, 0}, /* 0xfb48 resh+dagesh */
5367 {0x5e9, 0x5bc, 0}, /* 0xfb49 shin+dagesh */
5368 {0x5ea, 0x5bc, 0}, /* 0xfb4a tav+dagesh */
5369 {0x5d5, 0x5b9, 0}, /* 0xfb4b vav+holam */
5370 {0x5d1, 0x5bf, 0}, /* 0xfb4c bet+rafe */
5371 {0x5db, 0x5bf, 0}, /* 0xfb4d kaf+rafe */
5372 {0x5e4, 0x5bf, 0}, /* 0xfb4e pe+rafe */
5373 {0x5d0, 0x5dc, 0} /* 0xfb4f alef-lamed */
5374 };
5375
5376 static void
5377 mb_decompose(c, c1, c2, c3)
5378 int c, *c1, *c2, *c3;
5379 {
5380 decomp_T d;
5381
5382 if (c >= 0x4b20 && c <= 0xfb4f)
5383 {
5384 d = decomp_table[c - 0xfb20];
5385 *c1 = d.a;
5386 *c2 = d.b;
5387 *c3 = d.c;
5388 }
5389 else
5390 {
5391 *c1 = c;
5392 *c2 = *c3 = 0;
5393 }
5394 }
5395 #endif
5396
5397 /*
5398 * Compare two strings, ignore case if ireg_ic set.
5399 * Return 0 if strings match, non-zero otherwise.
5400 * Correct the length "*n" when composing characters are ignored.
5401 */
5402 static int
5403 cstrncmp(s1, s2, n)
5404 char_u *s1, *s2;
5405 int *n;
5406 {
5407 int result;
5408
5409 if (!ireg_ic)
5410 result = STRNCMP(s1, s2, *n);
5411 else
5412 result = MB_STRNICMP(s1, s2, *n);
5413
5414 #ifdef FEAT_MBYTE
5415 /* if it failed and it's utf8 and we want to combineignore: */
5416 if (result != 0 && enc_utf8 && ireg_icombine)
5417 {
5418 char_u *str1, *str2;
5419 int c1, c2, c11, c12;
5420 int ix;
5421 int junk;
5422
5423 /* we have to handle the strcmp ourselves, since it is necessary to
5424 * deal with the composing characters by ignoring them: */
5425 str1 = s1;
5426 str2 = s2;
5427 c1 = c2 = 0;
5428 for (ix = 0; ix < *n; )
5429 {
5430 c1 = mb_ptr2char_adv(&str1);
5431 c2 = mb_ptr2char_adv(&str2);
5432 ix += utf_char2len(c1);
5433
5434 /* decompose the character if necessary, into 'base' characters
5435 * because I don't care about Arabic, I will hard-code the Hebrew
5436 * which I *do* care about! So sue me... */
5437 if (c1 != c2 && (!ireg_ic || utf_fold(c1) != utf_fold(c2)))
5438 {
5439 /* decomposition necessary? */
5440 mb_decompose(c1, &c11, &junk, &junk);
5441 mb_decompose(c2, &c12, &junk, &junk);
5442 c1 = c11;
5443 c2 = c12;
5444 if (c11 != c12 && (!ireg_ic || utf_fold(c11) != utf_fold(c12)))
5445 break;
5446 }
5447 }
5448 result = c2 - c1;
5449 if (result == 0)
5450 *n = (int)(str2 - s2);
5451 }
5452 #endif
5453
5454 return result;
5455 }
5456
5457 /*
5458 * cstrchr: This function is used a lot for simple searches, keep it fast!
5459 */
5460 static char_u *
5461 cstrchr(s, c)
5462 char_u *s;
5463 int c;
5464 {
5465 char_u *p;
5466 int cc;
5467
5468 if (!ireg_ic
5469 #ifdef FEAT_MBYTE
5470 || (!enc_utf8 && mb_char2len(c) > 1)
5471 #endif
5472 )
5473 return vim_strchr(s, c);
5474
5475 /* tolower() and toupper() can be slow, comparing twice should be a lot
5476 * faster (esp. when using MS Visual C++!).
5477 * For UTF-8 need to use folded case. */
5478 #ifdef FEAT_MBYTE
5479 if (enc_utf8 && c > 0x80)
5480 cc = utf_fold(c);
5481 else
5482 #endif
5483 if (isupper(c))
5484 cc = TOLOWER_LOC(c);
5485 else if (islower(c))
5486 cc = TOUPPER_LOC(c);
5487 else
5488 return vim_strchr(s, c);
5489
5490 #ifdef FEAT_MBYTE
5491 if (has_mbyte)
5492 {
5493 for (p = s; *p != NUL; p += (*mb_ptr2len_check)(p))
5494 {
5495 if (enc_utf8 && c > 0x80)
5496 {
5497 if (utf_fold(utf_ptr2char(p)) == cc)
5498 return p;
5499 }
5500 else if (*p == c || *p == cc)
5501 return p;
5502 }
5503 }
5504 else
5505 #endif
5506 /* Faster version for when there are no multi-byte characters. */
5507 for (p = s; *p != NUL; ++p)
5508 if (*p == c || *p == cc)
5509 return p;
5510
5511 return NULL;
5512 }
5513
5514 /***************************************************************
5515 * regsub stuff *
5516 ***************************************************************/
5517
5518 /* This stuff below really confuses cc on an SGI -- webb */
5519 #ifdef __sgi
5520 # undef __ARGS
5521 # define __ARGS(x) ()
5522 #endif
5523
5524 /*
5525 * We should define ftpr as a pointer to a function returning a pointer to
5526 * a function returning a pointer to a function ...
5527 * This is impossible, so we declare a pointer to a function returning a
5528 * pointer to a function returning void. This should work for all compilers.
5529 */
5530 typedef void (*(*fptr) __ARGS((char_u *, int)))();
5531
5532 static fptr do_upper __ARGS((char_u *, int));
5533 static fptr do_Upper __ARGS((char_u *, int));
5534 static fptr do_lower __ARGS((char_u *, int));
5535 static fptr do_Lower __ARGS((char_u *, int));
5536
5537 static int vim_regsub_both __ARGS((char_u *source, char_u *dest, int copy, int magic, int backslash));
5538
5539 static fptr
5540 do_upper(d, c)
5541 char_u *d;
5542 int c;
5543 {
5544 *d = TOUPPER_LOC(c);
5545
5546 return (fptr)NULL;
5547 }
5548
5549 static fptr
5550 do_Upper(d, c)
5551 char_u *d;
5552 int c;
5553 {
5554 *d = TOUPPER_LOC(c);
5555
5556 return (fptr)do_Upper;
5557 }
5558
5559 static fptr
5560 do_lower(d, c)
5561 char_u *d;
5562 int c;
5563 {
5564 *d = TOLOWER_LOC(c);
5565
5566 return (fptr)NULL;
5567 }
5568
5569 static fptr
5570 do_Lower(d, c)
5571 char_u *d;
5572 int c;
5573 {
5574 *d = TOLOWER_LOC(c);
5575
5576 return (fptr)do_Lower;
5577 }
5578
5579 /*
5580 * regtilde(): Replace tildes in the pattern by the old pattern.
5581 *
5582 * Short explanation of the tilde: It stands for the previous replacement
5583 * pattern. If that previous pattern also contains a ~ we should go back a
5584 * step further... But we insert the previous pattern into the current one
5585 * and remember that.
5586 * This still does not handle the case where "magic" changes. TODO?
5587 *
5588 * The tildes are parsed once before the first call to vim_regsub().
5589 */
5590 char_u *
5591 regtilde(source, magic)
5592 char_u *source;
5593 int magic;
5594 {
5595 char_u *newsub = source;
5596 char_u *tmpsub;
5597 char_u *p;
5598 int len;
5599 int prevlen;
5600
5601 for (p = newsub; *p; ++p)
5602 {
5603 if ((*p == '~' && magic) || (*p == '\\' && *(p + 1) == '~' && !magic))
5604 {
5605 if (reg_prev_sub != NULL)
5606 {
5607 /* length = len(newsub) - 1 + len(prev_sub) + 1 */
5608 prevlen = (int)STRLEN(reg_prev_sub);
5609 tmpsub = alloc((unsigned)(STRLEN(newsub) + prevlen));
5610 if (tmpsub != NULL)
5611 {
5612 /* copy prefix */
5613 len = (int)(p - newsub); /* not including ~ */
5614 mch_memmove(tmpsub, newsub, (size_t)len);
5615 /* interpretate tilde */
5616 mch_memmove(tmpsub + len, reg_prev_sub, (size_t)prevlen);
5617 /* copy postfix */
5618 if (!magic)
5619 ++p; /* back off \ */
5620 STRCPY(tmpsub + len + prevlen, p + 1);
5621
5622 if (newsub != source) /* already allocated newsub */
5623 vim_free(newsub);
5624 newsub = tmpsub;
5625 p = newsub + len + prevlen;
5626 }
5627 }
5628 else if (magic)
5629 STRCPY(p, p + 1); /* remove '~' */
5630 else
5631 STRCPY(p, p + 2); /* remove '\~' */
5632 --p;
5633 }
5634 else
5635 {
5636 if (*p == '\\' && p[1]) /* skip escaped characters */
5637 ++p;
5638 #ifdef FEAT_MBYTE
5639 if (has_mbyte)
5640 p += (*mb_ptr2len_check)(p) - 1;
5641 #endif
5642 }
5643 }
5644
5645 vim_free(reg_prev_sub);
5646 if (newsub != source) /* newsub was allocated, just keep it */
5647 reg_prev_sub = newsub;
5648 else /* no ~ found, need to save newsub */
5649 reg_prev_sub = vim_strsave(newsub);
5650 return newsub;
5651 }
5652
5653 #ifdef FEAT_EVAL
5654 static int can_f_submatch = FALSE; /* TRUE when submatch() can be used */
5655
5656 /* These pointers are used instead of reg_match and reg_mmatch for
5657 * reg_submatch(). Needed for when the substitution string is an expression
5658 * that contains a call to substitute() and submatch(). */
5659 static regmatch_T *submatch_match;
5660 static regmmatch_T *submatch_mmatch;
5661 #endif
5662
5663 #if defined(FEAT_MODIFY_FNAME) || defined(FEAT_EVAL) || defined(PROTO)
5664 /*
5665 * vim_regsub() - perform substitutions after a vim_regexec() or
5666 * vim_regexec_multi() match.
5667 *
5668 * If "copy" is TRUE really copy into "dest".
5669 * If "copy" is FALSE nothing is copied, this is just to find out the length
5670 * of the result.
5671 *
5672 * If "backslash" is TRUE, a backslash will be removed later, need to double
5673 * them to keep them, and insert a backslash before a CR to avoid it being
5674 * replaced with a line break later.
5675 *
5676 * Note: The matched text must not change between the call of
5677 * vim_regexec()/vim_regexec_multi() and vim_regsub()! It would make the back
5678 * references invalid!
5679 *
5680 * Returns the size of the replacement, including terminating NUL.
5681 */
5682 int
5683 vim_regsub(rmp, source, dest, copy, magic, backslash)
5684 regmatch_T *rmp;
5685 char_u *source;
5686 char_u *dest;
5687 int copy;
5688 int magic;
5689 int backslash;
5690 {
5691 reg_match = rmp;
5692 reg_mmatch = NULL;
5693 reg_maxline = 0;
5694 return vim_regsub_both(source, dest, copy, magic, backslash);
5695 }
5696 #endif
5697
5698 int
5699 vim_regsub_multi(rmp, lnum, source, dest, copy, magic, backslash)
5700 regmmatch_T *rmp;
5701 linenr_T lnum;
5702 char_u *source;
5703 char_u *dest;
5704 int copy;
5705 int magic;
5706 int backslash;
5707 {
5708 reg_match = NULL;
5709 reg_mmatch = rmp;
5710 reg_buf = curbuf; /* always works on the current buffer! */
5711 reg_firstlnum = lnum;
5712 reg_maxline = curbuf->b_ml.ml_line_count - lnum;
5713 return vim_regsub_both(source, dest, copy, magic, backslash);
5714 }
5715
5716 static int
5717 vim_regsub_both(source, dest, copy, magic, backslash)
5718 char_u *source;
5719 char_u *dest;
5720 int copy;
5721 int magic;
5722 int backslash;
5723 {
5724 char_u *src;
5725 char_u *dst;
5726 char_u *s;
5727 int c;
5728 int no = -1;
5729 fptr func = (fptr)NULL;
5730 linenr_T clnum = 0; /* init for GCC */
5731 int len = 0; /* init for GCC */
5732 #ifdef FEAT_EVAL
5733 static char_u *eval_result = NULL;
5734 #endif
5735 #ifdef FEAT_MBYTE
5736 int l;
5737 #endif
5738
5739
5740 /* Be paranoid... */
5741 if (source == NULL || dest == NULL)
5742 {
5743 EMSG(_(e_null));
5744 return 0;
5745 }
5746 if (prog_magic_wrong())
5747 return 0;
5748 src = source;
5749 dst = dest;
5750
5751 /*
5752 * When the substitute part starts with "\=" evaluate it as an expression.
5753 */
5754 if (source[0] == '\\' && source[1] == '='
5755 #ifdef FEAT_EVAL
5756 && !can_f_submatch /* can't do this recursively */
5757 #endif
5758 )
5759 {
5760 #ifdef FEAT_EVAL
5761 /* To make sure that the length doesn't change between checking the
5762 * length and copying the string, and to speed up things, the
5763 * resulting string is saved from the call with "copy" == FALSE to the
5764 * call with "copy" == TRUE. */
5765 if (copy)
5766 {
5767 if (eval_result != NULL)
5768 {
5769 STRCPY(dest, eval_result);
5770 dst += STRLEN(eval_result);
5771 vim_free(eval_result);
5772 eval_result = NULL;
5773 }
5774 }
5775 else
5776 {
5777 linenr_T save_reg_maxline;
5778 win_T *save_reg_win;
5779 int save_ireg_ic;
5780
5781 vim_free(eval_result);
5782
5783 /* The expression may contain substitute(), which calls us
5784 * recursively. Make sure submatch() gets the text from the first
5785 * level. Don't need to save "reg_buf", because
5786 * vim_regexec_multi() can't be called recursively. */
5787 submatch_match = reg_match;
5788 submatch_mmatch = reg_mmatch;
5789 save_reg_maxline = reg_maxline;
5790 save_reg_win = reg_win;
5791 save_ireg_ic = ireg_ic;
5792 can_f_submatch = TRUE;
5793
5794 eval_result = eval_to_string(source + 2, NULL);
5795 if (eval_result != NULL)
5796 {
5797 for (s = eval_result; *s != NUL; ++s)
5798 {
5799 /* Change NL to CR, so that it becomes a line break.
5800 * Skip over a backslashed character. */
5801 if (*s == NL)
5802 *s = CAR;
5803 else if (*s == '\\' && s[1] != NUL)
5804 ++s;
5805 #ifdef FEAT_MBYTE
5806 if (has_mbyte)
5807 s += (*mb_ptr2len_check)(s) - 1;
5808 #endif
5809 }
5810
5811 dst += STRLEN(eval_result);
5812 }
5813
5814 reg_match = submatch_match;
5815 reg_mmatch = submatch_mmatch;
5816 reg_maxline = save_reg_maxline;
5817 reg_win = save_reg_win;
5818 ireg_ic = save_ireg_ic;
5819 can_f_submatch = FALSE;
5820 }
5821 #endif
5822 }
5823 else
5824 while ((c = *src++) != NUL)
5825 {
5826 if (c == '&' && magic)
5827 no = 0;
5828 else if (c == '\\' && *src != NUL)
5829 {
5830 if (*src == '&' && !magic)
5831 {
5832 ++src;
5833 no = 0;
5834 }
5835 else if ('0' <= *src && *src <= '9')
5836 {
5837 no = *src++ - '0';
5838 }
5839 else if (vim_strchr((char_u *)"uUlLeE", *src))
5840 {
5841 switch (*src++)
5842 {
5843 case 'u': func = (fptr)do_upper;
5844 continue;
5845 case 'U': func = (fptr)do_Upper;
5846 continue;
5847 case 'l': func = (fptr)do_lower;
5848 continue;
5849 case 'L': func = (fptr)do_Lower;
5850 continue;
5851 case 'e':
5852 case 'E': func = (fptr)NULL;
5853 continue;
5854 }
5855 }
5856 }
5857 if (no < 0) /* Ordinary character. */
5858 {
5859 if (c == '\\' && *src != NUL)
5860 {
5861 /* Check for abbreviations -- webb */
5862 switch (*src)
5863 {
5864 case 'r': c = CAR; ++src; break;
5865 case 'n': c = NL; ++src; break;
5866 case 't': c = TAB; ++src; break;
5867 /* Oh no! \e already has meaning in subst pat :-( */
5868 /* case 'e': c = ESC; ++src; break; */
5869 case 'b': c = Ctrl_H; ++src; break;
5870
5871 /* If "backslash" is TRUE the backslash will be removed
5872 * later. Used to insert a literal CR. */
5873 default: if (backslash)
5874 {
5875 if (copy)
5876 *dst = '\\';
5877 ++dst;
5878 }
5879 c = *src++;
5880 }
5881 }
5882
5883 /* Write to buffer, if copy is set. */
5884 #ifdef FEAT_MBYTE
5885 if (has_mbyte && (l = (*mb_ptr2len_check)(src - 1)) > 1)
5886 {
5887 /* TODO: should use "func" here. */
5888 if (copy)
5889 mch_memmove(dst, src - 1, l);
5890 dst += l - 1;
5891 src += l - 1;
5892 }
5893 else
5894 {
5895 #endif
5896 if (copy)
5897 {
5898 if (func == (fptr)NULL) /* just copy */
5899 *dst = c;
5900 else /* change case */
5901 func = (fptr)(func(dst, c));
5902 /* Turbo C complains without the typecast */
5903 }
5904 #ifdef FEAT_MBYTE
5905 }
5906 #endif
5907 dst++;
5908 }
5909 else
5910 {
5911 if (REG_MULTI)
5912 {
5913 clnum = reg_mmatch->startpos[no].lnum;
5914 if (clnum < 0 || reg_mmatch->endpos[no].lnum < 0)
5915 s = NULL;
5916 else
5917 {
5918 s = reg_getline(clnum) + reg_mmatch->startpos[no].col;
5919 if (reg_mmatch->endpos[no].lnum == clnum)
5920 len = reg_mmatch->endpos[no].col
5921 - reg_mmatch->startpos[no].col;
5922 else
5923 len = (int)STRLEN(s);
5924 }
5925 }
5926 else
5927 {
5928 s = reg_match->startp[no];
5929 if (reg_match->endp[no] == NULL)
5930 s = NULL;
5931 else
5932 len = (int)(reg_match->endp[no] - s);
5933 }
5934 if (s != NULL)
5935 {
5936 for (;;)
5937 {
5938 if (len == 0)
5939 {
5940 if (REG_MULTI)
5941 {
5942 if (reg_mmatch->endpos[no].lnum == clnum)
5943 break;
5944 if (copy)
5945 *dst = CAR;
5946 ++dst;
5947 s = reg_getline(++clnum);
5948 if (reg_mmatch->endpos[no].lnum == clnum)
5949 len = reg_mmatch->endpos[no].col;
5950 else
5951 len = (int)STRLEN(s);
5952 }
5953 else
5954 break;
5955 }
5956 else if (*s == NUL) /* we hit NUL. */
5957 {
5958 if (copy)
5959 EMSG(_(e_re_damg));
5960 goto exit;
5961 }
5962 else
5963 {
5964 if (backslash && (*s == CAR || *s == '\\'))
5965 {
5966 /*
5967 * Insert a backslash in front of a CR, otherwise
5968 * it will be replaced by a line break.
5969 * Number of backslashes will be halved later,
5970 * double them here.
5971 */
5972 if (copy)
5973 {
5974 dst[0] = '\\';
5975 dst[1] = *s;
5976 }
5977 dst += 2;
5978 }
5979 #ifdef FEAT_MBYTE
5980 else if (has_mbyte && (l = (*mb_ptr2len_check)(s)) > 1)
5981 {
5982 /* TODO: should use "func" here. */
5983 if (copy)
5984 mch_memmove(dst, s, l);
5985 dst += l;
5986 s += l - 1;
5987 len -= l - 1;
5988 }
5989 #endif
5990 else
5991 {
5992 if (copy)
5993 {
5994 if (func == (fptr)NULL) /* just copy */
5995 *dst = *s;
5996 else /* change case */
5997 func = (fptr)(func(dst, *s));
5998 /* Turbo C complains without the typecast */
5999 }
6000 ++dst;
6001 }
6002 ++s;
6003 --len;
6004 }
6005 }
6006 }
6007 no = -1;
6008 }
6009 }
6010 if (copy)
6011 *dst = NUL;
6012
6013 exit:
6014 return (int)((dst - dest) + 1);
6015 }
6016
6017 #ifdef FEAT_EVAL
6018 /*
6019 * Used for the submatch() function: get the string from tne n'th submatch in
6020 * allocated memory.
6021 * Returns NULL when not in a ":s" command and for a non-existing submatch.
6022 */
6023 char_u *
6024 reg_submatch(no)
6025 int no;
6026 {
6027 char_u *retval = NULL;
6028 char_u *s;
6029 int len;
6030 int round;
6031 linenr_T lnum;
6032
6033 if (!can_f_submatch)
6034 return NULL;
6035
6036 if (submatch_match == NULL)
6037 {
6038 /*
6039 * First round: compute the length and allocate memory.
6040 * Second round: copy the text.
6041 */
6042 for (round = 1; round <= 2; ++round)
6043 {
6044 lnum = submatch_mmatch->startpos[no].lnum;
6045 if (lnum < 0 || submatch_mmatch->endpos[no].lnum < 0)
6046 return NULL;
6047
6048 s = reg_getline(lnum) + submatch_mmatch->startpos[no].col;
6049 if (s == NULL) /* anti-crash check, cannot happen? */
6050 break;
6051 if (submatch_mmatch->endpos[no].lnum == lnum)
6052 {
6053 /* Within one line: take form start to end col. */
6054 len = submatch_mmatch->endpos[no].col
6055 - submatch_mmatch->startpos[no].col;
6056 if (round == 2)
6057 {
6058 STRNCPY(retval, s, len);
6059 retval[len] = NUL;
6060 }
6061 ++len;
6062 }
6063 else
6064 {
6065 /* Multiple lines: take start line from start col, middle
6066 * lines completely and end line up to end col. */
6067 len = (int)STRLEN(s);
6068 if (round == 2)
6069 {
6070 STRCPY(retval, s);
6071 retval[len] = '\n';
6072 }
6073 ++len;
6074 ++lnum;
6075 while (lnum < submatch_mmatch->endpos[no].lnum)
6076 {
6077 s = reg_getline(lnum++);
6078 if (round == 2)
6079 STRCPY(retval + len, s);
6080 len += (int)STRLEN(s);
6081 if (round == 2)
6082 retval[len] = '\n';
6083 ++len;
6084 }
6085 if (round == 2)
6086 STRNCPY(retval + len, reg_getline(lnum),
6087 submatch_mmatch->endpos[no].col);
6088 len += submatch_mmatch->endpos[no].col;
6089 if (round == 2)
6090 retval[len] = NUL;
6091 ++len;
6092 }
6093
6094 if (round == 1)
6095 {
6096 retval = lalloc((long_u)len, TRUE);
6097 if (s == NULL)
6098 return NULL;
6099 }
6100 }
6101 }
6102 else
6103 {
6104 if (submatch_match->endp[no] == NULL)
6105 retval = NULL;
6106 else
6107 {
6108 s = submatch_match->startp[no];
6109 retval = vim_strnsave(s, (int)(submatch_match->endp[no] - s));
6110 }
6111 }
6112
6113 return retval;
6114 }
6115 #endif