vim: src/regexp.c comparison

comparison src/regexp.c @ 35166:0b259135fb3a v9.1.0409

patch 9.1.0409: too many strlen() calls in the regexp engine Commit: https://github.com/vim/vim/commit/82792db6315f7c7b0e299cdde1566f2932a463f8 Author: John Marriott <basilisk@internode.on.net> Date: Sun May 12 00:07:17 2024 +0200 patch 9.1.0409: too many strlen() calls in the regexp engine Problem: too many strlen() calls in the regexp engine Solution: refactor code to retrieve strlen differently, make use of bsearch() for getting the character class (John Marriott) closes: #14648 Signed-off-by: John Marriott <basilisk@internode.on.net> Signed-off-by: Christian Brabandt <cb@256bit.org>

author	Christian Brabandt <cb@256bit.org>
date	Sun, 12 May 2024 00:15:04 +0200
parents	3f8444c5a6f3
children	4aad918ac113

comparison

equal deleted inserted replaced

-:d0498ef60b5b
+:0b259135fb3a
 	return MULTI_MULT;
 return NOT_MULTI;
 }
 static char_u		*reg_prev_sub = NULL;
+static size_t		reg_prev_sublen = 0;
 /*
 * REGEXP_INRANGE contains all characters which are always special in a []
 * range after '\'.
 * REGEXP_ABBR contains all characters which act as abbreviations after '\'.
 	case 'b':   return BS;
 }
 return c;
 }
+enum
+{
+CLASS_ALNUM = 0,
+CLASS_ALPHA,
+CLASS_BLANK,
+CLASS_CNTRL,
+CLASS_DIGIT,
+CLASS_GRAPH,
+CLASS_LOWER,
+CLASS_PRINT,
+CLASS_PUNCT,
+CLASS_SPACE,
+CLASS_UPPER,
+CLASS_XDIGIT,
+CLASS_TAB,
+CLASS_RETURN,
+CLASS_BACKSPACE,
+CLASS_ESCAPE,
+CLASS_IDENT,
+CLASS_KEYWORD,
+CLASS_FNAME,
+CLASS_NONE = 99
+};
 /*
 * Check for a character class name "[:name:]".  "pp" points to the '['.
 * Returns one of the CLASS_ items. CLASS_NONE means that no item was
 * recognized.  Otherwise "pp" is advanced to after the item.
 */
 static int
 get_char_class(char_u **pp)
 {
-static const char *(class_names[]) =
+// must be sorted by the 'value' field because it is used by bsearch()!
-{
+static keyvalue_T char_class_tab[] =
-	"alnum:]",
+{
-#define CLASS_ALNUM 0
+	KEYVALUE_ENTRY(CLASS_ALNUM, "alnum:]"),
-	"alpha:]",
+	KEYVALUE_ENTRY(CLASS_ALPHA, "alpha:]"),
-#define CLASS_ALPHA 1
+	KEYVALUE_ENTRY(CLASS_BACKSPACE, "backspace:]"),
-	"blank:]",
+	KEYVALUE_ENTRY(CLASS_BLANK, "blank:]"),
-#define CLASS_BLANK 2
+	KEYVALUE_ENTRY(CLASS_CNTRL, "cntrl:]"),
-	"cntrl:]",
+	KEYVALUE_ENTRY(CLASS_DIGIT, "digit:]"),
-#define CLASS_CNTRL 3
+	KEYVALUE_ENTRY(CLASS_ESCAPE, "escape:]"),
-	"digit:]",
+	KEYVALUE_ENTRY(CLASS_FNAME, "fname:]"),
-#define CLASS_DIGIT 4
+	KEYVALUE_ENTRY(CLASS_GRAPH, "graph:]"),
-	"graph:]",
+	KEYVALUE_ENTRY(CLASS_IDENT, "ident:]"),
-#define CLASS_GRAPH 5
+	KEYVALUE_ENTRY(CLASS_KEYWORD, "keyword:]"),
-	"lower:]",
+	KEYVALUE_ENTRY(CLASS_LOWER, "lower:]"),
-#define CLASS_LOWER 6
+	KEYVALUE_ENTRY(CLASS_PRINT, "print:]"),
-	"print:]",
+	KEYVALUE_ENTRY(CLASS_PUNCT, "punct:]"),
-#define CLASS_PRINT 7
+	KEYVALUE_ENTRY(CLASS_RETURN, "return:]"),
-	"punct:]",
+	KEYVALUE_ENTRY(CLASS_SPACE, "space:]"),
-#define CLASS_PUNCT 8
+	KEYVALUE_ENTRY(CLASS_TAB, "tab:]"),
-	"space:]",
+	KEYVALUE_ENTRY(CLASS_UPPER, "upper:]"),
-#define CLASS_SPACE 9
+	KEYVALUE_ENTRY(CLASS_XDIGIT, "xdigit:]")
-	"upper:]",
-#define CLASS_UPPER 10
-	"xdigit:]",
-#define CLASS_XDIGIT 11
-	"tab:]",
-#define CLASS_TAB 12
-	"return:]",
-#define CLASS_RETURN 13
-	"backspace:]",
-#define CLASS_BACKSPACE 14
-	"escape:]",
-#define CLASS_ESCAPE 15
-	"ident:]",
-#define CLASS_IDENT 16
-	"keyword:]",
-#define CLASS_KEYWORD 17
-	"fname:]",
-#define CLASS_FNAME 18
 };
-#define CLASS_NONE 99
-int i;
+// check that the value of "pp" has a chance of matching
+if ((*pp)[1] == ':' && ASCII_ISLOWER((*pp)[2])
-if ((*pp)[1] == ':')
+			&& ASCII_ISLOWER((*pp)[3]) && ASCII_ISLOWER((*pp)[4]))
 {
-	for (i = 0; i < (int)ARRAY_LENGTH(class_names); ++i)
+	keyvalue_T target;
-	    if (STRNCMP(*pp + 2, class_names[i], STRLEN(class_names[i])) == 0)
+	keyvalue_T *entry;
-	    {
+	// this function can be called repeatedly with the same value for "pp"
-		*pp += STRLEN(class_names[i]) + 2;
+	// so we cache the last found entry.
-		return i;
+	static keyvalue_T *last_entry = NULL;
-	    }
+	target.key = 0;
+	target.value = (char *)*pp + 2;
+	target.length = 0;		    // not used, see cmp_keyvalue_value_n()
+	if (last_entry != NULL && cmp_keyvalue_value_n(&target, last_entry) == 0)
+	    entry = last_entry;
+	else
+	    entry = (keyvalue_T *)bsearch(&target, &char_class_tab,
+					ARRAY_LENGTH(char_class_tab),
+					sizeof(char_class_tab[0]), cmp_keyvalue_value_n);
+	if (entry != NULL)
+	{
+	    last_entry = entry;
+	    *pp += entry->length + 2;
+	    return entry->key;
+	}
 }
 return CLASS_NONE;
 }
 /*
 	}
 	else if (p[0] == '\\' && p[1] != NUL)
 	{
 	    if (dirc == '?' && newp != NULL && p[1] == '?')
 	    {
+		size_t	startplen;
 		// change "\?" to "?", make a copy first.
 		if (*newp == NULL)
 		{
-		    *newp = vim_strsave(startp);
+		    startplen = STRLEN(startp);
+		    *newp = vim_strnsave(startp, startplen);
 		    if (*newp != NULL)
 			p = *newp + (p - startp);
 		}
 		if (dropped != NULL)
 		    ++*dropped;
 		if (*newp != NULL)
-		    STRMOVE(p, p + 1);
+		    mch_memmove(p, p + 1, (startplen - ((p + 1) - *newp)) + 1);
 		else
 		    ++p;
 	    }
 	    else
 		++p;    // skip next character
 reg_iswordc(int c)
 {
 return vim_iswordc_buf(c, rex.reg_buf);
 }
+#ifdef FEAT_EVAL
+static int can_f_submatch = FALSE;	// TRUE when submatch() can be used
+// This struct is used for reg_submatch(). Needed for when the
+// substitution string is an expression that contains a call to substitute()
+// and submatch().
+typedef struct {
+regmatch_T	*sm_match;
+regmmatch_T	*sm_mmatch;
+linenr_T	sm_firstlnum;
+linenr_T	sm_maxline;
+int		sm_line_lbr;
+} regsubmatch_T;
+static regsubmatch_T rsm;  // can only be used when can_f_submatch is TRUE
+#endif
+typedef enum
+{
+RGLF_LINE = 0x01,
+RGLF_LENGTH = 0x02
+#ifdef FEAT_EVAL
+,
+RGLF_SUBMATCH = 0x04
+#endif
+} reg_getline_flags_T;
+//
+// common code for reg_getline(), reg_getline_len(), reg_getline_submatch() and
+// reg_getline_submatch_len().
+// the flags argument (which is a bitmask) controls what info is to be returned and whether
+// or not submatch is in effect.
+// note:
+//     submatch is available only if FEAT_EVAL is defined.
+static void
+reg_getline_common(linenr_T lnum, reg_getline_flags_T flags, char_u **line, colnr_T *length)
+{
+int get_line = flags & RGLF_LINE;
+int get_length = flags & RGLF_LENGTH;
+linenr_T firstlnum;
+linenr_T maxline;
+#ifdef FEAT_EVAL
+if (flags & RGLF_SUBMATCH)
+{
+	firstlnum = rsm.sm_firstlnum + lnum;
+	maxline = rsm.sm_maxline;
+}
+else
+#endif
+{
+	firstlnum = rex.reg_firstlnum + lnum;
+	maxline = rex.reg_maxline;
+}
+// when looking behind for a match/no-match lnum is negative. but we
+// can't go before line 1.
+if (firstlnum < 1)
+{
+	if (get_line)
+	    *line = NULL;
+	if (get_length)
+	    *length = 0;
+	return;
+}
+if (lnum > maxline)
+{
+	// must have matched the "\n" in the last line.
+	if (get_line)
+	    *line = (char_u *)"";
+	if (get_length)
+	    *length = 0;
+	return;
+}
+if (get_line)
+	*line = ml_get_buf(rex.reg_buf, firstlnum, FALSE);
+if (get_length)
+	*length = ml_get_buf_len(rex.reg_buf, firstlnum);
+}
 /*
 * Get pointer to the line "lnum", which is relative to "reg_firstlnum".
 */
 static char_u *
 reg_getline(linenr_T lnum)
 {
-// when looking behind for a match/no-match lnum is negative.  But we
+char_u *line;
-// can't go before line 1
-if (rex.reg_firstlnum + lnum < 1)
+reg_getline_common(lnum, RGLF_LINE, &line, NULL);
-	return NULL;
-if (lnum > rex.reg_maxline)
+return line;
-	// Must have matched the "\n" in the last line.
+}
-	return (char_u *)"";
-return ml_get_buf(rex.reg_buf, rex.reg_firstlnum + lnum, FALSE);
+/*
+* Get length of line "lnum", which is relative to "reg_firstlnum".
+*/
+static colnr_T
+reg_getline_len(linenr_T lnum)
+{
+colnr_T length;
+reg_getline_common(lnum, RGLF_LENGTH, NULL, &length);
+return length;
 }
 #ifdef FEAT_SYN_HL
 static char_u	*reg_startzp[NSUBEXP];	// Workspace to mark beginning
 static char_u	*reg_endzp[NSUBEXP];	//   and end of \z(...\) matches
 	// Get the line to compare with.
 	p = reg_getline(clnum);
 	if (clnum == end_lnum)
 	    len = end_col - ccol;
 	else
-	    len = (int)STRLEN(p + ccol);
+	    len = (int)reg_getline_len(clnum) - ccol;
 	if (cstrncmp(p + ccol, rex.input, &len) != 0)
 	    return RA_NOMATCH;  // doesn't match
 	if (bytelen != NULL)
 	    *bytelen += len;
 char_u *
 regtilde(char_u *source, int magic)
 {
 char_u	*newsub = source;
 char_u	*p;
+size_t	newsublen = 0;
+char_u	tilde[3] = {'~', NUL, NUL};
+size_t	tildelen = 1;
+int		error = FALSE;
+if (!magic)
+{
+	tilde[0] = '\\';
+	tilde[1] = '~';
+	tilde[2] = NUL;
+	tildelen = 2;
+}
 for (p = newsub; *p; ++p)
 {
-	if ((*p == '~' && magic) || (*p == '\\' && *(p + 1) == '~' && !magic))
+	if (STRNCMP(p, tilde, tildelen) == 0)
 	{
-	    if (reg_prev_sub != NULL)
+	    size_t prefixlen = p - newsub;		// not including the tilde
+	    char_u *postfix = p + tildelen;
+	    size_t postfixlen;
+	    size_t tmpsublen;
+	    if (newsublen == 0)
+		newsublen = STRLEN(newsub);
+	    newsublen -= tildelen;
+	    postfixlen = newsublen - prefixlen;
+	    tmpsublen = prefixlen + reg_prev_sublen + postfixlen;
+	    if (tmpsublen > 0 && reg_prev_sub != NULL)
 	    {
-		// length = len(newsub) - 1 + len(prev_sub) + 1
+		char_u *tmpsub;
 		// Avoid making the text longer than MAXCOL, it will cause
 		// trouble at some point.
-		size_t	prevsublen = STRLEN(reg_prev_sub);
+		if (tmpsublen > MAXCOL)
-		size_t  newsublen = STRLEN(newsub);
-		if (prevsublen > MAXCOL || newsublen > MAXCOL
-					    || newsublen + prevsublen > MAXCOL)
 		{
 		    emsg(_(e_resulting_text_too_long));
+		    error = TRUE;
 		    break;
 		}
-		char_u *tmpsub = alloc(newsublen + prevsublen);
+		tmpsub = alloc(tmpsublen + 1);
-		if (tmpsub != NULL)
+		if (tmpsub == NULL)
 		{
-		    // copy prefix
+		    emsg(_(e_out_of_memory));
-		    size_t prefixlen = p - newsub;	// not including ~
+		    error = TRUE;
-		    mch_memmove(tmpsub, newsub, prefixlen);
+		    break;
-		    // interpret tilde
-		    mch_memmove(tmpsub + prefixlen, reg_prev_sub,
-							       prevsublen);
-		    // copy postfix
-		    if (!magic)
-			++p;			// back off backslash
-		    STRCPY(tmpsub + prefixlen + prevsublen, p + 1);
-		    if (newsub != source)	// allocated newsub before
-			vim_free(newsub);
-		    newsub = tmpsub;
-		    p = newsub + prefixlen + prevsublen;
 		}
+		// copy prefix
+		mch_memmove(tmpsub, newsub, prefixlen);
+		// interpret tilde
+		mch_memmove(tmpsub + prefixlen, reg_prev_sub, reg_prev_sublen);
+		// copy postfix
+		STRCPY(tmpsub + prefixlen + reg_prev_sublen, postfix);
+		if (newsub != source)	// allocated newsub before
+		    vim_free(newsub);
+		newsub = tmpsub;
+		newsublen = tmpsublen;
+		p = newsub + prefixlen + reg_prev_sublen;
 	    }
-	    else if (magic)
-		STRMOVE(p, p + 1);	// remove '~'
 	    else
-		STRMOVE(p, p + 2);	// remove '\~'
+		mch_memmove(p, postfix, postfixlen + 1);	// remove the tilde (+1 for the NUL)
 	    --p;
 	}
 	else
 	{
 	    if (*p == '\\' && p[1])		// skip escaped characters
 	    if (has_mbyte)
 		p += (*mb_ptr2len)(p) - 1;
 	}
 }
+if (error)
+{
+	if (newsub != source)
+	    vim_free(newsub);
+	return source;
+}
 // Store a copy of newsub  in reg_prev_sub.  It is always allocated,
 // because recursive calls may make the returned string invalid.
-vim_free(reg_prev_sub);
+// Only store it if there something to store.
-reg_prev_sub = vim_strsave(newsub);
+newsublen = p - newsub;
+if (newsublen == 0)
+	VIM_CLEAR(reg_prev_sub);
+else
+{
+	vim_free(reg_prev_sub);
+	reg_prev_sub = vim_strnsave(newsub, newsublen);
+}
+if (reg_prev_sub == NULL)
+	reg_prev_sublen = 0;
+else
+	reg_prev_sublen = newsublen;
 return newsub;
 }
-#ifdef FEAT_EVAL
-static int can_f_submatch = FALSE;	// TRUE when submatch() can be used
-// These pointers are used for reg_submatch().  Needed for when the
-// substitution string is an expression that contains a call to substitute()
-// and submatch().
-typedef struct {
-regmatch_T	*sm_match;
-regmmatch_T	*sm_mmatch;
-linenr_T	sm_firstlnum;
-linenr_T	sm_maxline;
-int		sm_line_lbr;
-} regsubmatch_T;
-static regsubmatch_T rsm;  // can only be used when can_f_submatch is TRUE
-#endif
 #ifdef FEAT_EVAL
 /*
 * Put the submatches in "argv[argskip]" which is a list passed into
 	// resulting string is saved from the call with
 	// "flags & REGSUB_COPY" == 0 to the call with
 	// "flags & REGSUB_COPY" != 0.
 	if (copy)
 	{
-	    if (eval_result[nested] != NULL &&
+	    if (eval_result[nested] != NULL)
-		    (int)STRLEN(eval_result[nested]) < destlen)
 	    {
-		STRCPY(dest, eval_result[nested]);
+		int eval_len = (int)STRLEN(eval_result[nested]);
-		dst += STRLEN(eval_result[nested]);
-		VIM_CLEAR(eval_result[nested]);
+		if (eval_len < destlen)
+		{
+		    STRCPY(dest, eval_result[nested]);
+		    dst += eval_len;
+		    VIM_CLEAR(eval_result[nested]);
+		}
 	    }
 	}
 	else
 	{
 	    int		    prev_can_f_submatch = can_f_submatch;
 		    s = reg_getline(clnum) + rex.reg_mmatch->startpos[no].col;
 		    if (rex.reg_mmatch->endpos[no].lnum == clnum)
 			len = rex.reg_mmatch->endpos[no].col
 					    - rex.reg_mmatch->startpos[no].col;
 		    else
-			len = (int)STRLEN(s);
+			len = (int)reg_getline_len(clnum) - rex.reg_mmatch->startpos[no].col;
 		}
 	    }
 	    else
 	    {
 		s = rex.reg_match->startp[no];
 			    ++dst;
 			    s = reg_getline(++clnum);
 			    if (rex.reg_mmatch->endpos[no].lnum == clnum)
 				len = rex.reg_mmatch->endpos[no].col;
 			    else
-				len = (int)STRLEN(s);
+				len = (int)reg_getline_len(clnum);
 			}
 			else
 			    break;
 		    }
 		    else if (*s == NUL) // we hit NUL.
 exit:
 return (int)((dst - dest) + 1);
 }
 #ifdef FEAT_EVAL
-/*
-* Call reg_getline() with the line numbers from the submatch.  If a
-* substitute() was used the reg_maxline and other values have been
-* overwritten.
-*/
 static char_u *
 reg_getline_submatch(linenr_T lnum)
 {
-char_u *s;
+char_u *line;
-linenr_T save_first = rex.reg_firstlnum;
-linenr_T save_max = rex.reg_maxline;
+reg_getline_common(lnum, RGLF_LINE | RGLF_SUBMATCH, &line, NULL);
-rex.reg_firstlnum = rsm.sm_firstlnum;
+return line;
-rex.reg_maxline = rsm.sm_maxline;
+}
-s = reg_getline(lnum);
+static colnr_T
+reg_getline_submatch_len(linenr_T lnum)
-rex.reg_firstlnum = save_first;
+{
-rex.reg_maxline = save_max;
+colnr_T length;
-return s;
+reg_getline_common(lnum, RGLF_LENGTH | RGLF_SUBMATCH, NULL, &length);
+return length;
 }
 /*
 * Used for the submatch() function: get the string from the n'th submatch in
 * allocated memory.
 	    }
 	    else
 	    {
 		// Multiple lines: take start line from start col, middle
 		// lines completely and end line up to end col.
-		len = (int)STRLEN(s);
+		len = (int)reg_getline_submatch_len(lnum) - rsm.sm_mmatch->startpos[no].col;
 		if (round == 2)
 		{
 		    STRCPY(retval, s);
 		    retval[len] = '\n';
 		}
 		++len;
 		++lnum;
 		while (lnum < rsm.sm_mmatch->endpos[no].lnum)
 		{
-		    s = reg_getline_submatch(lnum++);
+		    s = reg_getline_submatch(lnum);
 		    if (round == 2)
 			STRCPY(retval + len, s);
-		    len += (int)STRLEN(s);
+		    len += (int)reg_getline_submatch_len(lnum);
 		    if (round == 2)
 			retval[len] = '\n';
 		    ++len;
+		    ++lnum;
 		}
 		if (round == 2)
 		    STRNCPY(retval + len, reg_getline_submatch(lnum),
 					     rsm.sm_mmatch->endpos[no].col);
 		len += rsm.sm_mmatch->endpos[no].col;
 	    if (list_append_string(list, s, ecol - scol) == FAIL)
 		error = TRUE;
 	}
 	else
 	{
+	    int max_lnum = elnum - slnum;
 	    if (list_append_string(list, s, -1) == FAIL)
 		error = TRUE;
-	    for (i = 1; i < elnum - slnum; i++)
+	    for (i = 1; i < max_lnum; i++)
 	    {
 		s = reg_getline_submatch(slnum + i);
 		if (list_append_string(list, s, -1) == FAIL)
 		    error = TRUE;
 	    }

Mercurial > vim

comparison src/regexp.c @ 35166:0b259135fb3a v9.1.0409