xref: /vim-8.2.3635/src/regexp.c (revision 64066b9a)
1 /* vi:set ts=8 sts=4 sw=4 noet:
2  *
3  * Handling of regular expressions: vim_regcomp(), vim_regexec(), vim_regsub()
4  */
5 
6 // By default: do not create debugging logs or files related to regular
7 // expressions, even when compiling with -DDEBUG.
8 // Uncomment the second line to get the regexp debugging.
9 #undef DEBUG
10 // #define DEBUG
11 
12 #include "vim.h"
13 
14 #ifdef DEBUG
15 // show/save debugging data when BT engine is used
16 # define BT_REGEXP_DUMP
17 // save the debugging data to a file instead of displaying it
18 # define BT_REGEXP_LOG
19 # define BT_REGEXP_DEBUG_LOG
20 # define BT_REGEXP_DEBUG_LOG_NAME	"bt_regexp_debug.log"
21 #endif
22 
23 /*
24  * Magic characters have a special meaning, they don't match literally.
25  * Magic characters are negative.  This separates them from literal characters
26  * (possibly multi-byte).  Only ASCII characters can be Magic.
27  */
28 #define Magic(x)	((int)(x) - 256)
29 #define un_Magic(x)	((x) + 256)
30 #define is_Magic(x)	((x) < 0)
31 
32     static int
no_Magic(int x)33 no_Magic(int x)
34 {
35     if (is_Magic(x))
36 	return un_Magic(x);
37     return x;
38 }
39 
40     static int
toggle_Magic(int x)41 toggle_Magic(int x)
42 {
43     if (is_Magic(x))
44 	return un_Magic(x);
45     return Magic(x);
46 }
47 
48 /*
49  * The first byte of the BT regexp internal "program" is actually this magic
50  * number; the start node begins in the second byte.  It's used to catch the
51  * most severe mutilation of the program by the caller.
52  */
53 
54 #define REGMAGIC	0234
55 
56 /*
57  * Utility definitions.
58  */
59 #define UCHARAT(p)	((int)*(char_u *)(p))
60 
61 // Used for an error (down from) vim_regcomp(): give the error message, set
62 // rc_did_emsg and return NULL
63 #define EMSG_RET_NULL(m) return (emsg((m)), rc_did_emsg = TRUE, (void *)NULL)
64 #define IEMSG_RET_NULL(m) return (iemsg((m)), rc_did_emsg = TRUE, (void *)NULL)
65 #define EMSG_RET_FAIL(m) return (emsg((m)), rc_did_emsg = TRUE, FAIL)
66 #define EMSG2_RET_NULL(m, c) return (semsg((const char *)(m), (c) ? "" : "\\"), rc_did_emsg = TRUE, (void *)NULL)
67 #define EMSG3_RET_NULL(m, c, a) return (semsg((const char *)(m), (c) ? "" : "\\", (a)), rc_did_emsg = TRUE, (void *)NULL)
68 #define EMSG2_RET_FAIL(m, c) return (semsg((const char *)(m), (c) ? "" : "\\"), rc_did_emsg = TRUE, FAIL)
69 #define EMSG_ONE_RET_NULL EMSG2_RET_NULL(_("E369: invalid item in %s%%[]"), reg_magic == MAGIC_ALL)
70 
71 
72 #define MAX_LIMIT	(32767L << 16L)
73 
74 static char_u e_missingbracket[] = N_("E769: Missing ] after %s[");
75 static char_u e_reverse_range[] = N_("E944: Reverse range in character class");
76 static char_u e_large_class[] = N_("E945: Range too large in character class");
77 #ifdef FEAT_SYN_HL
78 static char_u e_z_not_allowed[] = N_("E66: \\z( not allowed here");
79 static char_u e_z1_not_allowed[] = N_("E67: \\z1 - \\z9 not allowed here");
80 #endif
81 static char_u e_missing_sb[] = N_("E69: Missing ] after %s%%[");
82 static char_u e_empty_sb[]  = N_("E70: Empty %s%%[]");
83 static char_u e_recursive[]  = N_("E956: Cannot use pattern recursively");
84 
85 #define NOT_MULTI	0
86 #define MULTI_ONE	1
87 #define MULTI_MULT	2
88 
89 // return values for regmatch()
90 #define RA_FAIL		1	// something failed, abort
91 #define RA_CONT		2	// continue in inner loop
92 #define RA_BREAK	3	// break inner loop
93 #define RA_MATCH	4	// successful match
94 #define RA_NOMATCH	5	// didn't match
95 
96 /*
97  * Return NOT_MULTI if c is not a "multi" operator.
98  * Return MULTI_ONE if c is a single "multi" operator.
99  * Return MULTI_MULT if c is a multi "multi" operator.
100  */
101     static int
re_multi_type(int c)102 re_multi_type(int c)
103 {
104     if (c == Magic('@') || c == Magic('=') || c == Magic('?'))
105 	return MULTI_ONE;
106     if (c == Magic('*') || c == Magic('+') || c == Magic('{'))
107 	return MULTI_MULT;
108     return NOT_MULTI;
109 }
110 
111 static char_u		*reg_prev_sub = NULL;
112 
113 /*
114  * REGEXP_INRANGE contains all characters which are always special in a []
115  * range after '\'.
116  * REGEXP_ABBR contains all characters which act as abbreviations after '\'.
117  * These are:
118  *  \n	- New line (NL).
119  *  \r	- Carriage Return (CR).
120  *  \t	- Tab (TAB).
121  *  \e	- Escape (ESC).
122  *  \b	- Backspace (Ctrl_H).
123  *  \d  - Character code in decimal, eg \d123
124  *  \o	- Character code in octal, eg \o80
125  *  \x	- Character code in hex, eg \x4a
126  *  \u	- Multibyte character code, eg \u20ac
127  *  \U	- Long multibyte character code, eg \U12345678
128  */
129 static char_u REGEXP_INRANGE[] = "]^-n\\";
130 static char_u REGEXP_ABBR[] = "nrtebdoxuU";
131 
132 /*
133  * Translate '\x' to its control character, except "\n", which is Magic.
134  */
135     static int
backslash_trans(int c)136 backslash_trans(int c)
137 {
138     switch (c)
139     {
140 	case 'r':   return CAR;
141 	case 't':   return TAB;
142 	case 'e':   return ESC;
143 	case 'b':   return BS;
144     }
145     return c;
146 }
147 
148 /*
149  * Check for a character class name "[:name:]".  "pp" points to the '['.
150  * Returns one of the CLASS_ items. CLASS_NONE means that no item was
151  * recognized.  Otherwise "pp" is advanced to after the item.
152  */
153     static int
get_char_class(char_u ** pp)154 get_char_class(char_u **pp)
155 {
156     static const char *(class_names[]) =
157     {
158 	"alnum:]",
159 #define CLASS_ALNUM 0
160 	"alpha:]",
161 #define CLASS_ALPHA 1
162 	"blank:]",
163 #define CLASS_BLANK 2
164 	"cntrl:]",
165 #define CLASS_CNTRL 3
166 	"digit:]",
167 #define CLASS_DIGIT 4
168 	"graph:]",
169 #define CLASS_GRAPH 5
170 	"lower:]",
171 #define CLASS_LOWER 6
172 	"print:]",
173 #define CLASS_PRINT 7
174 	"punct:]",
175 #define CLASS_PUNCT 8
176 	"space:]",
177 #define CLASS_SPACE 9
178 	"upper:]",
179 #define CLASS_UPPER 10
180 	"xdigit:]",
181 #define CLASS_XDIGIT 11
182 	"tab:]",
183 #define CLASS_TAB 12
184 	"return:]",
185 #define CLASS_RETURN 13
186 	"backspace:]",
187 #define CLASS_BACKSPACE 14
188 	"escape:]",
189 #define CLASS_ESCAPE 15
190 	"ident:]",
191 #define CLASS_IDENT 16
192 	"keyword:]",
193 #define CLASS_KEYWORD 17
194 	"fname:]",
195 #define CLASS_FNAME 18
196     };
197 #define CLASS_NONE 99
198     int i;
199 
200     if ((*pp)[1] == ':')
201     {
202 	for (i = 0; i < (int)ARRAY_LENGTH(class_names); ++i)
203 	    if (STRNCMP(*pp + 2, class_names[i], STRLEN(class_names[i])) == 0)
204 	    {
205 		*pp += STRLEN(class_names[i]) + 2;
206 		return i;
207 	    }
208     }
209     return CLASS_NONE;
210 }
211 
212 /*
213  * Specific version of character class functions.
214  * Using a table to keep this fast.
215  */
216 static short	class_tab[256];
217 
218 #define	    RI_DIGIT	0x01
219 #define	    RI_HEX	0x02
220 #define	    RI_OCTAL	0x04
221 #define	    RI_WORD	0x08
222 #define	    RI_HEAD	0x10
223 #define	    RI_ALPHA	0x20
224 #define	    RI_LOWER	0x40
225 #define	    RI_UPPER	0x80
226 #define	    RI_WHITE	0x100
227 
228     static void
init_class_tab(void)229 init_class_tab(void)
230 {
231     int		i;
232     static int	done = FALSE;
233 
234     if (done)
235 	return;
236 
237     for (i = 0; i < 256; ++i)
238     {
239 	if (i >= '0' && i <= '7')
240 	    class_tab[i] = RI_DIGIT + RI_HEX + RI_OCTAL + RI_WORD;
241 	else if (i >= '8' && i <= '9')
242 	    class_tab[i] = RI_DIGIT + RI_HEX + RI_WORD;
243 	else if (i >= 'a' && i <= 'f')
244 	    class_tab[i] = RI_HEX + RI_WORD + RI_HEAD + RI_ALPHA + RI_LOWER;
245 #ifdef EBCDIC
246 	else if ((i >= 'g' && i <= 'i') || (i >= 'j' && i <= 'r')
247 						    || (i >= 's' && i <= 'z'))
248 #else
249 	else if (i >= 'g' && i <= 'z')
250 #endif
251 	    class_tab[i] = RI_WORD + RI_HEAD + RI_ALPHA + RI_LOWER;
252 	else if (i >= 'A' && i <= 'F')
253 	    class_tab[i] = RI_HEX + RI_WORD + RI_HEAD + RI_ALPHA + RI_UPPER;
254 #ifdef EBCDIC
255 	else if ((i >= 'G' && i <= 'I') || ( i >= 'J' && i <= 'R')
256 						    || (i >= 'S' && i <= 'Z'))
257 #else
258 	else if (i >= 'G' && i <= 'Z')
259 #endif
260 	    class_tab[i] = RI_WORD + RI_HEAD + RI_ALPHA + RI_UPPER;
261 	else if (i == '_')
262 	    class_tab[i] = RI_WORD + RI_HEAD;
263 	else
264 	    class_tab[i] = 0;
265     }
266     class_tab[' '] |= RI_WHITE;
267     class_tab['\t'] |= RI_WHITE;
268     done = TRUE;
269 }
270 
271 #define ri_digit(c)	(c < 0x100 && (class_tab[c] & RI_DIGIT))
272 #define ri_hex(c)	(c < 0x100 && (class_tab[c] & RI_HEX))
273 #define ri_octal(c)	(c < 0x100 && (class_tab[c] & RI_OCTAL))
274 #define ri_word(c)	(c < 0x100 && (class_tab[c] & RI_WORD))
275 #define ri_head(c)	(c < 0x100 && (class_tab[c] & RI_HEAD))
276 #define ri_alpha(c)	(c < 0x100 && (class_tab[c] & RI_ALPHA))
277 #define ri_lower(c)	(c < 0x100 && (class_tab[c] & RI_LOWER))
278 #define ri_upper(c)	(c < 0x100 && (class_tab[c] & RI_UPPER))
279 #define ri_white(c)	(c < 0x100 && (class_tab[c] & RI_WHITE))
280 
281 // flags for regflags
282 #define RF_ICASE    1	// ignore case
283 #define RF_NOICASE  2	// don't ignore case
284 #define RF_HASNL    4	// can match a NL
285 #define RF_ICOMBINE 8	// ignore combining characters
286 #define RF_LOOKBH   16	// uses "\@<=" or "\@<!"
287 
288 /*
289  * Global work variables for vim_regcomp().
290  */
291 
292 static char_u	*regparse;	// Input-scan pointer.
293 static int	regnpar;	// () count.
294 static int	wants_nfa;	// regex should use NFA engine
295 #ifdef FEAT_SYN_HL
296 static int	regnzpar;	// \z() count.
297 static int	re_has_z;	// \z item detected
298 #endif
299 static unsigned	regflags;	// RF_ flags for prog
300 #if defined(FEAT_SYN_HL) || defined(PROTO)
301 static int	had_eol;	// TRUE when EOL found by vim_regcomp()
302 #endif
303 
304 static magic_T	reg_magic;	// magicness of the pattern
305 
306 static int	reg_string;	// matching with a string instead of a buffer
307 				// line
308 static int	reg_strict;	// "[abc" is illegal
309 
310 /*
311  * META contains all characters that may be magic, except '^' and '$'.
312  */
313 
314 #ifdef EBCDIC
315 static char_u META[] = "%&()*+.123456789<=>?@ACDFHIKLMOPSUVWX[_acdfhiklmnopsuvwxz{|~";
316 #else
317 // META[] is used often enough to justify turning it into a table.
318 static char_u META_flags[] = {
319     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
320     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
321 //		   %  &     (  )  *  +	      .
322     0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0,
323 //     1  2  3	4  5  6  7  8  9	<  =  >  ?
324     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
325 //  @  A     C	D     F     H  I     K	L  M	 O
326     1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1,
327 //  P	     S	   U  V  W  X	  Z  [		 _
328     1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1,
329 //     a     c	d     f     h  i     k	l  m  n  o
330     0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
331 //  p	     s	   u  v  w  x	  z  {	|     ~
332     1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1
333 };
334 #endif
335 
336 static int	curchr;		// currently parsed character
337 // Previous character.  Note: prevchr is sometimes -1 when we are not at the
338 // start, eg in /[ ^I]^ the pattern was never found even if it existed,
339 // because ^ was taken to be magic -- webb
340 static int	prevchr;
341 static int	prevprevchr;	// previous-previous character
342 static int	nextchr;	// used for ungetchr()
343 
344 // arguments for reg()
345 #define REG_NOPAREN	0	// toplevel reg()
346 #define REG_PAREN	1	// \(\)
347 #define REG_ZPAREN	2	// \z(\)
348 #define REG_NPAREN	3	// \%(\)
349 
350 typedef struct
351 {
352      char_u	*regparse;
353      int	prevchr_len;
354      int	curchr;
355      int	prevchr;
356      int	prevprevchr;
357      int	nextchr;
358      int	at_start;
359      int	prev_at_start;
360      int	regnpar;
361 } parse_state_T;
362 
363 static void	initchr(char_u *);
364 static int	getchr(void);
365 static void	skipchr_keepstart(void);
366 static int	peekchr(void);
367 static void	skipchr(void);
368 static void	ungetchr(void);
369 static long	gethexchrs(int maxinputlen);
370 static long	getoctchrs(void);
371 static long	getdecchrs(void);
372 static int	coll_get_char(void);
373 static int	prog_magic_wrong(void);
374 static int	cstrncmp(char_u *s1, char_u *s2, int *n);
375 static char_u	*cstrchr(char_u *, int);
376 static int	re_mult_next(char *what);
377 static int	reg_iswordc(int);
378 #ifdef FEAT_EVAL
379 static void report_re_switch(char_u *pat);
380 #endif
381 
382 static regengine_T bt_regengine;
383 static regengine_T nfa_regengine;
384 
385 /*
386  * Return TRUE if compiled regular expression "prog" can match a line break.
387  */
388     int
re_multiline(regprog_T * prog)389 re_multiline(regprog_T *prog)
390 {
391     return (prog->regflags & RF_HASNL);
392 }
393 
394 /*
395  * Check for an equivalence class name "[=a=]".  "pp" points to the '['.
396  * Returns a character representing the class. Zero means that no item was
397  * recognized.  Otherwise "pp" is advanced to after the item.
398  */
399     static int
get_equi_class(char_u ** pp)400 get_equi_class(char_u **pp)
401 {
402     int		c;
403     int		l = 1;
404     char_u	*p = *pp;
405 
406     if (p[1] == '=' && p[2] != NUL)
407     {
408 	if (has_mbyte)
409 	    l = (*mb_ptr2len)(p + 2);
410 	if (p[l + 2] == '=' && p[l + 3] == ']')
411 	{
412 	    if (has_mbyte)
413 		c = mb_ptr2char(p + 2);
414 	    else
415 		c = p[2];
416 	    *pp += l + 4;
417 	    return c;
418 	}
419     }
420     return 0;
421 }
422 
423 #ifdef EBCDIC
424 /*
425  * Table for equivalence class "c". (IBM-1047)
426  */
427 static char *EQUIVAL_CLASS_C[16] = {
428     "A\x62\x63\x64\x65\x66\x67",
429     "C\x68",
430     "E\x71\x72\x73\x74",
431     "I\x75\x76\x77\x78",
432     "N\x69",
433     "O\xEB\xEC\xED\xEE\xEF\x80",
434     "U\xFB\xFC\xFD\xFE",
435     "Y\xBA",
436     "a\x42\x43\x44\x45\x46\x47",
437     "c\x48",
438     "e\x51\x52\x53\x54",
439     "i\x55\x56\x57\x58",
440     "n\x49",
441     "o\xCB\xCC\xCD\xCE\xCF\x70",
442     "u\xDB\xDC\xDD\xDE",
443     "y\x8D\xDF",
444 };
445 #endif
446 
447 /*
448  * Check for a collating element "[.a.]".  "pp" points to the '['.
449  * Returns a character. Zero means that no item was recognized.  Otherwise
450  * "pp" is advanced to after the item.
451  * Currently only single characters are recognized!
452  */
453     static int
get_coll_element(char_u ** pp)454 get_coll_element(char_u **pp)
455 {
456     int		c;
457     int		l = 1;
458     char_u	*p = *pp;
459 
460     if (p[0] != NUL && p[1] == '.' && p[2] != NUL)
461     {
462 	if (has_mbyte)
463 	    l = (*mb_ptr2len)(p + 2);
464 	if (p[l + 2] == '.' && p[l + 3] == ']')
465 	{
466 	    if (has_mbyte)
467 		c = mb_ptr2char(p + 2);
468 	    else
469 		c = p[2];
470 	    *pp += l + 4;
471 	    return c;
472 	}
473     }
474     return 0;
475 }
476 
477 static int reg_cpo_lit; // 'cpoptions' contains 'l' flag
478 static int reg_cpo_bsl; // 'cpoptions' contains '\' flag
479 
480     static void
get_cpo_flags(void)481 get_cpo_flags(void)
482 {
483     reg_cpo_lit = vim_strchr(p_cpo, CPO_LITERAL) != NULL;
484     reg_cpo_bsl = vim_strchr(p_cpo, CPO_BACKSL) != NULL;
485 }
486 
487 /*
488  * Skip over a "[]" range.
489  * "p" must point to the character after the '['.
490  * The returned pointer is on the matching ']', or the terminating NUL.
491  */
492     static char_u *
skip_anyof(char_u * p)493 skip_anyof(char_u *p)
494 {
495     int		l;
496 
497     if (*p == '^')	// Complement of range.
498 	++p;
499     if (*p == ']' || *p == '-')
500 	++p;
501     while (*p != NUL && *p != ']')
502     {
503 	if (has_mbyte && (l = (*mb_ptr2len)(p)) > 1)
504 	    p += l;
505 	else
506 	    if (*p == '-')
507 	    {
508 		++p;
509 		if (*p != ']' && *p != NUL)
510 		    MB_PTR_ADV(p);
511 	    }
512 	else if (*p == '\\'
513 		&& !reg_cpo_bsl
514 		&& (vim_strchr(REGEXP_INRANGE, p[1]) != NULL
515 		    || (!reg_cpo_lit && vim_strchr(REGEXP_ABBR, p[1]) != NULL)))
516 	    p += 2;
517 	else if (*p == '[')
518 	{
519 	    if (get_char_class(&p) == CLASS_NONE
520 		    && get_equi_class(&p) == 0
521 		    && get_coll_element(&p) == 0
522 		    && *p != NUL)
523 		++p; // it is not a class name and not NUL
524 	}
525 	else
526 	    ++p;
527     }
528 
529     return p;
530 }
531 
532 /*
533  * Skip past regular expression.
534  * Stop at end of "startp" or where "delim" is found ('/', '?', etc).
535  * Take care of characters with a backslash in front of it.
536  * Skip strings inside [ and ].
537  */
538     char_u *
skip_regexp(char_u * startp,int delim,int magic)539 skip_regexp(
540     char_u	*startp,
541     int		delim,
542     int		magic)
543 {
544     return skip_regexp_ex(startp, delim, magic, NULL, NULL, NULL);
545 }
546 
547 /*
548  * Call skip_regexp() and when the delimiter does not match give an error and
549  * return NULL.
550  */
551     char_u *
skip_regexp_err(char_u * startp,int delim,int magic)552 skip_regexp_err(
553     char_u	*startp,
554     int		delim,
555     int		magic)
556 {
557     char_u *p = skip_regexp(startp, delim, magic);
558 
559     if (*p != delim)
560     {
561 	semsg(_("E654: missing delimiter after search pattern: %s"), startp);
562 	return NULL;
563     }
564     return p;
565 }
566 
567 /*
568  * skip_regexp() with extra arguments:
569  * When "newp" is not NULL and "dirc" is '?', make an allocated copy of the
570  * expression and change "\?" to "?".  If "*newp" is not NULL the expression
571  * is changed in-place.
572  * If a "\?" is changed to "?" then "dropped" is incremented, unless NULL.
573  * If "magic_val" is not NULL, returns the effective magicness of the pattern
574  */
575     char_u *
skip_regexp_ex(char_u * startp,int dirc,int magic,char_u ** newp,int * dropped,magic_T * magic_val)576 skip_regexp_ex(
577     char_u	*startp,
578     int		dirc,
579     int		magic,
580     char_u	**newp,
581     int		*dropped,
582     magic_T	*magic_val)
583 {
584     magic_T	mymagic;
585     char_u	*p = startp;
586 
587     if (magic)
588 	mymagic = MAGIC_ON;
589     else
590 	mymagic = MAGIC_OFF;
591     get_cpo_flags();
592 
593     for (; p[0] != NUL; MB_PTR_ADV(p))
594     {
595 	if (p[0] == dirc)	// found end of regexp
596 	    break;
597 	if ((p[0] == '[' && mymagic >= MAGIC_ON)
598 		|| (p[0] == '\\' && p[1] == '[' && mymagic <= MAGIC_OFF))
599 	{
600 	    p = skip_anyof(p + 1);
601 	    if (p[0] == NUL)
602 		break;
603 	}
604 	else if (p[0] == '\\' && p[1] != NUL)
605 	{
606 	    if (dirc == '?' && newp != NULL && p[1] == '?')
607 	    {
608 		// change "\?" to "?", make a copy first.
609 		if (*newp == NULL)
610 		{
611 		    *newp = vim_strsave(startp);
612 		    if (*newp != NULL)
613 			p = *newp + (p - startp);
614 		}
615 		if (dropped != NULL)
616 		    ++*dropped;
617 		if (*newp != NULL)
618 		    STRMOVE(p, p + 1);
619 		else
620 		    ++p;
621 	    }
622 	    else
623 		++p;    // skip next character
624 	    if (*p == 'v')
625 		mymagic = MAGIC_ALL;
626 	    else if (*p == 'V')
627 		mymagic = MAGIC_NONE;
628 	}
629     }
630     if (magic_val != NULL)
631 	*magic_val = mymagic;
632     return p;
633 }
634 
635 /*
636  * Functions for getting characters from the regexp input.
637  */
638 static int	prevchr_len;	// byte length of previous char
639 static int	at_start;	// True when on the first character
640 static int	prev_at_start;  // True when on the second character
641 
642 /*
643  * Start parsing at "str".
644  */
645     static void
initchr(char_u * str)646 initchr(char_u *str)
647 {
648     regparse = str;
649     prevchr_len = 0;
650     curchr = prevprevchr = prevchr = nextchr = -1;
651     at_start = TRUE;
652     prev_at_start = FALSE;
653 }
654 
655 /*
656  * Save the current parse state, so that it can be restored and parsing
657  * starts in the same state again.
658  */
659     static void
save_parse_state(parse_state_T * ps)660 save_parse_state(parse_state_T *ps)
661 {
662     ps->regparse = regparse;
663     ps->prevchr_len = prevchr_len;
664     ps->curchr = curchr;
665     ps->prevchr = prevchr;
666     ps->prevprevchr = prevprevchr;
667     ps->nextchr = nextchr;
668     ps->at_start = at_start;
669     ps->prev_at_start = prev_at_start;
670     ps->regnpar = regnpar;
671 }
672 
673 /*
674  * Restore a previously saved parse state.
675  */
676     static void
restore_parse_state(parse_state_T * ps)677 restore_parse_state(parse_state_T *ps)
678 {
679     regparse = ps->regparse;
680     prevchr_len = ps->prevchr_len;
681     curchr = ps->curchr;
682     prevchr = ps->prevchr;
683     prevprevchr = ps->prevprevchr;
684     nextchr = ps->nextchr;
685     at_start = ps->at_start;
686     prev_at_start = ps->prev_at_start;
687     regnpar = ps->regnpar;
688 }
689 
690 
691 /*
692  * Get the next character without advancing.
693  */
694     static int
peekchr(void)695 peekchr(void)
696 {
697     static int	after_slash = FALSE;
698 
699     if (curchr == -1)
700     {
701 	switch (curchr = regparse[0])
702 	{
703 	case '.':
704 	case '[':
705 	case '~':
706 	    // magic when 'magic' is on
707 	    if (reg_magic >= MAGIC_ON)
708 		curchr = Magic(curchr);
709 	    break;
710 	case '(':
711 	case ')':
712 	case '{':
713 	case '%':
714 	case '+':
715 	case '=':
716 	case '?':
717 	case '@':
718 	case '!':
719 	case '&':
720 	case '|':
721 	case '<':
722 	case '>':
723 	case '#':	// future ext.
724 	case '"':	// future ext.
725 	case '\'':	// future ext.
726 	case ',':	// future ext.
727 	case '-':	// future ext.
728 	case ':':	// future ext.
729 	case ';':	// future ext.
730 	case '`':	// future ext.
731 	case '/':	// Can't be used in / command
732 	    // magic only after "\v"
733 	    if (reg_magic == MAGIC_ALL)
734 		curchr = Magic(curchr);
735 	    break;
736 	case '*':
737 	    // * is not magic as the very first character, eg "?*ptr", when
738 	    // after '^', eg "/^*ptr" and when after "\(", "\|", "\&".  But
739 	    // "\(\*" is not magic, thus must be magic if "after_slash"
740 	    if (reg_magic >= MAGIC_ON
741 		    && !at_start
742 		    && !(prev_at_start && prevchr == Magic('^'))
743 		    && (after_slash
744 			|| (prevchr != Magic('(')
745 			    && prevchr != Magic('&')
746 			    && prevchr != Magic('|'))))
747 		curchr = Magic('*');
748 	    break;
749 	case '^':
750 	    // '^' is only magic as the very first character and if it's after
751 	    // "\(", "\|", "\&' or "\n"
752 	    if (reg_magic >= MAGIC_OFF
753 		    && (at_start
754 			|| reg_magic == MAGIC_ALL
755 			|| prevchr == Magic('(')
756 			|| prevchr == Magic('|')
757 			|| prevchr == Magic('&')
758 			|| prevchr == Magic('n')
759 			|| (no_Magic(prevchr) == '('
760 			    && prevprevchr == Magic('%'))))
761 	    {
762 		curchr = Magic('^');
763 		at_start = TRUE;
764 		prev_at_start = FALSE;
765 	    }
766 	    break;
767 	case '$':
768 	    // '$' is only magic as the very last char and if it's in front of
769 	    // either "\|", "\)", "\&", or "\n"
770 	    if (reg_magic >= MAGIC_OFF)
771 	    {
772 		char_u *p = regparse + 1;
773 		int is_magic_all = (reg_magic == MAGIC_ALL);
774 
775 		// ignore \c \C \m \M \v \V and \Z after '$'
776 		while (p[0] == '\\' && (p[1] == 'c' || p[1] == 'C'
777 				|| p[1] == 'm' || p[1] == 'M'
778 				|| p[1] == 'v' || p[1] == 'V' || p[1] == 'Z'))
779 		{
780 		    if (p[1] == 'v')
781 			is_magic_all = TRUE;
782 		    else if (p[1] == 'm' || p[1] == 'M' || p[1] == 'V')
783 			is_magic_all = FALSE;
784 		    p += 2;
785 		}
786 		if (p[0] == NUL
787 			|| (p[0] == '\\'
788 			    && (p[1] == '|' || p[1] == '&' || p[1] == ')'
789 				|| p[1] == 'n'))
790 			|| (is_magic_all
791 			       && (p[0] == '|' || p[0] == '&' || p[0] == ')'))
792 			|| reg_magic == MAGIC_ALL)
793 		    curchr = Magic('$');
794 	    }
795 	    break;
796 	case '\\':
797 	    {
798 		int c = regparse[1];
799 
800 		if (c == NUL)
801 		    curchr = '\\';	// trailing '\'
802 		else if (
803 #ifdef EBCDIC
804 			vim_strchr(META, c)
805 #else
806 			c <= '~' && META_flags[c]
807 #endif
808 			)
809 		{
810 		    /*
811 		     * META contains everything that may be magic sometimes,
812 		     * except ^ and $ ("\^" and "\$" are only magic after
813 		     * "\V").  We now fetch the next character and toggle its
814 		     * magicness.  Therefore, \ is so meta-magic that it is
815 		     * not in META.
816 		     */
817 		    curchr = -1;
818 		    prev_at_start = at_start;
819 		    at_start = FALSE;	// be able to say "/\*ptr"
820 		    ++regparse;
821 		    ++after_slash;
822 		    peekchr();
823 		    --regparse;
824 		    --after_slash;
825 		    curchr = toggle_Magic(curchr);
826 		}
827 		else if (vim_strchr(REGEXP_ABBR, c))
828 		{
829 		    /*
830 		     * Handle abbreviations, like "\t" for TAB -- webb
831 		     */
832 		    curchr = backslash_trans(c);
833 		}
834 		else if (reg_magic == MAGIC_NONE && (c == '$' || c == '^'))
835 		    curchr = toggle_Magic(c);
836 		else
837 		{
838 		    /*
839 		     * Next character can never be (made) magic?
840 		     * Then backslashing it won't do anything.
841 		     */
842 		    if (has_mbyte)
843 			curchr = (*mb_ptr2char)(regparse + 1);
844 		    else
845 			curchr = c;
846 		}
847 		break;
848 	    }
849 
850 	default:
851 	    if (has_mbyte)
852 		curchr = (*mb_ptr2char)(regparse);
853 	}
854     }
855 
856     return curchr;
857 }
858 
859 /*
860  * Eat one lexed character.  Do this in a way that we can undo it.
861  */
862     static void
skipchr(void)863 skipchr(void)
864 {
865     // peekchr() eats a backslash, do the same here
866     if (*regparse == '\\')
867 	prevchr_len = 1;
868     else
869 	prevchr_len = 0;
870     if (regparse[prevchr_len] != NUL)
871     {
872 	if (enc_utf8)
873 	    // exclude composing chars that mb_ptr2len does include
874 	    prevchr_len += utf_ptr2len(regparse + prevchr_len);
875 	else if (has_mbyte)
876 	    prevchr_len += (*mb_ptr2len)(regparse + prevchr_len);
877 	else
878 	    ++prevchr_len;
879     }
880     regparse += prevchr_len;
881     prev_at_start = at_start;
882     at_start = FALSE;
883     prevprevchr = prevchr;
884     prevchr = curchr;
885     curchr = nextchr;	    // use previously unget char, or -1
886     nextchr = -1;
887 }
888 
889 /*
890  * Skip a character while keeping the value of prev_at_start for at_start.
891  * prevchr and prevprevchr are also kept.
892  */
893     static void
skipchr_keepstart(void)894 skipchr_keepstart(void)
895 {
896     int as = prev_at_start;
897     int pr = prevchr;
898     int prpr = prevprevchr;
899 
900     skipchr();
901     at_start = as;
902     prevchr = pr;
903     prevprevchr = prpr;
904 }
905 
906 /*
907  * Get the next character from the pattern. We know about magic and such, so
908  * therefore we need a lexical analyzer.
909  */
910     static int
getchr(void)911 getchr(void)
912 {
913     int chr = peekchr();
914 
915     skipchr();
916     return chr;
917 }
918 
919 /*
920  * put character back.  Works only once!
921  */
922     static void
ungetchr(void)923 ungetchr(void)
924 {
925     nextchr = curchr;
926     curchr = prevchr;
927     prevchr = prevprevchr;
928     at_start = prev_at_start;
929     prev_at_start = FALSE;
930 
931     // Backup regparse, so that it's at the same position as before the
932     // getchr().
933     regparse -= prevchr_len;
934 }
935 
936 /*
937  * Get and return the value of the hex string at the current position.
938  * Return -1 if there is no valid hex number.
939  * The position is updated:
940  *     blahblah\%x20asdf
941  *	   before-^ ^-after
942  * The parameter controls the maximum number of input characters. This will be
943  * 2 when reading a \%x20 sequence and 4 when reading a \%u20AC sequence.
944  */
945     static long
gethexchrs(int maxinputlen)946 gethexchrs(int maxinputlen)
947 {
948     long_u	nr = 0;
949     int		c;
950     int		i;
951 
952     for (i = 0; i < maxinputlen; ++i)
953     {
954 	c = regparse[0];
955 	if (!vim_isxdigit(c))
956 	    break;
957 	nr <<= 4;
958 	nr |= hex2nr(c);
959 	++regparse;
960     }
961 
962     if (i == 0)
963 	return -1;
964     return (long)nr;
965 }
966 
967 /*
968  * Get and return the value of the decimal string immediately after the
969  * current position. Return -1 for invalid.  Consumes all digits.
970  */
971     static long
getdecchrs(void)972 getdecchrs(void)
973 {
974     long_u	nr = 0;
975     int		c;
976     int		i;
977 
978     for (i = 0; ; ++i)
979     {
980 	c = regparse[0];
981 	if (c < '0' || c > '9')
982 	    break;
983 	nr *= 10;
984 	nr += c - '0';
985 	++regparse;
986 	curchr = -1; // no longer valid
987     }
988 
989     if (i == 0)
990 	return -1;
991     return (long)nr;
992 }
993 
994 /*
995  * get and return the value of the octal string immediately after the current
996  * position. Return -1 for invalid, or 0-255 for valid. Smart enough to handle
997  * numbers > 377 correctly (for example, 400 is treated as 40) and doesn't
998  * treat 8 or 9 as recognised characters. Position is updated:
999  *     blahblah\%o210asdf
1000  *	   before-^  ^-after
1001  */
1002     static long
getoctchrs(void)1003 getoctchrs(void)
1004 {
1005     long_u	nr = 0;
1006     int		c;
1007     int		i;
1008 
1009     for (i = 0; i < 3 && nr < 040; ++i)
1010     {
1011 	c = regparse[0];
1012 	if (c < '0' || c > '7')
1013 	    break;
1014 	nr <<= 3;
1015 	nr |= hex2nr(c);
1016 	++regparse;
1017     }
1018 
1019     if (i == 0)
1020 	return -1;
1021     return (long)nr;
1022 }
1023 
1024 /*
1025  * read_limits - Read two integers to be taken as a minimum and maximum.
1026  * If the first character is '-', then the range is reversed.
1027  * Should end with 'end'.  If minval is missing, zero is default, if maxval is
1028  * missing, a very big number is the default.
1029  */
1030     static int
read_limits(long * minval,long * maxval)1031 read_limits(long *minval, long *maxval)
1032 {
1033     int		reverse = FALSE;
1034     char_u	*first_char;
1035     long	tmp;
1036 
1037     if (*regparse == '-')
1038     {
1039 	// Starts with '-', so reverse the range later
1040 	regparse++;
1041 	reverse = TRUE;
1042     }
1043     first_char = regparse;
1044     *minval = getdigits(&regparse);
1045     if (*regparse == ',')	    // There is a comma
1046     {
1047 	if (vim_isdigit(*++regparse))
1048 	    *maxval = getdigits(&regparse);
1049 	else
1050 	    *maxval = MAX_LIMIT;
1051     }
1052     else if (VIM_ISDIGIT(*first_char))
1053 	*maxval = *minval;	    // It was \{n} or \{-n}
1054     else
1055 	*maxval = MAX_LIMIT;	    // It was \{} or \{-}
1056     if (*regparse == '\\')
1057 	regparse++;	// Allow either \{...} or \{...\}
1058     if (*regparse != '}')
1059 	EMSG2_RET_FAIL(_("E554: Syntax error in %s{...}"),
1060 						       reg_magic == MAGIC_ALL);
1061 
1062     /*
1063      * Reverse the range if there was a '-', or make sure it is in the right
1064      * order otherwise.
1065      */
1066     if ((!reverse && *minval > *maxval) || (reverse && *minval < *maxval))
1067     {
1068 	tmp = *minval;
1069 	*minval = *maxval;
1070 	*maxval = tmp;
1071     }
1072     skipchr();		// let's be friends with the lexer again
1073     return OK;
1074 }
1075 
1076 /*
1077  * vim_regexec and friends
1078  */
1079 
1080 /*
1081  * Global work variables for vim_regexec().
1082  */
1083 
1084 static void	cleanup_subexpr(void);
1085 #ifdef FEAT_SYN_HL
1086 static void	cleanup_zsubexpr(void);
1087 #endif
1088 static void	reg_nextline(void);
1089 static int	match_with_backref(linenr_T start_lnum, colnr_T start_col, linenr_T end_lnum, colnr_T end_col, int *bytelen);
1090 
1091 /*
1092  * Sometimes need to save a copy of a line.  Since alloc()/free() is very
1093  * slow, we keep one allocated piece of memory and only re-allocate it when
1094  * it's too small.  It's freed in bt_regexec_both() when finished.
1095  */
1096 static char_u	*reg_tofree = NULL;
1097 static unsigned	reg_tofreelen;
1098 
1099 /*
1100  * Structure used to store the execution state of the regex engine.
1101  * Which ones are set depends on whether a single-line or multi-line match is
1102  * done:
1103  *			single-line		multi-line
1104  * reg_match		&regmatch_T		NULL
1105  * reg_mmatch		NULL			&regmmatch_T
1106  * reg_startp		reg_match->startp	<invalid>
1107  * reg_endp		reg_match->endp		<invalid>
1108  * reg_startpos		<invalid>		reg_mmatch->startpos
1109  * reg_endpos		<invalid>		reg_mmatch->endpos
1110  * reg_win		NULL			window in which to search
1111  * reg_buf		curbuf			buffer in which to search
1112  * reg_firstlnum	<invalid>		first line in which to search
1113  * reg_maxline		0			last line nr
1114  * reg_line_lbr		FALSE or TRUE		FALSE
1115  */
1116 typedef struct {
1117     regmatch_T		*reg_match;
1118     regmmatch_T		*reg_mmatch;
1119     char_u		**reg_startp;
1120     char_u		**reg_endp;
1121     lpos_T		*reg_startpos;
1122     lpos_T		*reg_endpos;
1123     win_T		*reg_win;
1124     buf_T		*reg_buf;
1125     linenr_T		reg_firstlnum;
1126     linenr_T		reg_maxline;
1127     int			reg_line_lbr;	// "\n" in string is line break
1128 
1129     // The current match-position is stord in these variables:
1130     linenr_T	lnum;		// line number, relative to first line
1131     char_u	*line;		// start of current line
1132     char_u	*input;		// current input, points into "line"
1133 
1134     int	need_clear_subexpr;	// subexpressions still need to be cleared
1135 #ifdef FEAT_SYN_HL
1136     int	need_clear_zsubexpr;	// extmatch subexpressions still need to be
1137 				// cleared
1138 #endif
1139 
1140     // Internal copy of 'ignorecase'.  It is set at each call to vim_regexec().
1141     // Normally it gets the value of "rm_ic" or "rmm_ic", but when the pattern
1142     // contains '\c' or '\C' the value is overruled.
1143     int			reg_ic;
1144 
1145     // Similar to "reg_ic", but only for 'combining' characters.  Set with \Z
1146     // flag in the regexp.  Defaults to false, always.
1147     int			reg_icombine;
1148 
1149     // Copy of "rmm_maxcol": maximum column to search for a match.  Zero when
1150     // there is no maximum.
1151     colnr_T		reg_maxcol;
1152 
1153     // State for the NFA engine regexec.
1154     int nfa_has_zend;	    // NFA regexp \ze operator encountered.
1155     int nfa_has_backref;    // NFA regexp \1 .. \9 encountered.
1156     int nfa_nsubexpr;	    // Number of sub expressions actually being used
1157 			    // during execution. 1 if only the whole match
1158 			    // (subexpr 0) is used.
1159     // listid is global, so that it increases on recursive calls to
1160     // nfa_regmatch(), which means we don't have to clear the lastlist field of
1161     // all the states.
1162     int nfa_listid;
1163     int nfa_alt_listid;
1164 
1165 #ifdef FEAT_SYN_HL
1166     int nfa_has_zsubexpr;   // NFA regexp has \z( ), set zsubexpr.
1167 #endif
1168 } regexec_T;
1169 
1170 static regexec_T	rex;
1171 static int		rex_in_use = FALSE;
1172 
1173 /*
1174  * Return TRUE if character 'c' is included in 'iskeyword' option for
1175  * "reg_buf" buffer.
1176  */
1177     static int
reg_iswordc(int c)1178 reg_iswordc(int c)
1179 {
1180     return vim_iswordc_buf(c, rex.reg_buf);
1181 }
1182 
1183 /*
1184  * Get pointer to the line "lnum", which is relative to "reg_firstlnum".
1185  */
1186     static char_u *
reg_getline(linenr_T lnum)1187 reg_getline(linenr_T lnum)
1188 {
1189     // when looking behind for a match/no-match lnum is negative.  But we
1190     // can't go before line 1
1191     if (rex.reg_firstlnum + lnum < 1)
1192 	return NULL;
1193     if (lnum > rex.reg_maxline)
1194 	// Must have matched the "\n" in the last line.
1195 	return (char_u *)"";
1196     return ml_get_buf(rex.reg_buf, rex.reg_firstlnum + lnum, FALSE);
1197 }
1198 
1199 #ifdef FEAT_SYN_HL
1200 static char_u	*reg_startzp[NSUBEXP];	// Workspace to mark beginning
1201 static char_u	*reg_endzp[NSUBEXP];	//   and end of \z(...\) matches
1202 static lpos_T	reg_startzpos[NSUBEXP];	// idem, beginning pos
1203 static lpos_T	reg_endzpos[NSUBEXP];	// idem, end pos
1204 #endif
1205 
1206 // TRUE if using multi-line regexp.
1207 #define REG_MULTI	(rex.reg_match == NULL)
1208 
1209 #ifdef FEAT_SYN_HL
1210 /*
1211  * Create a new extmatch and mark it as referenced once.
1212  */
1213     static reg_extmatch_T *
make_extmatch(void)1214 make_extmatch(void)
1215 {
1216     reg_extmatch_T	*em;
1217 
1218     em = ALLOC_CLEAR_ONE(reg_extmatch_T);
1219     if (em != NULL)
1220 	em->refcnt = 1;
1221     return em;
1222 }
1223 
1224 /*
1225  * Add a reference to an extmatch.
1226  */
1227     reg_extmatch_T *
ref_extmatch(reg_extmatch_T * em)1228 ref_extmatch(reg_extmatch_T *em)
1229 {
1230     if (em != NULL)
1231 	em->refcnt++;
1232     return em;
1233 }
1234 
1235 /*
1236  * Remove a reference to an extmatch.  If there are no references left, free
1237  * the info.
1238  */
1239     void
unref_extmatch(reg_extmatch_T * em)1240 unref_extmatch(reg_extmatch_T *em)
1241 {
1242     int i;
1243 
1244     if (em != NULL && --em->refcnt <= 0)
1245     {
1246 	for (i = 0; i < NSUBEXP; ++i)
1247 	    vim_free(em->matches[i]);
1248 	vim_free(em);
1249     }
1250 }
1251 #endif
1252 
1253 /*
1254  * Get class of previous character.
1255  */
1256     static int
reg_prev_class(void)1257 reg_prev_class(void)
1258 {
1259     if (rex.input > rex.line)
1260 	return mb_get_class_buf(rex.input - 1
1261 		       - (*mb_head_off)(rex.line, rex.input - 1), rex.reg_buf);
1262     return -1;
1263 }
1264 
1265 /*
1266  * Return TRUE if the current rex.input position matches the Visual area.
1267  */
1268     static int
reg_match_visual(void)1269 reg_match_visual(void)
1270 {
1271     pos_T	top, bot;
1272     linenr_T    lnum;
1273     colnr_T	col;
1274     win_T	*wp = rex.reg_win == NULL ? curwin : rex.reg_win;
1275     int		mode;
1276     colnr_T	start, end;
1277     colnr_T	start2, end2;
1278     colnr_T	cols;
1279     colnr_T	curswant;
1280 
1281     // Check if the buffer is the current buffer.
1282     if (rex.reg_buf != curbuf || VIsual.lnum == 0)
1283 	return FALSE;
1284 
1285     if (VIsual_active)
1286     {
1287 	if (LT_POS(VIsual, wp->w_cursor))
1288 	{
1289 	    top = VIsual;
1290 	    bot = wp->w_cursor;
1291 	}
1292 	else
1293 	{
1294 	    top = wp->w_cursor;
1295 	    bot = VIsual;
1296 	}
1297 	mode = VIsual_mode;
1298 	curswant = wp->w_curswant;
1299     }
1300     else
1301     {
1302 	if (LT_POS(curbuf->b_visual.vi_start, curbuf->b_visual.vi_end))
1303 	{
1304 	    top = curbuf->b_visual.vi_start;
1305 	    bot = curbuf->b_visual.vi_end;
1306 	}
1307 	else
1308 	{
1309 	    top = curbuf->b_visual.vi_end;
1310 	    bot = curbuf->b_visual.vi_start;
1311 	}
1312 	mode = curbuf->b_visual.vi_mode;
1313 	curswant = curbuf->b_visual.vi_curswant;
1314     }
1315     lnum = rex.lnum + rex.reg_firstlnum;
1316     if (lnum < top.lnum || lnum > bot.lnum)
1317 	return FALSE;
1318 
1319     if (mode == 'v')
1320     {
1321 	col = (colnr_T)(rex.input - rex.line);
1322 	if ((lnum == top.lnum && col < top.col)
1323 		|| (lnum == bot.lnum && col >= bot.col + (*p_sel != 'e')))
1324 	    return FALSE;
1325     }
1326     else if (mode == Ctrl_V)
1327     {
1328 	getvvcol(wp, &top, &start, NULL, &end);
1329 	getvvcol(wp, &bot, &start2, NULL, &end2);
1330 	if (start2 < start)
1331 	    start = start2;
1332 	if (end2 > end)
1333 	    end = end2;
1334 	if (top.col == MAXCOL || bot.col == MAXCOL || curswant == MAXCOL)
1335 	    end = MAXCOL;
1336 	cols = win_linetabsize(wp, rex.line, (colnr_T)(rex.input - rex.line));
1337 	if (cols < start || cols > end - (*p_sel == 'e'))
1338 	    return FALSE;
1339     }
1340     return TRUE;
1341 }
1342 
1343 /*
1344  * Check the regexp program for its magic number.
1345  * Return TRUE if it's wrong.
1346  */
1347     static int
prog_magic_wrong(void)1348 prog_magic_wrong(void)
1349 {
1350     regprog_T	*prog;
1351 
1352     prog = REG_MULTI ? rex.reg_mmatch->regprog : rex.reg_match->regprog;
1353     if (prog->engine == &nfa_regengine)
1354 	// For NFA matcher we don't check the magic
1355 	return FALSE;
1356 
1357     if (UCHARAT(((bt_regprog_T *)prog)->program) != REGMAGIC)
1358     {
1359 	emsg(_(e_corrupted_regexp_program));
1360 	return TRUE;
1361     }
1362     return FALSE;
1363 }
1364 
1365 /*
1366  * Cleanup the subexpressions, if this wasn't done yet.
1367  * This construction is used to clear the subexpressions only when they are
1368  * used (to increase speed).
1369  */
1370     static void
cleanup_subexpr(void)1371 cleanup_subexpr(void)
1372 {
1373     if (rex.need_clear_subexpr)
1374     {
1375 	if (REG_MULTI)
1376 	{
1377 	    // Use 0xff to set lnum to -1
1378 	    vim_memset(rex.reg_startpos, 0xff, sizeof(lpos_T) * NSUBEXP);
1379 	    vim_memset(rex.reg_endpos, 0xff, sizeof(lpos_T) * NSUBEXP);
1380 	}
1381 	else
1382 	{
1383 	    vim_memset(rex.reg_startp, 0, sizeof(char_u *) * NSUBEXP);
1384 	    vim_memset(rex.reg_endp, 0, sizeof(char_u *) * NSUBEXP);
1385 	}
1386 	rex.need_clear_subexpr = FALSE;
1387     }
1388 }
1389 
1390 #ifdef FEAT_SYN_HL
1391     static void
cleanup_zsubexpr(void)1392 cleanup_zsubexpr(void)
1393 {
1394     if (rex.need_clear_zsubexpr)
1395     {
1396 	if (REG_MULTI)
1397 	{
1398 	    // Use 0xff to set lnum to -1
1399 	    vim_memset(reg_startzpos, 0xff, sizeof(lpos_T) * NSUBEXP);
1400 	    vim_memset(reg_endzpos, 0xff, sizeof(lpos_T) * NSUBEXP);
1401 	}
1402 	else
1403 	{
1404 	    vim_memset(reg_startzp, 0, sizeof(char_u *) * NSUBEXP);
1405 	    vim_memset(reg_endzp, 0, sizeof(char_u *) * NSUBEXP);
1406 	}
1407 	rex.need_clear_zsubexpr = FALSE;
1408     }
1409 }
1410 #endif
1411 
1412 /*
1413  * Advance rex.lnum, rex.line and rex.input to the next line.
1414  */
1415     static void
reg_nextline(void)1416 reg_nextline(void)
1417 {
1418     rex.line = reg_getline(++rex.lnum);
1419     rex.input = rex.line;
1420     fast_breakcheck();
1421 }
1422 
1423 /*
1424  * Check whether a backreference matches.
1425  * Returns RA_FAIL, RA_NOMATCH or RA_MATCH.
1426  * If "bytelen" is not NULL, it is set to the byte length of the match in the
1427  * last line.
1428  */
1429     static int
match_with_backref(linenr_T start_lnum,colnr_T start_col,linenr_T end_lnum,colnr_T end_col,int * bytelen)1430 match_with_backref(
1431     linenr_T start_lnum,
1432     colnr_T  start_col,
1433     linenr_T end_lnum,
1434     colnr_T  end_col,
1435     int	     *bytelen)
1436 {
1437     linenr_T	clnum = start_lnum;
1438     colnr_T	ccol = start_col;
1439     int		len;
1440     char_u	*p;
1441 
1442     if (bytelen != NULL)
1443 	*bytelen = 0;
1444     for (;;)
1445     {
1446 	// Since getting one line may invalidate the other, need to make copy.
1447 	// Slow!
1448 	if (rex.line != reg_tofree)
1449 	{
1450 	    len = (int)STRLEN(rex.line);
1451 	    if (reg_tofree == NULL || len >= (int)reg_tofreelen)
1452 	    {
1453 		len += 50;	// get some extra
1454 		vim_free(reg_tofree);
1455 		reg_tofree = alloc(len);
1456 		if (reg_tofree == NULL)
1457 		    return RA_FAIL; // out of memory!
1458 		reg_tofreelen = len;
1459 	    }
1460 	    STRCPY(reg_tofree, rex.line);
1461 	    rex.input = reg_tofree + (rex.input - rex.line);
1462 	    rex.line = reg_tofree;
1463 	}
1464 
1465 	// Get the line to compare with.
1466 	p = reg_getline(clnum);
1467 	if (clnum == end_lnum)
1468 	    len = end_col - ccol;
1469 	else
1470 	    len = (int)STRLEN(p + ccol);
1471 
1472 	if (cstrncmp(p + ccol, rex.input, &len) != 0)
1473 	    return RA_NOMATCH;  // doesn't match
1474 	if (bytelen != NULL)
1475 	    *bytelen += len;
1476 	if (clnum == end_lnum)
1477 	    break;		// match and at end!
1478 	if (rex.lnum >= rex.reg_maxline)
1479 	    return RA_NOMATCH;  // text too short
1480 
1481 	// Advance to next line.
1482 	reg_nextline();
1483 	if (bytelen != NULL)
1484 	    *bytelen = 0;
1485 	++clnum;
1486 	ccol = 0;
1487 	if (got_int)
1488 	    return RA_FAIL;
1489     }
1490 
1491     // found a match!  Note that rex.line may now point to a copy of the line,
1492     // that should not matter.
1493     return RA_MATCH;
1494 }
1495 
1496 /*
1497  * Used in a place where no * or \+ can follow.
1498  */
1499     static int
re_mult_next(char * what)1500 re_mult_next(char *what)
1501 {
1502     if (re_multi_type(peekchr()) == MULTI_MULT)
1503     {
1504        semsg(_("E888: (NFA regexp) cannot repeat %s"), what);
1505        rc_did_emsg = TRUE;
1506        return FAIL;
1507     }
1508     return OK;
1509 }
1510 
1511 typedef struct
1512 {
1513     int a, b, c;
1514 } decomp_T;
1515 
1516 
1517 // 0xfb20 - 0xfb4f
1518 static decomp_T decomp_table[0xfb4f-0xfb20+1] =
1519 {
1520     {0x5e2,0,0},		// 0xfb20	alt ayin
1521     {0x5d0,0,0},		// 0xfb21	alt alef
1522     {0x5d3,0,0},		// 0xfb22	alt dalet
1523     {0x5d4,0,0},		// 0xfb23	alt he
1524     {0x5db,0,0},		// 0xfb24	alt kaf
1525     {0x5dc,0,0},		// 0xfb25	alt lamed
1526     {0x5dd,0,0},		// 0xfb26	alt mem-sofit
1527     {0x5e8,0,0},		// 0xfb27	alt resh
1528     {0x5ea,0,0},		// 0xfb28	alt tav
1529     {'+', 0, 0},		// 0xfb29	alt plus
1530     {0x5e9, 0x5c1, 0},		// 0xfb2a	shin+shin-dot
1531     {0x5e9, 0x5c2, 0},		// 0xfb2b	shin+sin-dot
1532     {0x5e9, 0x5c1, 0x5bc},	// 0xfb2c	shin+shin-dot+dagesh
1533     {0x5e9, 0x5c2, 0x5bc},	// 0xfb2d	shin+sin-dot+dagesh
1534     {0x5d0, 0x5b7, 0},		// 0xfb2e	alef+patah
1535     {0x5d0, 0x5b8, 0},		// 0xfb2f	alef+qamats
1536     {0x5d0, 0x5b4, 0},		// 0xfb30	alef+hiriq
1537     {0x5d1, 0x5bc, 0},		// 0xfb31	bet+dagesh
1538     {0x5d2, 0x5bc, 0},		// 0xfb32	gimel+dagesh
1539     {0x5d3, 0x5bc, 0},		// 0xfb33	dalet+dagesh
1540     {0x5d4, 0x5bc, 0},		// 0xfb34	he+dagesh
1541     {0x5d5, 0x5bc, 0},		// 0xfb35	vav+dagesh
1542     {0x5d6, 0x5bc, 0},		// 0xfb36	zayin+dagesh
1543     {0xfb37, 0, 0},		// 0xfb37 -- UNUSED
1544     {0x5d8, 0x5bc, 0},		// 0xfb38	tet+dagesh
1545     {0x5d9, 0x5bc, 0},		// 0xfb39	yud+dagesh
1546     {0x5da, 0x5bc, 0},		// 0xfb3a	kaf sofit+dagesh
1547     {0x5db, 0x5bc, 0},		// 0xfb3b	kaf+dagesh
1548     {0x5dc, 0x5bc, 0},		// 0xfb3c	lamed+dagesh
1549     {0xfb3d, 0, 0},		// 0xfb3d -- UNUSED
1550     {0x5de, 0x5bc, 0},		// 0xfb3e	mem+dagesh
1551     {0xfb3f, 0, 0},		// 0xfb3f -- UNUSED
1552     {0x5e0, 0x5bc, 0},		// 0xfb40	nun+dagesh
1553     {0x5e1, 0x5bc, 0},		// 0xfb41	samech+dagesh
1554     {0xfb42, 0, 0},		// 0xfb42 -- UNUSED
1555     {0x5e3, 0x5bc, 0},		// 0xfb43	pe sofit+dagesh
1556     {0x5e4, 0x5bc,0},		// 0xfb44	pe+dagesh
1557     {0xfb45, 0, 0},		// 0xfb45 -- UNUSED
1558     {0x5e6, 0x5bc, 0},		// 0xfb46	tsadi+dagesh
1559     {0x5e7, 0x5bc, 0},		// 0xfb47	qof+dagesh
1560     {0x5e8, 0x5bc, 0},		// 0xfb48	resh+dagesh
1561     {0x5e9, 0x5bc, 0},		// 0xfb49	shin+dagesh
1562     {0x5ea, 0x5bc, 0},		// 0xfb4a	tav+dagesh
1563     {0x5d5, 0x5b9, 0},		// 0xfb4b	vav+holam
1564     {0x5d1, 0x5bf, 0},		// 0xfb4c	bet+rafe
1565     {0x5db, 0x5bf, 0},		// 0xfb4d	kaf+rafe
1566     {0x5e4, 0x5bf, 0},		// 0xfb4e	pe+rafe
1567     {0x5d0, 0x5dc, 0}		// 0xfb4f	alef-lamed
1568 };
1569 
1570     static void
mb_decompose(int c,int * c1,int * c2,int * c3)1571 mb_decompose(int c, int *c1, int *c2, int *c3)
1572 {
1573     decomp_T d;
1574 
1575     if (c >= 0xfb20 && c <= 0xfb4f)
1576     {
1577 	d = decomp_table[c - 0xfb20];
1578 	*c1 = d.a;
1579 	*c2 = d.b;
1580 	*c3 = d.c;
1581     }
1582     else
1583     {
1584 	*c1 = c;
1585 	*c2 = *c3 = 0;
1586     }
1587 }
1588 
1589 /*
1590  * Compare two strings, ignore case if rex.reg_ic set.
1591  * Return 0 if strings match, non-zero otherwise.
1592  * Correct the length "*n" when composing characters are ignored.
1593  */
1594     static int
cstrncmp(char_u * s1,char_u * s2,int * n)1595 cstrncmp(char_u *s1, char_u *s2, int *n)
1596 {
1597     int		result;
1598 
1599     if (!rex.reg_ic)
1600 	result = STRNCMP(s1, s2, *n);
1601     else
1602 	result = MB_STRNICMP(s1, s2, *n);
1603 
1604     // if it failed and it's utf8 and we want to combineignore:
1605     if (result != 0 && enc_utf8 && rex.reg_icombine)
1606     {
1607 	char_u	*str1, *str2;
1608 	int	c1, c2, c11, c12;
1609 	int	junk;
1610 
1611 	// we have to handle the strcmp ourselves, since it is necessary to
1612 	// deal with the composing characters by ignoring them:
1613 	str1 = s1;
1614 	str2 = s2;
1615 	c1 = c2 = 0;
1616 	while ((int)(str1 - s1) < *n)
1617 	{
1618 	    c1 = mb_ptr2char_adv(&str1);
1619 	    c2 = mb_ptr2char_adv(&str2);
1620 
1621 	    // Decompose the character if necessary, into 'base' characters.
1622 	    // Currently hard-coded for Hebrew, Arabic to be done...
1623 	    if (c1 != c2 && (!rex.reg_ic || utf_fold(c1) != utf_fold(c2)))
1624 	    {
1625 		// decomposition necessary?
1626 		mb_decompose(c1, &c11, &junk, &junk);
1627 		mb_decompose(c2, &c12, &junk, &junk);
1628 		c1 = c11;
1629 		c2 = c12;
1630 		if (c11 != c12
1631 			    && (!rex.reg_ic || utf_fold(c11) != utf_fold(c12)))
1632 		    break;
1633 	    }
1634 	}
1635 	result = c2 - c1;
1636 	if (result == 0)
1637 	    *n = (int)(str2 - s2);
1638     }
1639 
1640     return result;
1641 }
1642 
1643 /*
1644  * cstrchr: This function is used a lot for simple searches, keep it fast!
1645  */
1646     static char_u *
cstrchr(char_u * s,int c)1647 cstrchr(char_u *s, int c)
1648 {
1649     char_u	*p;
1650     int		cc;
1651 
1652     if (!rex.reg_ic || (!enc_utf8 && mb_char2len(c) > 1))
1653 	return vim_strchr(s, c);
1654 
1655     // tolower() and toupper() can be slow, comparing twice should be a lot
1656     // faster (esp. when using MS Visual C++!).
1657     // For UTF-8 need to use folded case.
1658     if (enc_utf8 && c > 0x80)
1659 	cc = utf_fold(c);
1660     else
1661 	 if (MB_ISUPPER(c))
1662 	cc = MB_TOLOWER(c);
1663     else if (MB_ISLOWER(c))
1664 	cc = MB_TOUPPER(c);
1665     else
1666 	return vim_strchr(s, c);
1667 
1668     if (has_mbyte)
1669     {
1670 	for (p = s; *p != NUL; p += (*mb_ptr2len)(p))
1671 	{
1672 	    if (enc_utf8 && c > 0x80)
1673 	    {
1674 		if (utf_fold(utf_ptr2char(p)) == cc)
1675 		    return p;
1676 	    }
1677 	    else if (*p == c || *p == cc)
1678 		return p;
1679 	}
1680     }
1681     else
1682 	// Faster version for when there are no multi-byte characters.
1683 	for (p = s; *p != NUL; ++p)
1684 	    if (*p == c || *p == cc)
1685 		return p;
1686 
1687     return NULL;
1688 }
1689 
1690 ////////////////////////////////////////////////////////////////
1691 //		      regsub stuff			      //
1692 ////////////////////////////////////////////////////////////////
1693 
1694 /*
1695  * We should define ftpr as a pointer to a function returning a pointer to
1696  * a function returning a pointer to a function ...
1697  * This is impossible, so we declare a pointer to a function returning a
1698  * void pointer. This should work for all compilers.
1699  */
1700 typedef void (*(*fptr_T)(int *, int));
1701 
1702 static int vim_regsub_both(char_u *source, typval_T *expr, char_u *dest, int copy, int magic, int backslash);
1703 
1704     static fptr_T
do_upper(int * d,int c)1705 do_upper(int *d, int c)
1706 {
1707     *d = MB_TOUPPER(c);
1708 
1709     return (fptr_T)NULL;
1710 }
1711 
1712     static fptr_T
do_Upper(int * d,int c)1713 do_Upper(int *d, int c)
1714 {
1715     *d = MB_TOUPPER(c);
1716 
1717     return (fptr_T)do_Upper;
1718 }
1719 
1720     static fptr_T
do_lower(int * d,int c)1721 do_lower(int *d, int c)
1722 {
1723     *d = MB_TOLOWER(c);
1724 
1725     return (fptr_T)NULL;
1726 }
1727 
1728     static fptr_T
do_Lower(int * d,int c)1729 do_Lower(int *d, int c)
1730 {
1731     *d = MB_TOLOWER(c);
1732 
1733     return (fptr_T)do_Lower;
1734 }
1735 
1736 /*
1737  * regtilde(): Replace tildes in the pattern by the old pattern.
1738  *
1739  * Short explanation of the tilde: It stands for the previous replacement
1740  * pattern.  If that previous pattern also contains a ~ we should go back a
1741  * step further...  But we insert the previous pattern into the current one
1742  * and remember that.
1743  * This still does not handle the case where "magic" changes.  So require the
1744  * user to keep his hands off of "magic".
1745  *
1746  * The tildes are parsed once before the first call to vim_regsub().
1747  */
1748     char_u *
regtilde(char_u * source,int magic)1749 regtilde(char_u *source, int magic)
1750 {
1751     char_u	*newsub = source;
1752     char_u	*tmpsub;
1753     char_u	*p;
1754     int		len;
1755     int		prevlen;
1756 
1757     for (p = newsub; *p; ++p)
1758     {
1759 	if ((*p == '~' && magic) || (*p == '\\' && *(p + 1) == '~' && !magic))
1760 	{
1761 	    if (reg_prev_sub != NULL)
1762 	    {
1763 		// length = len(newsub) - 1 + len(prev_sub) + 1
1764 		prevlen = (int)STRLEN(reg_prev_sub);
1765 		tmpsub = alloc(STRLEN(newsub) + prevlen);
1766 		if (tmpsub != NULL)
1767 		{
1768 		    // copy prefix
1769 		    len = (int)(p - newsub);	// not including ~
1770 		    mch_memmove(tmpsub, newsub, (size_t)len);
1771 		    // interpret tilde
1772 		    mch_memmove(tmpsub + len, reg_prev_sub, (size_t)prevlen);
1773 		    // copy postfix
1774 		    if (!magic)
1775 			++p;			// back off backslash
1776 		    STRCPY(tmpsub + len + prevlen, p + 1);
1777 
1778 		    if (newsub != source)	// already allocated newsub
1779 			vim_free(newsub);
1780 		    newsub = tmpsub;
1781 		    p = newsub + len + prevlen;
1782 		}
1783 	    }
1784 	    else if (magic)
1785 		STRMOVE(p, p + 1);	// remove '~'
1786 	    else
1787 		STRMOVE(p, p + 2);	// remove '\~'
1788 	    --p;
1789 	}
1790 	else
1791 	{
1792 	    if (*p == '\\' && p[1])		// skip escaped characters
1793 		++p;
1794 	    if (has_mbyte)
1795 		p += (*mb_ptr2len)(p) - 1;
1796 	}
1797     }
1798 
1799     vim_free(reg_prev_sub);
1800     if (newsub != source)	// newsub was allocated, just keep it
1801 	reg_prev_sub = newsub;
1802     else			// no ~ found, need to save newsub
1803 	reg_prev_sub = vim_strsave(newsub);
1804     return newsub;
1805 }
1806 
1807 #ifdef FEAT_EVAL
1808 static int can_f_submatch = FALSE;	// TRUE when submatch() can be used
1809 
1810 // These pointers are used for reg_submatch().  Needed for when the
1811 // substitution string is an expression that contains a call to substitute()
1812 // and submatch().
1813 typedef struct {
1814     regmatch_T	*sm_match;
1815     regmmatch_T	*sm_mmatch;
1816     linenr_T	sm_firstlnum;
1817     linenr_T	sm_maxline;
1818     int		sm_line_lbr;
1819 } regsubmatch_T;
1820 
1821 static regsubmatch_T rsm;  // can only be used when can_f_submatch is TRUE
1822 #endif
1823 
1824 #ifdef FEAT_EVAL
1825 
1826 /*
1827  * Put the submatches in "argv[argskip]" which is a list passed into
1828  * call_func() by vim_regsub_both().
1829  */
1830     static int
fill_submatch_list(int argc UNUSED,typval_T * argv,int argskip,int argcount)1831 fill_submatch_list(int argc UNUSED, typval_T *argv, int argskip, int argcount)
1832 {
1833     listitem_T	*li;
1834     int		i;
1835     char_u	*s;
1836     typval_T	*listarg = argv + argskip;
1837 
1838     if (argcount == argskip)
1839 	// called function doesn't take a submatches argument
1840 	return argskip;
1841 
1842     // Relies on sl_list to be the first item in staticList10_T.
1843     init_static_list((staticList10_T *)(listarg->vval.v_list));
1844 
1845     // There are always 10 list items in staticList10_T.
1846     li = listarg->vval.v_list->lv_first;
1847     for (i = 0; i < 10; ++i)
1848     {
1849 	s = rsm.sm_match->startp[i];
1850 	if (s == NULL || rsm.sm_match->endp[i] == NULL)
1851 	    s = NULL;
1852 	else
1853 	    s = vim_strnsave(s, rsm.sm_match->endp[i] - s);
1854 	li->li_tv.v_type = VAR_STRING;
1855 	li->li_tv.vval.v_string = s;
1856 	li = li->li_next;
1857     }
1858     return argskip + 1;
1859 }
1860 
1861     static void
clear_submatch_list(staticList10_T * sl)1862 clear_submatch_list(staticList10_T *sl)
1863 {
1864     int i;
1865 
1866     for (i = 0; i < 10; ++i)
1867 	vim_free(sl->sl_items[i].li_tv.vval.v_string);
1868 }
1869 #endif
1870 
1871 /*
1872  * vim_regsub() - perform substitutions after a vim_regexec() or
1873  * vim_regexec_multi() match.
1874  *
1875  * If "copy" is TRUE really copy into "dest".
1876  * If "copy" is FALSE nothing is copied, this is just to find out the length
1877  * of the result.
1878  *
1879  * If "backslash" is TRUE, a backslash will be removed later, need to double
1880  * them to keep them, and insert a backslash before a CR to avoid it being
1881  * replaced with a line break later.
1882  *
1883  * Note: The matched text must not change between the call of
1884  * vim_regexec()/vim_regexec_multi() and vim_regsub()!  It would make the back
1885  * references invalid!
1886  *
1887  * Returns the size of the replacement, including terminating NUL.
1888  */
1889     int
vim_regsub(regmatch_T * rmp,char_u * source,typval_T * expr,char_u * dest,int copy,int magic,int backslash)1890 vim_regsub(
1891     regmatch_T	*rmp,
1892     char_u	*source,
1893     typval_T	*expr,
1894     char_u	*dest,
1895     int		copy,
1896     int		magic,
1897     int		backslash)
1898 {
1899     int		result;
1900     regexec_T	rex_save;
1901     int		rex_in_use_save = rex_in_use;
1902 
1903     if (rex_in_use)
1904 	// Being called recursively, save the state.
1905 	rex_save = rex;
1906     rex_in_use = TRUE;
1907 
1908     rex.reg_match = rmp;
1909     rex.reg_mmatch = NULL;
1910     rex.reg_maxline = 0;
1911     rex.reg_buf = curbuf;
1912     rex.reg_line_lbr = TRUE;
1913     result = vim_regsub_both(source, expr, dest, copy, magic, backslash);
1914 
1915     rex_in_use = rex_in_use_save;
1916     if (rex_in_use)
1917 	rex = rex_save;
1918 
1919     return result;
1920 }
1921 
1922     int
vim_regsub_multi(regmmatch_T * rmp,linenr_T lnum,char_u * source,char_u * dest,int copy,int magic,int backslash)1923 vim_regsub_multi(
1924     regmmatch_T	*rmp,
1925     linenr_T	lnum,
1926     char_u	*source,
1927     char_u	*dest,
1928     int		copy,
1929     int		magic,
1930     int		backslash)
1931 {
1932     int		result;
1933     regexec_T	rex_save;
1934     int		rex_in_use_save = rex_in_use;
1935 
1936     if (rex_in_use)
1937 	// Being called recursively, save the state.
1938 	rex_save = rex;
1939     rex_in_use = TRUE;
1940 
1941     rex.reg_match = NULL;
1942     rex.reg_mmatch = rmp;
1943     rex.reg_buf = curbuf;	// always works on the current buffer!
1944     rex.reg_firstlnum = lnum;
1945     rex.reg_maxline = curbuf->b_ml.ml_line_count - lnum;
1946     rex.reg_line_lbr = FALSE;
1947     result = vim_regsub_both(source, NULL, dest, copy, magic, backslash);
1948 
1949     rex_in_use = rex_in_use_save;
1950     if (rex_in_use)
1951 	rex = rex_save;
1952 
1953     return result;
1954 }
1955 
1956     static int
vim_regsub_both(char_u * source,typval_T * expr,char_u * dest,int copy,int magic,int backslash)1957 vim_regsub_both(
1958     char_u	*source,
1959     typval_T	*expr,
1960     char_u	*dest,
1961     int		copy,
1962     int		magic,
1963     int		backslash)
1964 {
1965     char_u	*src;
1966     char_u	*dst;
1967     char_u	*s;
1968     int		c;
1969     int		cc;
1970     int		no = -1;
1971     fptr_T	func_all = (fptr_T)NULL;
1972     fptr_T	func_one = (fptr_T)NULL;
1973     linenr_T	clnum = 0;	// init for GCC
1974     int		len = 0;	// init for GCC
1975 #ifdef FEAT_EVAL
1976     static char_u   *eval_result = NULL;
1977 #endif
1978 
1979     // Be paranoid...
1980     if ((source == NULL && expr == NULL) || dest == NULL)
1981     {
1982 	emsg(_(e_null_argument));
1983 	return 0;
1984     }
1985     if (prog_magic_wrong())
1986 	return 0;
1987     src = source;
1988     dst = dest;
1989 
1990     /*
1991      * When the substitute part starts with "\=" evaluate it as an expression.
1992      */
1993     if (expr != NULL || (source[0] == '\\' && source[1] == '='))
1994     {
1995 #ifdef FEAT_EVAL
1996 	// To make sure that the length doesn't change between checking the
1997 	// length and copying the string, and to speed up things, the
1998 	// resulting string is saved from the call with "copy" == FALSE to the
1999 	// call with "copy" == TRUE.
2000 	if (copy)
2001 	{
2002 	    if (eval_result != NULL)
2003 	    {
2004 		STRCPY(dest, eval_result);
2005 		dst += STRLEN(eval_result);
2006 		VIM_CLEAR(eval_result);
2007 	    }
2008 	}
2009 	else
2010 	{
2011 	    int		    prev_can_f_submatch = can_f_submatch;
2012 	    regsubmatch_T   rsm_save;
2013 
2014 	    vim_free(eval_result);
2015 
2016 	    // The expression may contain substitute(), which calls us
2017 	    // recursively.  Make sure submatch() gets the text from the first
2018 	    // level.
2019 	    if (can_f_submatch)
2020 		rsm_save = rsm;
2021 	    can_f_submatch = TRUE;
2022 	    rsm.sm_match = rex.reg_match;
2023 	    rsm.sm_mmatch = rex.reg_mmatch;
2024 	    rsm.sm_firstlnum = rex.reg_firstlnum;
2025 	    rsm.sm_maxline = rex.reg_maxline;
2026 	    rsm.sm_line_lbr = rex.reg_line_lbr;
2027 
2028 	    if (expr != NULL)
2029 	    {
2030 		typval_T	argv[2];
2031 		char_u		buf[NUMBUFLEN];
2032 		typval_T	rettv;
2033 		staticList10_T	matchList;
2034 		funcexe_T	funcexe;
2035 
2036 		rettv.v_type = VAR_STRING;
2037 		rettv.vval.v_string = NULL;
2038 		argv[0].v_type = VAR_LIST;
2039 		argv[0].vval.v_list = &matchList.sl_list;
2040 		matchList.sl_list.lv_len = 0;
2041 		CLEAR_FIELD(funcexe);
2042 		funcexe.argv_func = fill_submatch_list;
2043 		funcexe.evaluate = TRUE;
2044 		if (expr->v_type == VAR_FUNC)
2045 		{
2046 		    s = expr->vval.v_string;
2047 		    call_func(s, -1, &rettv, 1, argv, &funcexe);
2048 		}
2049 		else if (expr->v_type == VAR_PARTIAL)
2050 		{
2051 		    partial_T   *partial = expr->vval.v_partial;
2052 
2053 		    s = partial_name(partial);
2054 		    funcexe.partial = partial;
2055 		    call_func(s, -1, &rettv, 1, argv, &funcexe);
2056 		}
2057 		if (matchList.sl_list.lv_len > 0)
2058 		    // fill_submatch_list() was called
2059 		    clear_submatch_list(&matchList);
2060 
2061 		if (rettv.v_type == VAR_UNKNOWN)
2062 		    // something failed, no need to report another error
2063 		    eval_result = NULL;
2064 		else
2065 		{
2066 		    eval_result = tv_get_string_buf_chk(&rettv, buf);
2067 		    if (eval_result != NULL)
2068 			eval_result = vim_strsave(eval_result);
2069 		}
2070 		clear_tv(&rettv);
2071 	    }
2072 	    else if (substitute_instr != NULL)
2073 		// Execute instructions from ISN_SUBSTITUTE.
2074 		eval_result = exe_substitute_instr();
2075 	    else
2076 		eval_result = eval_to_string(source + 2, TRUE);
2077 
2078 	    if (eval_result != NULL)
2079 	    {
2080 		int had_backslash = FALSE;
2081 
2082 		for (s = eval_result; *s != NUL; MB_PTR_ADV(s))
2083 		{
2084 		    // Change NL to CR, so that it becomes a line break,
2085 		    // unless called from vim_regexec_nl().
2086 		    // Skip over a backslashed character.
2087 		    if (*s == NL && !rsm.sm_line_lbr)
2088 			*s = CAR;
2089 		    else if (*s == '\\' && s[1] != NUL)
2090 		    {
2091 			++s;
2092 			/* Change NL to CR here too, so that this works:
2093 			 * :s/abc\\\ndef/\="aaa\\\nbbb"/  on text:
2094 			 *   abc\
2095 			 *   def
2096 			 * Not when called from vim_regexec_nl().
2097 			 */
2098 			if (*s == NL && !rsm.sm_line_lbr)
2099 			    *s = CAR;
2100 			had_backslash = TRUE;
2101 		    }
2102 		}
2103 		if (had_backslash && backslash)
2104 		{
2105 		    // Backslashes will be consumed, need to double them.
2106 		    s = vim_strsave_escaped(eval_result, (char_u *)"\\");
2107 		    if (s != NULL)
2108 		    {
2109 			vim_free(eval_result);
2110 			eval_result = s;
2111 		    }
2112 		}
2113 
2114 		dst += STRLEN(eval_result);
2115 	    }
2116 
2117 	    can_f_submatch = prev_can_f_submatch;
2118 	    if (can_f_submatch)
2119 		rsm = rsm_save;
2120 	}
2121 #endif
2122     }
2123     else
2124       while ((c = *src++) != NUL)
2125       {
2126 	if (c == '&' && magic)
2127 	    no = 0;
2128 	else if (c == '\\' && *src != NUL)
2129 	{
2130 	    if (*src == '&' && !magic)
2131 	    {
2132 		++src;
2133 		no = 0;
2134 	    }
2135 	    else if ('0' <= *src && *src <= '9')
2136 	    {
2137 		no = *src++ - '0';
2138 	    }
2139 	    else if (vim_strchr((char_u *)"uUlLeE", *src))
2140 	    {
2141 		switch (*src++)
2142 		{
2143 		case 'u':   func_one = (fptr_T)do_upper;
2144 			    continue;
2145 		case 'U':   func_all = (fptr_T)do_Upper;
2146 			    continue;
2147 		case 'l':   func_one = (fptr_T)do_lower;
2148 			    continue;
2149 		case 'L':   func_all = (fptr_T)do_Lower;
2150 			    continue;
2151 		case 'e':
2152 		case 'E':   func_one = func_all = (fptr_T)NULL;
2153 			    continue;
2154 		}
2155 	    }
2156 	}
2157 	if (no < 0)	      // Ordinary character.
2158 	{
2159 	    if (c == K_SPECIAL && src[0] != NUL && src[1] != NUL)
2160 	    {
2161 		// Copy a special key as-is.
2162 		if (copy)
2163 		{
2164 		    *dst++ = c;
2165 		    *dst++ = *src++;
2166 		    *dst++ = *src++;
2167 		}
2168 		else
2169 		{
2170 		    dst += 3;
2171 		    src += 2;
2172 		}
2173 		continue;
2174 	    }
2175 
2176 	    if (c == '\\' && *src != NUL)
2177 	    {
2178 		// Check for abbreviations -- webb
2179 		switch (*src)
2180 		{
2181 		    case 'r':	c = CAR;	++src;	break;
2182 		    case 'n':	c = NL;		++src;	break;
2183 		    case 't':	c = TAB;	++src;	break;
2184 		 // Oh no!  \e already has meaning in subst pat :-(
2185 		 // case 'e':   c = ESC;	++src;	break;
2186 		    case 'b':	c = Ctrl_H;	++src;	break;
2187 
2188 		    // If "backslash" is TRUE the backslash will be removed
2189 		    // later.  Used to insert a literal CR.
2190 		    default:	if (backslash)
2191 				{
2192 				    if (copy)
2193 					*dst = '\\';
2194 				    ++dst;
2195 				}
2196 				c = *src++;
2197 		}
2198 	    }
2199 	    else if (has_mbyte)
2200 		c = mb_ptr2char(src - 1);
2201 
2202 	    // Write to buffer, if copy is set.
2203 	    if (func_one != (fptr_T)NULL)
2204 		// Turbo C complains without the typecast
2205 		func_one = (fptr_T)(func_one(&cc, c));
2206 	    else if (func_all != (fptr_T)NULL)
2207 		// Turbo C complains without the typecast
2208 		func_all = (fptr_T)(func_all(&cc, c));
2209 	    else // just copy
2210 		cc = c;
2211 
2212 	    if (has_mbyte)
2213 	    {
2214 		int totlen = mb_ptr2len(src - 1);
2215 
2216 		if (copy)
2217 		    mb_char2bytes(cc, dst);
2218 		dst += mb_char2len(cc) - 1;
2219 		if (enc_utf8)
2220 		{
2221 		    int clen = utf_ptr2len(src - 1);
2222 
2223 		    // If the character length is shorter than "totlen", there
2224 		    // are composing characters; copy them as-is.
2225 		    if (clen < totlen)
2226 		    {
2227 			if (copy)
2228 			    mch_memmove(dst + 1, src - 1 + clen,
2229 						     (size_t)(totlen - clen));
2230 			dst += totlen - clen;
2231 		    }
2232 		}
2233 		src += totlen - 1;
2234 	    }
2235 	    else if (copy)
2236 		    *dst = cc;
2237 	    dst++;
2238 	}
2239 	else
2240 	{
2241 	    if (REG_MULTI)
2242 	    {
2243 		clnum = rex.reg_mmatch->startpos[no].lnum;
2244 		if (clnum < 0 || rex.reg_mmatch->endpos[no].lnum < 0)
2245 		    s = NULL;
2246 		else
2247 		{
2248 		    s = reg_getline(clnum) + rex.reg_mmatch->startpos[no].col;
2249 		    if (rex.reg_mmatch->endpos[no].lnum == clnum)
2250 			len = rex.reg_mmatch->endpos[no].col
2251 					    - rex.reg_mmatch->startpos[no].col;
2252 		    else
2253 			len = (int)STRLEN(s);
2254 		}
2255 	    }
2256 	    else
2257 	    {
2258 		s = rex.reg_match->startp[no];
2259 		if (rex.reg_match->endp[no] == NULL)
2260 		    s = NULL;
2261 		else
2262 		    len = (int)(rex.reg_match->endp[no] - s);
2263 	    }
2264 	    if (s != NULL)
2265 	    {
2266 		for (;;)
2267 		{
2268 		    if (len == 0)
2269 		    {
2270 			if (REG_MULTI)
2271 			{
2272 			    if (rex.reg_mmatch->endpos[no].lnum == clnum)
2273 				break;
2274 			    if (copy)
2275 				*dst = CAR;
2276 			    ++dst;
2277 			    s = reg_getline(++clnum);
2278 			    if (rex.reg_mmatch->endpos[no].lnum == clnum)
2279 				len = rex.reg_mmatch->endpos[no].col;
2280 			    else
2281 				len = (int)STRLEN(s);
2282 			}
2283 			else
2284 			    break;
2285 		    }
2286 		    else if (*s == NUL) // we hit NUL.
2287 		    {
2288 			if (copy)
2289 			    iemsg(_(e_damaged_match_string));
2290 			goto exit;
2291 		    }
2292 		    else
2293 		    {
2294 			if (backslash && (*s == CAR || *s == '\\'))
2295 			{
2296 			    /*
2297 			     * Insert a backslash in front of a CR, otherwise
2298 			     * it will be replaced by a line break.
2299 			     * Number of backslashes will be halved later,
2300 			     * double them here.
2301 			     */
2302 			    if (copy)
2303 			    {
2304 				dst[0] = '\\';
2305 				dst[1] = *s;
2306 			    }
2307 			    dst += 2;
2308 			}
2309 			else
2310 			{
2311 			    if (has_mbyte)
2312 				c = mb_ptr2char(s);
2313 			    else
2314 				c = *s;
2315 
2316 			    if (func_one != (fptr_T)NULL)
2317 				// Turbo C complains without the typecast
2318 				func_one = (fptr_T)(func_one(&cc, c));
2319 			    else if (func_all != (fptr_T)NULL)
2320 				// Turbo C complains without the typecast
2321 				func_all = (fptr_T)(func_all(&cc, c));
2322 			    else // just copy
2323 				cc = c;
2324 
2325 			    if (has_mbyte)
2326 			    {
2327 				int l;
2328 
2329 				// Copy composing characters separately, one
2330 				// at a time.
2331 				if (enc_utf8)
2332 				    l = utf_ptr2len(s) - 1;
2333 				else
2334 				    l = mb_ptr2len(s) - 1;
2335 
2336 				s += l;
2337 				len -= l;
2338 				if (copy)
2339 				    mb_char2bytes(cc, dst);
2340 				dst += mb_char2len(cc) - 1;
2341 			    }
2342 			    else if (copy)
2343 				    *dst = cc;
2344 			    dst++;
2345 			}
2346 
2347 			++s;
2348 			--len;
2349 		    }
2350 		}
2351 	    }
2352 	    no = -1;
2353 	}
2354       }
2355     if (copy)
2356 	*dst = NUL;
2357 
2358 exit:
2359     return (int)((dst - dest) + 1);
2360 }
2361 
2362 #ifdef FEAT_EVAL
2363 /*
2364  * Call reg_getline() with the line numbers from the submatch.  If a
2365  * substitute() was used the reg_maxline and other values have been
2366  * overwritten.
2367  */
2368     static char_u *
reg_getline_submatch(linenr_T lnum)2369 reg_getline_submatch(linenr_T lnum)
2370 {
2371     char_u *s;
2372     linenr_T save_first = rex.reg_firstlnum;
2373     linenr_T save_max = rex.reg_maxline;
2374 
2375     rex.reg_firstlnum = rsm.sm_firstlnum;
2376     rex.reg_maxline = rsm.sm_maxline;
2377 
2378     s = reg_getline(lnum);
2379 
2380     rex.reg_firstlnum = save_first;
2381     rex.reg_maxline = save_max;
2382     return s;
2383 }
2384 
2385 /*
2386  * Used for the submatch() function: get the string from the n'th submatch in
2387  * allocated memory.
2388  * Returns NULL when not in a ":s" command and for a non-existing submatch.
2389  */
2390     char_u *
reg_submatch(int no)2391 reg_submatch(int no)
2392 {
2393     char_u	*retval = NULL;
2394     char_u	*s;
2395     int		len;
2396     int		round;
2397     linenr_T	lnum;
2398 
2399     if (!can_f_submatch || no < 0)
2400 	return NULL;
2401 
2402     if (rsm.sm_match == NULL)
2403     {
2404 	/*
2405 	 * First round: compute the length and allocate memory.
2406 	 * Second round: copy the text.
2407 	 */
2408 	for (round = 1; round <= 2; ++round)
2409 	{
2410 	    lnum = rsm.sm_mmatch->startpos[no].lnum;
2411 	    if (lnum < 0 || rsm.sm_mmatch->endpos[no].lnum < 0)
2412 		return NULL;
2413 
2414 	    s = reg_getline_submatch(lnum);
2415 	    if (s == NULL)  // anti-crash check, cannot happen?
2416 		break;
2417 	    s += rsm.sm_mmatch->startpos[no].col;
2418 	    if (rsm.sm_mmatch->endpos[no].lnum == lnum)
2419 	    {
2420 		// Within one line: take form start to end col.
2421 		len = rsm.sm_mmatch->endpos[no].col
2422 					  - rsm.sm_mmatch->startpos[no].col;
2423 		if (round == 2)
2424 		    vim_strncpy(retval, s, len);
2425 		++len;
2426 	    }
2427 	    else
2428 	    {
2429 		// Multiple lines: take start line from start col, middle
2430 		// lines completely and end line up to end col.
2431 		len = (int)STRLEN(s);
2432 		if (round == 2)
2433 		{
2434 		    STRCPY(retval, s);
2435 		    retval[len] = '\n';
2436 		}
2437 		++len;
2438 		++lnum;
2439 		while (lnum < rsm.sm_mmatch->endpos[no].lnum)
2440 		{
2441 		    s = reg_getline_submatch(lnum++);
2442 		    if (round == 2)
2443 			STRCPY(retval + len, s);
2444 		    len += (int)STRLEN(s);
2445 		    if (round == 2)
2446 			retval[len] = '\n';
2447 		    ++len;
2448 		}
2449 		if (round == 2)
2450 		    STRNCPY(retval + len, reg_getline_submatch(lnum),
2451 					     rsm.sm_mmatch->endpos[no].col);
2452 		len += rsm.sm_mmatch->endpos[no].col;
2453 		if (round == 2)
2454 		    retval[len] = NUL;
2455 		++len;
2456 	    }
2457 
2458 	    if (retval == NULL)
2459 	    {
2460 		retval = alloc(len);
2461 		if (retval == NULL)
2462 		    return NULL;
2463 	    }
2464 	}
2465     }
2466     else
2467     {
2468 	s = rsm.sm_match->startp[no];
2469 	if (s == NULL || rsm.sm_match->endp[no] == NULL)
2470 	    retval = NULL;
2471 	else
2472 	    retval = vim_strnsave(s, rsm.sm_match->endp[no] - s);
2473     }
2474 
2475     return retval;
2476 }
2477 
2478 /*
2479  * Used for the submatch() function with the optional non-zero argument: get
2480  * the list of strings from the n'th submatch in allocated memory with NULs
2481  * represented in NLs.
2482  * Returns a list of allocated strings.  Returns NULL when not in a ":s"
2483  * command, for a non-existing submatch and for any error.
2484  */
2485     list_T *
reg_submatch_list(int no)2486 reg_submatch_list(int no)
2487 {
2488     char_u	*s;
2489     linenr_T	slnum;
2490     linenr_T	elnum;
2491     colnr_T	scol;
2492     colnr_T	ecol;
2493     int		i;
2494     list_T	*list;
2495     int		error = FALSE;
2496 
2497     if (!can_f_submatch || no < 0)
2498 	return NULL;
2499 
2500     if (rsm.sm_match == NULL)
2501     {
2502 	slnum = rsm.sm_mmatch->startpos[no].lnum;
2503 	elnum = rsm.sm_mmatch->endpos[no].lnum;
2504 	if (slnum < 0 || elnum < 0)
2505 	    return NULL;
2506 
2507 	scol = rsm.sm_mmatch->startpos[no].col;
2508 	ecol = rsm.sm_mmatch->endpos[no].col;
2509 
2510 	list = list_alloc();
2511 	if (list == NULL)
2512 	    return NULL;
2513 
2514 	s = reg_getline_submatch(slnum) + scol;
2515 	if (slnum == elnum)
2516 	{
2517 	    if (list_append_string(list, s, ecol - scol) == FAIL)
2518 		error = TRUE;
2519 	}
2520 	else
2521 	{
2522 	    if (list_append_string(list, s, -1) == FAIL)
2523 		error = TRUE;
2524 	    for (i = 1; i < elnum - slnum; i++)
2525 	    {
2526 		s = reg_getline_submatch(slnum + i);
2527 		if (list_append_string(list, s, -1) == FAIL)
2528 		    error = TRUE;
2529 	    }
2530 	    s = reg_getline_submatch(elnum);
2531 	    if (list_append_string(list, s, ecol) == FAIL)
2532 		error = TRUE;
2533 	}
2534     }
2535     else
2536     {
2537 	s = rsm.sm_match->startp[no];
2538 	if (s == NULL || rsm.sm_match->endp[no] == NULL)
2539 	    return NULL;
2540 	list = list_alloc();
2541 	if (list == NULL)
2542 	    return NULL;
2543 	if (list_append_string(list, s,
2544 				 (int)(rsm.sm_match->endp[no] - s)) == FAIL)
2545 	    error = TRUE;
2546     }
2547 
2548     if (error)
2549     {
2550 	list_free(list);
2551 	return NULL;
2552     }
2553     ++list->lv_refcount;
2554     return list;
2555 }
2556 #endif
2557 
2558 /*
2559  * Initialize the values used for matching against multiple lines
2560  */
2561     static void
init_regexec_multi(regmmatch_T * rmp,win_T * win,buf_T * buf,linenr_T lnum)2562 init_regexec_multi(
2563 	regmmatch_T	*rmp,
2564 	win_T		*win,	// window in which to search or NULL
2565 	buf_T		*buf,	// buffer in which to search
2566 	linenr_T	lnum)	// nr of line to start looking for match
2567 {
2568     rex.reg_match = NULL;
2569     rex.reg_mmatch = rmp;
2570     rex.reg_buf = buf;
2571     rex.reg_win = win;
2572     rex.reg_firstlnum = lnum;
2573     rex.reg_maxline = rex.reg_buf->b_ml.ml_line_count - lnum;
2574     rex.reg_line_lbr = FALSE;
2575     rex.reg_ic = rmp->rmm_ic;
2576     rex.reg_icombine = FALSE;
2577     rex.reg_maxcol = rmp->rmm_maxcol;
2578 }
2579 
2580 #include "regexp_bt.c"
2581 
2582 static regengine_T bt_regengine =
2583 {
2584     bt_regcomp,
2585     bt_regfree,
2586     bt_regexec_nl,
2587     bt_regexec_multi,
2588     (char_u *)""
2589 };
2590 
2591 #include "regexp_nfa.c"
2592 
2593 static regengine_T nfa_regengine =
2594 {
2595     nfa_regcomp,
2596     nfa_regfree,
2597     nfa_regexec_nl,
2598     nfa_regexec_multi,
2599     (char_u *)""
2600 };
2601 
2602 // Which regexp engine to use? Needed for vim_regcomp().
2603 // Must match with 'regexpengine'.
2604 static int regexp_engine = 0;
2605 
2606 #ifdef DEBUG
2607 static char_u regname[][30] = {
2608 		    "AUTOMATIC Regexp Engine",
2609 		    "BACKTRACKING Regexp Engine",
2610 		    "NFA Regexp Engine"
2611 			    };
2612 #endif
2613 
2614 /*
2615  * Compile a regular expression into internal code.
2616  * Returns the program in allocated memory.
2617  * Use vim_regfree() to free the memory.
2618  * Returns NULL for an error.
2619  */
2620     regprog_T *
vim_regcomp(char_u * expr_arg,int re_flags)2621 vim_regcomp(char_u *expr_arg, int re_flags)
2622 {
2623     regprog_T   *prog = NULL;
2624     char_u	*expr = expr_arg;
2625     int		called_emsg_before;
2626 
2627     regexp_engine = p_re;
2628 
2629     // Check for prefix "\%#=", that sets the regexp engine
2630     if (STRNCMP(expr, "\\%#=", 4) == 0)
2631     {
2632 	int newengine = expr[4] - '0';
2633 
2634 	if (newengine == AUTOMATIC_ENGINE
2635 	    || newengine == BACKTRACKING_ENGINE
2636 	    || newengine == NFA_ENGINE)
2637 	{
2638 	    regexp_engine = expr[4] - '0';
2639 	    expr += 5;
2640 #ifdef DEBUG
2641 	    smsg("New regexp mode selected (%d): %s",
2642 					   regexp_engine, regname[newengine]);
2643 #endif
2644 	}
2645 	else
2646 	{
2647 	    emsg(_("E864: \\%#= can only be followed by 0, 1, or 2. The automatic engine will be used "));
2648 	    regexp_engine = AUTOMATIC_ENGINE;
2649 	}
2650     }
2651 #ifdef DEBUG
2652     bt_regengine.expr = expr;
2653     nfa_regengine.expr = expr;
2654 #endif
2655     // reg_iswordc() uses rex.reg_buf
2656     rex.reg_buf = curbuf;
2657 
2658     /*
2659      * First try the NFA engine, unless backtracking was requested.
2660      */
2661     called_emsg_before = called_emsg;
2662     if (regexp_engine != BACKTRACKING_ENGINE)
2663 	prog = nfa_regengine.regcomp(expr,
2664 		re_flags + (regexp_engine == AUTOMATIC_ENGINE ? RE_AUTO : 0));
2665     else
2666 	prog = bt_regengine.regcomp(expr, re_flags);
2667 
2668     // Check for error compiling regexp with initial engine.
2669     if (prog == NULL)
2670     {
2671 #ifdef BT_REGEXP_DEBUG_LOG
2672 	if (regexp_engine == BACKTRACKING_ENGINE)   // debugging log for BT engine
2673 	{
2674 	    FILE *f;
2675 	    f = fopen(BT_REGEXP_DEBUG_LOG_NAME, "a");
2676 	    if (f)
2677 	    {
2678 		fprintf(f, "Syntax error in \"%s\"\n", expr);
2679 		fclose(f);
2680 	    }
2681 	    else
2682 		semsg("(NFA) Could not open \"%s\" to write !!!",
2683 			BT_REGEXP_DEBUG_LOG_NAME);
2684 	}
2685 #endif
2686 	/*
2687 	 * If the NFA engine failed, try the backtracking engine.
2688 	 * The NFA engine also fails for patterns that it can't handle well
2689 	 * but are still valid patterns, thus a retry should work.
2690 	 * But don't try if an error message was given.
2691 	 */
2692 	if (regexp_engine == AUTOMATIC_ENGINE
2693 					  && called_emsg == called_emsg_before)
2694 	{
2695 	    regexp_engine = BACKTRACKING_ENGINE;
2696 #ifdef FEAT_EVAL
2697 	    report_re_switch(expr);
2698 #endif
2699 	    prog = bt_regengine.regcomp(expr, re_flags);
2700 	}
2701     }
2702 
2703     if (prog != NULL)
2704     {
2705 	// Store the info needed to call regcomp() again when the engine turns
2706 	// out to be very slow when executing it.
2707 	prog->re_engine = regexp_engine;
2708 	prog->re_flags  = re_flags;
2709     }
2710 
2711     return prog;
2712 }
2713 
2714 /*
2715  * Free a compiled regexp program, returned by vim_regcomp().
2716  */
2717     void
vim_regfree(regprog_T * prog)2718 vim_regfree(regprog_T *prog)
2719 {
2720     if (prog != NULL)
2721 	prog->engine->regfree(prog);
2722 }
2723 
2724 #if defined(EXITFREE) || defined(PROTO)
2725     void
free_regexp_stuff(void)2726 free_regexp_stuff(void)
2727 {
2728     ga_clear(&regstack);
2729     ga_clear(&backpos);
2730     vim_free(reg_tofree);
2731     vim_free(reg_prev_sub);
2732 }
2733 #endif
2734 
2735 #ifdef FEAT_EVAL
2736     static void
report_re_switch(char_u * pat)2737 report_re_switch(char_u *pat)
2738 {
2739     if (p_verbose > 0)
2740     {
2741 	verbose_enter();
2742 	msg_puts(_("Switching to backtracking RE engine for pattern: "));
2743 	msg_puts((char *)pat);
2744 	verbose_leave();
2745     }
2746 }
2747 #endif
2748 
2749 #if (defined(FEAT_X11) && (defined(FEAT_TITLE) || defined(FEAT_XCLIPBOARD))) \
2750 	|| defined(PROTO)
2751 /*
2752  * Return whether "prog" is currently being executed.
2753  */
2754     int
regprog_in_use(regprog_T * prog)2755 regprog_in_use(regprog_T *prog)
2756 {
2757     return prog->re_in_use;
2758 }
2759 #endif
2760 
2761 /*
2762  * Match a regexp against a string.
2763  * "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
2764  * Note: "rmp->regprog" may be freed and changed.
2765  * Uses curbuf for line count and 'iskeyword'.
2766  * When "nl" is TRUE consider a "\n" in "line" to be a line break.
2767  *
2768  * Return TRUE if there is a match, FALSE if not.
2769  */
2770     static int
vim_regexec_string(regmatch_T * rmp,char_u * line,colnr_T col,int nl)2771 vim_regexec_string(
2772     regmatch_T	*rmp,
2773     char_u	*line,  // string to match against
2774     colnr_T	col,    // column to start looking for match
2775     int		nl)
2776 {
2777     int		result;
2778     regexec_T	rex_save;
2779     int		rex_in_use_save = rex_in_use;
2780 
2781     // Cannot use the same prog recursively, it contains state.
2782     if (rmp->regprog->re_in_use)
2783     {
2784 	emsg(_(e_recursive));
2785 	return FALSE;
2786     }
2787     rmp->regprog->re_in_use = TRUE;
2788 
2789     if (rex_in_use)
2790 	// Being called recursively, save the state.
2791 	rex_save = rex;
2792     rex_in_use = TRUE;
2793 
2794     rex.reg_startp = NULL;
2795     rex.reg_endp = NULL;
2796     rex.reg_startpos = NULL;
2797     rex.reg_endpos = NULL;
2798 
2799     result = rmp->regprog->engine->regexec_nl(rmp, line, col, nl);
2800     rmp->regprog->re_in_use = FALSE;
2801 
2802     // NFA engine aborted because it's very slow.
2803     if (rmp->regprog->re_engine == AUTOMATIC_ENGINE
2804 					       && result == NFA_TOO_EXPENSIVE)
2805     {
2806 	int    save_p_re = p_re;
2807 	int    re_flags = rmp->regprog->re_flags;
2808 	char_u *pat = vim_strsave(((nfa_regprog_T *)rmp->regprog)->pattern);
2809 
2810 	p_re = BACKTRACKING_ENGINE;
2811 	vim_regfree(rmp->regprog);
2812 	if (pat != NULL)
2813 	{
2814 #ifdef FEAT_EVAL
2815 	    report_re_switch(pat);
2816 #endif
2817 	    rmp->regprog = vim_regcomp(pat, re_flags);
2818 	    if (rmp->regprog != NULL)
2819 	    {
2820 		rmp->regprog->re_in_use = TRUE;
2821 		result = rmp->regprog->engine->regexec_nl(rmp, line, col, nl);
2822 		rmp->regprog->re_in_use = FALSE;
2823 	    }
2824 	    vim_free(pat);
2825 	}
2826 
2827 	p_re = save_p_re;
2828     }
2829 
2830     rex_in_use = rex_in_use_save;
2831     if (rex_in_use)
2832 	rex = rex_save;
2833 
2834     return result > 0;
2835 }
2836 
2837 /*
2838  * Note: "*prog" may be freed and changed.
2839  * Return TRUE if there is a match, FALSE if not.
2840  */
2841     int
vim_regexec_prog(regprog_T ** prog,int ignore_case,char_u * line,colnr_T col)2842 vim_regexec_prog(
2843     regprog_T	**prog,
2844     int		ignore_case,
2845     char_u	*line,
2846     colnr_T	col)
2847 {
2848     int		r;
2849     regmatch_T	regmatch;
2850 
2851     regmatch.regprog = *prog;
2852     regmatch.rm_ic = ignore_case;
2853     r = vim_regexec_string(&regmatch, line, col, FALSE);
2854     *prog = regmatch.regprog;
2855     return r;
2856 }
2857 
2858 /*
2859  * Note: "rmp->regprog" may be freed and changed.
2860  * Return TRUE if there is a match, FALSE if not.
2861  */
2862     int
vim_regexec(regmatch_T * rmp,char_u * line,colnr_T col)2863 vim_regexec(regmatch_T *rmp, char_u *line, colnr_T col)
2864 {
2865     return vim_regexec_string(rmp, line, col, FALSE);
2866 }
2867 
2868 /*
2869  * Like vim_regexec(), but consider a "\n" in "line" to be a line break.
2870  * Note: "rmp->regprog" may be freed and changed.
2871  * Return TRUE if there is a match, FALSE if not.
2872  */
2873     int
vim_regexec_nl(regmatch_T * rmp,char_u * line,colnr_T col)2874 vim_regexec_nl(regmatch_T *rmp, char_u *line, colnr_T col)
2875 {
2876     return vim_regexec_string(rmp, line, col, TRUE);
2877 }
2878 
2879 /*
2880  * Match a regexp against multiple lines.
2881  * "rmp->regprog" must be a compiled regexp as returned by vim_regcomp().
2882  * Note: "rmp->regprog" may be freed and changed, even set to NULL.
2883  * Uses curbuf for line count and 'iskeyword'.
2884  *
2885  * Return zero if there is no match.  Return number of lines contained in the
2886  * match otherwise.
2887  */
2888     long
vim_regexec_multi(regmmatch_T * rmp,win_T * win,buf_T * buf,linenr_T lnum,colnr_T col,proftime_T * tm,int * timed_out)2889 vim_regexec_multi(
2890     regmmatch_T *rmp,
2891     win_T       *win,		// window in which to search or NULL
2892     buf_T       *buf,		// buffer in which to search
2893     linenr_T	lnum,		// nr of line to start looking for match
2894     colnr_T	col,		// column to start looking for match
2895     proftime_T	*tm,		// timeout limit or NULL
2896     int		*timed_out)	// flag is set when timeout limit reached
2897 {
2898     int		result;
2899     regexec_T	rex_save;
2900     int		rex_in_use_save = rex_in_use;
2901 
2902     // Cannot use the same prog recursively, it contains state.
2903     if (rmp->regprog->re_in_use)
2904     {
2905 	emsg(_(e_recursive));
2906 	return FALSE;
2907     }
2908     rmp->regprog->re_in_use = TRUE;
2909 
2910     if (rex_in_use)
2911 	// Being called recursively, save the state.
2912 	rex_save = rex;
2913     rex_in_use = TRUE;
2914 
2915     result = rmp->regprog->engine->regexec_multi(
2916 				      rmp, win, buf, lnum, col, tm, timed_out);
2917     rmp->regprog->re_in_use = FALSE;
2918 
2919     // NFA engine aborted because it's very slow.
2920     if (rmp->regprog->re_engine == AUTOMATIC_ENGINE
2921 					       && result == NFA_TOO_EXPENSIVE)
2922     {
2923 	int    save_p_re = p_re;
2924 	int    re_flags = rmp->regprog->re_flags;
2925 	char_u *pat = vim_strsave(((nfa_regprog_T *)rmp->regprog)->pattern);
2926 
2927 	p_re = BACKTRACKING_ENGINE;
2928 	vim_regfree(rmp->regprog);
2929 	if (pat != NULL)
2930 	{
2931 #ifdef FEAT_EVAL
2932 	    report_re_switch(pat);
2933 #endif
2934 #ifdef FEAT_SYN_HL
2935 	    // checking for \z misuse was already done when compiling for NFA,
2936 	    // allow all here
2937 	    reg_do_extmatch = REX_ALL;
2938 #endif
2939 	    rmp->regprog = vim_regcomp(pat, re_flags);
2940 #ifdef FEAT_SYN_HL
2941 	    reg_do_extmatch = 0;
2942 #endif
2943 
2944 	    if (rmp->regprog != NULL)
2945 	    {
2946 		rmp->regprog->re_in_use = TRUE;
2947 		result = rmp->regprog->engine->regexec_multi(
2948 				      rmp, win, buf, lnum, col, tm, timed_out);
2949 		rmp->regprog->re_in_use = FALSE;
2950 	    }
2951 	    vim_free(pat);
2952 	}
2953 	p_re = save_p_re;
2954     }
2955 
2956     rex_in_use = rex_in_use_save;
2957     if (rex_in_use)
2958 	rex = rex_save;
2959 
2960     return result <= 0 ? 0 : result;
2961 }
2962