1 /* vi:set ts=8 sts=4 sw=4 noet:
2 *
3 * Handling of regular expressions: vim_regcomp(), vim_regexec(), vim_regsub()
4 */
5
6 // By default: do not create debugging logs or files related to regular
7 // expressions, even when compiling with -DDEBUG.
8 // Uncomment the second line to get the regexp debugging.
9 #undef DEBUG
10 // #define DEBUG
11
12 #include "vim.h"
13
14 #ifdef DEBUG
15 // show/save debugging data when BT engine is used
16 # define BT_REGEXP_DUMP
17 // save the debugging data to a file instead of displaying it
18 # define BT_REGEXP_LOG
19 # define BT_REGEXP_DEBUG_LOG
20 # define BT_REGEXP_DEBUG_LOG_NAME "bt_regexp_debug.log"
21 #endif
22
23 /*
24 * Magic characters have a special meaning, they don't match literally.
25 * Magic characters are negative. This separates them from literal characters
26 * (possibly multi-byte). Only ASCII characters can be Magic.
27 */
28 #define Magic(x) ((int)(x) - 256)
29 #define un_Magic(x) ((x) + 256)
30 #define is_Magic(x) ((x) < 0)
31
32 static int
no_Magic(int x)33 no_Magic(int x)
34 {
35 if (is_Magic(x))
36 return un_Magic(x);
37 return x;
38 }
39
40 static int
toggle_Magic(int x)41 toggle_Magic(int x)
42 {
43 if (is_Magic(x))
44 return un_Magic(x);
45 return Magic(x);
46 }
47
48 /*
49 * The first byte of the BT regexp internal "program" is actually this magic
50 * number; the start node begins in the second byte. It's used to catch the
51 * most severe mutilation of the program by the caller.
52 */
53
54 #define REGMAGIC 0234
55
56 /*
57 * Utility definitions.
58 */
59 #define UCHARAT(p) ((int)*(char_u *)(p))
60
61 // Used for an error (down from) vim_regcomp(): give the error message, set
62 // rc_did_emsg and return NULL
63 #define EMSG_RET_NULL(m) return (emsg((m)), rc_did_emsg = TRUE, (void *)NULL)
64 #define IEMSG_RET_NULL(m) return (iemsg((m)), rc_did_emsg = TRUE, (void *)NULL)
65 #define EMSG_RET_FAIL(m) return (emsg((m)), rc_did_emsg = TRUE, FAIL)
66 #define EMSG2_RET_NULL(m, c) return (semsg((const char *)(m), (c) ? "" : "\\"), rc_did_emsg = TRUE, (void *)NULL)
67 #define EMSG3_RET_NULL(m, c, a) return (semsg((const char *)(m), (c) ? "" : "\\", (a)), rc_did_emsg = TRUE, (void *)NULL)
68 #define EMSG2_RET_FAIL(m, c) return (semsg((const char *)(m), (c) ? "" : "\\"), rc_did_emsg = TRUE, FAIL)
69 #define EMSG_ONE_RET_NULL EMSG2_RET_NULL(_("E369: invalid item in %s%%[]"), reg_magic == MAGIC_ALL)
70
71
72 #define MAX_LIMIT (32767L << 16L)
73
74 static char_u e_missingbracket[] = N_("E769: Missing ] after %s[");
75 static char_u e_reverse_range[] = N_("E944: Reverse range in character class");
76 static char_u e_large_class[] = N_("E945: Range too large in character class");
77 #ifdef FEAT_SYN_HL
78 static char_u e_z_not_allowed[] = N_("E66: \\z( not allowed here");
79 static char_u e_z1_not_allowed[] = N_("E67: \\z1 - \\z9 not allowed here");
80 #endif
81 static char_u e_missing_sb[] = N_("E69: Missing ] after %s%%[");
82 static char_u e_empty_sb[] = N_("E70: Empty %s%%[]");
83 static char_u e_recursive[] = N_("E956: Cannot use pattern recursively");
84
85 #define NOT_MULTI 0
86 #define MULTI_ONE 1
87 #define MULTI_MULT 2
88
89 // return values for regmatch()
90 #define RA_FAIL 1 // something failed, abort
91 #define RA_CONT 2 // continue in inner loop
92 #define RA_BREAK 3 // break inner loop
93 #define RA_MATCH 4 // successful match
94 #define RA_NOMATCH 5 // didn't match
95
96 /*
97 * Return NOT_MULTI if c is not a "multi" operator.
98 * Return MULTI_ONE if c is a single "multi" operator.
99 * Return MULTI_MULT if c is a multi "multi" operator.
100 */
101 static int
re_multi_type(int c)102 re_multi_type(int c)
103 {
104 if (c == Magic('@') || c == Magic('=') || c == Magic('?'))
105 return MULTI_ONE;
106 if (c == Magic('*') || c == Magic('+') || c == Magic('{'))
107 return MULTI_MULT;
108 return NOT_MULTI;
109 }
110
111 static char_u *reg_prev_sub = NULL;
112
113 /*
114 * REGEXP_INRANGE contains all characters which are always special in a []
115 * range after '\'.
116 * REGEXP_ABBR contains all characters which act as abbreviations after '\'.
117 * These are:
118 * \n - New line (NL).
119 * \r - Carriage Return (CR).
120 * \t - Tab (TAB).
121 * \e - Escape (ESC).
122 * \b - Backspace (Ctrl_H).
123 * \d - Character code in decimal, eg \d123
124 * \o - Character code in octal, eg \o80
125 * \x - Character code in hex, eg \x4a
126 * \u - Multibyte character code, eg \u20ac
127 * \U - Long multibyte character code, eg \U12345678
128 */
129 static char_u REGEXP_INRANGE[] = "]^-n\\";
130 static char_u REGEXP_ABBR[] = "nrtebdoxuU";
131
132 /*
133 * Translate '\x' to its control character, except "\n", which is Magic.
134 */
135 static int
backslash_trans(int c)136 backslash_trans(int c)
137 {
138 switch (c)
139 {
140 case 'r': return CAR;
141 case 't': return TAB;
142 case 'e': return ESC;
143 case 'b': return BS;
144 }
145 return c;
146 }
147
148 /*
149 * Check for a character class name "[:name:]". "pp" points to the '['.
150 * Returns one of the CLASS_ items. CLASS_NONE means that no item was
151 * recognized. Otherwise "pp" is advanced to after the item.
152 */
153 static int
get_char_class(char_u ** pp)154 get_char_class(char_u **pp)
155 {
156 static const char *(class_names[]) =
157 {
158 "alnum:]",
159 #define CLASS_ALNUM 0
160 "alpha:]",
161 #define CLASS_ALPHA 1
162 "blank:]",
163 #define CLASS_BLANK 2
164 "cntrl:]",
165 #define CLASS_CNTRL 3
166 "digit:]",
167 #define CLASS_DIGIT 4
168 "graph:]",
169 #define CLASS_GRAPH 5
170 "lower:]",
171 #define CLASS_LOWER 6
172 "print:]",
173 #define CLASS_PRINT 7
174 "punct:]",
175 #define CLASS_PUNCT 8
176 "space:]",
177 #define CLASS_SPACE 9
178 "upper:]",
179 #define CLASS_UPPER 10
180 "xdigit:]",
181 #define CLASS_XDIGIT 11
182 "tab:]",
183 #define CLASS_TAB 12
184 "return:]",
185 #define CLASS_RETURN 13
186 "backspace:]",
187 #define CLASS_BACKSPACE 14
188 "escape:]",
189 #define CLASS_ESCAPE 15
190 "ident:]",
191 #define CLASS_IDENT 16
192 "keyword:]",
193 #define CLASS_KEYWORD 17
194 "fname:]",
195 #define CLASS_FNAME 18
196 };
197 #define CLASS_NONE 99
198 int i;
199
200 if ((*pp)[1] == ':')
201 {
202 for (i = 0; i < (int)ARRAY_LENGTH(class_names); ++i)
203 if (STRNCMP(*pp + 2, class_names[i], STRLEN(class_names[i])) == 0)
204 {
205 *pp += STRLEN(class_names[i]) + 2;
206 return i;
207 }
208 }
209 return CLASS_NONE;
210 }
211
212 /*
213 * Specific version of character class functions.
214 * Using a table to keep this fast.
215 */
216 static short class_tab[256];
217
218 #define RI_DIGIT 0x01
219 #define RI_HEX 0x02
220 #define RI_OCTAL 0x04
221 #define RI_WORD 0x08
222 #define RI_HEAD 0x10
223 #define RI_ALPHA 0x20
224 #define RI_LOWER 0x40
225 #define RI_UPPER 0x80
226 #define RI_WHITE 0x100
227
228 static void
init_class_tab(void)229 init_class_tab(void)
230 {
231 int i;
232 static int done = FALSE;
233
234 if (done)
235 return;
236
237 for (i = 0; i < 256; ++i)
238 {
239 if (i >= '0' && i <= '7')
240 class_tab[i] = RI_DIGIT + RI_HEX + RI_OCTAL + RI_WORD;
241 else if (i >= '8' && i <= '9')
242 class_tab[i] = RI_DIGIT + RI_HEX + RI_WORD;
243 else if (i >= 'a' && i <= 'f')
244 class_tab[i] = RI_HEX + RI_WORD + RI_HEAD + RI_ALPHA + RI_LOWER;
245 #ifdef EBCDIC
246 else if ((i >= 'g' && i <= 'i') || (i >= 'j' && i <= 'r')
247 || (i >= 's' && i <= 'z'))
248 #else
249 else if (i >= 'g' && i <= 'z')
250 #endif
251 class_tab[i] = RI_WORD + RI_HEAD + RI_ALPHA + RI_LOWER;
252 else if (i >= 'A' && i <= 'F')
253 class_tab[i] = RI_HEX + RI_WORD + RI_HEAD + RI_ALPHA + RI_UPPER;
254 #ifdef EBCDIC
255 else if ((i >= 'G' && i <= 'I') || ( i >= 'J' && i <= 'R')
256 || (i >= 'S' && i <= 'Z'))
257 #else
258 else if (i >= 'G' && i <= 'Z')
259 #endif
260 class_tab[i] = RI_WORD + RI_HEAD + RI_ALPHA + RI_UPPER;
261 else if (i == '_')
262 class_tab[i] = RI_WORD + RI_HEAD;
263 else
264 class_tab[i] = 0;
265 }
266 class_tab[' '] |= RI_WHITE;
267 class_tab['\t'] |= RI_WHITE;
268 done = TRUE;
269 }
270
271 #define ri_digit(c) (c < 0x100 && (class_tab[c] & RI_DIGIT))
272 #define ri_hex(c) (c < 0x100 && (class_tab[c] & RI_HEX))
273 #define ri_octal(c) (c < 0x100 && (class_tab[c] & RI_OCTAL))
274 #define ri_word(c) (c < 0x100 && (class_tab[c] & RI_WORD))
275 #define ri_head(c) (c < 0x100 && (class_tab[c] & RI_HEAD))
276 #define ri_alpha(c) (c < 0x100 && (class_tab[c] & RI_ALPHA))
277 #define ri_lower(c) (c < 0x100 && (class_tab[c] & RI_LOWER))
278 #define ri_upper(c) (c < 0x100 && (class_tab[c] & RI_UPPER))
279 #define ri_white(c) (c < 0x100 && (class_tab[c] & RI_WHITE))
280
281 // flags for regflags
282 #define RF_ICASE 1 // ignore case
283 #define RF_NOICASE 2 // don't ignore case
284 #define RF_HASNL 4 // can match a NL
285 #define RF_ICOMBINE 8 // ignore combining characters
286 #define RF_LOOKBH 16 // uses "\@<=" or "\@<!"
287
288 /*
289 * Global work variables for vim_regcomp().
290 */
291
292 static char_u *regparse; // Input-scan pointer.
293 static int regnpar; // () count.
294 static int wants_nfa; // regex should use NFA engine
295 #ifdef FEAT_SYN_HL
296 static int regnzpar; // \z() count.
297 static int re_has_z; // \z item detected
298 #endif
299 static unsigned regflags; // RF_ flags for prog
300 #if defined(FEAT_SYN_HL) || defined(PROTO)
301 static int had_eol; // TRUE when EOL found by vim_regcomp()
302 #endif
303
304 static magic_T reg_magic; // magicness of the pattern
305
306 static int reg_string; // matching with a string instead of a buffer
307 // line
308 static int reg_strict; // "[abc" is illegal
309
310 /*
311 * META contains all characters that may be magic, except '^' and '$'.
312 */
313
314 #ifdef EBCDIC
315 static char_u META[] = "%&()*+.123456789<=>?@ACDFHIKLMOPSUVWX[_acdfhiklmnopsuvwxz{|~";
316 #else
317 // META[] is used often enough to justify turning it into a table.
318 static char_u META_flags[] = {
319 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
320 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
321 // % & ( ) * + .
322 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0,
323 // 1 2 3 4 5 6 7 8 9 < = > ?
324 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
325 // @ A C D F H I K L M O
326 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1,
327 // P S U V W X Z [ _
328 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1,
329 // a c d f h i k l m n o
330 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
331 // p s u v w x z { | ~
332 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1
333 };
334 #endif
335
336 static int curchr; // currently parsed character
337 // Previous character. Note: prevchr is sometimes -1 when we are not at the
338 // start, eg in /[ ^I]^ the pattern was never found even if it existed,
339 // because ^ was taken to be magic -- webb
340 static int prevchr;
341 static int prevprevchr; // previous-previous character
342 static int nextchr; // used for ungetchr()
343
344 // arguments for reg()
345 #define REG_NOPAREN 0 // toplevel reg()
346 #define REG_PAREN 1 // \(\)
347 #define REG_ZPAREN 2 // \z(\)
348 #define REG_NPAREN 3 // \%(\)
349
350 typedef struct
351 {
352 char_u *regparse;
353 int prevchr_len;
354 int curchr;
355 int prevchr;
356 int prevprevchr;
357 int nextchr;
358 int at_start;
359 int prev_at_start;
360 int regnpar;
361 } parse_state_T;
362
363 static void initchr(char_u *);
364 static int getchr(void);
365 static void skipchr_keepstart(void);
366 static int peekchr(void);
367 static void skipchr(void);
368 static void ungetchr(void);
369 static long gethexchrs(int maxinputlen);
370 static long getoctchrs(void);
371 static long getdecchrs(void);
372 static int coll_get_char(void);
373 static int prog_magic_wrong(void);
374 static int cstrncmp(char_u *s1, char_u *s2, int *n);
375 static char_u *cstrchr(char_u *, int);
376 static int re_mult_next(char *what);
377 static int reg_iswordc(int);
378 #ifdef FEAT_EVAL
379 static void report_re_switch(char_u *pat);
380 #endif
381
382 static regengine_T bt_regengine;
383 static regengine_T nfa_regengine;
384
385 /*
386 * Return TRUE if compiled regular expression "prog" can match a line break.
387 */
388 int
re_multiline(regprog_T * prog)389 re_multiline(regprog_T *prog)
390 {
391 return (prog->regflags & RF_HASNL);
392 }
393
394 /*
395 * Check for an equivalence class name "[=a=]". "pp" points to the '['.
396 * Returns a character representing the class. Zero means that no item was
397 * recognized. Otherwise "pp" is advanced to after the item.
398 */
399 static int
get_equi_class(char_u ** pp)400 get_equi_class(char_u **pp)
401 {
402 int c;
403 int l = 1;
404 char_u *p = *pp;
405
406 if (p[1] == '=' && p[2] != NUL)
407 {
408 if (has_mbyte)
409 l = (*mb_ptr2len)(p + 2);
410 if (p[l + 2] == '=' && p[l + 3] == ']')
411 {
412 if (has_mbyte)
413 c = mb_ptr2char(p + 2);
414 else
415 c = p[2];
416 *pp += l + 4;
417 return c;
418 }
419 }
420 return 0;
421 }
422
423 #ifdef EBCDIC
424 /*
425 * Table for equivalence class "c". (IBM-1047)
426 */
427 static char *EQUIVAL_CLASS_C[16] = {
428 "A\x62\x63\x64\x65\x66\x67",
429 "C\x68",
430 "E\x71\x72\x73\x74",
431 "I\x75\x76\x77\x78",
432 "N\x69",
433 "O\xEB\xEC\xED\xEE\xEF\x80",
434 "U\xFB\xFC\xFD\xFE",
435 "Y\xBA",
436 "a\x42\x43\x44\x45\x46\x47",
437 "c\x48",
438 "e\x51\x52\x53\x54",
439 "i\x55\x56\x57\x58",
440 "n\x49",
441 "o\xCB\xCC\xCD\xCE\xCF\x70",
442 "u\xDB\xDC\xDD\xDE",
443 "y\x8D\xDF",
444 };
445 #endif
446
447 /*
448 * Check for a collating element "[.a.]". "pp" points to the '['.
449 * Returns a character. Zero means that no item was recognized. Otherwise
450 * "pp" is advanced to after the item.
451 * Currently only single characters are recognized!
452 */
453 static int
get_coll_element(char_u ** pp)454 get_coll_element(char_u **pp)
455 {
456 int c;
457 int l = 1;
458 char_u *p = *pp;
459
460 if (p[0] != NUL && p[1] == '.' && p[2] != NUL)
461 {
462 if (has_mbyte)
463 l = (*mb_ptr2len)(p + 2);
464 if (p[l + 2] == '.' && p[l + 3] == ']')
465 {
466 if (has_mbyte)
467 c = mb_ptr2char(p + 2);
468 else
469 c = p[2];
470 *pp += l + 4;
471 return c;
472 }
473 }
474 return 0;
475 }
476
477 static int reg_cpo_lit; // 'cpoptions' contains 'l' flag
478 static int reg_cpo_bsl; // 'cpoptions' contains '\' flag
479
480 static void
get_cpo_flags(void)481 get_cpo_flags(void)
482 {
483 reg_cpo_lit = vim_strchr(p_cpo, CPO_LITERAL) != NULL;
484 reg_cpo_bsl = vim_strchr(p_cpo, CPO_BACKSL) != NULL;
485 }
486
487 /*
488 * Skip over a "[]" range.
489 * "p" must point to the character after the '['.
490 * The returned pointer is on the matching ']', or the terminating NUL.
491 */
492 static char_u *
skip_anyof(char_u * p)493 skip_anyof(char_u *p)
494 {
495 int l;
496
497 if (*p == '^') // Complement of range.
498 ++p;
499 if (*p == ']' || *p == '-')
500 ++p;
501 while (*p != NUL && *p != ']')
502 {
503 if (has_mbyte && (l = (*mb_ptr2len)(p)) > 1)
504 p += l;
505 else
506 if (*p == '-')
507 {
508 ++p;
509 if (*p != ']' && *p != NUL)
510 MB_PTR_ADV(p);
511 }
512 else if (*p == '\\'
513 && !reg_cpo_bsl
514 && (vim_strchr(REGEXP_INRANGE, p[1]) != NULL
515 || (!reg_cpo_lit && vim_strchr(REGEXP_ABBR, p[1]) != NULL)))
516 p += 2;
517 else if (*p == '[')
518 {
519 if (get_char_class(&p) == CLASS_NONE
520 && get_equi_class(&p) == 0
521 && get_coll_element(&p) == 0
522 && *p != NUL)
523 ++p; // it is not a class name and not NUL
524 }
525 else
526 ++p;
527 }
528
529 return p;
530 }
531
532 /*
533 * Skip past regular expression.
534 * Stop at end of "startp" or where "delim" is found ('/', '?', etc).
535 * Take care of characters with a backslash in front of it.
536 * Skip strings inside [ and ].
537 */
538 char_u *
skip_regexp(char_u * startp,int delim,int magic)539 skip_regexp(
540 char_u *startp,
541 int delim,
542 int magic)
543 {
544 return skip_regexp_ex(startp, delim, magic, NULL, NULL, NULL);
545 }
546
547 /*
548 * Call skip_regexp() and when the delimiter does not match give an error and
549 * return NULL.
550 */
551 char_u *
skip_regexp_err(char_u * startp,int delim,int magic)552 skip_regexp_err(
553 char_u *startp,
554 int delim,
555 int magic)
556 {
557 char_u *p = skip_regexp(startp, delim, magic);
558
559 if (*p != delim)
560 {
561 semsg(_("E654: missing delimiter after search pattern: %s"), startp);
562 return NULL;
563 }
564 return p;
565 }
566
567 /*
568 * skip_regexp() with extra arguments:
569 * When "newp" is not NULL and "dirc" is '?', make an allocated copy of the
570 * expression and change "\?" to "?". If "*newp" is not NULL the expression
571 * is changed in-place.
572 * If a "\?" is changed to "?" then "dropped" is incremented, unless NULL.
573 * If "magic_val" is not NULL, returns the effective magicness of the pattern
574 */
575 char_u *
skip_regexp_ex(char_u * startp,int dirc,int magic,char_u ** newp,int * dropped,magic_T * magic_val)576 skip_regexp_ex(
577 char_u *startp,
578 int dirc,
579 int magic,
580 char_u **newp,
581 int *dropped,
582 magic_T *magic_val)
583 {
584 magic_T mymagic;
585 char_u *p = startp;
586
587 if (magic)
588 mymagic = MAGIC_ON;
589 else
590 mymagic = MAGIC_OFF;
591 get_cpo_flags();
592
593 for (; p[0] != NUL; MB_PTR_ADV(p))
594 {
595 if (p[0] == dirc) // found end of regexp
596 break;
597 if ((p[0] == '[' && mymagic >= MAGIC_ON)
598 || (p[0] == '\\' && p[1] == '[' && mymagic <= MAGIC_OFF))
599 {
600 p = skip_anyof(p + 1);
601 if (p[0] == NUL)
602 break;
603 }
604 else if (p[0] == '\\' && p[1] != NUL)
605 {
606 if (dirc == '?' && newp != NULL && p[1] == '?')
607 {
608 // change "\?" to "?", make a copy first.
609 if (*newp == NULL)
610 {
611 *newp = vim_strsave(startp);
612 if (*newp != NULL)
613 p = *newp + (p - startp);
614 }
615 if (dropped != NULL)
616 ++*dropped;
617 if (*newp != NULL)
618 STRMOVE(p, p + 1);
619 else
620 ++p;
621 }
622 else
623 ++p; // skip next character
624 if (*p == 'v')
625 mymagic = MAGIC_ALL;
626 else if (*p == 'V')
627 mymagic = MAGIC_NONE;
628 }
629 }
630 if (magic_val != NULL)
631 *magic_val = mymagic;
632 return p;
633 }
634
635 /*
636 * Functions for getting characters from the regexp input.
637 */
638 static int prevchr_len; // byte length of previous char
639 static int at_start; // True when on the first character
640 static int prev_at_start; // True when on the second character
641
642 /*
643 * Start parsing at "str".
644 */
645 static void
initchr(char_u * str)646 initchr(char_u *str)
647 {
648 regparse = str;
649 prevchr_len = 0;
650 curchr = prevprevchr = prevchr = nextchr = -1;
651 at_start = TRUE;
652 prev_at_start = FALSE;
653 }
654
655 /*
656 * Save the current parse state, so that it can be restored and parsing
657 * starts in the same state again.
658 */
659 static void
save_parse_state(parse_state_T * ps)660 save_parse_state(parse_state_T *ps)
661 {
662 ps->regparse = regparse;
663 ps->prevchr_len = prevchr_len;
664 ps->curchr = curchr;
665 ps->prevchr = prevchr;
666 ps->prevprevchr = prevprevchr;
667 ps->nextchr = nextchr;
668 ps->at_start = at_start;
669 ps->prev_at_start = prev_at_start;
670 ps->regnpar = regnpar;
671 }
672
673 /*
674 * Restore a previously saved parse state.
675 */
676 static void
restore_parse_state(parse_state_T * ps)677 restore_parse_state(parse_state_T *ps)
678 {
679 regparse = ps->regparse;
680 prevchr_len = ps->prevchr_len;
681 curchr = ps->curchr;
682 prevchr = ps->prevchr;
683 prevprevchr = ps->prevprevchr;
684 nextchr = ps->nextchr;
685 at_start = ps->at_start;
686 prev_at_start = ps->prev_at_start;
687 regnpar = ps->regnpar;
688 }
689
690
691 /*
692 * Get the next character without advancing.
693 */
694 static int
peekchr(void)695 peekchr(void)
696 {
697 static int after_slash = FALSE;
698
699 if (curchr == -1)
700 {
701 switch (curchr = regparse[0])
702 {
703 case '.':
704 case '[':
705 case '~':
706 // magic when 'magic' is on
707 if (reg_magic >= MAGIC_ON)
708 curchr = Magic(curchr);
709 break;
710 case '(':
711 case ')':
712 case '{':
713 case '%':
714 case '+':
715 case '=':
716 case '?':
717 case '@':
718 case '!':
719 case '&':
720 case '|':
721 case '<':
722 case '>':
723 case '#': // future ext.
724 case '"': // future ext.
725 case '\'': // future ext.
726 case ',': // future ext.
727 case '-': // future ext.
728 case ':': // future ext.
729 case ';': // future ext.
730 case '`': // future ext.
731 case '/': // Can't be used in / command
732 // magic only after "\v"
733 if (reg_magic == MAGIC_ALL)
734 curchr = Magic(curchr);
735 break;
736 case '*':
737 // * is not magic as the very first character, eg "?*ptr", when
738 // after '^', eg "/^*ptr" and when after "\(", "\|", "\&". But
739 // "\(\*" is not magic, thus must be magic if "after_slash"
740 if (reg_magic >= MAGIC_ON
741 && !at_start
742 && !(prev_at_start && prevchr == Magic('^'))
743 && (after_slash
744 || (prevchr != Magic('(')
745 && prevchr != Magic('&')
746 && prevchr != Magic('|'))))
747 curchr = Magic('*');
748 break;
749 case '^':
750 // '^' is only magic as the very first character and if it's after
751 // "\(", "\|", "\&' or "\n"
752 if (reg_magic >= MAGIC_OFF
753 && (at_start
754 || reg_magic == MAGIC_ALL
755 || prevchr == Magic('(')
756 || prevchr == Magic('|')
757 || prevchr == Magic('&')
758 || prevchr == Magic('n')
759 || (no_Magic(prevchr) == '('
760 && prevprevchr == Magic('%'))))
761 {
762 curchr = Magic('^');
763 at_start = TRUE;
764 prev_at_start = FALSE;
765 }
766 break;
767 case '$':
768 // '$' is only magic as the very last char and if it's in front of
769 // either "\|", "\)", "\&", or "\n"
770 if (reg_magic >= MAGIC_OFF)
771 {
772 char_u *p = regparse + 1;
773 int is_magic_all = (reg_magic == MAGIC_ALL);
774
775 // ignore \c \C \m \M \v \V and \Z after '$'
776 while (p[0] == '\\' && (p[1] == 'c' || p[1] == 'C'
777 || p[1] == 'm' || p[1] == 'M'
778 || p[1] == 'v' || p[1] == 'V' || p[1] == 'Z'))
779 {
780 if (p[1] == 'v')
781 is_magic_all = TRUE;
782 else if (p[1] == 'm' || p[1] == 'M' || p[1] == 'V')
783 is_magic_all = FALSE;
784 p += 2;
785 }
786 if (p[0] == NUL
787 || (p[0] == '\\'
788 && (p[1] == '|' || p[1] == '&' || p[1] == ')'
789 || p[1] == 'n'))
790 || (is_magic_all
791 && (p[0] == '|' || p[0] == '&' || p[0] == ')'))
792 || reg_magic == MAGIC_ALL)
793 curchr = Magic('$');
794 }
795 break;
796 case '\\':
797 {
798 int c = regparse[1];
799
800 if (c == NUL)
801 curchr = '\\'; // trailing '\'
802 else if (
803 #ifdef EBCDIC
804 vim_strchr(META, c)
805 #else
806 c <= '~' && META_flags[c]
807 #endif
808 )
809 {
810 /*
811 * META contains everything that may be magic sometimes,
812 * except ^ and $ ("\^" and "\$" are only magic after
813 * "\V"). We now fetch the next character and toggle its
814 * magicness. Therefore, \ is so meta-magic that it is
815 * not in META.
816 */
817 curchr = -1;
818 prev_at_start = at_start;
819 at_start = FALSE; // be able to say "/\*ptr"
820 ++regparse;
821 ++after_slash;
822 peekchr();
823 --regparse;
824 --after_slash;
825 curchr = toggle_Magic(curchr);
826 }
827 else if (vim_strchr(REGEXP_ABBR, c))
828 {
829 /*
830 * Handle abbreviations, like "\t" for TAB -- webb
831 */
832 curchr = backslash_trans(c);
833 }
834 else if (reg_magic == MAGIC_NONE && (c == '$' || c == '^'))
835 curchr = toggle_Magic(c);
836 else
837 {
838 /*
839 * Next character can never be (made) magic?
840 * Then backslashing it won't do anything.
841 */
842 if (has_mbyte)
843 curchr = (*mb_ptr2char)(regparse + 1);
844 else
845 curchr = c;
846 }
847 break;
848 }
849
850 default:
851 if (has_mbyte)
852 curchr = (*mb_ptr2char)(regparse);
853 }
854 }
855
856 return curchr;
857 }
858
859 /*
860 * Eat one lexed character. Do this in a way that we can undo it.
861 */
862 static void
skipchr(void)863 skipchr(void)
864 {
865 // peekchr() eats a backslash, do the same here
866 if (*regparse == '\\')
867 prevchr_len = 1;
868 else
869 prevchr_len = 0;
870 if (regparse[prevchr_len] != NUL)
871 {
872 if (enc_utf8)
873 // exclude composing chars that mb_ptr2len does include
874 prevchr_len += utf_ptr2len(regparse + prevchr_len);
875 else if (has_mbyte)
876 prevchr_len += (*mb_ptr2len)(regparse + prevchr_len);
877 else
878 ++prevchr_len;
879 }
880 regparse += prevchr_len;
881 prev_at_start = at_start;
882 at_start = FALSE;
883 prevprevchr = prevchr;
884 prevchr = curchr;
885 curchr = nextchr; // use previously unget char, or -1
886 nextchr = -1;
887 }
888
889 /*
890 * Skip a character while keeping the value of prev_at_start for at_start.
891 * prevchr and prevprevchr are also kept.
892 */
893 static void
skipchr_keepstart(void)894 skipchr_keepstart(void)
895 {
896 int as = prev_at_start;
897 int pr = prevchr;
898 int prpr = prevprevchr;
899
900 skipchr();
901 at_start = as;
902 prevchr = pr;
903 prevprevchr = prpr;
904 }
905
906 /*
907 * Get the next character from the pattern. We know about magic and such, so
908 * therefore we need a lexical analyzer.
909 */
910 static int
getchr(void)911 getchr(void)
912 {
913 int chr = peekchr();
914
915 skipchr();
916 return chr;
917 }
918
919 /*
920 * put character back. Works only once!
921 */
922 static void
ungetchr(void)923 ungetchr(void)
924 {
925 nextchr = curchr;
926 curchr = prevchr;
927 prevchr = prevprevchr;
928 at_start = prev_at_start;
929 prev_at_start = FALSE;
930
931 // Backup regparse, so that it's at the same position as before the
932 // getchr().
933 regparse -= prevchr_len;
934 }
935
936 /*
937 * Get and return the value of the hex string at the current position.
938 * Return -1 if there is no valid hex number.
939 * The position is updated:
940 * blahblah\%x20asdf
941 * before-^ ^-after
942 * The parameter controls the maximum number of input characters. This will be
943 * 2 when reading a \%x20 sequence and 4 when reading a \%u20AC sequence.
944 */
945 static long
gethexchrs(int maxinputlen)946 gethexchrs(int maxinputlen)
947 {
948 long_u nr = 0;
949 int c;
950 int i;
951
952 for (i = 0; i < maxinputlen; ++i)
953 {
954 c = regparse[0];
955 if (!vim_isxdigit(c))
956 break;
957 nr <<= 4;
958 nr |= hex2nr(c);
959 ++regparse;
960 }
961
962 if (i == 0)
963 return -1;
964 return (long)nr;
965 }
966
967 /*
968 * Get and return the value of the decimal string immediately after the
969 * current position. Return -1 for invalid. Consumes all digits.
970 */
971 static long
getdecchrs(void)972 getdecchrs(void)
973 {
974 long_u nr = 0;
975 int c;
976 int i;
977
978 for (i = 0; ; ++i)
979 {
980 c = regparse[0];
981 if (c < '0' || c > '9')
982 break;
983 nr *= 10;
984 nr += c - '0';
985 ++regparse;
986 curchr = -1; // no longer valid
987 }
988
989 if (i == 0)
990 return -1;
991 return (long)nr;
992 }
993
994 /*
995 * get and return the value of the octal string immediately after the current
996 * position. Return -1 for invalid, or 0-255 for valid. Smart enough to handle
997 * numbers > 377 correctly (for example, 400 is treated as 40) and doesn't
998 * treat 8 or 9 as recognised characters. Position is updated:
999 * blahblah\%o210asdf
1000 * before-^ ^-after
1001 */
1002 static long
getoctchrs(void)1003 getoctchrs(void)
1004 {
1005 long_u nr = 0;
1006 int c;
1007 int i;
1008
1009 for (i = 0; i < 3 && nr < 040; ++i)
1010 {
1011 c = regparse[0];
1012 if (c < '0' || c > '7')
1013 break;
1014 nr <<= 3;
1015 nr |= hex2nr(c);
1016 ++regparse;
1017 }
1018
1019 if (i == 0)
1020 return -1;
1021 return (long)nr;
1022 }
1023
1024 /*
1025 * read_limits - Read two integers to be taken as a minimum and maximum.
1026 * If the first character is '-', then the range is reversed.
1027 * Should end with 'end'. If minval is missing, zero is default, if maxval is
1028 * missing, a very big number is the default.
1029 */
1030 static int
read_limits(long * minval,long * maxval)1031 read_limits(long *minval, long *maxval)
1032 {
1033 int reverse = FALSE;
1034 char_u *first_char;
1035 long tmp;
1036
1037 if (*regparse == '-')
1038 {
1039 // Starts with '-', so reverse the range later
1040 regparse++;
1041 reverse = TRUE;
1042 }
1043 first_char = regparse;
1044 *minval = getdigits(®parse);
1045 if (*regparse == ',') // There is a comma
1046 {
1047 if (vim_isdigit(*++regparse))
1048 *maxval = getdigits(®parse);
1049 else
1050 *maxval = MAX_LIMIT;
1051 }
1052 else if (VIM_ISDIGIT(*first_char))
1053 *maxval = *minval; // It was \{n} or \{-n}
1054 else
1055 *maxval = MAX_LIMIT; // It was \{} or \{-}
1056 if (*regparse == '\\')
1057 regparse++; // Allow either \{...} or \{...\}
1058 if (*regparse != '}')
1059 EMSG2_RET_FAIL(_("E554: Syntax error in %s{...}"),
1060 reg_magic == MAGIC_ALL);
1061
1062 /*
1063 * Reverse the range if there was a '-', or make sure it is in the right
1064 * order otherwise.
1065 */
1066 if ((!reverse && *minval > *maxval) || (reverse && *minval < *maxval))
1067 {
1068 tmp = *minval;
1069 *minval = *maxval;
1070 *maxval = tmp;
1071 }
1072 skipchr(); // let's be friends with the lexer again
1073 return OK;
1074 }
1075
1076 /*
1077 * vim_regexec and friends
1078 */
1079
1080 /*
1081 * Global work variables for vim_regexec().
1082 */
1083
1084 static void cleanup_subexpr(void);
1085 #ifdef FEAT_SYN_HL
1086 static void cleanup_zsubexpr(void);
1087 #endif
1088 static void reg_nextline(void);
1089 static int match_with_backref(linenr_T start_lnum, colnr_T start_col, linenr_T end_lnum, colnr_T end_col, int *bytelen);
1090
1091 /*
1092 * Sometimes need to save a copy of a line. Since alloc()/free() is very
1093 * slow, we keep one allocated piece of memory and only re-allocate it when
1094 * it's too small. It's freed in bt_regexec_both() when finished.
1095 */
1096 static char_u *reg_tofree = NULL;
1097 static unsigned reg_tofreelen;
1098
1099 /*
1100 * Structure used to store the execution state of the regex engine.
1101 * Which ones are set depends on whether a single-line or multi-line match is
1102 * done:
1103 * single-line multi-line
1104 * reg_match ®match_T NULL
1105 * reg_mmatch NULL ®mmatch_T
1106 * reg_startp reg_match->startp <invalid>
1107 * reg_endp reg_match->endp <invalid>
1108 * reg_startpos <invalid> reg_mmatch->startpos
1109 * reg_endpos <invalid> reg_mmatch->endpos
1110 * reg_win NULL window in which to search
1111 * reg_buf curbuf buffer in which to search
1112 * reg_firstlnum <invalid> first line in which to search
1113 * reg_maxline 0 last line nr
1114 * reg_line_lbr FALSE or TRUE FALSE
1115 */
1116 typedef struct {
1117 regmatch_T *reg_match;
1118 regmmatch_T *reg_mmatch;
1119 char_u **reg_startp;
1120 char_u **reg_endp;
1121 lpos_T *reg_startpos;
1122 lpos_T *reg_endpos;
1123 win_T *reg_win;
1124 buf_T *reg_buf;
1125 linenr_T reg_firstlnum;
1126 linenr_T reg_maxline;
1127 int reg_line_lbr; // "\n" in string is line break
1128
1129 // The current match-position is stord in these variables:
1130 linenr_T lnum; // line number, relative to first line
1131 char_u *line; // start of current line
1132 char_u *input; // current input, points into "line"
1133
1134 int need_clear_subexpr; // subexpressions still need to be cleared
1135 #ifdef FEAT_SYN_HL
1136 int need_clear_zsubexpr; // extmatch subexpressions still need to be
1137 // cleared
1138 #endif
1139
1140 // Internal copy of 'ignorecase'. It is set at each call to vim_regexec().
1141 // Normally it gets the value of "rm_ic" or "rmm_ic", but when the pattern
1142 // contains '\c' or '\C' the value is overruled.
1143 int reg_ic;
1144
1145 // Similar to "reg_ic", but only for 'combining' characters. Set with \Z
1146 // flag in the regexp. Defaults to false, always.
1147 int reg_icombine;
1148
1149 // Copy of "rmm_maxcol": maximum column to search for a match. Zero when
1150 // there is no maximum.
1151 colnr_T reg_maxcol;
1152
1153 // State for the NFA engine regexec.
1154 int nfa_has_zend; // NFA regexp \ze operator encountered.
1155 int nfa_has_backref; // NFA regexp \1 .. \9 encountered.
1156 int nfa_nsubexpr; // Number of sub expressions actually being used
1157 // during execution. 1 if only the whole match
1158 // (subexpr 0) is used.
1159 // listid is global, so that it increases on recursive calls to
1160 // nfa_regmatch(), which means we don't have to clear the lastlist field of
1161 // all the states.
1162 int nfa_listid;
1163 int nfa_alt_listid;
1164
1165 #ifdef FEAT_SYN_HL
1166 int nfa_has_zsubexpr; // NFA regexp has \z( ), set zsubexpr.
1167 #endif
1168 } regexec_T;
1169
1170 static regexec_T rex;
1171 static int rex_in_use = FALSE;
1172
1173 /*
1174 * Return TRUE if character 'c' is included in 'iskeyword' option for
1175 * "reg_buf" buffer.
1176 */
1177 static int
reg_iswordc(int c)1178 reg_iswordc(int c)
1179 {
1180 return vim_iswordc_buf(c, rex.reg_buf);
1181 }
1182
1183 /*
1184 * Get pointer to the line "lnum", which is relative to "reg_firstlnum".
1185 */
1186 static char_u *
reg_getline(linenr_T lnum)1187 reg_getline(linenr_T lnum)
1188 {
1189 // when looking behind for a match/no-match lnum is negative. But we
1190 // can't go before line 1
1191 if (rex.reg_firstlnum + lnum < 1)
1192 return NULL;
1193 if (lnum > rex.reg_maxline)
1194 // Must have matched the "\n" in the last line.
1195 return (char_u *)"";
1196 return ml_get_buf(rex.reg_buf, rex.reg_firstlnum + lnum, FALSE);
1197 }
1198
1199 #ifdef FEAT_SYN_HL
1200 static char_u *reg_startzp[NSUBEXP]; // Workspace to mark beginning
1201 static char_u *reg_endzp[NSUBEXP]; // and end of \z(...\) matches
1202 static lpos_T reg_startzpos[NSUBEXP]; // idem, beginning pos
1203 static lpos_T reg_endzpos[NSUBEXP]; // idem, end pos
1204 #endif
1205
1206 // TRUE if using multi-line regexp.
1207 #define REG_MULTI (rex.reg_match == NULL)
1208
1209 #ifdef FEAT_SYN_HL
1210 /*
1211 * Create a new extmatch and mark it as referenced once.
1212 */
1213 static reg_extmatch_T *
make_extmatch(void)1214 make_extmatch(void)
1215 {
1216 reg_extmatch_T *em;
1217
1218 em = ALLOC_CLEAR_ONE(reg_extmatch_T);
1219 if (em != NULL)
1220 em->refcnt = 1;
1221 return em;
1222 }
1223
1224 /*
1225 * Add a reference to an extmatch.
1226 */
1227 reg_extmatch_T *
ref_extmatch(reg_extmatch_T * em)1228 ref_extmatch(reg_extmatch_T *em)
1229 {
1230 if (em != NULL)
1231 em->refcnt++;
1232 return em;
1233 }
1234
1235 /*
1236 * Remove a reference to an extmatch. If there are no references left, free
1237 * the info.
1238 */
1239 void
unref_extmatch(reg_extmatch_T * em)1240 unref_extmatch(reg_extmatch_T *em)
1241 {
1242 int i;
1243
1244 if (em != NULL && --em->refcnt <= 0)
1245 {
1246 for (i = 0; i < NSUBEXP; ++i)
1247 vim_free(em->matches[i]);
1248 vim_free(em);
1249 }
1250 }
1251 #endif
1252
1253 /*
1254 * Get class of previous character.
1255 */
1256 static int
reg_prev_class(void)1257 reg_prev_class(void)
1258 {
1259 if (rex.input > rex.line)
1260 return mb_get_class_buf(rex.input - 1
1261 - (*mb_head_off)(rex.line, rex.input - 1), rex.reg_buf);
1262 return -1;
1263 }
1264
1265 /*
1266 * Return TRUE if the current rex.input position matches the Visual area.
1267 */
1268 static int
reg_match_visual(void)1269 reg_match_visual(void)
1270 {
1271 pos_T top, bot;
1272 linenr_T lnum;
1273 colnr_T col;
1274 win_T *wp = rex.reg_win == NULL ? curwin : rex.reg_win;
1275 int mode;
1276 colnr_T start, end;
1277 colnr_T start2, end2;
1278 colnr_T cols;
1279 colnr_T curswant;
1280
1281 // Check if the buffer is the current buffer.
1282 if (rex.reg_buf != curbuf || VIsual.lnum == 0)
1283 return FALSE;
1284
1285 if (VIsual_active)
1286 {
1287 if (LT_POS(VIsual, wp->w_cursor))
1288 {
1289 top = VIsual;
1290 bot = wp->w_cursor;
1291 }
1292 else
1293 {
1294 top = wp->w_cursor;
1295 bot = VIsual;
1296 }
1297 mode = VIsual_mode;
1298 curswant = wp->w_curswant;
1299 }
1300 else
1301 {
1302 if (LT_POS(curbuf->b_visual.vi_start, curbuf->b_visual.vi_end))
1303 {
1304 top = curbuf->b_visual.vi_start;
1305 bot = curbuf->b_visual.vi_end;
1306 }
1307 else
1308 {
1309 top = curbuf->b_visual.vi_end;
1310 bot = curbuf->b_visual.vi_start;
1311 }
1312 mode = curbuf->b_visual.vi_mode;
1313 curswant = curbuf->b_visual.vi_curswant;
1314 }
1315 lnum = rex.lnum + rex.reg_firstlnum;
1316 if (lnum < top.lnum || lnum > bot.lnum)
1317 return FALSE;
1318
1319 if (mode == 'v')
1320 {
1321 col = (colnr_T)(rex.input - rex.line);
1322 if ((lnum == top.lnum && col < top.col)
1323 || (lnum == bot.lnum && col >= bot.col + (*p_sel != 'e')))
1324 return FALSE;
1325 }
1326 else if (mode == Ctrl_V)
1327 {
1328 getvvcol(wp, &top, &start, NULL, &end);
1329 getvvcol(wp, &bot, &start2, NULL, &end2);
1330 if (start2 < start)
1331 start = start2;
1332 if (end2 > end)
1333 end = end2;
1334 if (top.col == MAXCOL || bot.col == MAXCOL || curswant == MAXCOL)
1335 end = MAXCOL;
1336 cols = win_linetabsize(wp, rex.line, (colnr_T)(rex.input - rex.line));
1337 if (cols < start || cols > end - (*p_sel == 'e'))
1338 return FALSE;
1339 }
1340 return TRUE;
1341 }
1342
1343 /*
1344 * Check the regexp program for its magic number.
1345 * Return TRUE if it's wrong.
1346 */
1347 static int
prog_magic_wrong(void)1348 prog_magic_wrong(void)
1349 {
1350 regprog_T *prog;
1351
1352 prog = REG_MULTI ? rex.reg_mmatch->regprog : rex.reg_match->regprog;
1353 if (prog->engine == &nfa_regengine)
1354 // For NFA matcher we don't check the magic
1355 return FALSE;
1356
1357 if (UCHARAT(((bt_regprog_T *)prog)->program) != REGMAGIC)
1358 {
1359 emsg(_(e_corrupted_regexp_program));
1360 return TRUE;
1361 }
1362 return FALSE;
1363 }
1364
1365 /*
1366 * Cleanup the subexpressions, if this wasn't done yet.
1367 * This construction is used to clear the subexpressions only when they are
1368 * used (to increase speed).
1369 */
1370 static void
cleanup_subexpr(void)1371 cleanup_subexpr(void)
1372 {
1373 if (rex.need_clear_subexpr)
1374 {
1375 if (REG_MULTI)
1376 {
1377 // Use 0xff to set lnum to -1
1378 vim_memset(rex.reg_startpos, 0xff, sizeof(lpos_T) * NSUBEXP);
1379 vim_memset(rex.reg_endpos, 0xff, sizeof(lpos_T) * NSUBEXP);
1380 }
1381 else
1382 {
1383 vim_memset(rex.reg_startp, 0, sizeof(char_u *) * NSUBEXP);
1384 vim_memset(rex.reg_endp, 0, sizeof(char_u *) * NSUBEXP);
1385 }
1386 rex.need_clear_subexpr = FALSE;
1387 }
1388 }
1389
1390 #ifdef FEAT_SYN_HL
1391 static void
cleanup_zsubexpr(void)1392 cleanup_zsubexpr(void)
1393 {
1394 if (rex.need_clear_zsubexpr)
1395 {
1396 if (REG_MULTI)
1397 {
1398 // Use 0xff to set lnum to -1
1399 vim_memset(reg_startzpos, 0xff, sizeof(lpos_T) * NSUBEXP);
1400 vim_memset(reg_endzpos, 0xff, sizeof(lpos_T) * NSUBEXP);
1401 }
1402 else
1403 {
1404 vim_memset(reg_startzp, 0, sizeof(char_u *) * NSUBEXP);
1405 vim_memset(reg_endzp, 0, sizeof(char_u *) * NSUBEXP);
1406 }
1407 rex.need_clear_zsubexpr = FALSE;
1408 }
1409 }
1410 #endif
1411
1412 /*
1413 * Advance rex.lnum, rex.line and rex.input to the next line.
1414 */
1415 static void
reg_nextline(void)1416 reg_nextline(void)
1417 {
1418 rex.line = reg_getline(++rex.lnum);
1419 rex.input = rex.line;
1420 fast_breakcheck();
1421 }
1422
1423 /*
1424 * Check whether a backreference matches.
1425 * Returns RA_FAIL, RA_NOMATCH or RA_MATCH.
1426 * If "bytelen" is not NULL, it is set to the byte length of the match in the
1427 * last line.
1428 */
1429 static int
match_with_backref(linenr_T start_lnum,colnr_T start_col,linenr_T end_lnum,colnr_T end_col,int * bytelen)1430 match_with_backref(
1431 linenr_T start_lnum,
1432 colnr_T start_col,
1433 linenr_T end_lnum,
1434 colnr_T end_col,
1435 int *bytelen)
1436 {
1437 linenr_T clnum = start_lnum;
1438 colnr_T ccol = start_col;
1439 int len;
1440 char_u *p;
1441
1442 if (bytelen != NULL)
1443 *bytelen = 0;
1444 for (;;)
1445 {
1446 // Since getting one line may invalidate the other, need to make copy.
1447 // Slow!
1448 if (rex.line != reg_tofree)
1449 {
1450 len = (int)STRLEN(rex.line);
1451 if (reg_tofree == NULL || len >= (int)reg_tofreelen)
1452 {
1453 len += 50; // get some extra
1454 vim_free(reg_tofree);
1455 reg_tofree = alloc(len);
1456 if (reg_tofree == NULL)
1457 return RA_FAIL; // out of memory!
1458 reg_tofreelen = len;
1459 }
1460 STRCPY(reg_tofree, rex.line);
1461 rex.input = reg_tofree + (rex.input - rex.line);
1462 rex.line = reg_tofree;
1463 }
1464
1465 // Get the line to compare with.
1466 p = reg_getline(clnum);
1467 if (clnum == end_lnum)
1468 len = end_col - ccol;
1469 else
1470 len = (int)STRLEN(p + ccol);
1471
1472 if (cstrncmp(p + ccol, rex.input, &len) != 0)
1473 return RA_NOMATCH; // doesn't match
1474 if (bytelen != NULL)
1475 *bytelen += len;
1476 if (clnum == end_lnum)
1477 break; // match and at end!
1478 if (rex.lnum >= rex.reg_maxline)
1479 return RA_NOMATCH; // text too short
1480
1481 // Advance to next line.
1482 reg_nextline();
1483 if (bytelen != NULL)
1484 *bytelen = 0;
1485 ++clnum;
1486 ccol = 0;
1487 if (got_int)
1488 return RA_FAIL;
1489 }
1490
1491 // found a match! Note that rex.line may now point to a copy of the line,
1492 // that should not matter.
1493 return RA_MATCH;
1494 }
1495
1496 /*
1497 * Used in a place where no * or \+ can follow.
1498 */
1499 static int
re_mult_next(char * what)1500 re_mult_next(char *what)
1501 {
1502 if (re_multi_type(peekchr()) == MULTI_MULT)
1503 {
1504 semsg(_("E888: (NFA regexp) cannot repeat %s"), what);
1505 rc_did_emsg = TRUE;
1506 return FAIL;
1507 }
1508 return OK;
1509 }
1510
1511 typedef struct
1512 {
1513 int a, b, c;
1514 } decomp_T;
1515
1516
1517 // 0xfb20 - 0xfb4f
1518 static decomp_T decomp_table[0xfb4f-0xfb20+1] =
1519 {
1520 {0x5e2,0,0}, // 0xfb20 alt ayin
1521 {0x5d0,0,0}, // 0xfb21 alt alef
1522 {0x5d3,0,0}, // 0xfb22 alt dalet
1523 {0x5d4,0,0}, // 0xfb23 alt he
1524 {0x5db,0,0}, // 0xfb24 alt kaf
1525 {0x5dc,0,0}, // 0xfb25 alt lamed
1526 {0x5dd,0,0}, // 0xfb26 alt mem-sofit
1527 {0x5e8,0,0}, // 0xfb27 alt resh
1528 {0x5ea,0,0}, // 0xfb28 alt tav
1529 {'+', 0, 0}, // 0xfb29 alt plus
1530 {0x5e9, 0x5c1, 0}, // 0xfb2a shin+shin-dot
1531 {0x5e9, 0x5c2, 0}, // 0xfb2b shin+sin-dot
1532 {0x5e9, 0x5c1, 0x5bc}, // 0xfb2c shin+shin-dot+dagesh
1533 {0x5e9, 0x5c2, 0x5bc}, // 0xfb2d shin+sin-dot+dagesh
1534 {0x5d0, 0x5b7, 0}, // 0xfb2e alef+patah
1535 {0x5d0, 0x5b8, 0}, // 0xfb2f alef+qamats
1536 {0x5d0, 0x5b4, 0}, // 0xfb30 alef+hiriq
1537 {0x5d1, 0x5bc, 0}, // 0xfb31 bet+dagesh
1538 {0x5d2, 0x5bc, 0}, // 0xfb32 gimel+dagesh
1539 {0x5d3, 0x5bc, 0}, // 0xfb33 dalet+dagesh
1540 {0x5d4, 0x5bc, 0}, // 0xfb34 he+dagesh
1541 {0x5d5, 0x5bc, 0}, // 0xfb35 vav+dagesh
1542 {0x5d6, 0x5bc, 0}, // 0xfb36 zayin+dagesh
1543 {0xfb37, 0, 0}, // 0xfb37 -- UNUSED
1544 {0x5d8, 0x5bc, 0}, // 0xfb38 tet+dagesh
1545 {0x5d9, 0x5bc, 0}, // 0xfb39 yud+dagesh
1546 {0x5da, 0x5bc, 0}, // 0xfb3a kaf sofit+dagesh
1547 {0x5db, 0x5bc, 0}, // 0xfb3b kaf+dagesh
1548 {0x5dc, 0x5bc, 0}, // 0xfb3c lamed+dagesh
1549 {0xfb3d, 0, 0}, // 0xfb3d -- UNUSED
1550 {0x5de, 0x5bc, 0}, // 0xfb3e mem+dagesh
1551 {0xfb3f, 0, 0}, // 0xfb3f -- UNUSED
1552 {0x5e0, 0x5bc, 0}, // 0xfb40 nun+dagesh
1553 {0x5e1, 0x5bc, 0}, // 0xfb41 samech+dagesh
1554 {0xfb42, 0, 0}, // 0xfb42 -- UNUSED
1555 {0x5e3, 0x5bc, 0}, // 0xfb43 pe sofit+dagesh
1556 {0x5e4, 0x5bc,0}, // 0xfb44 pe+dagesh
1557 {0xfb45, 0, 0}, // 0xfb45 -- UNUSED
1558 {0x5e6, 0x5bc, 0}, // 0xfb46 tsadi+dagesh
1559 {0x5e7, 0x5bc, 0}, // 0xfb47 qof+dagesh
1560 {0x5e8, 0x5bc, 0}, // 0xfb48 resh+dagesh
1561 {0x5e9, 0x5bc, 0}, // 0xfb49 shin+dagesh
1562 {0x5ea, 0x5bc, 0}, // 0xfb4a tav+dagesh
1563 {0x5d5, 0x5b9, 0}, // 0xfb4b vav+holam
1564 {0x5d1, 0x5bf, 0}, // 0xfb4c bet+rafe
1565 {0x5db, 0x5bf, 0}, // 0xfb4d kaf+rafe
1566 {0x5e4, 0x5bf, 0}, // 0xfb4e pe+rafe
1567 {0x5d0, 0x5dc, 0} // 0xfb4f alef-lamed
1568 };
1569
1570 static void
mb_decompose(int c,int * c1,int * c2,int * c3)1571 mb_decompose(int c, int *c1, int *c2, int *c3)
1572 {
1573 decomp_T d;
1574
1575 if (c >= 0xfb20 && c <= 0xfb4f)
1576 {
1577 d = decomp_table[c - 0xfb20];
1578 *c1 = d.a;
1579 *c2 = d.b;
1580 *c3 = d.c;
1581 }
1582 else
1583 {
1584 *c1 = c;
1585 *c2 = *c3 = 0;
1586 }
1587 }
1588
1589 /*
1590 * Compare two strings, ignore case if rex.reg_ic set.
1591 * Return 0 if strings match, non-zero otherwise.
1592 * Correct the length "*n" when composing characters are ignored.
1593 */
1594 static int
cstrncmp(char_u * s1,char_u * s2,int * n)1595 cstrncmp(char_u *s1, char_u *s2, int *n)
1596 {
1597 int result;
1598
1599 if (!rex.reg_ic)
1600 result = STRNCMP(s1, s2, *n);
1601 else
1602 result = MB_STRNICMP(s1, s2, *n);
1603
1604 // if it failed and it's utf8 and we want to combineignore:
1605 if (result != 0 && enc_utf8 && rex.reg_icombine)
1606 {
1607 char_u *str1, *str2;
1608 int c1, c2, c11, c12;
1609 int junk;
1610
1611 // we have to handle the strcmp ourselves, since it is necessary to
1612 // deal with the composing characters by ignoring them:
1613 str1 = s1;
1614 str2 = s2;
1615 c1 = c2 = 0;
1616 while ((int)(str1 - s1) < *n)
1617 {
1618 c1 = mb_ptr2char_adv(&str1);
1619 c2 = mb_ptr2char_adv(&str2);
1620
1621 // Decompose the character if necessary, into 'base' characters.
1622 // Currently hard-coded for Hebrew, Arabic to be done...
1623 if (c1 != c2 && (!rex.reg_ic || utf_fold(c1) != utf_fold(c2)))
1624 {
1625 // decomposition necessary?
1626 mb_decompose(c1, &c11, &junk, &junk);
1627 mb_decompose(c2, &c12, &junk, &junk);
1628 c1 = c11;
1629 c2 = c12;
1630 if (c11 != c12
1631 && (!rex.reg_ic || utf_fold(c11) != utf_fold(c12)))
1632 break;
1633 }
1634 }
1635 result = c2 - c1;
1636 if (result == 0)
1637 *n = (int)(str2 - s2);
1638 }
1639
1640 return result;
1641 }
1642
1643 /*
1644 * cstrchr: This function is used a lot for simple searches, keep it fast!
1645 */
1646 static char_u *
cstrchr(char_u * s,int c)1647 cstrchr(char_u *s, int c)
1648 {
1649 char_u *p;
1650 int cc;
1651
1652 if (!rex.reg_ic || (!enc_utf8 && mb_char2len(c) > 1))
1653 return vim_strchr(s, c);
1654
1655 // tolower() and toupper() can be slow, comparing twice should be a lot
1656 // faster (esp. when using MS Visual C++!).
1657 // For UTF-8 need to use folded case.
1658 if (enc_utf8 && c > 0x80)
1659 cc = utf_fold(c);
1660 else
1661 if (MB_ISUPPER(c))
1662 cc = MB_TOLOWER(c);
1663 else if (MB_ISLOWER(c))
1664 cc = MB_TOUPPER(c);
1665 else
1666 return vim_strchr(s, c);
1667
1668 if (has_mbyte)
1669 {
1670 for (p = s; *p != NUL; p += (*mb_ptr2len)(p))
1671 {
1672 if (enc_utf8 && c > 0x80)
1673 {
1674 if (utf_fold(utf_ptr2char(p)) == cc)
1675 return p;
1676 }
1677 else if (*p == c || *p == cc)
1678 return p;
1679 }
1680 }
1681 else
1682 // Faster version for when there are no multi-byte characters.
1683 for (p = s; *p != NUL; ++p)
1684 if (*p == c || *p == cc)
1685 return p;
1686
1687 return NULL;
1688 }
1689
1690 ////////////////////////////////////////////////////////////////
1691 // regsub stuff //
1692 ////////////////////////////////////////////////////////////////
1693
1694 /*
1695 * We should define ftpr as a pointer to a function returning a pointer to
1696 * a function returning a pointer to a function ...
1697 * This is impossible, so we declare a pointer to a function returning a
1698 * void pointer. This should work for all compilers.
1699 */
1700 typedef void (*(*fptr_T)(int *, int));
1701
1702 static int vim_regsub_both(char_u *source, typval_T *expr, char_u *dest, int copy, int magic, int backslash);
1703
1704 static fptr_T
do_upper(int * d,int c)1705 do_upper(int *d, int c)
1706 {
1707 *d = MB_TOUPPER(c);
1708
1709 return (fptr_T)NULL;
1710 }
1711
1712 static fptr_T
do_Upper(int * d,int c)1713 do_Upper(int *d, int c)
1714 {
1715 *d = MB_TOUPPER(c);
1716
1717 return (fptr_T)do_Upper;
1718 }
1719
1720 static fptr_T
do_lower(int * d,int c)1721 do_lower(int *d, int c)
1722 {
1723 *d = MB_TOLOWER(c);
1724
1725 return (fptr_T)NULL;
1726 }
1727
1728 static fptr_T
do_Lower(int * d,int c)1729 do_Lower(int *d, int c)
1730 {
1731 *d = MB_TOLOWER(c);
1732
1733 return (fptr_T)do_Lower;
1734 }
1735
1736 /*
1737 * regtilde(): Replace tildes in the pattern by the old pattern.
1738 *
1739 * Short explanation of the tilde: It stands for the previous replacement
1740 * pattern. If that previous pattern also contains a ~ we should go back a
1741 * step further... But we insert the previous pattern into the current one
1742 * and remember that.
1743 * This still does not handle the case where "magic" changes. So require the
1744 * user to keep his hands off of "magic".
1745 *
1746 * The tildes are parsed once before the first call to vim_regsub().
1747 */
1748 char_u *
regtilde(char_u * source,int magic)1749 regtilde(char_u *source, int magic)
1750 {
1751 char_u *newsub = source;
1752 char_u *tmpsub;
1753 char_u *p;
1754 int len;
1755 int prevlen;
1756
1757 for (p = newsub; *p; ++p)
1758 {
1759 if ((*p == '~' && magic) || (*p == '\\' && *(p + 1) == '~' && !magic))
1760 {
1761 if (reg_prev_sub != NULL)
1762 {
1763 // length = len(newsub) - 1 + len(prev_sub) + 1
1764 prevlen = (int)STRLEN(reg_prev_sub);
1765 tmpsub = alloc(STRLEN(newsub) + prevlen);
1766 if (tmpsub != NULL)
1767 {
1768 // copy prefix
1769 len = (int)(p - newsub); // not including ~
1770 mch_memmove(tmpsub, newsub, (size_t)len);
1771 // interpret tilde
1772 mch_memmove(tmpsub + len, reg_prev_sub, (size_t)prevlen);
1773 // copy postfix
1774 if (!magic)
1775 ++p; // back off backslash
1776 STRCPY(tmpsub + len + prevlen, p + 1);
1777
1778 if (newsub != source) // already allocated newsub
1779 vim_free(newsub);
1780 newsub = tmpsub;
1781 p = newsub + len + prevlen;
1782 }
1783 }
1784 else if (magic)
1785 STRMOVE(p, p + 1); // remove '~'
1786 else
1787 STRMOVE(p, p + 2); // remove '\~'
1788 --p;
1789 }
1790 else
1791 {
1792 if (*p == '\\' && p[1]) // skip escaped characters
1793 ++p;
1794 if (has_mbyte)
1795 p += (*mb_ptr2len)(p) - 1;
1796 }
1797 }
1798
1799 vim_free(reg_prev_sub);
1800 if (newsub != source) // newsub was allocated, just keep it
1801 reg_prev_sub = newsub;
1802 else // no ~ found, need to save newsub
1803 reg_prev_sub = vim_strsave(newsub);
1804 return newsub;
1805 }
1806
1807 #ifdef FEAT_EVAL
1808 static int can_f_submatch = FALSE; // TRUE when submatch() can be used
1809
1810 // These pointers are used for reg_submatch(). Needed for when the
1811 // substitution string is an expression that contains a call to substitute()
1812 // and submatch().
1813 typedef struct {
1814 regmatch_T *sm_match;
1815 regmmatch_T *sm_mmatch;
1816 linenr_T sm_firstlnum;
1817 linenr_T sm_maxline;
1818 int sm_line_lbr;
1819 } regsubmatch_T;
1820
1821 static regsubmatch_T rsm; // can only be used when can_f_submatch is TRUE
1822 #endif
1823
1824 #ifdef FEAT_EVAL
1825
1826 /*
1827 * Put the submatches in "argv[argskip]" which is a list passed into
1828 * call_func() by vim_regsub_both().
1829 */
1830 static int
fill_submatch_list(int argc UNUSED,typval_T * argv,int argskip,int argcount)1831 fill_submatch_list(int argc UNUSED, typval_T *argv, int argskip, int argcount)
1832 {
1833 listitem_T *li;
1834 int i;
1835 char_u *s;
1836 typval_T *listarg = argv + argskip;
1837
1838 if (argcount == argskip)
1839 // called function doesn't take a submatches argument
1840 return argskip;
1841
1842 // Relies on sl_list to be the first item in staticList10_T.
1843 init_static_list((staticList10_T *)(listarg->vval.v_list));
1844
1845 // There are always 10 list items in staticList10_T.
1846 li = listarg->vval.v_list->lv_first;
1847 for (i = 0; i < 10; ++i)
1848 {
1849 s = rsm.sm_match->startp[i];
1850 if (s == NULL || rsm.sm_match->endp[i] == NULL)
1851 s = NULL;
1852 else
1853 s = vim_strnsave(s, rsm.sm_match->endp[i] - s);
1854 li->li_tv.v_type = VAR_STRING;
1855 li->li_tv.vval.v_string = s;
1856 li = li->li_next;
1857 }
1858 return argskip + 1;
1859 }
1860
1861 static void
clear_submatch_list(staticList10_T * sl)1862 clear_submatch_list(staticList10_T *sl)
1863 {
1864 int i;
1865
1866 for (i = 0; i < 10; ++i)
1867 vim_free(sl->sl_items[i].li_tv.vval.v_string);
1868 }
1869 #endif
1870
1871 /*
1872 * vim_regsub() - perform substitutions after a vim_regexec() or
1873 * vim_regexec_multi() match.
1874 *
1875 * If "copy" is TRUE really copy into "dest".
1876 * If "copy" is FALSE nothing is copied, this is just to find out the length
1877 * of the result.
1878 *
1879 * If "backslash" is TRUE, a backslash will be removed later, need to double
1880 * them to keep them, and insert a backslash before a CR to avoid it being
1881 * replaced with a line break later.
1882 *
1883 * Note: The matched text must not change between the call of
1884 * vim_regexec()/vim_regexec_multi() and vim_regsub()! It would make the back
1885 * references invalid!
1886 *
1887 * Returns the size of the replacement, including terminating NUL.
1888 */
1889 int
vim_regsub(regmatch_T * rmp,char_u * source,typval_T * expr,char_u * dest,int copy,int magic,int backslash)1890 vim_regsub(
1891 regmatch_T *rmp,
1892 char_u *source,
1893 typval_T *expr,
1894 char_u *dest,
1895 int copy,
1896 int magic,
1897 int backslash)
1898 {
1899 int result;
1900 regexec_T rex_save;
1901 int rex_in_use_save = rex_in_use;
1902
1903 if (rex_in_use)
1904 // Being called recursively, save the state.
1905 rex_save = rex;
1906 rex_in_use = TRUE;
1907
1908 rex.reg_match = rmp;
1909 rex.reg_mmatch = NULL;
1910 rex.reg_maxline = 0;
1911 rex.reg_buf = curbuf;
1912 rex.reg_line_lbr = TRUE;
1913 result = vim_regsub_both(source, expr, dest, copy, magic, backslash);
1914
1915 rex_in_use = rex_in_use_save;
1916 if (rex_in_use)
1917 rex = rex_save;
1918
1919 return result;
1920 }
1921
1922 int
vim_regsub_multi(regmmatch_T * rmp,linenr_T lnum,char_u * source,char_u * dest,int copy,int magic,int backslash)1923 vim_regsub_multi(
1924 regmmatch_T *rmp,
1925 linenr_T lnum,
1926 char_u *source,
1927 char_u *dest,
1928 int copy,
1929 int magic,
1930 int backslash)
1931 {
1932 int result;
1933 regexec_T rex_save;
1934 int rex_in_use_save = rex_in_use;
1935
1936 if (rex_in_use)
1937 // Being called recursively, save the state.
1938 rex_save = rex;
1939 rex_in_use = TRUE;
1940
1941 rex.reg_match = NULL;
1942 rex.reg_mmatch = rmp;
1943 rex.reg_buf = curbuf; // always works on the current buffer!
1944 rex.reg_firstlnum = lnum;
1945 rex.reg_maxline = curbuf->b_ml.ml_line_count - lnum;
1946 rex.reg_line_lbr = FALSE;
1947 result = vim_regsub_both(source, NULL, dest, copy, magic, backslash);
1948
1949 rex_in_use = rex_in_use_save;
1950 if (rex_in_use)
1951 rex = rex_save;
1952
1953 return result;
1954 }
1955
1956 static int
vim_regsub_both(char_u * source,typval_T * expr,char_u * dest,int copy,int magic,int backslash)1957 vim_regsub_both(
1958 char_u *source,
1959 typval_T *expr,
1960 char_u *dest,
1961 int copy,
1962 int magic,
1963 int backslash)
1964 {
1965 char_u *src;
1966 char_u *dst;
1967 char_u *s;
1968 int c;
1969 int cc;
1970 int no = -1;
1971 fptr_T func_all = (fptr_T)NULL;
1972 fptr_T func_one = (fptr_T)NULL;
1973 linenr_T clnum = 0; // init for GCC
1974 int len = 0; // init for GCC
1975 #ifdef FEAT_EVAL
1976 static char_u *eval_result = NULL;
1977 #endif
1978
1979 // Be paranoid...
1980 if ((source == NULL && expr == NULL) || dest == NULL)
1981 {
1982 emsg(_(e_null_argument));
1983 return 0;
1984 }
1985 if (prog_magic_wrong())
1986 return 0;
1987 src = source;
1988 dst = dest;
1989
1990 /*
1991 * When the substitute part starts with "\=" evaluate it as an expression.
1992 */
1993 if (expr != NULL || (source[0] == '\\' && source[1] == '='))
1994 {
1995 #ifdef FEAT_EVAL
1996 // To make sure that the length doesn't change between checking the
1997 // length and copying the string, and to speed up things, the
1998 // resulting string is saved from the call with "copy" == FALSE to the
1999 // call with "copy" == TRUE.
2000 if (copy)
2001 {
2002 if (eval_result != NULL)
2003 {
2004 STRCPY(dest, eval_result);
2005 dst += STRLEN(eval_result);
2006 VIM_CLEAR(eval_result);
2007 }
2008 }
2009 else
2010 {
2011 int prev_can_f_submatch = can_f_submatch;
2012 regsubmatch_T rsm_save;
2013
2014 vim_free(eval_result);
2015
2016 // The expression may contain substitute(), which calls us
2017 // recursively. Make sure submatch() gets the text from the first
2018 // level.
2019 if (can_f_submatch)
2020 rsm_save = rsm;
2021 can_f_submatch = TRUE;
2022 rsm.sm_match = rex.reg_match;
2023 rsm.sm_mmatch = rex.reg_mmatch;
2024 rsm.sm_firstlnum = rex.reg_firstlnum;
2025 rsm.sm_maxline = rex.reg_maxline;
2026 rsm.sm_line_lbr = rex.reg_line_lbr;
2027
2028 if (expr != NULL)
2029 {
2030 typval_T argv[2];
2031 char_u buf[NUMBUFLEN];
2032 typval_T rettv;
2033 staticList10_T matchList;
2034 funcexe_T funcexe;
2035
2036 rettv.v_type = VAR_STRING;
2037 rettv.vval.v_string = NULL;
2038 argv[0].v_type = VAR_LIST;
2039 argv[0].vval.v_list = &matchList.sl_list;
2040 matchList.sl_list.lv_len = 0;
2041 CLEAR_FIELD(funcexe);
2042 funcexe.argv_func = fill_submatch_list;
2043 funcexe.evaluate = TRUE;
2044 if (expr->v_type == VAR_FUNC)
2045 {
2046 s = expr->vval.v_string;
2047 call_func(s, -1, &rettv, 1, argv, &funcexe);
2048 }
2049 else if (expr->v_type == VAR_PARTIAL)
2050 {
2051 partial_T *partial = expr->vval.v_partial;
2052
2053 s = partial_name(partial);
2054 funcexe.partial = partial;
2055 call_func(s, -1, &rettv, 1, argv, &funcexe);
2056 }
2057 if (matchList.sl_list.lv_len > 0)
2058 // fill_submatch_list() was called
2059 clear_submatch_list(&matchList);
2060
2061 if (rettv.v_type == VAR_UNKNOWN)
2062 // something failed, no need to report another error
2063 eval_result = NULL;
2064 else
2065 {
2066 eval_result = tv_get_string_buf_chk(&rettv, buf);
2067 if (eval_result != NULL)
2068 eval_result = vim_strsave(eval_result);
2069 }
2070 clear_tv(&rettv);
2071 }
2072 else if (substitute_instr != NULL)
2073 // Execute instructions from ISN_SUBSTITUTE.
2074 eval_result = exe_substitute_instr();
2075 else
2076 eval_result = eval_to_string(source + 2, TRUE);
2077
2078 if (eval_result != NULL)
2079 {
2080 int had_backslash = FALSE;
2081
2082 for (s = eval_result; *s != NUL; MB_PTR_ADV(s))
2083 {
2084 // Change NL to CR, so that it becomes a line break,
2085 // unless called from vim_regexec_nl().
2086 // Skip over a backslashed character.
2087 if (*s == NL && !rsm.sm_line_lbr)
2088 *s = CAR;
2089 else if (*s == '\\' && s[1] != NUL)
2090 {
2091 ++s;
2092 /* Change NL to CR here too, so that this works:
2093 * :s/abc\\\ndef/\="aaa\\\nbbb"/ on text:
2094 * abc\
2095 * def
2096 * Not when called from vim_regexec_nl().
2097 */
2098 if (*s == NL && !rsm.sm_line_lbr)
2099 *s = CAR;
2100 had_backslash = TRUE;
2101 }
2102 }
2103 if (had_backslash && backslash)
2104 {
2105 // Backslashes will be consumed, need to double them.
2106 s = vim_strsave_escaped(eval_result, (char_u *)"\\");
2107 if (s != NULL)
2108 {
2109 vim_free(eval_result);
2110 eval_result = s;
2111 }
2112 }
2113
2114 dst += STRLEN(eval_result);
2115 }
2116
2117 can_f_submatch = prev_can_f_submatch;
2118 if (can_f_submatch)
2119 rsm = rsm_save;
2120 }
2121 #endif
2122 }
2123 else
2124 while ((c = *src++) != NUL)
2125 {
2126 if (c == '&' && magic)
2127 no = 0;
2128 else if (c == '\\' && *src != NUL)
2129 {
2130 if (*src == '&' && !magic)
2131 {
2132 ++src;
2133 no = 0;
2134 }
2135 else if ('0' <= *src && *src <= '9')
2136 {
2137 no = *src++ - '0';
2138 }
2139 else if (vim_strchr((char_u *)"uUlLeE", *src))
2140 {
2141 switch (*src++)
2142 {
2143 case 'u': func_one = (fptr_T)do_upper;
2144 continue;
2145 case 'U': func_all = (fptr_T)do_Upper;
2146 continue;
2147 case 'l': func_one = (fptr_T)do_lower;
2148 continue;
2149 case 'L': func_all = (fptr_T)do_Lower;
2150 continue;
2151 case 'e':
2152 case 'E': func_one = func_all = (fptr_T)NULL;
2153 continue;
2154 }
2155 }
2156 }
2157 if (no < 0) // Ordinary character.
2158 {
2159 if (c == K_SPECIAL && src[0] != NUL && src[1] != NUL)
2160 {
2161 // Copy a special key as-is.
2162 if (copy)
2163 {
2164 *dst++ = c;
2165 *dst++ = *src++;
2166 *dst++ = *src++;
2167 }
2168 else
2169 {
2170 dst += 3;
2171 src += 2;
2172 }
2173 continue;
2174 }
2175
2176 if (c == '\\' && *src != NUL)
2177 {
2178 // Check for abbreviations -- webb
2179 switch (*src)
2180 {
2181 case 'r': c = CAR; ++src; break;
2182 case 'n': c = NL; ++src; break;
2183 case 't': c = TAB; ++src; break;
2184 // Oh no! \e already has meaning in subst pat :-(
2185 // case 'e': c = ESC; ++src; break;
2186 case 'b': c = Ctrl_H; ++src; break;
2187
2188 // If "backslash" is TRUE the backslash will be removed
2189 // later. Used to insert a literal CR.
2190 default: if (backslash)
2191 {
2192 if (copy)
2193 *dst = '\\';
2194 ++dst;
2195 }
2196 c = *src++;
2197 }
2198 }
2199 else if (has_mbyte)
2200 c = mb_ptr2char(src - 1);
2201
2202 // Write to buffer, if copy is set.
2203 if (func_one != (fptr_T)NULL)
2204 // Turbo C complains without the typecast
2205 func_one = (fptr_T)(func_one(&cc, c));
2206 else if (func_all != (fptr_T)NULL)
2207 // Turbo C complains without the typecast
2208 func_all = (fptr_T)(func_all(&cc, c));
2209 else // just copy
2210 cc = c;
2211
2212 if (has_mbyte)
2213 {
2214 int totlen = mb_ptr2len(src - 1);
2215
2216 if (copy)
2217 mb_char2bytes(cc, dst);
2218 dst += mb_char2len(cc) - 1;
2219 if (enc_utf8)
2220 {
2221 int clen = utf_ptr2len(src - 1);
2222
2223 // If the character length is shorter than "totlen", there
2224 // are composing characters; copy them as-is.
2225 if (clen < totlen)
2226 {
2227 if (copy)
2228 mch_memmove(dst + 1, src - 1 + clen,
2229 (size_t)(totlen - clen));
2230 dst += totlen - clen;
2231 }
2232 }
2233 src += totlen - 1;
2234 }
2235 else if (copy)
2236 *dst = cc;
2237 dst++;
2238 }
2239 else
2240 {
2241 if (REG_MULTI)
2242 {
2243 clnum = rex.reg_mmatch->startpos[no].lnum;
2244 if (clnum < 0 || rex.reg_mmatch->endpos[no].lnum < 0)
2245 s = NULL;
2246 else
2247 {
2248 s = reg_getline(clnum) + rex.reg_mmatch->startpos[no].col;
2249 if (rex.reg_mmatch->endpos[no].lnum == clnum)
2250 len = rex.reg_mmatch->endpos[no].col
2251 - rex.reg_mmatch->startpos[no].col;
2252 else
2253 len = (int)STRLEN(s);
2254 }
2255 }
2256 else
2257 {
2258 s = rex.reg_match->startp[no];
2259 if (rex.reg_match->endp[no] == NULL)
2260 s = NULL;
2261 else
2262 len = (int)(rex.reg_match->endp[no] - s);
2263 }
2264 if (s != NULL)
2265 {
2266 for (;;)
2267 {
2268 if (len == 0)
2269 {
2270 if (REG_MULTI)
2271 {
2272 if (rex.reg_mmatch->endpos[no].lnum == clnum)
2273 break;
2274 if (copy)
2275 *dst = CAR;
2276 ++dst;
2277 s = reg_getline(++clnum);
2278 if (rex.reg_mmatch->endpos[no].lnum == clnum)
2279 len = rex.reg_mmatch->endpos[no].col;
2280 else
2281 len = (int)STRLEN(s);
2282 }
2283 else
2284 break;
2285 }
2286 else if (*s == NUL) // we hit NUL.
2287 {
2288 if (copy)
2289 iemsg(_(e_damaged_match_string));
2290 goto exit;
2291 }
2292 else
2293 {
2294 if (backslash && (*s == CAR || *s == '\\'))
2295 {
2296 /*
2297 * Insert a backslash in front of a CR, otherwise
2298 * it will be replaced by a line break.
2299 * Number of backslashes will be halved later,
2300 * double them here.
2301 */
2302 if (copy)
2303 {
2304 dst[0] = '\\';
2305 dst[1] = *s;
2306 }
2307 dst += 2;
2308 }
2309 else
2310 {
2311 if (has_mbyte)
2312 c = mb_ptr2char(s);
2313 else
2314 c = *s;
2315
2316 if (func_one != (fptr_T)NULL)
2317 // Turbo C complains without the typecast
2318 func_one = (fptr_T)(func_one(&cc, c));
2319 else if (func_all != (fptr_T)NULL)
2320 // Turbo C complains without the typecast
2321 func_all = (fptr_T)(func_all(&cc, c));
2322 else // just copy
2323 cc = c;
2324
2325 if (has_mbyte)
2326 {
2327 int l;
2328
2329 // Copy composing characters separately, one
2330 // at a time.
2331 if (enc_utf8)
2332 l = utf_ptr2len(s) - 1;
2333 else
2334 l = mb_ptr2len(s) - 1;
2335
2336 s += l;
2337 len -= l;
2338 if (copy)
2339 mb_char2bytes(cc, dst);
2340 dst += mb_char2len(cc) - 1;
2341 }
2342 else if (copy)
2343 *dst = cc;
2344 dst++;
2345 }
2346
2347 ++s;
2348 --len;
2349 }
2350 }
2351 }
2352 no = -1;
2353 }
2354 }
2355 if (copy)
2356 *dst = NUL;
2357
2358 exit:
2359 return (int)((dst - dest) + 1);
2360 }
2361
2362 #ifdef FEAT_EVAL
2363 /*
2364 * Call reg_getline() with the line numbers from the submatch. If a
2365 * substitute() was used the reg_maxline and other values have been
2366 * overwritten.
2367 */
2368 static char_u *
reg_getline_submatch(linenr_T lnum)2369 reg_getline_submatch(linenr_T lnum)
2370 {
2371 char_u *s;
2372 linenr_T save_first = rex.reg_firstlnum;
2373 linenr_T save_max = rex.reg_maxline;
2374
2375 rex.reg_firstlnum = rsm.sm_firstlnum;
2376 rex.reg_maxline = rsm.sm_maxline;
2377
2378 s = reg_getline(lnum);
2379
2380 rex.reg_firstlnum = save_first;
2381 rex.reg_maxline = save_max;
2382 return s;
2383 }
2384
2385 /*
2386 * Used for the submatch() function: get the string from the n'th submatch in
2387 * allocated memory.
2388 * Returns NULL when not in a ":s" command and for a non-existing submatch.
2389 */
2390 char_u *
reg_submatch(int no)2391 reg_submatch(int no)
2392 {
2393 char_u *retval = NULL;
2394 char_u *s;
2395 int len;
2396 int round;
2397 linenr_T lnum;
2398
2399 if (!can_f_submatch || no < 0)
2400 return NULL;
2401
2402 if (rsm.sm_match == NULL)
2403 {
2404 /*
2405 * First round: compute the length and allocate memory.
2406 * Second round: copy the text.
2407 */
2408 for (round = 1; round <= 2; ++round)
2409 {
2410 lnum = rsm.sm_mmatch->startpos[no].lnum;
2411 if (lnum < 0 || rsm.sm_mmatch->endpos[no].lnum < 0)
2412 return NULL;
2413
2414 s = reg_getline_submatch(lnum);
2415 if (s == NULL) // anti-crash check, cannot happen?
2416 break;
2417 s += rsm.sm_mmatch->startpos[no].col;
2418 if (rsm.sm_mmatch->endpos[no].lnum == lnum)
2419 {
2420 // Within one line: take form start to end col.
2421 len = rsm.sm_mmatch->endpos[no].col
2422 - rsm.sm_mmatch->startpos[no].col;
2423 if (round == 2)
2424 vim_strncpy(retval, s, len);
2425 ++len;
2426 }
2427 else
2428 {
2429 // Multiple lines: take start line from start col, middle
2430 // lines completely and end line up to end col.
2431 len = (int)STRLEN(s);
2432 if (round == 2)
2433 {
2434 STRCPY(retval, s);
2435 retval[len] = '\n';
2436 }
2437 ++len;
2438 ++lnum;
2439 while (lnum < rsm.sm_mmatch->endpos[no].lnum)
2440 {
2441 s = reg_getline_submatch(lnum++);
2442 if (round == 2)
2443 STRCPY(retval + len, s);
2444 len += (int)STRLEN(s);
2445 if (round == 2)
2446 retval[len] = '\n';
2447 ++len;
2448 }
2449 if (round == 2)
2450 STRNCPY(retval + len, reg_getline_submatch(lnum),
2451 rsm.sm_mmatch->endpos[no].col);
2452 len += rsm.sm_mmatch->endpos[no].col;
2453 if (round == 2)
2454 retval[len] = NUL;
2455 ++len;
2456 }
2457
2458 if (retval == NULL)
2459 {
2460 retval = alloc(len);
2461 if (retval == NULL)
2462 return NULL;
2463 }
2464 }
2465 }
2466 else
2467 {
2468 s = rsm.sm_match->startp[no];
2469 if (s == NULL || rsm.sm_match->endp[no] == NULL)
2470 retval = NULL;
2471 else
2472 retval = vim_strnsave(s, rsm.sm_match->endp[no] - s);
2473 }
2474
2475 return retval;
2476 }
2477
2478 /*
2479 * Used for the submatch() function with the optional non-zero argument: get
2480 * the list of strings from the n'th submatch in allocated memory with NULs
2481 * represented in NLs.
2482 * Returns a list of allocated strings. Returns NULL when not in a ":s"
2483 * command, for a non-existing submatch and for any error.
2484 */
2485 list_T *
reg_submatch_list(int no)2486 reg_submatch_list(int no)
2487 {
2488 char_u *s;
2489 linenr_T slnum;
2490 linenr_T elnum;
2491 colnr_T scol;
2492 colnr_T ecol;
2493 int i;
2494 list_T *list;
2495 int error = FALSE;
2496
2497 if (!can_f_submatch || no < 0)
2498 return NULL;
2499
2500 if (rsm.sm_match == NULL)
2501 {
2502 slnum = rsm.sm_mmatch->startpos[no].lnum;
2503 elnum = rsm.sm_mmatch->endpos[no].lnum;
2504 if (slnum < 0 || elnum < 0)
2505 return NULL;
2506
2507 scol = rsm.sm_mmatch->startpos[no].col;
2508 ecol = rsm.sm_mmatch->endpos[no].col;
2509
2510 list = list_alloc();
2511 if (list == NULL)
2512 return NULL;
2513
2514 s = reg_getline_submatch(slnum) + scol;
2515 if (slnum == elnum)
2516 {
2517 if (list_append_string(list, s, ecol - scol) == FAIL)
2518 error = TRUE;
2519 }
2520 else
2521 {
2522 if (list_append_string(list, s, -1) == FAIL)
2523 error = TRUE;
2524 for (i = 1; i < elnum - slnum; i++)
2525 {
2526 s = reg_getline_submatch(slnum + i);
2527 if (list_append_string(list, s, -1) == FAIL)
2528 error = TRUE;
2529 }
2530 s = reg_getline_submatch(elnum);
2531 if (list_append_string(list, s, ecol) == FAIL)
2532 error = TRUE;
2533 }
2534 }
2535 else
2536 {
2537 s = rsm.sm_match->startp[no];
2538 if (s == NULL || rsm.sm_match->endp[no] == NULL)
2539 return NULL;
2540 list = list_alloc();
2541 if (list == NULL)
2542 return NULL;
2543 if (list_append_string(list, s,
2544 (int)(rsm.sm_match->endp[no] - s)) == FAIL)
2545 error = TRUE;
2546 }
2547
2548 if (error)
2549 {
2550 list_free(list);
2551 return NULL;
2552 }
2553 ++list->lv_refcount;
2554 return list;
2555 }
2556 #endif
2557
2558 /*
2559 * Initialize the values used for matching against multiple lines
2560 */
2561 static void
init_regexec_multi(regmmatch_T * rmp,win_T * win,buf_T * buf,linenr_T lnum)2562 init_regexec_multi(
2563 regmmatch_T *rmp,
2564 win_T *win, // window in which to search or NULL
2565 buf_T *buf, // buffer in which to search
2566 linenr_T lnum) // nr of line to start looking for match
2567 {
2568 rex.reg_match = NULL;
2569 rex.reg_mmatch = rmp;
2570 rex.reg_buf = buf;
2571 rex.reg_win = win;
2572 rex.reg_firstlnum = lnum;
2573 rex.reg_maxline = rex.reg_buf->b_ml.ml_line_count - lnum;
2574 rex.reg_line_lbr = FALSE;
2575 rex.reg_ic = rmp->rmm_ic;
2576 rex.reg_icombine = FALSE;
2577 rex.reg_maxcol = rmp->rmm_maxcol;
2578 }
2579
2580 #include "regexp_bt.c"
2581
2582 static regengine_T bt_regengine =
2583 {
2584 bt_regcomp,
2585 bt_regfree,
2586 bt_regexec_nl,
2587 bt_regexec_multi,
2588 (char_u *)""
2589 };
2590
2591 #include "regexp_nfa.c"
2592
2593 static regengine_T nfa_regengine =
2594 {
2595 nfa_regcomp,
2596 nfa_regfree,
2597 nfa_regexec_nl,
2598 nfa_regexec_multi,
2599 (char_u *)""
2600 };
2601
2602 // Which regexp engine to use? Needed for vim_regcomp().
2603 // Must match with 'regexpengine'.
2604 static int regexp_engine = 0;
2605
2606 #ifdef DEBUG
2607 static char_u regname[][30] = {
2608 "AUTOMATIC Regexp Engine",
2609 "BACKTRACKING Regexp Engine",
2610 "NFA Regexp Engine"
2611 };
2612 #endif
2613
2614 /*
2615 * Compile a regular expression into internal code.
2616 * Returns the program in allocated memory.
2617 * Use vim_regfree() to free the memory.
2618 * Returns NULL for an error.
2619 */
2620 regprog_T *
vim_regcomp(char_u * expr_arg,int re_flags)2621 vim_regcomp(char_u *expr_arg, int re_flags)
2622 {
2623 regprog_T *prog = NULL;
2624 char_u *expr = expr_arg;
2625 int called_emsg_before;
2626
2627 regexp_engine = p_re;
2628
2629 // Check for prefix "\%#=", that sets the regexp engine
2630 if (STRNCMP(expr, "\\%#=", 4) == 0)
2631 {
2632 int newengine = expr[4] - '0';
2633
2634 if (newengine == AUTOMATIC_ENGINE
2635 || newengine == BACKTRACKING_ENGINE
2636 || newengine == NFA_ENGINE)
2637 {
2638 regexp_engine = expr[4] - '0';
2639 expr += 5;
2640 #ifdef DEBUG
2641 smsg("New regexp mode selected (%d): %s",
2642 regexp_engine, regname[newengine]);
2643 #endif
2644 }
2645 else
2646 {
2647 emsg(_("E864: \\%#= can only be followed by 0, 1, or 2. The automatic engine will be used "));
2648 regexp_engine = AUTOMATIC_ENGINE;
2649 }
2650 }
2651 #ifdef DEBUG
2652 bt_regengine.expr = expr;
2653 nfa_regengine.expr = expr;
2654 #endif
2655 // reg_iswordc() uses rex.reg_buf
2656 rex.reg_buf = curbuf;
2657
2658 /*
2659 * First try the NFA engine, unless backtracking was requested.
2660 */
2661 called_emsg_before = called_emsg;
2662 if (regexp_engine != BACKTRACKING_ENGINE)
2663 prog = nfa_regengine.regcomp(expr,
2664 re_flags + (regexp_engine == AUTOMATIC_ENGINE ? RE_AUTO : 0));
2665 else
2666 prog = bt_regengine.regcomp(expr, re_flags);
2667
2668 // Check for error compiling regexp with initial engine.
2669 if (prog == NULL)
2670 {
2671 #ifdef BT_REGEXP_DEBUG_LOG
2672 if (regexp_engine == BACKTRACKING_ENGINE) // debugging log for BT engine
2673 {
2674 FILE *f;
2675 f = fopen(BT_REGEXP_DEBUG_LOG_NAME, "a");
2676 if (f)
2677 {
2678 fprintf(f, "Syntax error in \"%s\"\n", expr);
2679 fclose(f);
2680 }
2681 else
2682 semsg("(NFA) Could not open \"%s\" to write !!!",
2683 BT_REGEXP_DEBUG_LOG_NAME);
2684 }
2685 #endif
2686 /*
2687 * If the NFA engine failed, try the backtracking engine.
2688 * The NFA engine also fails for patterns that it can't handle well
2689 * but are still valid patterns, thus a retry should work.
2690 * But don't try if an error message was given.
2691 */
2692 if (regexp_engine == AUTOMATIC_ENGINE
2693 && called_emsg == called_emsg_before)
2694 {
2695 regexp_engine = BACKTRACKING_ENGINE;
2696 #ifdef FEAT_EVAL
2697 report_re_switch(expr);
2698 #endif
2699 prog = bt_regengine.regcomp(expr, re_flags);
2700 }
2701 }
2702
2703 if (prog != NULL)
2704 {
2705 // Store the info needed to call regcomp() again when the engine turns
2706 // out to be very slow when executing it.
2707 prog->re_engine = regexp_engine;
2708 prog->re_flags = re_flags;
2709 }
2710
2711 return prog;
2712 }
2713
2714 /*
2715 * Free a compiled regexp program, returned by vim_regcomp().
2716 */
2717 void
vim_regfree(regprog_T * prog)2718 vim_regfree(regprog_T *prog)
2719 {
2720 if (prog != NULL)
2721 prog->engine->regfree(prog);
2722 }
2723
2724 #if defined(EXITFREE) || defined(PROTO)
2725 void
free_regexp_stuff(void)2726 free_regexp_stuff(void)
2727 {
2728 ga_clear(®stack);
2729 ga_clear(&backpos);
2730 vim_free(reg_tofree);
2731 vim_free(reg_prev_sub);
2732 }
2733 #endif
2734
2735 #ifdef FEAT_EVAL
2736 static void
report_re_switch(char_u * pat)2737 report_re_switch(char_u *pat)
2738 {
2739 if (p_verbose > 0)
2740 {
2741 verbose_enter();
2742 msg_puts(_("Switching to backtracking RE engine for pattern: "));
2743 msg_puts((char *)pat);
2744 verbose_leave();
2745 }
2746 }
2747 #endif
2748
2749 #if (defined(FEAT_X11) && (defined(FEAT_TITLE) || defined(FEAT_XCLIPBOARD))) \
2750 || defined(PROTO)
2751 /*
2752 * Return whether "prog" is currently being executed.
2753 */
2754 int
regprog_in_use(regprog_T * prog)2755 regprog_in_use(regprog_T *prog)
2756 {
2757 return prog->re_in_use;
2758 }
2759 #endif
2760
2761 /*
2762 * Match a regexp against a string.
2763 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
2764 * Note: "rmp->regprog" may be freed and changed.
2765 * Uses curbuf for line count and 'iskeyword'.
2766 * When "nl" is TRUE consider a "\n" in "line" to be a line break.
2767 *
2768 * Return TRUE if there is a match, FALSE if not.
2769 */
2770 static int
vim_regexec_string(regmatch_T * rmp,char_u * line,colnr_T col,int nl)2771 vim_regexec_string(
2772 regmatch_T *rmp,
2773 char_u *line, // string to match against
2774 colnr_T col, // column to start looking for match
2775 int nl)
2776 {
2777 int result;
2778 regexec_T rex_save;
2779 int rex_in_use_save = rex_in_use;
2780
2781 // Cannot use the same prog recursively, it contains state.
2782 if (rmp->regprog->re_in_use)
2783 {
2784 emsg(_(e_recursive));
2785 return FALSE;
2786 }
2787 rmp->regprog->re_in_use = TRUE;
2788
2789 if (rex_in_use)
2790 // Being called recursively, save the state.
2791 rex_save = rex;
2792 rex_in_use = TRUE;
2793
2794 rex.reg_startp = NULL;
2795 rex.reg_endp = NULL;
2796 rex.reg_startpos = NULL;
2797 rex.reg_endpos = NULL;
2798
2799 result = rmp->regprog->engine->regexec_nl(rmp, line, col, nl);
2800 rmp->regprog->re_in_use = FALSE;
2801
2802 // NFA engine aborted because it's very slow.
2803 if (rmp->regprog->re_engine == AUTOMATIC_ENGINE
2804 && result == NFA_TOO_EXPENSIVE)
2805 {
2806 int save_p_re = p_re;
2807 int re_flags = rmp->regprog->re_flags;
2808 char_u *pat = vim_strsave(((nfa_regprog_T *)rmp->regprog)->pattern);
2809
2810 p_re = BACKTRACKING_ENGINE;
2811 vim_regfree(rmp->regprog);
2812 if (pat != NULL)
2813 {
2814 #ifdef FEAT_EVAL
2815 report_re_switch(pat);
2816 #endif
2817 rmp->regprog = vim_regcomp(pat, re_flags);
2818 if (rmp->regprog != NULL)
2819 {
2820 rmp->regprog->re_in_use = TRUE;
2821 result = rmp->regprog->engine->regexec_nl(rmp, line, col, nl);
2822 rmp->regprog->re_in_use = FALSE;
2823 }
2824 vim_free(pat);
2825 }
2826
2827 p_re = save_p_re;
2828 }
2829
2830 rex_in_use = rex_in_use_save;
2831 if (rex_in_use)
2832 rex = rex_save;
2833
2834 return result > 0;
2835 }
2836
2837 /*
2838 * Note: "*prog" may be freed and changed.
2839 * Return TRUE if there is a match, FALSE if not.
2840 */
2841 int
vim_regexec_prog(regprog_T ** prog,int ignore_case,char_u * line,colnr_T col)2842 vim_regexec_prog(
2843 regprog_T **prog,
2844 int ignore_case,
2845 char_u *line,
2846 colnr_T col)
2847 {
2848 int r;
2849 regmatch_T regmatch;
2850
2851 regmatch.regprog = *prog;
2852 regmatch.rm_ic = ignore_case;
2853 r = vim_regexec_string(®match, line, col, FALSE);
2854 *prog = regmatch.regprog;
2855 return r;
2856 }
2857
2858 /*
2859 * Note: "rmp->regprog" may be freed and changed.
2860 * Return TRUE if there is a match, FALSE if not.
2861 */
2862 int
vim_regexec(regmatch_T * rmp,char_u * line,colnr_T col)2863 vim_regexec(regmatch_T *rmp, char_u *line, colnr_T col)
2864 {
2865 return vim_regexec_string(rmp, line, col, FALSE);
2866 }
2867
2868 /*
2869 * Like vim_regexec(), but consider a "\n" in "line" to be a line break.
2870 * Note: "rmp->regprog" may be freed and changed.
2871 * Return TRUE if there is a match, FALSE if not.
2872 */
2873 int
vim_regexec_nl(regmatch_T * rmp,char_u * line,colnr_T col)2874 vim_regexec_nl(regmatch_T *rmp, char_u *line, colnr_T col)
2875 {
2876 return vim_regexec_string(rmp, line, col, TRUE);
2877 }
2878
2879 /*
2880 * Match a regexp against multiple lines.
2881 * "rmp->regprog" must be a compiled regexp as returned by vim_regcomp().
2882 * Note: "rmp->regprog" may be freed and changed, even set to NULL.
2883 * Uses curbuf for line count and 'iskeyword'.
2884 *
2885 * Return zero if there is no match. Return number of lines contained in the
2886 * match otherwise.
2887 */
2888 long
vim_regexec_multi(regmmatch_T * rmp,win_T * win,buf_T * buf,linenr_T lnum,colnr_T col,proftime_T * tm,int * timed_out)2889 vim_regexec_multi(
2890 regmmatch_T *rmp,
2891 win_T *win, // window in which to search or NULL
2892 buf_T *buf, // buffer in which to search
2893 linenr_T lnum, // nr of line to start looking for match
2894 colnr_T col, // column to start looking for match
2895 proftime_T *tm, // timeout limit or NULL
2896 int *timed_out) // flag is set when timeout limit reached
2897 {
2898 int result;
2899 regexec_T rex_save;
2900 int rex_in_use_save = rex_in_use;
2901
2902 // Cannot use the same prog recursively, it contains state.
2903 if (rmp->regprog->re_in_use)
2904 {
2905 emsg(_(e_recursive));
2906 return FALSE;
2907 }
2908 rmp->regprog->re_in_use = TRUE;
2909
2910 if (rex_in_use)
2911 // Being called recursively, save the state.
2912 rex_save = rex;
2913 rex_in_use = TRUE;
2914
2915 result = rmp->regprog->engine->regexec_multi(
2916 rmp, win, buf, lnum, col, tm, timed_out);
2917 rmp->regprog->re_in_use = FALSE;
2918
2919 // NFA engine aborted because it's very slow.
2920 if (rmp->regprog->re_engine == AUTOMATIC_ENGINE
2921 && result == NFA_TOO_EXPENSIVE)
2922 {
2923 int save_p_re = p_re;
2924 int re_flags = rmp->regprog->re_flags;
2925 char_u *pat = vim_strsave(((nfa_regprog_T *)rmp->regprog)->pattern);
2926
2927 p_re = BACKTRACKING_ENGINE;
2928 vim_regfree(rmp->regprog);
2929 if (pat != NULL)
2930 {
2931 #ifdef FEAT_EVAL
2932 report_re_switch(pat);
2933 #endif
2934 #ifdef FEAT_SYN_HL
2935 // checking for \z misuse was already done when compiling for NFA,
2936 // allow all here
2937 reg_do_extmatch = REX_ALL;
2938 #endif
2939 rmp->regprog = vim_regcomp(pat, re_flags);
2940 #ifdef FEAT_SYN_HL
2941 reg_do_extmatch = 0;
2942 #endif
2943
2944 if (rmp->regprog != NULL)
2945 {
2946 rmp->regprog->re_in_use = TRUE;
2947 result = rmp->regprog->engine->regexec_multi(
2948 rmp, win, buf, lnum, col, tm, timed_out);
2949 rmp->regprog->re_in_use = FALSE;
2950 }
2951 vim_free(pat);
2952 }
2953 p_re = save_p_re;
2954 }
2955
2956 rex_in_use = rex_in_use_save;
2957 if (rex_in_use)
2958 rex = rex_save;
2959
2960 return result <= 0 ? 0 : result;
2961 }
2962