xref: /vim-8.2.3635/src/regexp_nfa.c (revision 64066b9a)
1 /* vi:set ts=8 sts=4 sw=4 noet:
2  *
3  * NFA regular expression implementation.
4  *
5  * This file is included in "regexp.c".
6  */
7 
8 /*
9  * Logging of NFA engine.
10  *
11  * The NFA engine can write four log files:
12  * - Error log: Contains NFA engine's fatal errors.
13  * - Dump log: Contains compiled NFA state machine's information.
14  * - Run log: Contains information of matching procedure.
15  * - Debug log: Contains detailed information of matching procedure. Can be
16  *   disabled by undefining NFA_REGEXP_DEBUG_LOG.
17  * The first one can also be used without debug mode.
18  * The last three are enabled when compiled as debug mode and individually
19  * disabled by commenting them out.
20  * The log files can get quite big!
21  * Do disable all of this when compiling Vim for debugging, undefine DEBUG in
22  * regexp.c
23  */
24 #ifdef DEBUG
25 # define NFA_REGEXP_ERROR_LOG	"nfa_regexp_error.log"
26 # define ENABLE_LOG
27 # define NFA_REGEXP_DUMP_LOG	"nfa_regexp_dump.log"
28 # define NFA_REGEXP_RUN_LOG	"nfa_regexp_run.log"
29 # define NFA_REGEXP_DEBUG_LOG	"nfa_regexp_debug.log"
30 #endif
31 
32 // Added to NFA_ANY - NFA_NUPPER_IC to include a NL.
33 #define NFA_ADD_NL		31
34 
35 enum
36 {
37     NFA_SPLIT = -1024,
38     NFA_MATCH,
39     NFA_EMPTY,			    // matches 0-length
40 
41     NFA_START_COLL,		    // [abc] start
42     NFA_END_COLL,		    // [abc] end
43     NFA_START_NEG_COLL,		    // [^abc] start
44     NFA_END_NEG_COLL,		    // [^abc] end (postfix only)
45     NFA_RANGE,			    // range of the two previous items
46 				    // (postfix only)
47     NFA_RANGE_MIN,		    // low end of a range
48     NFA_RANGE_MAX,		    // high end of a range
49 
50     NFA_CONCAT,			    // concatenate two previous items (postfix
51 				    // only)
52     NFA_OR,			    // \| (postfix only)
53     NFA_STAR,			    // greedy * (postfix only)
54     NFA_STAR_NONGREEDY,		    // non-greedy * (postfix only)
55     NFA_QUEST,			    // greedy \? (postfix only)
56     NFA_QUEST_NONGREEDY,	    // non-greedy \? (postfix only)
57 
58     NFA_BOL,			    // ^    Begin line
59     NFA_EOL,			    // $    End line
60     NFA_BOW,			    // \<   Begin word
61     NFA_EOW,			    // \>   End word
62     NFA_BOF,			    // \%^  Begin file
63     NFA_EOF,			    // \%$  End file
64     NFA_NEWL,
65     NFA_ZSTART,			    // Used for \zs
66     NFA_ZEND,			    // Used for \ze
67     NFA_NOPEN,			    // Start of subexpression marked with \%(
68     NFA_NCLOSE,			    // End of subexpr. marked with \%( ... \)
69     NFA_START_INVISIBLE,
70     NFA_START_INVISIBLE_FIRST,
71     NFA_START_INVISIBLE_NEG,
72     NFA_START_INVISIBLE_NEG_FIRST,
73     NFA_START_INVISIBLE_BEFORE,
74     NFA_START_INVISIBLE_BEFORE_FIRST,
75     NFA_START_INVISIBLE_BEFORE_NEG,
76     NFA_START_INVISIBLE_BEFORE_NEG_FIRST,
77     NFA_START_PATTERN,
78     NFA_END_INVISIBLE,
79     NFA_END_INVISIBLE_NEG,
80     NFA_END_PATTERN,
81     NFA_COMPOSING,		    // Next nodes in NFA are part of the
82 				    // composing multibyte char
83     NFA_END_COMPOSING,		    // End of a composing char in the NFA
84     NFA_ANY_COMPOSING,		    // \%C: Any composing characters.
85     NFA_OPT_CHARS,		    // \%[abc]
86 
87     // The following are used only in the postfix form, not in the NFA
88     NFA_PREV_ATOM_NO_WIDTH,	    // Used for \@=
89     NFA_PREV_ATOM_NO_WIDTH_NEG,	    // Used for \@!
90     NFA_PREV_ATOM_JUST_BEFORE,	    // Used for \@<=
91     NFA_PREV_ATOM_JUST_BEFORE_NEG,  // Used for \@<!
92     NFA_PREV_ATOM_LIKE_PATTERN,	    // Used for \@>
93 
94     NFA_BACKREF1,		    // \1
95     NFA_BACKREF2,		    // \2
96     NFA_BACKREF3,		    // \3
97     NFA_BACKREF4,		    // \4
98     NFA_BACKREF5,		    // \5
99     NFA_BACKREF6,		    // \6
100     NFA_BACKREF7,		    // \7
101     NFA_BACKREF8,		    // \8
102     NFA_BACKREF9,		    // \9
103 #ifdef FEAT_SYN_HL
104     NFA_ZREF1,			    // \z1
105     NFA_ZREF2,			    // \z2
106     NFA_ZREF3,			    // \z3
107     NFA_ZREF4,			    // \z4
108     NFA_ZREF5,			    // \z5
109     NFA_ZREF6,			    // \z6
110     NFA_ZREF7,			    // \z7
111     NFA_ZREF8,			    // \z8
112     NFA_ZREF9,			    // \z9
113 #endif
114     NFA_SKIP,			    // Skip characters
115 
116     NFA_MOPEN,
117     NFA_MOPEN1,
118     NFA_MOPEN2,
119     NFA_MOPEN3,
120     NFA_MOPEN4,
121     NFA_MOPEN5,
122     NFA_MOPEN6,
123     NFA_MOPEN7,
124     NFA_MOPEN8,
125     NFA_MOPEN9,
126 
127     NFA_MCLOSE,
128     NFA_MCLOSE1,
129     NFA_MCLOSE2,
130     NFA_MCLOSE3,
131     NFA_MCLOSE4,
132     NFA_MCLOSE5,
133     NFA_MCLOSE6,
134     NFA_MCLOSE7,
135     NFA_MCLOSE8,
136     NFA_MCLOSE9,
137 
138 #ifdef FEAT_SYN_HL
139     NFA_ZOPEN,
140     NFA_ZOPEN1,
141     NFA_ZOPEN2,
142     NFA_ZOPEN3,
143     NFA_ZOPEN4,
144     NFA_ZOPEN5,
145     NFA_ZOPEN6,
146     NFA_ZOPEN7,
147     NFA_ZOPEN8,
148     NFA_ZOPEN9,
149 
150     NFA_ZCLOSE,
151     NFA_ZCLOSE1,
152     NFA_ZCLOSE2,
153     NFA_ZCLOSE3,
154     NFA_ZCLOSE4,
155     NFA_ZCLOSE5,
156     NFA_ZCLOSE6,
157     NFA_ZCLOSE7,
158     NFA_ZCLOSE8,
159     NFA_ZCLOSE9,
160 #endif
161 
162     // NFA_FIRST_NL
163     NFA_ANY,		//	Match any one character.
164     NFA_IDENT,		//	Match identifier char
165     NFA_SIDENT,		//	Match identifier char but no digit
166     NFA_KWORD,		//	Match keyword char
167     NFA_SKWORD,		//	Match word char but no digit
168     NFA_FNAME,		//	Match file name char
169     NFA_SFNAME,		//	Match file name char but no digit
170     NFA_PRINT,		//	Match printable char
171     NFA_SPRINT,		//	Match printable char but no digit
172     NFA_WHITE,		//	Match whitespace char
173     NFA_NWHITE,		//	Match non-whitespace char
174     NFA_DIGIT,		//	Match digit char
175     NFA_NDIGIT,		//	Match non-digit char
176     NFA_HEX,		//	Match hex char
177     NFA_NHEX,		//	Match non-hex char
178     NFA_OCTAL,		//	Match octal char
179     NFA_NOCTAL,		//	Match non-octal char
180     NFA_WORD,		//	Match word char
181     NFA_NWORD,		//	Match non-word char
182     NFA_HEAD,		//	Match head char
183     NFA_NHEAD,		//	Match non-head char
184     NFA_ALPHA,		//	Match alpha char
185     NFA_NALPHA,		//	Match non-alpha char
186     NFA_LOWER,		//	Match lowercase char
187     NFA_NLOWER,		//	Match non-lowercase char
188     NFA_UPPER,		//	Match uppercase char
189     NFA_NUPPER,		//	Match non-uppercase char
190     NFA_LOWER_IC,	//	Match [a-z]
191     NFA_NLOWER_IC,	//	Match [^a-z]
192     NFA_UPPER_IC,	//	Match [A-Z]
193     NFA_NUPPER_IC,	//	Match [^A-Z]
194 
195     NFA_FIRST_NL = NFA_ANY + NFA_ADD_NL,
196     NFA_LAST_NL = NFA_NUPPER_IC + NFA_ADD_NL,
197 
198     NFA_CURSOR,		//	Match cursor pos
199     NFA_LNUM,		//	Match line number
200     NFA_LNUM_GT,	//	Match > line number
201     NFA_LNUM_LT,	//	Match < line number
202     NFA_COL,		//	Match cursor column
203     NFA_COL_GT,		//	Match > cursor column
204     NFA_COL_LT,		//	Match < cursor column
205     NFA_VCOL,		//	Match cursor virtual column
206     NFA_VCOL_GT,	//	Match > cursor virtual column
207     NFA_VCOL_LT,	//	Match < cursor virtual column
208     NFA_MARK,		//	Match mark
209     NFA_MARK_GT,	//	Match > mark
210     NFA_MARK_LT,	//	Match < mark
211     NFA_VISUAL,		//	Match Visual area
212 
213     // Character classes [:alnum:] etc
214     NFA_CLASS_ALNUM,
215     NFA_CLASS_ALPHA,
216     NFA_CLASS_BLANK,
217     NFA_CLASS_CNTRL,
218     NFA_CLASS_DIGIT,
219     NFA_CLASS_GRAPH,
220     NFA_CLASS_LOWER,
221     NFA_CLASS_PRINT,
222     NFA_CLASS_PUNCT,
223     NFA_CLASS_SPACE,
224     NFA_CLASS_UPPER,
225     NFA_CLASS_XDIGIT,
226     NFA_CLASS_TAB,
227     NFA_CLASS_RETURN,
228     NFA_CLASS_BACKSPACE,
229     NFA_CLASS_ESCAPE,
230     NFA_CLASS_IDENT,
231     NFA_CLASS_KEYWORD,
232     NFA_CLASS_FNAME
233 };
234 
235 // Keep in sync with classchars.
236 static int nfa_classcodes[] = {
237     NFA_ANY, NFA_IDENT, NFA_SIDENT, NFA_KWORD,NFA_SKWORD,
238     NFA_FNAME, NFA_SFNAME, NFA_PRINT, NFA_SPRINT,
239     NFA_WHITE, NFA_NWHITE, NFA_DIGIT, NFA_NDIGIT,
240     NFA_HEX, NFA_NHEX, NFA_OCTAL, NFA_NOCTAL,
241     NFA_WORD, NFA_NWORD, NFA_HEAD, NFA_NHEAD,
242     NFA_ALPHA, NFA_NALPHA, NFA_LOWER, NFA_NLOWER,
243     NFA_UPPER, NFA_NUPPER
244 };
245 
246 static char_u e_nul_found[] = N_("E865: (NFA) Regexp end encountered prematurely");
247 static char_u e_misplaced[] = N_("E866: (NFA regexp) Misplaced %c");
248 static char_u e_ill_char_class[] = N_("E877: (NFA regexp) Invalid character class: %d");
249 static char_u e_value_too_large[] = N_("E951: \\% value too large");
250 
251 // Variables only used in nfa_regcomp() and descendants.
252 static int nfa_re_flags; // re_flags passed to nfa_regcomp()
253 static int *post_start;  // holds the postfix form of r.e.
254 static int *post_end;
255 static int *post_ptr;
256 
257 // Set when the pattern should use the NFA engine.
258 // E.g. [[:upper:]] only allows 8bit characters for BT engine,
259 // while NFA engine handles multibyte characters correctly.
260 static int wants_nfa;
261 
262 static int nstate;	// Number of states in the NFA.
263 static int istate;	// Index in the state vector, used in alloc_state()
264 
265 // If not NULL match must end at this position
266 static save_se_T *nfa_endp = NULL;
267 
268 // 0 for first call to nfa_regmatch(), 1 for recursive call.
269 static int nfa_ll_index = 0;
270 
271 static int realloc_post_list(void);
272 static int nfa_reg(int paren);
273 #ifdef DEBUG
274 static void nfa_print_state2(FILE *debugf, nfa_state_T *state, garray_T *indent);
275 #endif
276 static int match_follows(nfa_state_T *startstate, int depth);
277 static int failure_chance(nfa_state_T *state, int depth);
278 
279 // helper functions used when doing re2post() ... regatom() parsing
280 #define EMIT(c)	do {				\
281 		    if (post_ptr >= post_end && realloc_post_list() == FAIL) \
282 			return FAIL;		\
283 		    *post_ptr++ = c;		\
284 		} while (0)
285 
286 /*
287  * Initialize internal variables before NFA compilation.
288  * Return OK on success, FAIL otherwise.
289  */
290     static int
nfa_regcomp_start(char_u * expr,int re_flags)291 nfa_regcomp_start(
292     char_u	*expr,
293     int		re_flags)	    // see vim_regcomp()
294 {
295     size_t	postfix_size;
296     int		nstate_max;
297 
298     nstate = 0;
299     istate = 0;
300     // A reasonable estimation for maximum size
301     nstate_max = (int)(STRLEN(expr) + 1) * 25;
302 
303     // Some items blow up in size, such as [A-z].  Add more space for that.
304     // When it is still not enough realloc_post_list() will be used.
305     nstate_max += 1000;
306 
307     // Size for postfix representation of expr.
308     postfix_size = sizeof(int) * nstate_max;
309 
310     post_start = alloc(postfix_size);
311     if (post_start == NULL)
312 	return FAIL;
313     post_ptr = post_start;
314     post_end = post_start + nstate_max;
315     wants_nfa = FALSE;
316     rex.nfa_has_zend = FALSE;
317     rex.nfa_has_backref = FALSE;
318 
319     // shared with BT engine
320     regcomp_start(expr, re_flags);
321 
322     return OK;
323 }
324 
325 /*
326  * Figure out if the NFA state list starts with an anchor, must match at start
327  * of the line.
328  */
329     static int
nfa_get_reganch(nfa_state_T * start,int depth)330 nfa_get_reganch(nfa_state_T *start, int depth)
331 {
332     nfa_state_T *p = start;
333 
334     if (depth > 4)
335 	return 0;
336 
337     while (p != NULL)
338     {
339 	switch (p->c)
340 	{
341 	    case NFA_BOL:
342 	    case NFA_BOF:
343 		return 1; // yes!
344 
345 	    case NFA_ZSTART:
346 	    case NFA_ZEND:
347 	    case NFA_CURSOR:
348 	    case NFA_VISUAL:
349 
350 	    case NFA_MOPEN:
351 	    case NFA_MOPEN1:
352 	    case NFA_MOPEN2:
353 	    case NFA_MOPEN3:
354 	    case NFA_MOPEN4:
355 	    case NFA_MOPEN5:
356 	    case NFA_MOPEN6:
357 	    case NFA_MOPEN7:
358 	    case NFA_MOPEN8:
359 	    case NFA_MOPEN9:
360 	    case NFA_NOPEN:
361 #ifdef FEAT_SYN_HL
362 	    case NFA_ZOPEN:
363 	    case NFA_ZOPEN1:
364 	    case NFA_ZOPEN2:
365 	    case NFA_ZOPEN3:
366 	    case NFA_ZOPEN4:
367 	    case NFA_ZOPEN5:
368 	    case NFA_ZOPEN6:
369 	    case NFA_ZOPEN7:
370 	    case NFA_ZOPEN8:
371 	    case NFA_ZOPEN9:
372 #endif
373 		p = p->out;
374 		break;
375 
376 	    case NFA_SPLIT:
377 		return nfa_get_reganch(p->out, depth + 1)
378 				       && nfa_get_reganch(p->out1, depth + 1);
379 
380 	    default:
381 		return 0; // noooo
382 	}
383     }
384     return 0;
385 }
386 
387 /*
388  * Figure out if the NFA state list starts with a character which must match
389  * at start of the match.
390  */
391     static int
nfa_get_regstart(nfa_state_T * start,int depth)392 nfa_get_regstart(nfa_state_T *start, int depth)
393 {
394     nfa_state_T *p = start;
395 
396     if (depth > 4)
397 	return 0;
398 
399     while (p != NULL)
400     {
401 	switch (p->c)
402 	{
403 	    // all kinds of zero-width matches
404 	    case NFA_BOL:
405 	    case NFA_BOF:
406 	    case NFA_BOW:
407 	    case NFA_EOW:
408 	    case NFA_ZSTART:
409 	    case NFA_ZEND:
410 	    case NFA_CURSOR:
411 	    case NFA_VISUAL:
412 	    case NFA_LNUM:
413 	    case NFA_LNUM_GT:
414 	    case NFA_LNUM_LT:
415 	    case NFA_COL:
416 	    case NFA_COL_GT:
417 	    case NFA_COL_LT:
418 	    case NFA_VCOL:
419 	    case NFA_VCOL_GT:
420 	    case NFA_VCOL_LT:
421 	    case NFA_MARK:
422 	    case NFA_MARK_GT:
423 	    case NFA_MARK_LT:
424 
425 	    case NFA_MOPEN:
426 	    case NFA_MOPEN1:
427 	    case NFA_MOPEN2:
428 	    case NFA_MOPEN3:
429 	    case NFA_MOPEN4:
430 	    case NFA_MOPEN5:
431 	    case NFA_MOPEN6:
432 	    case NFA_MOPEN7:
433 	    case NFA_MOPEN8:
434 	    case NFA_MOPEN9:
435 	    case NFA_NOPEN:
436 #ifdef FEAT_SYN_HL
437 	    case NFA_ZOPEN:
438 	    case NFA_ZOPEN1:
439 	    case NFA_ZOPEN2:
440 	    case NFA_ZOPEN3:
441 	    case NFA_ZOPEN4:
442 	    case NFA_ZOPEN5:
443 	    case NFA_ZOPEN6:
444 	    case NFA_ZOPEN7:
445 	    case NFA_ZOPEN8:
446 	    case NFA_ZOPEN9:
447 #endif
448 		p = p->out;
449 		break;
450 
451 	    case NFA_SPLIT:
452 	    {
453 		int c1 = nfa_get_regstart(p->out, depth + 1);
454 		int c2 = nfa_get_regstart(p->out1, depth + 1);
455 
456 		if (c1 == c2)
457 		    return c1; // yes!
458 		return 0;
459 	    }
460 
461 	    default:
462 		if (p->c > 0)
463 		    return p->c; // yes!
464 		return 0;
465 	}
466     }
467     return 0;
468 }
469 
470 /*
471  * Figure out if the NFA state list contains just literal text and nothing
472  * else.  If so return a string in allocated memory with what must match after
473  * regstart.  Otherwise return NULL.
474  */
475     static char_u *
nfa_get_match_text(nfa_state_T * start)476 nfa_get_match_text(nfa_state_T *start)
477 {
478     nfa_state_T *p = start;
479     int		len = 0;
480     char_u	*ret;
481     char_u	*s;
482 
483     if (p->c != NFA_MOPEN)
484 	return NULL; // just in case
485     p = p->out;
486     while (p->c > 0)
487     {
488 	len += MB_CHAR2LEN(p->c);
489 	p = p->out;
490     }
491     if (p->c != NFA_MCLOSE || p->out->c != NFA_MATCH)
492 	return NULL;
493 
494     ret = alloc(len);
495     if (ret != NULL)
496     {
497 	p = start->out->out; // skip first char, it goes into regstart
498 	s = ret;
499 	while (p->c > 0)
500 	{
501 	    if (has_mbyte)
502 		s += (*mb_char2bytes)(p->c, s);
503 	    else
504 		*s++ = p->c;
505 	    p = p->out;
506 	}
507 	*s = NUL;
508     }
509     return ret;
510 }
511 
512 /*
513  * Allocate more space for post_start.  Called when
514  * running above the estimated number of states.
515  */
516     static int
realloc_post_list(void)517 realloc_post_list(void)
518 {
519     int   nstate_max = (int)(post_end - post_start);
520     int   new_max;
521     int   *new_start;
522     int	  *old_start;
523 
524     // For weird patterns the number of states can be very high. Increasing by
525     // 50% seems a reasonable compromise between memory use and speed.
526     new_max = nstate_max * 3 / 2;
527     new_start = ALLOC_MULT(int, new_max);
528     if (new_start == NULL)
529 	return FAIL;
530     mch_memmove(new_start, post_start, nstate_max * sizeof(int));
531     old_start = post_start;
532     post_start = new_start;
533     post_ptr = new_start + (post_ptr - old_start);
534     post_end = post_start + new_max;
535     vim_free(old_start);
536     return OK;
537 }
538 
539 /*
540  * Search between "start" and "end" and try to recognize a
541  * character class in expanded form. For example [0-9].
542  * On success, return the id the character class to be emitted.
543  * On failure, return 0 (=FAIL)
544  * Start points to the first char of the range, while end should point
545  * to the closing brace.
546  * Keep in mind that 'ignorecase' applies at execution time, thus [a-z] may
547  * need to be interpreted as [a-zA-Z].
548  */
549     static int
nfa_recognize_char_class(char_u * start,char_u * end,int extra_newl)550 nfa_recognize_char_class(char_u *start, char_u *end, int extra_newl)
551 {
552 #   define CLASS_not		0x80
553 #   define CLASS_af		0x40
554 #   define CLASS_AF		0x20
555 #   define CLASS_az		0x10
556 #   define CLASS_AZ		0x08
557 #   define CLASS_o7		0x04
558 #   define CLASS_o9		0x02
559 #   define CLASS_underscore	0x01
560 
561     int		newl = FALSE;
562     char_u	*p;
563     int		config = 0;
564 
565     if (extra_newl == TRUE)
566 	newl = TRUE;
567 
568     if (*end != ']')
569 	return FAIL;
570     p = start;
571     if (*p == '^')
572     {
573 	config |= CLASS_not;
574 	p++;
575     }
576 
577     while (p < end)
578     {
579 	if (p + 2 < end && *(p + 1) == '-')
580 	{
581 	    switch (*p)
582 	    {
583 		case '0':
584 		    if (*(p + 2) == '9')
585 		    {
586 			config |= CLASS_o9;
587 			break;
588 		    }
589 		    if (*(p + 2) == '7')
590 		    {
591 			config |= CLASS_o7;
592 			break;
593 		    }
594 		    return FAIL;
595 
596 		case 'a':
597 		    if (*(p + 2) == 'z')
598 		    {
599 			config |= CLASS_az;
600 			break;
601 		    }
602 		    if (*(p + 2) == 'f')
603 		    {
604 			config |= CLASS_af;
605 			break;
606 		    }
607 		    return FAIL;
608 
609 		case 'A':
610 		    if (*(p + 2) == 'Z')
611 		    {
612 			config |= CLASS_AZ;
613 			break;
614 		    }
615 		    if (*(p + 2) == 'F')
616 		    {
617 			config |= CLASS_AF;
618 			break;
619 		    }
620 		    return FAIL;
621 
622 		default:
623 		    return FAIL;
624 	    }
625 	    p += 3;
626 	}
627 	else if (p + 1 < end && *p == '\\' && *(p + 1) == 'n')
628 	{
629 	    newl = TRUE;
630 	    p += 2;
631 	}
632 	else if (*p == '_')
633 	{
634 	    config |= CLASS_underscore;
635 	    p ++;
636 	}
637 	else if (*p == '\n')
638 	{
639 	    newl = TRUE;
640 	    p ++;
641 	}
642 	else
643 	    return FAIL;
644     } // while (p < end)
645 
646     if (p != end)
647 	return FAIL;
648 
649     if (newl == TRUE)
650 	extra_newl = NFA_ADD_NL;
651 
652     switch (config)
653     {
654 	case CLASS_o9:
655 	    return extra_newl + NFA_DIGIT;
656 	case CLASS_not |  CLASS_o9:
657 	    return extra_newl + NFA_NDIGIT;
658 	case CLASS_af | CLASS_AF | CLASS_o9:
659 	    return extra_newl + NFA_HEX;
660 	case CLASS_not | CLASS_af | CLASS_AF | CLASS_o9:
661 	    return extra_newl + NFA_NHEX;
662 	case CLASS_o7:
663 	    return extra_newl + NFA_OCTAL;
664 	case CLASS_not | CLASS_o7:
665 	    return extra_newl + NFA_NOCTAL;
666 	case CLASS_az | CLASS_AZ | CLASS_o9 | CLASS_underscore:
667 	    return extra_newl + NFA_WORD;
668 	case CLASS_not | CLASS_az | CLASS_AZ | CLASS_o9 | CLASS_underscore:
669 	    return extra_newl + NFA_NWORD;
670 	case CLASS_az | CLASS_AZ | CLASS_underscore:
671 	    return extra_newl + NFA_HEAD;
672 	case CLASS_not | CLASS_az | CLASS_AZ | CLASS_underscore:
673 	    return extra_newl + NFA_NHEAD;
674 	case CLASS_az | CLASS_AZ:
675 	    return extra_newl + NFA_ALPHA;
676 	case CLASS_not | CLASS_az | CLASS_AZ:
677 	    return extra_newl + NFA_NALPHA;
678 	case CLASS_az:
679 	   return extra_newl + NFA_LOWER_IC;
680 	case CLASS_not | CLASS_az:
681 	    return extra_newl + NFA_NLOWER_IC;
682 	case CLASS_AZ:
683 	    return extra_newl + NFA_UPPER_IC;
684 	case CLASS_not | CLASS_AZ:
685 	    return extra_newl + NFA_NUPPER_IC;
686     }
687     return FAIL;
688 }
689 
690 /*
691  * Produce the bytes for equivalence class "c".
692  * Currently only handles latin1, latin9 and utf-8.
693  * Emits bytes in postfix notation: 'a,b,NFA_OR,c,NFA_OR' is
694  * equivalent to 'a OR b OR c'
695  *
696  * NOTE! When changing this function, also update reg_equi_class()
697  */
698     static int
nfa_emit_equi_class(int c)699 nfa_emit_equi_class(int c)
700 {
701 #define EMIT2(c)    EMIT(c); EMIT(NFA_CONCAT);
702 
703     if (enc_utf8 || STRCMP(p_enc, "latin1") == 0
704 					 || STRCMP(p_enc, "iso-8859-15") == 0)
705     {
706 #ifdef EBCDIC
707 # define A_circumflex 0x62
708 # define A_diaeresis 0x63
709 # define A_grave 0x64
710 # define A_acute 0x65
711 # define A_virguilla 0x66
712 # define A_ring 0x67
713 # define C_cedilla 0x68
714 # define E_acute 0x71
715 # define E_circumflex 0x72
716 # define E_diaeresis 0x73
717 # define E_grave 0x74
718 # define I_acute 0x75
719 # define I_circumflex 0x76
720 # define I_diaeresis 0x77
721 # define I_grave 0x78
722 # define N_virguilla 0x69
723 # define O_circumflex 0xeb
724 # define O_diaeresis 0xec
725 # define O_grave 0xed
726 # define O_acute 0xee
727 # define O_virguilla 0xef
728 # define O_slash 0x80
729 # define U_circumflex 0xfb
730 # define U_diaeresis 0xfc
731 # define U_grave 0xfd
732 # define U_acute 0xfe
733 # define Y_acute 0xba
734 # define a_grave 0x42
735 # define a_acute 0x43
736 # define a_circumflex 0x44
737 # define a_virguilla 0x45
738 # define a_diaeresis 0x46
739 # define a_ring 0x47
740 # define c_cedilla 0x48
741 # define e_grave 0x51
742 # define e_acute 0x52
743 # define e_circumflex 0x53
744 # define e_diaeresis 0x54
745 # define i_grave 0x55
746 # define i_acute 0x56
747 # define i_circumflex 0x57
748 # define i_diaeresis 0x58
749 # define n_virguilla 0x49
750 # define o_grave 0xcb
751 # define o_acute 0xcc
752 # define o_circumflex 0xcd
753 # define o_virguilla 0xce
754 # define o_diaeresis 0xcf
755 # define o_slash 0x70
756 # define u_grave 0xdb
757 # define u_acute 0xdc
758 # define u_circumflex 0xdd
759 # define u_diaeresis 0xde
760 # define y_acute 0x8d
761 # define y_diaeresis 0xdf
762 #else
763 # define A_grave 0xc0
764 # define A_acute 0xc1
765 # define A_circumflex 0xc2
766 # define A_virguilla 0xc3
767 # define A_diaeresis 0xc4
768 # define A_ring 0xc5
769 # define C_cedilla 0xc7
770 # define E_grave 0xc8
771 # define E_acute 0xc9
772 # define E_circumflex 0xca
773 # define E_diaeresis 0xcb
774 # define I_grave 0xcc
775 # define I_acute 0xcd
776 # define I_circumflex 0xce
777 # define I_diaeresis 0xcf
778 # define N_virguilla 0xd1
779 # define O_grave 0xd2
780 # define O_acute 0xd3
781 # define O_circumflex 0xd4
782 # define O_virguilla 0xd5
783 # define O_diaeresis 0xd6
784 # define O_slash 0xd8
785 # define U_grave 0xd9
786 # define U_acute 0xda
787 # define U_circumflex 0xdb
788 # define U_diaeresis 0xdc
789 # define Y_acute 0xdd
790 # define a_grave 0xe0
791 # define a_acute 0xe1
792 # define a_circumflex 0xe2
793 # define a_virguilla 0xe3
794 # define a_diaeresis 0xe4
795 # define a_ring 0xe5
796 # define c_cedilla 0xe7
797 # define e_grave 0xe8
798 # define e_acute 0xe9
799 # define e_circumflex 0xea
800 # define e_diaeresis 0xeb
801 # define i_grave 0xec
802 # define i_acute 0xed
803 # define i_circumflex 0xee
804 # define i_diaeresis 0xef
805 # define n_virguilla 0xf1
806 # define o_grave 0xf2
807 # define o_acute 0xf3
808 # define o_circumflex 0xf4
809 # define o_virguilla 0xf5
810 # define o_diaeresis 0xf6
811 # define o_slash 0xf8
812 # define u_grave 0xf9
813 # define u_acute 0xfa
814 # define u_circumflex 0xfb
815 # define u_diaeresis 0xfc
816 # define y_acute 0xfd
817 # define y_diaeresis 0xff
818 #endif
819 	switch (c)
820 	{
821 	    case 'A': case A_grave: case A_acute: case A_circumflex:
822 	    case A_virguilla: case A_diaeresis: case A_ring:
823 	    case 0x100: case 0x102: case 0x104: case 0x1cd:
824 	    case 0x1de: case 0x1e0: case 0x1fa: case 0x200:
825 	    case 0x202: case 0x226: case 0x23a: case 0x1e00:
826 	    case 0x1ea0: case 0x1ea2: case 0x1ea4: case 0x1ea6:
827 	    case 0x1ea8: case 0x1eaa: case 0x1eac: case 0x1eae:
828 	    case 0x1eb0: case 0x1eb2: case 0x1eb4: case 0x1eb6:
829 		    EMIT2('A') EMIT2(A_grave) EMIT2(A_acute)
830 		    EMIT2(A_circumflex) EMIT2(A_virguilla)
831 		    EMIT2(A_diaeresis) EMIT2(A_ring)
832 		    EMIT2(0x100) EMIT2(0x102) EMIT2(0x104)
833 		    EMIT2(0x1cd) EMIT2(0x1de) EMIT2(0x1e0)
834 		    EMIT2(0x1fa) EMIT2(0x200) EMIT2(0x202)
835 		    EMIT2(0x226) EMIT2(0x23a) EMIT2(0x1e00)
836 		    EMIT2(0x1ea0) EMIT2(0x1ea2) EMIT2(0x1ea4)
837 		    EMIT2(0x1ea6) EMIT2(0x1ea8) EMIT2(0x1eaa)
838 		    EMIT2(0x1eac) EMIT2(0x1eae) EMIT2(0x1eb0)
839 		    EMIT2(0x1eb2) EMIT2(0x1eb6) EMIT2(0x1eb4)
840 		    return OK;
841 
842 	    case 'B': case 0x181: case 0x243: case 0x1e02:
843 	    case 0x1e04: case 0x1e06:
844 		    EMIT2('B')
845 		    EMIT2(0x181) EMIT2(0x243) EMIT2(0x1e02)
846 		    EMIT2(0x1e04) EMIT2(0x1e06)
847 		    return OK;
848 
849 	    case 'C': case C_cedilla: case 0x106: case 0x108:
850 	    case 0x10a: case 0x10c: case 0x187: case 0x23b:
851 	    case 0x1e08: case 0xa792:
852 		    EMIT2('C') EMIT2(C_cedilla)
853 		    EMIT2(0x106) EMIT2(0x108) EMIT2(0x10a)
854 		    EMIT2(0x10c) EMIT2(0x187) EMIT2(0x23b)
855 		    EMIT2(0x1e08) EMIT2(0xa792)
856 		    return OK;
857 
858 	    case 'D': case 0x10e: case 0x110: case 0x18a:
859 	    case 0x1e0a: case 0x1e0c: case 0x1e0e: case 0x1e10:
860 	    case 0x1e12:
861 		    EMIT2('D') EMIT2(0x10e) EMIT2(0x110) EMIT2(0x18a)
862 		    EMIT2(0x1e0a) EMIT2(0x1e0c) EMIT2(0x1e0e)
863 		    EMIT2(0x1e10) EMIT2(0x1e12)
864 		    return OK;
865 
866 	    case 'E': case E_grave: case E_acute: case E_circumflex:
867 	    case E_diaeresis: case 0x112: case 0x114: case 0x116:
868 	    case 0x118: case 0x11a: case 0x204: case 0x206:
869 	    case 0x228: case 0x246: case 0x1e14: case 0x1e16:
870 	    case 0x1e18: case 0x1e1a: case 0x1e1c: case 0x1eb8:
871 	    case 0x1eba: case 0x1ebc: case 0x1ebe: case 0x1ec0:
872 	    case 0x1ec2: case 0x1ec4: case 0x1ec6:
873 		    EMIT2('E') EMIT2(E_grave) EMIT2(E_acute)
874 		    EMIT2(E_circumflex) EMIT2(E_diaeresis)
875 		    EMIT2(0x112) EMIT2(0x114) EMIT2(0x116)
876 		    EMIT2(0x118) EMIT2(0x11a) EMIT2(0x204)
877 		    EMIT2(0x206) EMIT2(0x228) EMIT2(0x246)
878 		    EMIT2(0x1e14) EMIT2(0x1e16) EMIT2(0x1e18)
879 		    EMIT2(0x1e1a) EMIT2(0x1e1c) EMIT2(0x1eb8)
880 		    EMIT2(0x1eba) EMIT2(0x1ebc) EMIT2(0x1ebe)
881 		    EMIT2(0x1ec0) EMIT2(0x1ec2) EMIT2(0x1ec4)
882 		    EMIT2(0x1ec6)
883 		    return OK;
884 
885 	    case 'F': case 0x191: case 0x1e1e: case 0xa798:
886 		    EMIT2('F') EMIT2(0x191) EMIT2(0x1e1e) EMIT2(0xa798)
887 		    return OK;
888 
889 	    case 'G': case 0x11c: case 0x11e: case 0x120:
890 	    case 0x122: case 0x193: case 0x1e4: case 0x1e6:
891 	    case 0x1f4: case 0x1e20: case 0xa7a0:
892 		    EMIT2('G') EMIT2(0x11c) EMIT2(0x11e) EMIT2(0x120)
893 		    EMIT2(0x122) EMIT2(0x193) EMIT2(0x1e4)
894 		    EMIT2(0x1e6) EMIT2(0x1f4) EMIT2(0x1e20)
895 		    EMIT2(0xa7a0)
896 		    return OK;
897 
898 	    case 'H': case 0x124: case 0x126: case 0x21e:
899 	    case 0x1e22: case 0x1e24: case 0x1e26: case 0x1e28:
900 	    case 0x1e2a: case 0x2c67:
901 		    EMIT2('H') EMIT2(0x124) EMIT2(0x126) EMIT2(0x21e)
902 		    EMIT2(0x1e22) EMIT2(0x1e24) EMIT2(0x1e26)
903 		    EMIT2(0x1e28) EMIT2(0x1e2a) EMIT2(0x2c67)
904 		    return OK;
905 
906 	    case 'I': case I_grave: case I_acute: case I_circumflex:
907 	    case I_diaeresis: case 0x128: case 0x12a: case 0x12c:
908 	    case 0x12e: case 0x130: case 0x197: case 0x1cf:
909 	    case 0x208: case 0x20a: case 0x1e2c: case 0x1e2e:
910 	    case 0x1ec8: case 0x1eca:
911 		    EMIT2('I') EMIT2(I_grave) EMIT2(I_acute)
912 		    EMIT2(I_circumflex) EMIT2(I_diaeresis)
913 		    EMIT2(0x128) EMIT2(0x12a) EMIT2(0x12c)
914 		    EMIT2(0x12e) EMIT2(0x130) EMIT2(0x197)
915 		    EMIT2(0x1cf) EMIT2(0x208) EMIT2(0x20a)
916 		    EMIT2(0x1e2c) EMIT2(0x1e2e) EMIT2(0x1ec8)
917 		    EMIT2(0x1eca)
918 		    return OK;
919 
920 	    case 'J': case 0x134: case 0x248:
921 		    EMIT2('J') EMIT2(0x134) EMIT2(0x248)
922 		    return OK;
923 
924 	    case 'K': case 0x136: case 0x198: case 0x1e8: case 0x1e30:
925 	    case 0x1e32: case 0x1e34: case 0x2c69: case 0xa740:
926 		    EMIT2('K') EMIT2(0x136) EMIT2(0x198) EMIT2(0x1e8)
927 		    EMIT2(0x1e30) EMIT2(0x1e32) EMIT2(0x1e34)
928 		    EMIT2(0x2c69) EMIT2(0xa740)
929 		    return OK;
930 
931 	    case 'L': case 0x139: case 0x13b: case 0x13d:
932 	    case 0x13f: case 0x141: case 0x23d: case 0x1e36:
933 	    case 0x1e38: case 0x1e3a: case 0x1e3c: case 0x2c60:
934 		    EMIT2('L') EMIT2(0x139) EMIT2(0x13b)
935 		    EMIT2(0x13d) EMIT2(0x13f) EMIT2(0x141)
936 		    EMIT2(0x23d) EMIT2(0x1e36) EMIT2(0x1e38)
937 		    EMIT2(0x1e3a) EMIT2(0x1e3c) EMIT2(0x2c60)
938 		    return OK;
939 
940 	    case 'M': case 0x1e3e: case 0x1e40: case 0x1e42:
941 		    EMIT2('M') EMIT2(0x1e3e) EMIT2(0x1e40)
942 		    EMIT2(0x1e42)
943 		    return OK;
944 
945 	    case 'N': case N_virguilla:
946 	    case 0x143: case 0x145: case 0x147: case 0x1f8:
947 	    case 0x1e44: case 0x1e46: case 0x1e48: case 0x1e4a:
948 	    case 0xa7a4:
949 		    EMIT2('N') EMIT2(N_virguilla)
950 		    EMIT2(0x143) EMIT2(0x145) EMIT2(0x147)
951 		    EMIT2(0x1f8) EMIT2(0x1e44) EMIT2(0x1e46)
952 		    EMIT2(0x1e48) EMIT2(0x1e4a) EMIT2(0xa7a4)
953 		    return OK;
954 
955 	    case 'O': case O_grave: case O_acute: case O_circumflex:
956 	    case O_virguilla: case O_diaeresis: case O_slash:
957 	    case 0x14c: case 0x14e: case 0x150: case 0x19f:
958 	    case 0x1a0: case 0x1d1: case 0x1ea: case 0x1ec:
959 	    case 0x1fe: case 0x20c: case 0x20e: case 0x22a:
960 	    case 0x22c: case 0x22e: case 0x230: case 0x1e4c:
961 	    case 0x1e4e: case 0x1e50: case 0x1e52: case 0x1ecc:
962 	    case 0x1ece: case 0x1ed0: case 0x1ed2: case 0x1ed4:
963 	    case 0x1ed6: case 0x1ed8: case 0x1eda: case 0x1edc:
964 	    case 0x1ede: case 0x1ee0: case 0x1ee2:
965 		    EMIT2('O') EMIT2(O_grave) EMIT2(O_acute)
966 		    EMIT2(O_circumflex) EMIT2(O_virguilla)
967 		    EMIT2(O_diaeresis) EMIT2(O_slash)
968 		    EMIT2(0x14c) EMIT2(0x14e) EMIT2(0x150)
969 		    EMIT2(0x19f) EMIT2(0x1a0) EMIT2(0x1d1)
970 		    EMIT2(0x1ea) EMIT2(0x1ec) EMIT2(0x1fe)
971 		    EMIT2(0x20c) EMIT2(0x20e) EMIT2(0x22a)
972 		    EMIT2(0x22c) EMIT2(0x22e) EMIT2(0x230)
973 		    EMIT2(0x1e4c) EMIT2(0x1e4e) EMIT2(0x1e50)
974 		    EMIT2(0x1e52) EMIT2(0x1ecc) EMIT2(0x1ece)
975 		    EMIT2(0x1ed0) EMIT2(0x1ed2) EMIT2(0x1ed4)
976 		    EMIT2(0x1ed6) EMIT2(0x1ed8) EMIT2(0x1eda)
977 		    EMIT2(0x1edc) EMIT2(0x1ede) EMIT2(0x1ee0)
978 		    EMIT2(0x1ee2)
979 		    return OK;
980 
981 	    case 'P': case 0x1a4: case 0x1e54: case 0x1e56: case 0x2c63:
982 		    EMIT2('P') EMIT2(0x1a4) EMIT2(0x1e54) EMIT2(0x1e56)
983 		    EMIT2(0x2c63)
984 		    return OK;
985 
986 	    case 'Q': case 0x24a:
987 		    EMIT2('Q') EMIT2(0x24a)
988 		    return OK;
989 
990 	    case 'R': case 0x154: case 0x156: case 0x158: case 0x210:
991 	    case 0x212: case 0x24c: case 0x1e58: case 0x1e5a:
992 	    case 0x1e5c: case 0x1e5e: case 0x2c64: case 0xa7a6:
993 		    EMIT2('R') EMIT2(0x154) EMIT2(0x156) EMIT2(0x158)
994 		    EMIT2(0x210) EMIT2(0x212) EMIT2(0x24c) EMIT2(0x1e58)
995 		    EMIT2(0x1e5a) EMIT2(0x1e5c) EMIT2(0x1e5e) EMIT2(0x2c64)
996 		    EMIT2(0xa7a6)
997 		    return OK;
998 
999 	    case 'S': case 0x15a: case 0x15c: case 0x15e: case 0x160:
1000 	    case 0x218: case 0x1e60: case 0x1e62: case 0x1e64:
1001 	    case 0x1e66: case 0x1e68: case 0x2c7e: case 0xa7a8:
1002 		    EMIT2('S') EMIT2(0x15a) EMIT2(0x15c) EMIT2(0x15e)
1003 		    EMIT2(0x160) EMIT2(0x218) EMIT2(0x1e60) EMIT2(0x1e62)
1004 		    EMIT2(0x1e64) EMIT2(0x1e66) EMIT2(0x1e68) EMIT2(0x2c7e)
1005 		    EMIT2(0xa7a8)
1006 		    return OK;
1007 
1008 	    case 'T': case 0x162: case 0x164: case 0x166: case 0x1ac:
1009 	    case 0x1ae: case 0x21a: case 0x23e: case 0x1e6a: case 0x1e6c:
1010 	    case 0x1e6e: case 0x1e70:
1011 		    EMIT2('T') EMIT2(0x162) EMIT2(0x164) EMIT2(0x166)
1012 		    EMIT2(0x1ac) EMIT2(0x1ae) EMIT2(0x23e) EMIT2(0x21a)
1013 		    EMIT2(0x1e6a) EMIT2(0x1e6c) EMIT2(0x1e6e) EMIT2(0x1e70)
1014 		    return OK;
1015 
1016 	    case 'U': case U_grave: case U_acute: case U_diaeresis:
1017 	    case U_circumflex: case 0x168: case 0x16a: case 0x16c:
1018 	    case 0x16e: case 0x170: case 0x172: case 0x1af:
1019 	    case 0x1d3: case 0x1d5: case 0x1d7: case 0x1d9:
1020 	    case 0x1db: case 0x214: case 0x216: case 0x244:
1021 	    case 0x1e72: case 0x1e74: case 0x1e76: case 0x1e78:
1022 	    case 0x1e7a: case 0x1ee4: case 0x1ee6: case 0x1ee8:
1023 	    case 0x1eea: case 0x1eec: case 0x1eee: case 0x1ef0:
1024 		    EMIT2('U') EMIT2(U_grave) EMIT2(U_acute)
1025 		    EMIT2(U_diaeresis) EMIT2(U_circumflex)
1026 		    EMIT2(0x168) EMIT2(0x16a)
1027 		    EMIT2(0x16c) EMIT2(0x16e) EMIT2(0x170)
1028 		    EMIT2(0x172) EMIT2(0x1af) EMIT2(0x1d3)
1029 		    EMIT2(0x1d5) EMIT2(0x1d7) EMIT2(0x1d9)
1030 		    EMIT2(0x1db) EMIT2(0x214) EMIT2(0x216)
1031 		    EMIT2(0x244) EMIT2(0x1e72) EMIT2(0x1e74)
1032 		    EMIT2(0x1e76) EMIT2(0x1e78) EMIT2(0x1e7a)
1033 		    EMIT2(0x1ee4) EMIT2(0x1ee6) EMIT2(0x1ee8)
1034 		    EMIT2(0x1eea) EMIT2(0x1eec) EMIT2(0x1eee)
1035 		    EMIT2(0x1ef0)
1036 		    return OK;
1037 
1038 	    case 'V': case 0x1b2: case 0x1e7c: case 0x1e7e:
1039 		    EMIT2('V') EMIT2(0x1b2) EMIT2(0x1e7c) EMIT2(0x1e7e)
1040 		    return OK;
1041 
1042 	    case 'W': case 0x174: case 0x1e80: case 0x1e82: case 0x1e84:
1043 	    case 0x1e86: case 0x1e88:
1044 		    EMIT2('W') EMIT2(0x174) EMIT2(0x1e80) EMIT2(0x1e82)
1045 		    EMIT2(0x1e84) EMIT2(0x1e86) EMIT2(0x1e88)
1046 		    return OK;
1047 
1048 	    case 'X': case 0x1e8a: case 0x1e8c:
1049 		    EMIT2('X') EMIT2(0x1e8a) EMIT2(0x1e8c)
1050 		    return OK;
1051 
1052 	    case 'Y': case Y_acute: case 0x176: case 0x178:
1053 	    case 0x1b3: case 0x232: case 0x24e: case 0x1e8e:
1054 	    case 0x1ef2: case 0x1ef4: case 0x1ef6: case 0x1ef8:
1055 		    EMIT2('Y') EMIT2(Y_acute)
1056 		    EMIT2(0x176) EMIT2(0x178) EMIT2(0x1b3)
1057 		    EMIT2(0x232) EMIT2(0x24e) EMIT2(0x1e8e)
1058 		    EMIT2(0x1ef2) EMIT2(0x1ef4) EMIT2(0x1ef6)
1059 		    EMIT2(0x1ef8)
1060 		    return OK;
1061 
1062 	    case 'Z': case 0x179: case 0x17b: case 0x17d:
1063 	    case 0x1b5: case 0x1e90: case 0x1e92: case 0x1e94:
1064 	    case 0x2c6b:
1065 		    EMIT2('Z') EMIT2(0x179) EMIT2(0x17b) EMIT2(0x17d)
1066 		    EMIT2(0x1b5) EMIT2(0x1e90) EMIT2(0x1e92)
1067 		    EMIT2(0x1e94) EMIT2(0x2c6b)
1068 		    return OK;
1069 
1070 	    case 'a': case a_grave: case a_acute: case a_circumflex:
1071 	    case a_virguilla: case a_diaeresis: case a_ring:
1072 	    case 0x101: case 0x103: case 0x105: case 0x1ce:
1073 	    case 0x1df: case 0x1e1: case 0x1fb: case 0x201:
1074 	    case 0x203: case 0x227: case 0x1d8f: case 0x1e01:
1075 	    case 0x1e9a: case 0x1ea1: case 0x1ea3: case 0x1ea5:
1076 	    case 0x1ea7: case 0x1ea9: case 0x1eab: case 0x1ead:
1077 	    case 0x1eaf: case 0x1eb1: case 0x1eb3: case 0x1eb5:
1078 	    case 0x1eb7: case 0x2c65:
1079 		    EMIT2('a') EMIT2(a_grave) EMIT2(a_acute)
1080 		    EMIT2(a_circumflex) EMIT2(a_virguilla)
1081 		    EMIT2(a_diaeresis) EMIT2(a_ring)
1082 		    EMIT2(0x101) EMIT2(0x103) EMIT2(0x105)
1083 		    EMIT2(0x1ce) EMIT2(0x1df) EMIT2(0x1e1)
1084 		    EMIT2(0x1fb) EMIT2(0x201) EMIT2(0x203)
1085 		    EMIT2(0x227) EMIT2(0x1d8f) EMIT2(0x1e01)
1086 		    EMIT2(0x1e9a) EMIT2(0x1ea1) EMIT2(0x1ea3)
1087 		    EMIT2(0x1ea5) EMIT2(0x1ea7) EMIT2(0x1ea9)
1088 		    EMIT2(0x1eab) EMIT2(0x1ead) EMIT2(0x1eaf)
1089 		    EMIT2(0x1eb1) EMIT2(0x1eb3) EMIT2(0x1eb5)
1090 		    EMIT2(0x1eb7) EMIT2(0x2c65)
1091 		    return OK;
1092 
1093 	    case 'b': case 0x180: case 0x253: case 0x1d6c: case 0x1d80:
1094 	    case 0x1e03: case 0x1e05: case 0x1e07:
1095 		    EMIT2('b') EMIT2(0x180) EMIT2(0x253) EMIT2(0x1d6c)
1096 		    EMIT2(0x1d80) EMIT2(0x1e03) EMIT2(0x1e05) EMIT2(0x1e07)
1097 		    return OK;
1098 
1099 	    case 'c': case c_cedilla: case 0x107: case 0x109: case 0x10b:
1100 	    case 0x10d: case 0x188: case 0x23c: case 0x1e09: case 0xa793:
1101 	    case 0xa794:
1102 		    EMIT2('c') EMIT2(c_cedilla)
1103 		    EMIT2(0x107) EMIT2(0x109) EMIT2(0x10b)
1104 		    EMIT2(0x10d) EMIT2(0x188) EMIT2(0x23c)
1105 		    EMIT2(0x1e09) EMIT2(0xa793) EMIT2(0xa794)
1106 		    return OK;
1107 
1108 	    case 'd': case 0x10f: case 0x111: case 0x257: case 0x1d6d:
1109 	    case 0x1d81: case 0x1d91: case 0x1e0b: case 0x1e0d: case 0x1e0f:
1110 	    case 0x1e11: case 0x1e13:
1111 		    EMIT2('d') EMIT2(0x10f) EMIT2(0x111)
1112 		    EMIT2(0x257) EMIT2(0x1d6d) EMIT2(0x1d81)
1113 		    EMIT2(0x1d91) EMIT2(0x1e0b) EMIT2(0x1e0d)
1114 		    EMIT2(0x1e0f) EMIT2(0x1e11) EMIT2(0x1e13)
1115 		    return OK;
1116 
1117 	    case 'e': case e_grave: case e_acute: case e_circumflex:
1118 	    case e_diaeresis: case 0x113: case 0x115: case 0x117:
1119 	    case 0x119: case 0x11b: case 0x205: case 0x207:
1120 	    case 0x229: case 0x247: case 0x1d92: case 0x1e15:
1121 	    case 0x1e17: case 0x1e19: case 0x1e1b: case 0x1e1d:
1122 	    case 0x1eb9: case 0x1ebb: case 0x1ebd: case 0x1ebf:
1123 	    case 0x1ec1: case 0x1ec3: case 0x1ec5: case 0x1ec7:
1124 		    EMIT2('e') EMIT2(e_grave) EMIT2(e_acute)
1125 		    EMIT2(e_circumflex) EMIT2(e_diaeresis)
1126 		    EMIT2(0x113) EMIT2(0x115)
1127 		    EMIT2(0x117) EMIT2(0x119) EMIT2(0x11b)
1128 		    EMIT2(0x205) EMIT2(0x207) EMIT2(0x229)
1129 		    EMIT2(0x247) EMIT2(0x1d92) EMIT2(0x1e15)
1130 		    EMIT2(0x1e17) EMIT2(0x1e19) EMIT2(0x1e1b)
1131 		    EMIT2(0x1e1d) EMIT2(0x1eb9) EMIT2(0x1ebb)
1132 		    EMIT2(0x1ebd) EMIT2(0x1ebf) EMIT2(0x1ec1)
1133 		    EMIT2(0x1ec3) EMIT2(0x1ec5) EMIT2(0x1ec7)
1134 		    return OK;
1135 
1136 	    case 'f': case 0x192: case 0x1d6e: case 0x1d82:
1137 	    case 0x1e1f: case 0xa799:
1138 		    EMIT2('f') EMIT2(0x192) EMIT2(0x1d6e) EMIT2(0x1d82)
1139 		    EMIT2(0x1e1f) EMIT2(0xa799)
1140 		    return OK;
1141 
1142 	    case 'g': case 0x11d: case 0x11f: case 0x121: case 0x123:
1143 	    case 0x1e5: case 0x1e7: case 0x1f5: case 0x260: case 0x1d83:
1144 	    case 0x1e21: case 0xa7a1:
1145 		    EMIT2('g') EMIT2(0x11d) EMIT2(0x11f) EMIT2(0x121)
1146 		    EMIT2(0x123) EMIT2(0x1e5) EMIT2(0x1e7)
1147 		    EMIT2(0x1f5) EMIT2(0x260) EMIT2(0x1d83)
1148 		    EMIT2(0x1e21) EMIT2(0xa7a1)
1149 		    return OK;
1150 
1151 	    case 'h': case 0x125: case 0x127: case 0x21f: case 0x1e23:
1152 	    case 0x1e25: case 0x1e27: case 0x1e29: case 0x1e2b:
1153 	    case 0x1e96: case 0x2c68: case 0xa795:
1154 		    EMIT2('h') EMIT2(0x125) EMIT2(0x127) EMIT2(0x21f)
1155 		    EMIT2(0x1e23) EMIT2(0x1e25) EMIT2(0x1e27)
1156 		    EMIT2(0x1e29) EMIT2(0x1e2b) EMIT2(0x1e96)
1157 		    EMIT2(0x2c68) EMIT2(0xa795)
1158 		    return OK;
1159 
1160 	    case 'i': case i_grave: case i_acute: case i_circumflex:
1161 	    case i_diaeresis: case 0x129: case 0x12b: case 0x12d:
1162 	    case 0x12f: case 0x1d0: case 0x209: case 0x20b:
1163 	    case 0x268: case 0x1d96: case 0x1e2d: case 0x1e2f:
1164 	    case 0x1ec9: case 0x1ecb:
1165 		    EMIT2('i') EMIT2(i_grave) EMIT2(i_acute)
1166 		    EMIT2(i_circumflex) EMIT2(i_diaeresis)
1167 		    EMIT2(0x129) EMIT2(0x12b) EMIT2(0x12d)
1168 		    EMIT2(0x12f) EMIT2(0x1d0) EMIT2(0x209)
1169 		    EMIT2(0x20b) EMIT2(0x268) EMIT2(0x1d96)
1170 		    EMIT2(0x1e2d) EMIT2(0x1e2f) EMIT2(0x1ec9)
1171 		    EMIT2(0x1ecb) EMIT2(0x1ecb)
1172 		    return OK;
1173 
1174 	    case 'j': case 0x135: case 0x1f0: case 0x249:
1175 		    EMIT2('j') EMIT2(0x135) EMIT2(0x1f0) EMIT2(0x249)
1176 		    return OK;
1177 
1178 	    case 'k': case 0x137: case 0x199: case 0x1e9: case 0x1d84:
1179 	    case 0x1e31: case 0x1e33: case 0x1e35: case 0x2c6a: case 0xa741:
1180 		    EMIT2('k') EMIT2(0x137) EMIT2(0x199) EMIT2(0x1e9)
1181 		    EMIT2(0x1d84) EMIT2(0x1e31) EMIT2(0x1e33)
1182 		    EMIT2(0x1e35) EMIT2(0x2c6a) EMIT2(0xa741)
1183 		    return OK;
1184 
1185 	    case 'l': case 0x13a: case 0x13c: case 0x13e: case 0x140:
1186 	    case 0x142: case 0x19a: case 0x1e37: case 0x1e39: case 0x1e3b:
1187 	    case 0x1e3d: case 0x2c61:
1188 		    EMIT2('l') EMIT2(0x13a) EMIT2(0x13c)
1189 		    EMIT2(0x13e) EMIT2(0x140) EMIT2(0x142)
1190 		    EMIT2(0x19a) EMIT2(0x1e37) EMIT2(0x1e39)
1191 		    EMIT2(0x1e3b) EMIT2(0x1e3d) EMIT2(0x2c61)
1192 		    return OK;
1193 
1194 	    case 'm': case 0x1d6f: case 0x1e3f: case 0x1e41: case 0x1e43:
1195 		    EMIT2('m') EMIT2(0x1d6f) EMIT2(0x1e3f)
1196 		    EMIT2(0x1e41) EMIT2(0x1e43)
1197 		    return OK;
1198 
1199 	    case 'n': case n_virguilla: case 0x144: case 0x146: case 0x148:
1200 	    case 0x149: case 0x1f9: case 0x1d70: case 0x1d87: case 0x1e45:
1201 	    case 0x1e47: case 0x1e49: case 0x1e4b: case 0xa7a5:
1202 		    EMIT2('n') EMIT2(n_virguilla)
1203 		    EMIT2(0x144) EMIT2(0x146) EMIT2(0x148)
1204 		    EMIT2(0x149) EMIT2(0x1f9) EMIT2(0x1d70)
1205 		    EMIT2(0x1d87) EMIT2(0x1e45) EMIT2(0x1e47)
1206 		    EMIT2(0x1e49) EMIT2(0x1e4b) EMIT2(0xa7a5)
1207 		    return OK;
1208 
1209 	    case 'o': case o_grave: case o_acute: case o_circumflex:
1210 	    case o_virguilla: case o_diaeresis: case o_slash:
1211 	    case 0x14d: case 0x14f: case 0x151: case 0x1a1:
1212 	    case 0x1d2: case 0x1eb: case 0x1ed: case 0x1ff:
1213 	    case 0x20d: case 0x20f: case 0x22b: case 0x22d:
1214 	    case 0x22f: case 0x231: case 0x275: case 0x1e4d:
1215 	    case 0x1e4f: case 0x1e51: case 0x1e53: case 0x1ecd:
1216 	    case 0x1ecf: case 0x1ed1: case 0x1ed3: case 0x1ed5:
1217 	    case 0x1ed7: case 0x1ed9: case 0x1edb: case 0x1edd:
1218 	    case 0x1edf: case 0x1ee1: case 0x1ee3:
1219 		    EMIT2('o') EMIT2(o_grave) EMIT2(o_acute)
1220 		    EMIT2(o_circumflex) EMIT2(o_virguilla)
1221 		    EMIT2(o_diaeresis) EMIT2(o_slash)
1222 		    EMIT2(0x14d) EMIT2(0x14f) EMIT2(0x151)
1223 		    EMIT2(0x1a1) EMIT2(0x1d2) EMIT2(0x1eb)
1224 		    EMIT2(0x1ed) EMIT2(0x1ff) EMIT2(0x20d)
1225 		    EMIT2(0x20f) EMIT2(0x22b) EMIT2(0x22d)
1226 		    EMIT2(0x22f) EMIT2(0x231) EMIT2(0x275)
1227 		    EMIT2(0x1e4d) EMIT2(0x1e4f) EMIT2(0x1e51)
1228 		    EMIT2(0x1e53) EMIT2(0x1ecd) EMIT2(0x1ecf)
1229 		    EMIT2(0x1ed1) EMIT2(0x1ed3) EMIT2(0x1ed5)
1230 		    EMIT2(0x1ed7) EMIT2(0x1ed9) EMIT2(0x1edb)
1231 		    EMIT2(0x1edd) EMIT2(0x1edf) EMIT2(0x1ee1)
1232 		    EMIT2(0x1ee3)
1233 		    return OK;
1234 
1235 	    case 'p': case 0x1a5: case 0x1d71: case 0x1d7d: case 0x1d88:
1236 	    case 0x1e55: case 0x1e57:
1237 		    EMIT2('p') EMIT2(0x1a5) EMIT2(0x1d71) EMIT2(0x1d7d)
1238 		    EMIT2(0x1d88) EMIT2(0x1e55) EMIT2(0x1e57)
1239 		    return OK;
1240 
1241 	    case 'q': case 0x24b: case 0x2a0:
1242 		    EMIT2('q') EMIT2(0x24b) EMIT2(0x2a0)
1243 		    return OK;
1244 
1245 	    case 'r': case 0x155: case 0x157: case 0x159: case 0x211:
1246 	    case 0x213: case 0x24d: case 0x27d: case 0x1d72: case 0x1d73:
1247 	    case 0x1d89: case 0x1e59: case 0x1e5b: case 0x1e5d: case 0x1e5f:
1248 	    case 0xa7a7:
1249 		    EMIT2('r') EMIT2(0x155) EMIT2(0x157) EMIT2(0x159)
1250 		    EMIT2(0x211) EMIT2(0x213) EMIT2(0x24d) EMIT2(0x27d)
1251 		    EMIT2(0x1d72) EMIT2(0x1d73) EMIT2(0x1d89) EMIT2(0x1e59)
1252 		    EMIT2(0x1e5b) EMIT2(0x1e5d) EMIT2(0x1e5f) EMIT2(0xa7a7)
1253 		    return OK;
1254 
1255 	    case 's': case 0x15b: case 0x15d: case 0x15f: case 0x161:
1256 	    case 0x219: case 0x23f: case 0x1d74: case 0x1d8a: case 0x1e61:
1257 	    case 0x1e63: case 0x1e65: case 0x1e67: case 0x1e69: case 0xa7a9:
1258 		    EMIT2('s') EMIT2(0x15b) EMIT2(0x15d) EMIT2(0x15f)
1259 		    EMIT2(0x161) EMIT2(0x219) EMIT2(0x23f) EMIT2(0x1d74)
1260 		    EMIT2(0x1d8a) EMIT2(0x1e61) EMIT2(0x1e63) EMIT2(0x1e65)
1261 		    EMIT2(0x1e67) EMIT2(0x1e69) EMIT2(0xa7a9)
1262 		    return OK;
1263 
1264 	    case 't': case 0x163: case 0x165: case 0x167: case 0x1ab:
1265 	    case 0x1ad: case 0x21b: case 0x288: case 0x1d75: case 0x1e6b:
1266 	    case 0x1e6d: case 0x1e6f: case 0x1e71: case 0x1e97: case 0x2c66:
1267 		    EMIT2('t') EMIT2(0x163) EMIT2(0x165) EMIT2(0x167)
1268 		    EMIT2(0x1ab) EMIT2(0x1ad) EMIT2(0x21b) EMIT2(0x288)
1269 		    EMIT2(0x1d75) EMIT2(0x1e6b) EMIT2(0x1e6d) EMIT2(0x1e6f)
1270 		    EMIT2(0x1e71) EMIT2(0x1e97) EMIT2(0x2c66)
1271 		    return OK;
1272 
1273 	    case 'u': case u_grave: case u_acute: case u_circumflex:
1274 	    case u_diaeresis: case 0x169: case 0x16b: case 0x16d:
1275 	    case 0x16f: case 0x171: case 0x173: case 0x1b0: case 0x1d4:
1276 	    case 0x1d6: case 0x1d8: case 0x1da: case 0x1dc: case 0x215:
1277 	    case 0x217: case 0x289: case 0x1d7e: case 0x1d99: case 0x1e73:
1278 	    case 0x1e75: case 0x1e77: case 0x1e79: case 0x1e7b:
1279 	    case 0x1ee5: case 0x1ee7: case 0x1ee9: case 0x1eeb:
1280 	    case 0x1eed: case 0x1eef: case 0x1ef1:
1281 		    EMIT2('u') EMIT2(u_grave) EMIT2(u_acute)
1282 		    EMIT2(u_circumflex) EMIT2(u_diaeresis)
1283 		    EMIT2(0x169) EMIT2(0x16b)
1284 		    EMIT2(0x16d) EMIT2(0x16f) EMIT2(0x171)
1285 		    EMIT2(0x173) EMIT2(0x1d6) EMIT2(0x1d8)
1286 		    EMIT2(0x215) EMIT2(0x217) EMIT2(0x1b0)
1287 		    EMIT2(0x1d4) EMIT2(0x1da) EMIT2(0x1dc)
1288 		    EMIT2(0x289) EMIT2(0x1e73) EMIT2(0x1d7e)
1289 		    EMIT2(0x1d99) EMIT2(0x1e75) EMIT2(0x1e77)
1290 		    EMIT2(0x1e79) EMIT2(0x1e7b) EMIT2(0x1ee5)
1291 		    EMIT2(0x1ee7) EMIT2(0x1ee9) EMIT2(0x1eeb)
1292 		    EMIT2(0x1eed) EMIT2(0x1eef) EMIT2(0x1ef1)
1293 		    return OK;
1294 
1295 	    case 'v': case 0x28b: case 0x1d8c: case 0x1e7d: case 0x1e7f:
1296 		    EMIT2('v') EMIT2(0x28b) EMIT2(0x1d8c) EMIT2(0x1e7d)
1297 		    EMIT2(0x1e7f)
1298 		    return OK;
1299 
1300 	    case 'w': case 0x175: case 0x1e81: case 0x1e83: case 0x1e85:
1301 	    case 0x1e87: case 0x1e89: case 0x1e98:
1302 		    EMIT2('w') EMIT2(0x175) EMIT2(0x1e81) EMIT2(0x1e83)
1303 		    EMIT2(0x1e85) EMIT2(0x1e87) EMIT2(0x1e89) EMIT2(0x1e98)
1304 		    return OK;
1305 
1306 	    case 'x': case 0x1e8b: case 0x1e8d:
1307 		    EMIT2('x') EMIT2(0x1e8b) EMIT2(0x1e8d)
1308 		    return OK;
1309 
1310 	    case 'y': case y_acute: case y_diaeresis: case 0x177:
1311 	    case 0x1b4: case 0x233: case 0x24f: case 0x1e8f:
1312 	    case 0x1e99: case 0x1ef3: case 0x1ef5: case 0x1ef7:
1313 	    case 0x1ef9:
1314 		    EMIT2('y') EMIT2(y_acute) EMIT2(y_diaeresis)
1315 		    EMIT2(0x177) EMIT2(0x1b4) EMIT2(0x233) EMIT2(0x24f)
1316 		    EMIT2(0x1e8f) EMIT2(0x1e99) EMIT2(0x1ef3)
1317 		    EMIT2(0x1ef5) EMIT2(0x1ef7) EMIT2(0x1ef9)
1318 		    return OK;
1319 
1320 	    case 'z': case 0x17a: case 0x17c: case 0x17e: case 0x1b6:
1321 	    case 0x1d76: case 0x1d8e: case 0x1e91: case 0x1e93:
1322 	    case 0x1e95: case 0x2c6c:
1323 		    EMIT2('z') EMIT2(0x17a) EMIT2(0x17c) EMIT2(0x17e)
1324 		    EMIT2(0x1b6) EMIT2(0x1d76) EMIT2(0x1d8e) EMIT2(0x1e91)
1325 		    EMIT2(0x1e93) EMIT2(0x1e95) EMIT2(0x2c6c)
1326 		    return OK;
1327 
1328 	    // default: character itself
1329 	}
1330     }
1331 
1332     EMIT2(c);
1333     return OK;
1334 #undef EMIT2
1335 #undef EMIT2
1336 }
1337 
1338 /*
1339  * Code to parse regular expression.
1340  *
1341  * We try to reuse parsing functions in regexp.c to
1342  * minimize surprise and keep the syntax consistent.
1343  */
1344 
1345 /*
1346  * Parse the lowest level.
1347  *
1348  * An atom can be one of a long list of items.  Many atoms match one character
1349  * in the text.  It is often an ordinary character or a character class.
1350  * Braces can be used to make a pattern into an atom.  The "\z(\)" construct
1351  * is only for syntax highlighting.
1352  *
1353  * atom    ::=     ordinary-atom
1354  *     or  \( pattern \)
1355  *     or  \%( pattern \)
1356  *     or  \z( pattern \)
1357  */
1358     static int
nfa_regatom(void)1359 nfa_regatom(void)
1360 {
1361     int		c;
1362     int		charclass;
1363     int		equiclass;
1364     int		collclass;
1365     int		got_coll_char;
1366     char_u	*p;
1367     char_u	*endp;
1368     char_u	*old_regparse = regparse;
1369     int		extra = 0;
1370     int		emit_range;
1371     int		negated;
1372     int		result;
1373     int		startc = -1;
1374     int		save_prev_at_start = prev_at_start;
1375 
1376     c = getchr();
1377     switch (c)
1378     {
1379 	case NUL:
1380 	    EMSG_RET_FAIL(_(e_nul_found));
1381 
1382 	case Magic('^'):
1383 	    EMIT(NFA_BOL);
1384 	    break;
1385 
1386 	case Magic('$'):
1387 	    EMIT(NFA_EOL);
1388 #if defined(FEAT_SYN_HL) || defined(PROTO)
1389 	    had_eol = TRUE;
1390 #endif
1391 	    break;
1392 
1393 	case Magic('<'):
1394 	    EMIT(NFA_BOW);
1395 	    break;
1396 
1397 	case Magic('>'):
1398 	    EMIT(NFA_EOW);
1399 	    break;
1400 
1401 	case Magic('_'):
1402 	    c = no_Magic(getchr());
1403 	    if (c == NUL)
1404 		EMSG_RET_FAIL(_(e_nul_found));
1405 
1406 	    if (c == '^')	// "\_^" is start-of-line
1407 	    {
1408 		EMIT(NFA_BOL);
1409 		break;
1410 	    }
1411 	    if (c == '$')	// "\_$" is end-of-line
1412 	    {
1413 		EMIT(NFA_EOL);
1414 #if defined(FEAT_SYN_HL) || defined(PROTO)
1415 		had_eol = TRUE;
1416 #endif
1417 		break;
1418 	    }
1419 
1420 	    extra = NFA_ADD_NL;
1421 
1422 	    // "\_[" is collection plus newline
1423 	    if (c == '[')
1424 		goto collection;
1425 
1426 	// "\_x" is character class plus newline
1427 	// FALLTHROUGH
1428 
1429 	/*
1430 	 * Character classes.
1431 	 */
1432 	case Magic('.'):
1433 	case Magic('i'):
1434 	case Magic('I'):
1435 	case Magic('k'):
1436 	case Magic('K'):
1437 	case Magic('f'):
1438 	case Magic('F'):
1439 	case Magic('p'):
1440 	case Magic('P'):
1441 	case Magic('s'):
1442 	case Magic('S'):
1443 	case Magic('d'):
1444 	case Magic('D'):
1445 	case Magic('x'):
1446 	case Magic('X'):
1447 	case Magic('o'):
1448 	case Magic('O'):
1449 	case Magic('w'):
1450 	case Magic('W'):
1451 	case Magic('h'):
1452 	case Magic('H'):
1453 	case Magic('a'):
1454 	case Magic('A'):
1455 	case Magic('l'):
1456 	case Magic('L'):
1457 	case Magic('u'):
1458 	case Magic('U'):
1459 	    p = vim_strchr(classchars, no_Magic(c));
1460 	    if (p == NULL)
1461 	    {
1462 		if (extra == NFA_ADD_NL)
1463 		{
1464 		    semsg(_(e_ill_char_class), c);
1465 		    rc_did_emsg = TRUE;
1466 		    return FAIL;
1467 		}
1468 		siemsg("INTERNAL: Unknown character class char: %d", c);
1469 		return FAIL;
1470 	    }
1471 
1472 	    // When '.' is followed by a composing char ignore the dot, so that
1473 	    // the composing char is matched here.
1474 	    if (enc_utf8 && c == Magic('.') && utf_iscomposing(peekchr()))
1475 	    {
1476 		old_regparse = regparse;
1477 		c = getchr();
1478 		goto nfa_do_multibyte;
1479 	    }
1480 	    EMIT(nfa_classcodes[p - classchars]);
1481 	    if (extra == NFA_ADD_NL)
1482 	    {
1483 		EMIT(NFA_NEWL);
1484 		EMIT(NFA_OR);
1485 		regflags |= RF_HASNL;
1486 	    }
1487 	    break;
1488 
1489 	case Magic('n'):
1490 	    if (reg_string)
1491 		// In a string "\n" matches a newline character.
1492 		EMIT(NL);
1493 	    else
1494 	    {
1495 		// In buffer text "\n" matches the end of a line.
1496 		EMIT(NFA_NEWL);
1497 		regflags |= RF_HASNL;
1498 	    }
1499 	    break;
1500 
1501 	case Magic('('):
1502 	    if (nfa_reg(REG_PAREN) == FAIL)
1503 		return FAIL;	    // cascaded error
1504 	    break;
1505 
1506 	case Magic('|'):
1507 	case Magic('&'):
1508 	case Magic(')'):
1509 	    semsg(_(e_misplaced), no_Magic(c));
1510 	    return FAIL;
1511 
1512 	case Magic('='):
1513 	case Magic('?'):
1514 	case Magic('+'):
1515 	case Magic('@'):
1516 	case Magic('*'):
1517 	case Magic('{'):
1518 	    // these should follow an atom, not form an atom
1519 	    semsg(_(e_misplaced), no_Magic(c));
1520 	    return FAIL;
1521 
1522 	case Magic('~'):
1523 	    {
1524 		char_u	    *lp;
1525 
1526 		// Previous substitute pattern.
1527 		// Generated as "\%(pattern\)".
1528 		if (reg_prev_sub == NULL)
1529 		{
1530 		    emsg(_(e_no_previous_substitute_regular_expression));
1531 		    return FAIL;
1532 		}
1533 		for (lp = reg_prev_sub; *lp != NUL; MB_CPTR_ADV(lp))
1534 		{
1535 		    EMIT(PTR2CHAR(lp));
1536 		    if (lp != reg_prev_sub)
1537 			EMIT(NFA_CONCAT);
1538 		}
1539 		EMIT(NFA_NOPEN);
1540 		break;
1541 	    }
1542 
1543 	case Magic('1'):
1544 	case Magic('2'):
1545 	case Magic('3'):
1546 	case Magic('4'):
1547 	case Magic('5'):
1548 	case Magic('6'):
1549 	case Magic('7'):
1550 	case Magic('8'):
1551 	case Magic('9'):
1552 	    {
1553 		int refnum = no_Magic(c) - '1';
1554 
1555 		if (!seen_endbrace(refnum + 1))
1556 		    return FAIL;
1557 		EMIT(NFA_BACKREF1 + refnum);
1558 		rex.nfa_has_backref = TRUE;
1559 	    }
1560 	    break;
1561 
1562 	case Magic('z'):
1563 	    c = no_Magic(getchr());
1564 	    switch (c)
1565 	    {
1566 		case 's':
1567 		    EMIT(NFA_ZSTART);
1568 		    if (re_mult_next("\\zs") == FAIL)
1569 			return FAIL;
1570 		    break;
1571 		case 'e':
1572 		    EMIT(NFA_ZEND);
1573 		    rex.nfa_has_zend = TRUE;
1574 		    if (re_mult_next("\\ze") == FAIL)
1575 			return FAIL;
1576 		    break;
1577 #ifdef FEAT_SYN_HL
1578 		case '1':
1579 		case '2':
1580 		case '3':
1581 		case '4':
1582 		case '5':
1583 		case '6':
1584 		case '7':
1585 		case '8':
1586 		case '9':
1587 		    // \z1...\z9
1588 		    if ((reg_do_extmatch & REX_USE) == 0)
1589 			EMSG_RET_FAIL(_(e_z1_not_allowed));
1590 		    EMIT(NFA_ZREF1 + (no_Magic(c) - '1'));
1591 		    // No need to set rex.nfa_has_backref, the sub-matches don't
1592 		    // change when \z1 .. \z9 matches or not.
1593 		    re_has_z = REX_USE;
1594 		    break;
1595 		case '(':
1596 		    // \z(
1597 		    if ((reg_do_extmatch & REX_SET) == 0)
1598 			EMSG_RET_FAIL(_(e_z_not_allowed));
1599 		    if (nfa_reg(REG_ZPAREN) == FAIL)
1600 			return FAIL;	    // cascaded error
1601 		    re_has_z = REX_SET;
1602 		    break;
1603 #endif
1604 		default:
1605 		    semsg(_("E867: (NFA) Unknown operator '\\z%c'"),
1606 								 no_Magic(c));
1607 		    return FAIL;
1608 	    }
1609 	    break;
1610 
1611 	case Magic('%'):
1612 	    c = no_Magic(getchr());
1613 	    switch (c)
1614 	    {
1615 		// () without a back reference
1616 		case '(':
1617 		    if (nfa_reg(REG_NPAREN) == FAIL)
1618 			return FAIL;
1619 		    EMIT(NFA_NOPEN);
1620 		    break;
1621 
1622 		case 'd':   // %d123 decimal
1623 		case 'o':   // %o123 octal
1624 		case 'x':   // %xab hex 2
1625 		case 'u':   // %uabcd hex 4
1626 		case 'U':   // %U1234abcd hex 8
1627 		    {
1628 			long nr;
1629 
1630 			switch (c)
1631 			{
1632 			    case 'd': nr = getdecchrs(); break;
1633 			    case 'o': nr = getoctchrs(); break;
1634 			    case 'x': nr = gethexchrs(2); break;
1635 			    case 'u': nr = gethexchrs(4); break;
1636 			    case 'U': nr = gethexchrs(8); break;
1637 			    default:  nr = -1; break;
1638 			}
1639 
1640 			if (nr < 0 || nr > INT_MAX)
1641 			    EMSG2_RET_FAIL(
1642 			       _("E678: Invalid character after %s%%[dxouU]"),
1643 				    reg_magic == MAGIC_ALL);
1644 			// A NUL is stored in the text as NL
1645 			// TODO: what if a composing character follows?
1646 			EMIT(nr == 0 ? 0x0a : nr);
1647 		    }
1648 		    break;
1649 
1650 		// Catch \%^ and \%$ regardless of where they appear in the
1651 		// pattern -- regardless of whether or not it makes sense.
1652 		case '^':
1653 		    EMIT(NFA_BOF);
1654 		    break;
1655 
1656 		case '$':
1657 		    EMIT(NFA_EOF);
1658 		    break;
1659 
1660 		case '#':
1661 		    EMIT(NFA_CURSOR);
1662 		    break;
1663 
1664 		case 'V':
1665 		    EMIT(NFA_VISUAL);
1666 		    break;
1667 
1668 		case 'C':
1669 		    EMIT(NFA_ANY_COMPOSING);
1670 		    break;
1671 
1672 		case '[':
1673 		    {
1674 			int	    n;
1675 
1676 			// \%[abc]
1677 			for (n = 0; (c = peekchr()) != ']'; ++n)
1678 			{
1679 			    if (c == NUL)
1680 				EMSG2_RET_FAIL(_(e_missing_sb),
1681 						      reg_magic == MAGIC_ALL);
1682 			    // recursive call!
1683 			    if (nfa_regatom() == FAIL)
1684 				return FAIL;
1685 			}
1686 			getchr();  // get the ]
1687 			if (n == 0)
1688 			    EMSG2_RET_FAIL(_(e_empty_sb),
1689 						      reg_magic == MAGIC_ALL);
1690 			EMIT(NFA_OPT_CHARS);
1691 			EMIT(n);
1692 
1693 			// Emit as "\%(\%[abc]\)" to be able to handle
1694 			// "\%[abc]*" which would cause the empty string to be
1695 			// matched an unlimited number of times. NFA_NOPEN is
1696 			// added only once at a position, while NFA_SPLIT is
1697 			// added multiple times.  This is more efficient than
1698 			// not allowing NFA_SPLIT multiple times, it is used
1699 			// a lot.
1700 			EMIT(NFA_NOPEN);
1701 			break;
1702 		    }
1703 
1704 		default:
1705 		    {
1706 			long_u	n = 0;
1707 			int	cmp = c;
1708 			int	cur = FALSE;
1709 
1710 			if (c == '<' || c == '>')
1711 			    c = getchr();
1712 			if (no_Magic(c) == '.')
1713 			{
1714 			    cur = TRUE;
1715 			    c = getchr();
1716 			}
1717 			while (VIM_ISDIGIT(c))
1718 			{
1719 			    long_u tmp;
1720 
1721 			    if (cur)
1722 				semsg(_(e_regexp_number_after_dot_pos_search),
1723 								 no_Magic(c));
1724 			    tmp = n * 10 + (c - '0');
1725 
1726 			    if (tmp < n)
1727 			    {
1728 				// overflow.
1729 				emsg(_(e_value_too_large));
1730 				return FAIL;
1731 			    }
1732 			    n = tmp;
1733 			    c = getchr();
1734 			}
1735 			if (c == 'l' || c == 'c' || c == 'v')
1736 			{
1737 			    long_u limit = INT_MAX;
1738 
1739 			    if (c == 'l')
1740 			    {
1741 				if (cur)
1742 				    n = curwin->w_cursor.lnum;
1743 				// \%{n}l  \%{n}<l  \%{n}>l
1744 				EMIT(cmp == '<' ? NFA_LNUM_LT :
1745 				     cmp == '>' ? NFA_LNUM_GT : NFA_LNUM);
1746 				if (save_prev_at_start)
1747 				    at_start = TRUE;
1748 			    }
1749 			    else if (c == 'c')
1750 			    {
1751 				if (cur)
1752 				{
1753 				    n = curwin->w_cursor.col;
1754 				    n++;
1755 				}
1756 				// \%{n}c  \%{n}<c  \%{n}>c
1757 				EMIT(cmp == '<' ? NFA_COL_LT :
1758 				     cmp == '>' ? NFA_COL_GT : NFA_COL);
1759 			    }
1760 			    else
1761 			    {
1762 				if (cur)
1763 				{
1764 				    colnr_T vcol = 0;
1765 
1766 				    getvvcol(curwin, &curwin->w_cursor,
1767 							    NULL, NULL, &vcol);
1768 				    n = ++vcol;
1769 				}
1770 				// \%{n}v  \%{n}<v  \%{n}>v
1771 				EMIT(cmp == '<' ? NFA_VCOL_LT :
1772 				     cmp == '>' ? NFA_VCOL_GT : NFA_VCOL);
1773 				limit = INT_MAX / MB_MAXBYTES;
1774 			    }
1775 			    if (n >= limit)
1776 			    {
1777 				emsg(_(e_value_too_large));
1778 				return FAIL;
1779 			    }
1780 			    EMIT((int)n);
1781 			    break;
1782 			}
1783 			else if (c == '\'' && n == 0)
1784 			{
1785 			    // \%'m  \%<'m  \%>'m
1786 			    EMIT(cmp == '<' ? NFA_MARK_LT :
1787 				 cmp == '>' ? NFA_MARK_GT : NFA_MARK);
1788 			    EMIT(getchr());
1789 			    break;
1790 			}
1791 		    }
1792 		    semsg(_("E867: (NFA) Unknown operator '\\%%%c'"),
1793 								 no_Magic(c));
1794 		    return FAIL;
1795 	    }
1796 	    break;
1797 
1798 	case Magic('['):
1799 collection:
1800 	    /*
1801 	     * [abc]  uses NFA_START_COLL - NFA_END_COLL
1802 	     * [^abc] uses NFA_START_NEG_COLL - NFA_END_NEG_COLL
1803 	     * Each character is produced as a regular state, using
1804 	     * NFA_CONCAT to bind them together.
1805 	     * Besides normal characters there can be:
1806 	     * - character classes  NFA_CLASS_*
1807 	     * - ranges, two characters followed by NFA_RANGE.
1808 	     */
1809 
1810 	    p = regparse;
1811 	    endp = skip_anyof(p);
1812 	    if (*endp == ']')
1813 	    {
1814 		/*
1815 		 * Try to reverse engineer character classes. For example,
1816 		 * recognize that [0-9] stands for \d and [A-Za-z_] for \h,
1817 		 * and perform the necessary substitutions in the NFA.
1818 		 */
1819 		result = nfa_recognize_char_class(regparse, endp,
1820 							 extra == NFA_ADD_NL);
1821 		if (result != FAIL)
1822 		{
1823 		    if (result >= NFA_FIRST_NL && result <= NFA_LAST_NL)
1824 		    {
1825 			EMIT(result - NFA_ADD_NL);
1826 			EMIT(NFA_NEWL);
1827 			EMIT(NFA_OR);
1828 		    }
1829 		    else
1830 			EMIT(result);
1831 		    regparse = endp;
1832 		    MB_PTR_ADV(regparse);
1833 		    return OK;
1834 		}
1835 		/*
1836 		 * Failed to recognize a character class. Use the simple
1837 		 * version that turns [abc] into 'a' OR 'b' OR 'c'
1838 		 */
1839 		startc = -1;
1840 		negated = FALSE;
1841 		if (*regparse == '^')			// negated range
1842 		{
1843 		    negated = TRUE;
1844 		    MB_PTR_ADV(regparse);
1845 		    EMIT(NFA_START_NEG_COLL);
1846 		}
1847 		else
1848 		    EMIT(NFA_START_COLL);
1849 		if (*regparse == '-')
1850 		{
1851 		    startc = '-';
1852 		    EMIT(startc);
1853 		    EMIT(NFA_CONCAT);
1854 		    MB_PTR_ADV(regparse);
1855 		}
1856 		// Emit the OR branches for each character in the []
1857 		emit_range = FALSE;
1858 		while (regparse < endp)
1859 		{
1860 		    int	    oldstartc = startc;
1861 
1862 		    startc = -1;
1863 		    got_coll_char = FALSE;
1864 		    if (*regparse == '[')
1865 		    {
1866 			// Check for [: :], [= =], [. .]
1867 			equiclass = collclass = 0;
1868 			charclass = get_char_class(&regparse);
1869 			if (charclass == CLASS_NONE)
1870 			{
1871 			    equiclass = get_equi_class(&regparse);
1872 			    if (equiclass == 0)
1873 				collclass = get_coll_element(&regparse);
1874 			}
1875 
1876 			// Character class like [:alpha:]
1877 			if (charclass != CLASS_NONE)
1878 			{
1879 			    switch (charclass)
1880 			    {
1881 				case CLASS_ALNUM:
1882 				    EMIT(NFA_CLASS_ALNUM);
1883 				    break;
1884 				case CLASS_ALPHA:
1885 				    EMIT(NFA_CLASS_ALPHA);
1886 				    break;
1887 				case CLASS_BLANK:
1888 				    EMIT(NFA_CLASS_BLANK);
1889 				    break;
1890 				case CLASS_CNTRL:
1891 				    EMIT(NFA_CLASS_CNTRL);
1892 				    break;
1893 				case CLASS_DIGIT:
1894 				    EMIT(NFA_CLASS_DIGIT);
1895 				    break;
1896 				case CLASS_GRAPH:
1897 				    EMIT(NFA_CLASS_GRAPH);
1898 				    break;
1899 				case CLASS_LOWER:
1900 				    wants_nfa = TRUE;
1901 				    EMIT(NFA_CLASS_LOWER);
1902 				    break;
1903 				case CLASS_PRINT:
1904 				    EMIT(NFA_CLASS_PRINT);
1905 				    break;
1906 				case CLASS_PUNCT:
1907 				    EMIT(NFA_CLASS_PUNCT);
1908 				    break;
1909 				case CLASS_SPACE:
1910 				    EMIT(NFA_CLASS_SPACE);
1911 				    break;
1912 				case CLASS_UPPER:
1913 				    wants_nfa = TRUE;
1914 				    EMIT(NFA_CLASS_UPPER);
1915 				    break;
1916 				case CLASS_XDIGIT:
1917 				    EMIT(NFA_CLASS_XDIGIT);
1918 				    break;
1919 				case CLASS_TAB:
1920 				    EMIT(NFA_CLASS_TAB);
1921 				    break;
1922 				case CLASS_RETURN:
1923 				    EMIT(NFA_CLASS_RETURN);
1924 				    break;
1925 				case CLASS_BACKSPACE:
1926 				    EMIT(NFA_CLASS_BACKSPACE);
1927 				    break;
1928 				case CLASS_ESCAPE:
1929 				    EMIT(NFA_CLASS_ESCAPE);
1930 				    break;
1931 				case CLASS_IDENT:
1932 				    EMIT(NFA_CLASS_IDENT);
1933 				    break;
1934 				case CLASS_KEYWORD:
1935 				    EMIT(NFA_CLASS_KEYWORD);
1936 				    break;
1937 				case CLASS_FNAME:
1938 				    EMIT(NFA_CLASS_FNAME);
1939 				    break;
1940 			    }
1941 			    EMIT(NFA_CONCAT);
1942 			    continue;
1943 			}
1944 			// Try equivalence class [=a=] and the like
1945 			if (equiclass != 0)
1946 			{
1947 			    result = nfa_emit_equi_class(equiclass);
1948 			    if (result == FAIL)
1949 			    {
1950 				// should never happen
1951 				EMSG_RET_FAIL(_("E868: Error building NFA with equivalence class!"));
1952 			    }
1953 			    continue;
1954 			}
1955 			// Try collating class like [. .]
1956 			if (collclass != 0)
1957 			{
1958 			    startc = collclass;	 // allow [.a.]-x as a range
1959 			    // Will emit the proper atom at the end of the
1960 			    // while loop.
1961 			}
1962 		    }
1963 		    // Try a range like 'a-x' or '\t-z'. Also allows '-' as a
1964 		    // start character.
1965 		    if (*regparse == '-' && oldstartc != -1)
1966 		    {
1967 			emit_range = TRUE;
1968 			startc = oldstartc;
1969 			MB_PTR_ADV(regparse);
1970 			continue;	    // reading the end of the range
1971 		    }
1972 
1973 		    // Now handle simple and escaped characters.
1974 		    // Only "\]", "\^", "\]" and "\\" are special in Vi.  Vim
1975 		    // accepts "\t", "\e", etc., but only when the 'l' flag in
1976 		    // 'cpoptions' is not included.
1977 		    // Posix doesn't recognize backslash at all.
1978 		    if (*regparse == '\\'
1979 			    && !reg_cpo_bsl
1980 			    && regparse + 1 <= endp
1981 			    && (vim_strchr(REGEXP_INRANGE, regparse[1]) != NULL
1982 				|| (!reg_cpo_lit
1983 				    && vim_strchr(REGEXP_ABBR, regparse[1])
1984 								      != NULL)
1985 			    )
1986 			)
1987 		    {
1988 			MB_PTR_ADV(regparse);
1989 
1990 			if (*regparse == 'n')
1991 			    startc = (reg_string || emit_range
1992 					|| regparse[1] == '-') ? NL : NFA_NEWL;
1993 			else if (*regparse == 'd'
1994 				    || *regparse == 'o'
1995 				    || *regparse == 'x'
1996 				    || *regparse == 'u'
1997 				    || *regparse == 'U'
1998 				)
1999 			    {
2000 				// TODO(RE) This needs more testing
2001 				startc = coll_get_char();
2002 				got_coll_char = TRUE;
2003 				MB_PTR_BACK(old_regparse, regparse);
2004 			    }
2005 			    else
2006 			    {
2007 				// \r,\t,\e,\b
2008 				startc = backslash_trans(*regparse);
2009 			    }
2010 		    }
2011 
2012 		    // Normal printable char
2013 		    if (startc == -1)
2014 			startc = PTR2CHAR(regparse);
2015 
2016 		    // Previous char was '-', so this char is end of range.
2017 		    if (emit_range)
2018 		    {
2019 			int	endc = startc;
2020 
2021 			startc = oldstartc;
2022 			if (startc > endc)
2023 			    EMSG_RET_FAIL(_(e_reverse_range));
2024 
2025 			if (endc > startc + 2)
2026 			{
2027 			    // Emit a range instead of the sequence of
2028 			    // individual characters.
2029 			    if (startc == 0)
2030 				// \x00 is translated to \x0a, start at \x01.
2031 				EMIT(1);
2032 			    else
2033 				--post_ptr; // remove NFA_CONCAT
2034 			    EMIT(endc);
2035 			    EMIT(NFA_RANGE);
2036 			    EMIT(NFA_CONCAT);
2037 			}
2038 			else if (has_mbyte && ((*mb_char2len)(startc) > 1
2039 				    || (*mb_char2len)(endc) > 1))
2040 			{
2041 			    // Emit the characters in the range.
2042 			    // "startc" was already emitted, so skip it.
2043 			    //
2044 			    for (c = startc + 1; c <= endc; c++)
2045 			    {
2046 				EMIT(c);
2047 				EMIT(NFA_CONCAT);
2048 			    }
2049 			}
2050 			else
2051 			{
2052 #ifdef EBCDIC
2053 			    int alpha_only = FALSE;
2054 
2055 			    // for alphabetical range skip the gaps
2056 			    // 'i'-'j', 'r'-'s', 'I'-'J' and 'R'-'S'.
2057 			    if (isalpha(startc) && isalpha(endc))
2058 				alpha_only = TRUE;
2059 #endif
2060 			    // Emit the range. "startc" was already emitted, so
2061 			    // skip it.
2062 			    for (c = startc + 1; c <= endc; c++)
2063 #ifdef EBCDIC
2064 				if (!alpha_only || isalpha(startc))
2065 #endif
2066 				{
2067 				    EMIT(c);
2068 				    EMIT(NFA_CONCAT);
2069 				}
2070 			}
2071 			emit_range = FALSE;
2072 			startc = -1;
2073 		    }
2074 		    else
2075 		    {
2076 			// This char (startc) is not part of a range. Just
2077 			// emit it.
2078 			// Normally, simply emit startc. But if we get char
2079 			// code=0 from a collating char, then replace it with
2080 			// 0x0a.
2081 			// This is needed to completely mimic the behaviour of
2082 			// the backtracking engine.
2083 			if (startc == NFA_NEWL)
2084 			{
2085 			    // Line break can't be matched as part of the
2086 			    // collection, add an OR below. But not for negated
2087 			    // range.
2088 			    if (!negated)
2089 				extra = NFA_ADD_NL;
2090 			}
2091 			else
2092 			{
2093 			    if (got_coll_char == TRUE && startc == 0)
2094 				EMIT(0x0a);
2095 			    else
2096 				EMIT(startc);
2097 			    EMIT(NFA_CONCAT);
2098 			}
2099 		    }
2100 
2101 		    MB_PTR_ADV(regparse);
2102 		} // while (p < endp)
2103 
2104 		MB_PTR_BACK(old_regparse, regparse);
2105 		if (*regparse == '-')	    // if last, '-' is just a char
2106 		{
2107 		    EMIT('-');
2108 		    EMIT(NFA_CONCAT);
2109 		}
2110 
2111 		// skip the trailing ]
2112 		regparse = endp;
2113 		MB_PTR_ADV(regparse);
2114 
2115 		// Mark end of the collection.
2116 		if (negated == TRUE)
2117 		    EMIT(NFA_END_NEG_COLL);
2118 		else
2119 		    EMIT(NFA_END_COLL);
2120 
2121 		// \_[] also matches \n but it's not negated
2122 		if (extra == NFA_ADD_NL)
2123 		{
2124 		    EMIT(reg_string ? NL : NFA_NEWL);
2125 		    EMIT(NFA_OR);
2126 		}
2127 
2128 		return OK;
2129 	    } // if exists closing ]
2130 
2131 	    if (reg_strict)
2132 		EMSG_RET_FAIL(_(e_missingbracket));
2133 	    // FALLTHROUGH
2134 
2135 	default:
2136 	    {
2137 		int	plen;
2138 
2139 nfa_do_multibyte:
2140 		// plen is length of current char with composing chars
2141 		if (enc_utf8 && ((*mb_char2len)(c)
2142 			    != (plen = utfc_ptr2len(old_regparse))
2143 						       || utf_iscomposing(c)))
2144 		{
2145 		    int i = 0;
2146 
2147 		    // A base character plus composing characters, or just one
2148 		    // or more composing characters.
2149 		    // This requires creating a separate atom as if enclosing
2150 		    // the characters in (), where NFA_COMPOSING is the ( and
2151 		    // NFA_END_COMPOSING is the ). Note that right now we are
2152 		    // building the postfix form, not the NFA itself;
2153 		    // a composing char could be: a, b, c, NFA_COMPOSING
2154 		    // where 'b' and 'c' are chars with codes > 256.
2155 		    for (;;)
2156 		    {
2157 			EMIT(c);
2158 			if (i > 0)
2159 			    EMIT(NFA_CONCAT);
2160 			if ((i += utf_char2len(c)) >= plen)
2161 			    break;
2162 			c = utf_ptr2char(old_regparse + i);
2163 		    }
2164 		    EMIT(NFA_COMPOSING);
2165 		    regparse = old_regparse + plen;
2166 		}
2167 		else
2168 		{
2169 		    c = no_Magic(c);
2170 		    EMIT(c);
2171 		}
2172 		return OK;
2173 	    }
2174     }
2175 
2176     return OK;
2177 }
2178 
2179 /*
2180  * Parse something followed by possible [*+=].
2181  *
2182  * A piece is an atom, possibly followed by a multi, an indication of how many
2183  * times the atom can be matched.  Example: "a*" matches any sequence of "a"
2184  * characters: "", "a", "aa", etc.
2185  *
2186  * piece   ::=	    atom
2187  *	or  atom  multi
2188  */
2189     static int
nfa_regpiece(void)2190 nfa_regpiece(void)
2191 {
2192     int		i;
2193     int		op;
2194     int		ret;
2195     long	minval, maxval;
2196     int		greedy = TRUE;      // Braces are prefixed with '-' ?
2197     parse_state_T old_state;
2198     parse_state_T new_state;
2199     long	c2;
2200     int		old_post_pos;
2201     int		my_post_start;
2202     int		quest;
2203 
2204     // Save the current parse state, so that we can use it if <atom>{m,n} is
2205     // next.
2206     save_parse_state(&old_state);
2207 
2208     // store current pos in the postfix form, for \{m,n} involving 0s
2209     my_post_start = (int)(post_ptr - post_start);
2210 
2211     ret = nfa_regatom();
2212     if (ret == FAIL)
2213 	return FAIL;	    // cascaded error
2214 
2215     op = peekchr();
2216     if (re_multi_type(op) == NOT_MULTI)
2217 	return OK;
2218 
2219     skipchr();
2220     switch (op)
2221     {
2222 	case Magic('*'):
2223 	    EMIT(NFA_STAR);
2224 	    break;
2225 
2226 	case Magic('+'):
2227 	    /*
2228 	     * Trick: Normally, (a*)\+ would match the whole input "aaa".  The
2229 	     * first and only submatch would be "aaa". But the backtracking
2230 	     * engine interprets the plus as "try matching one more time", and
2231 	     * a* matches a second time at the end of the input, the empty
2232 	     * string.
2233 	     * The submatch will be the empty string.
2234 	     *
2235 	     * In order to be consistent with the old engine, we replace
2236 	     * <atom>+ with <atom><atom>*
2237 	     */
2238 	    restore_parse_state(&old_state);
2239 	    curchr = -1;
2240 	    if (nfa_regatom() == FAIL)
2241 		return FAIL;
2242 	    EMIT(NFA_STAR);
2243 	    EMIT(NFA_CONCAT);
2244 	    skipchr();		// skip the \+
2245 	    break;
2246 
2247 	case Magic('@'):
2248 	    c2 = getdecchrs();
2249 	    op = no_Magic(getchr());
2250 	    i = 0;
2251 	    switch(op)
2252 	    {
2253 		case '=':
2254 		    // \@=
2255 		    i = NFA_PREV_ATOM_NO_WIDTH;
2256 		    break;
2257 		case '!':
2258 		    // \@!
2259 		    i = NFA_PREV_ATOM_NO_WIDTH_NEG;
2260 		    break;
2261 		case '<':
2262 		    op = no_Magic(getchr());
2263 		    if (op == '=')
2264 			// \@<=
2265 			i = NFA_PREV_ATOM_JUST_BEFORE;
2266 		    else if (op == '!')
2267 			// \@<!
2268 			i = NFA_PREV_ATOM_JUST_BEFORE_NEG;
2269 		    break;
2270 		case '>':
2271 		    // \@>
2272 		    i = NFA_PREV_ATOM_LIKE_PATTERN;
2273 		    break;
2274 	    }
2275 	    if (i == 0)
2276 	    {
2277 		semsg(_("E869: (NFA) Unknown operator '\\@%c'"), op);
2278 		return FAIL;
2279 	    }
2280 	    EMIT(i);
2281 	    if (i == NFA_PREV_ATOM_JUST_BEFORE
2282 					|| i == NFA_PREV_ATOM_JUST_BEFORE_NEG)
2283 		EMIT(c2);
2284 	    break;
2285 
2286 	case Magic('?'):
2287 	case Magic('='):
2288 	    EMIT(NFA_QUEST);
2289 	    break;
2290 
2291 	case Magic('{'):
2292 	    // a{2,5} will expand to 'aaa?a?a?'
2293 	    // a{-1,3} will expand to 'aa??a??', where ?? is the nongreedy
2294 	    // version of '?'
2295 	    // \v(ab){2,3} will expand to '(ab)(ab)(ab)?', where all the
2296 	    // parenthesis have the same id
2297 
2298 	    greedy = TRUE;
2299 	    c2 = peekchr();
2300 	    if (c2 == '-' || c2 == Magic('-'))
2301 	    {
2302 		skipchr();
2303 		greedy = FALSE;
2304 	    }
2305 	    if (!read_limits(&minval, &maxval))
2306 		EMSG_RET_FAIL(_("E870: (NFA regexp) Error reading repetition limits"));
2307 
2308 	    //  <atom>{0,inf}, <atom>{0,} and <atom>{}  are equivalent to
2309 	    //  <atom>*
2310 	    if (minval == 0 && maxval == MAX_LIMIT)
2311 	    {
2312 		if (greedy)		// { { (match the braces)
2313 		    // \{}, \{0,}
2314 		    EMIT(NFA_STAR);
2315 		else			// { { (match the braces)
2316 		    // \{-}, \{-0,}
2317 		    EMIT(NFA_STAR_NONGREEDY);
2318 		break;
2319 	    }
2320 
2321 	    // Special case: x{0} or x{-0}
2322 	    if (maxval == 0)
2323 	    {
2324 		// Ignore result of previous call to nfa_regatom()
2325 		post_ptr = post_start + my_post_start;
2326 		// NFA_EMPTY is 0-length and works everywhere
2327 		EMIT(NFA_EMPTY);
2328 		return OK;
2329 	    }
2330 
2331 	    // The engine is very inefficient (uses too many states) when the
2332 	    // maximum is much larger than the minimum and when the maximum is
2333 	    // large.  However, when maxval is MAX_LIMIT, it is okay, as this
2334 	    // will emit NFA_STAR.
2335 	    // Bail out if we can use the other engine, but only, when the
2336 	    // pattern does not need the NFA engine like (e.g. [[:upper:]]\{2,\}
2337 	    // does not work with with characters > 8 bit with the BT engine)
2338 	    if ((nfa_re_flags & RE_AUTO)
2339 				   && (maxval > 500 || maxval > minval + 200)
2340 				   && (maxval != MAX_LIMIT && minval < 200)
2341 				   && !wants_nfa)
2342 		return FAIL;
2343 
2344 	    // Ignore previous call to nfa_regatom()
2345 	    post_ptr = post_start + my_post_start;
2346 	    // Save parse state after the repeated atom and the \{}
2347 	    save_parse_state(&new_state);
2348 
2349 	    quest = (greedy == TRUE? NFA_QUEST : NFA_QUEST_NONGREEDY);
2350 	    for (i = 0; i < maxval; i++)
2351 	    {
2352 		// Goto beginning of the repeated atom
2353 		restore_parse_state(&old_state);
2354 		old_post_pos = (int)(post_ptr - post_start);
2355 		if (nfa_regatom() == FAIL)
2356 		    return FAIL;
2357 		// after "minval" times, atoms are optional
2358 		if (i + 1 > minval)
2359 		{
2360 		    if (maxval == MAX_LIMIT)
2361 		    {
2362 			if (greedy)
2363 			    EMIT(NFA_STAR);
2364 			else
2365 			    EMIT(NFA_STAR_NONGREEDY);
2366 		    }
2367 		    else
2368 			EMIT(quest);
2369 		}
2370 		if (old_post_pos != my_post_start)
2371 		    EMIT(NFA_CONCAT);
2372 		if (i + 1 > minval && maxval == MAX_LIMIT)
2373 		    break;
2374 	    }
2375 
2376 	    // Go to just after the repeated atom and the \{}
2377 	    restore_parse_state(&new_state);
2378 	    curchr = -1;
2379 
2380 	    break;
2381 
2382 
2383 	default:
2384 	    break;
2385     }	// end switch
2386 
2387     if (re_multi_type(peekchr()) != NOT_MULTI)
2388 	// Can't have a multi follow a multi.
2389 	EMSG_RET_FAIL(_("E871: (NFA regexp) Can't have a multi follow a multi"));
2390 
2391     return OK;
2392 }
2393 
2394 /*
2395  * Parse one or more pieces, concatenated.  It matches a match for the
2396  * first piece, followed by a match for the second piece, etc.  Example:
2397  * "f[0-9]b", first matches "f", then a digit and then "b".
2398  *
2399  * concat  ::=	    piece
2400  *	or  piece piece
2401  *	or  piece piece piece
2402  *	etc.
2403  */
2404     static int
nfa_regconcat(void)2405 nfa_regconcat(void)
2406 {
2407     int		cont = TRUE;
2408     int		first = TRUE;
2409 
2410     while (cont)
2411     {
2412 	switch (peekchr())
2413 	{
2414 	    case NUL:
2415 	    case Magic('|'):
2416 	    case Magic('&'):
2417 	    case Magic(')'):
2418 		cont = FALSE;
2419 		break;
2420 
2421 	    case Magic('Z'):
2422 		regflags |= RF_ICOMBINE;
2423 		skipchr_keepstart();
2424 		break;
2425 	    case Magic('c'):
2426 		regflags |= RF_ICASE;
2427 		skipchr_keepstart();
2428 		break;
2429 	    case Magic('C'):
2430 		regflags |= RF_NOICASE;
2431 		skipchr_keepstart();
2432 		break;
2433 	    case Magic('v'):
2434 		reg_magic = MAGIC_ALL;
2435 		skipchr_keepstart();
2436 		curchr = -1;
2437 		break;
2438 	    case Magic('m'):
2439 		reg_magic = MAGIC_ON;
2440 		skipchr_keepstart();
2441 		curchr = -1;
2442 		break;
2443 	    case Magic('M'):
2444 		reg_magic = MAGIC_OFF;
2445 		skipchr_keepstart();
2446 		curchr = -1;
2447 		break;
2448 	    case Magic('V'):
2449 		reg_magic = MAGIC_NONE;
2450 		skipchr_keepstart();
2451 		curchr = -1;
2452 		break;
2453 
2454 	    default:
2455 		if (nfa_regpiece() == FAIL)
2456 		    return FAIL;
2457 		if (first == FALSE)
2458 		    EMIT(NFA_CONCAT);
2459 		else
2460 		    first = FALSE;
2461 		break;
2462 	}
2463     }
2464 
2465     return OK;
2466 }
2467 
2468 /*
2469  * Parse a branch, one or more concats, separated by "\&".  It matches the
2470  * last concat, but only if all the preceding concats also match at the same
2471  * position.  Examples:
2472  *      "foobeep\&..." matches "foo" in "foobeep".
2473  *      ".*Peter\&.*Bob" matches in a line containing both "Peter" and "Bob"
2474  *
2475  * branch ::=	    concat
2476  *		or  concat \& concat
2477  *		or  concat \& concat \& concat
2478  *		etc.
2479  */
2480     static int
nfa_regbranch(void)2481 nfa_regbranch(void)
2482 {
2483     int		old_post_pos;
2484 
2485     old_post_pos = (int)(post_ptr - post_start);
2486 
2487     // First branch, possibly the only one
2488     if (nfa_regconcat() == FAIL)
2489 	return FAIL;
2490 
2491     // Try next concats
2492     while (peekchr() == Magic('&'))
2493     {
2494 	skipchr();
2495 	// if concat is empty do emit a node
2496 	if (old_post_pos == (int)(post_ptr - post_start))
2497 	    EMIT(NFA_EMPTY);
2498 	EMIT(NFA_NOPEN);
2499 	EMIT(NFA_PREV_ATOM_NO_WIDTH);
2500 	old_post_pos = (int)(post_ptr - post_start);
2501 	if (nfa_regconcat() == FAIL)
2502 	    return FAIL;
2503 	// if concat is empty do emit a node
2504 	if (old_post_pos == (int)(post_ptr - post_start))
2505 	    EMIT(NFA_EMPTY);
2506 	EMIT(NFA_CONCAT);
2507     }
2508 
2509     // if a branch is empty, emit one node for it
2510     if (old_post_pos == (int)(post_ptr - post_start))
2511 	EMIT(NFA_EMPTY);
2512 
2513     return OK;
2514 }
2515 
2516 /*
2517  *  Parse a pattern, one or more branches, separated by "\|".  It matches
2518  *  anything that matches one of the branches.  Example: "foo\|beep" matches
2519  *  "foo" and matches "beep".  If more than one branch matches, the first one
2520  *  is used.
2521  *
2522  *  pattern ::=	    branch
2523  *	or  branch \| branch
2524  *	or  branch \| branch \| branch
2525  *	etc.
2526  */
2527     static int
nfa_reg(int paren)2528 nfa_reg(
2529     int		paren)	// REG_NOPAREN, REG_PAREN, REG_NPAREN or REG_ZPAREN
2530 {
2531     int		parno = 0;
2532 
2533     if (paren == REG_PAREN)
2534     {
2535 	if (regnpar >= NSUBEXP) // Too many `('
2536 	    EMSG_RET_FAIL(_("E872: (NFA regexp) Too many '('"));
2537 	parno = regnpar++;
2538     }
2539 #ifdef FEAT_SYN_HL
2540     else if (paren == REG_ZPAREN)
2541     {
2542 	// Make a ZOPEN node.
2543 	if (regnzpar >= NSUBEXP)
2544 	    EMSG_RET_FAIL(_("E879: (NFA regexp) Too many \\z("));
2545 	parno = regnzpar++;
2546     }
2547 #endif
2548 
2549     if (nfa_regbranch() == FAIL)
2550 	return FAIL;	    // cascaded error
2551 
2552     while (peekchr() == Magic('|'))
2553     {
2554 	skipchr();
2555 	if (nfa_regbranch() == FAIL)
2556 	    return FAIL;    // cascaded error
2557 	EMIT(NFA_OR);
2558     }
2559 
2560     // Check for proper termination.
2561     if (paren != REG_NOPAREN && getchr() != Magic(')'))
2562     {
2563 	if (paren == REG_NPAREN)
2564 	    EMSG2_RET_FAIL(_(e_unmatched_str_percent_open),
2565 						       reg_magic == MAGIC_ALL);
2566 	else
2567 	    EMSG2_RET_FAIL(_(e_unmatched_str_open), reg_magic == MAGIC_ALL);
2568     }
2569     else if (paren == REG_NOPAREN && peekchr() != NUL)
2570     {
2571 	if (peekchr() == Magic(')'))
2572 	    EMSG2_RET_FAIL(_(e_unmatched_str_close), reg_magic == MAGIC_ALL);
2573 	else
2574 	    EMSG_RET_FAIL(_("E873: (NFA regexp) proper termination error"));
2575     }
2576     /*
2577      * Here we set the flag allowing back references to this set of
2578      * parentheses.
2579      */
2580     if (paren == REG_PAREN)
2581     {
2582 	had_endbrace[parno] = TRUE;     // have seen the close paren
2583 	EMIT(NFA_MOPEN + parno);
2584     }
2585 #ifdef FEAT_SYN_HL
2586     else if (paren == REG_ZPAREN)
2587 	EMIT(NFA_ZOPEN + parno);
2588 #endif
2589 
2590     return OK;
2591 }
2592 
2593 #ifdef DEBUG
2594 static char_u code[50];
2595 
2596     static void
nfa_set_code(int c)2597 nfa_set_code(int c)
2598 {
2599     int	    addnl = FALSE;
2600 
2601     if (c >= NFA_FIRST_NL && c <= NFA_LAST_NL)
2602     {
2603 	addnl = TRUE;
2604 	c -= NFA_ADD_NL;
2605     }
2606 
2607     STRCPY(code, "");
2608     switch (c)
2609     {
2610 	case NFA_MATCH:	    STRCPY(code, "NFA_MATCH "); break;
2611 	case NFA_SPLIT:	    STRCPY(code, "NFA_SPLIT "); break;
2612 	case NFA_CONCAT:    STRCPY(code, "NFA_CONCAT "); break;
2613 	case NFA_NEWL:	    STRCPY(code, "NFA_NEWL "); break;
2614 	case NFA_ZSTART:    STRCPY(code, "NFA_ZSTART"); break;
2615 	case NFA_ZEND:	    STRCPY(code, "NFA_ZEND"); break;
2616 
2617 	case NFA_BACKREF1:  STRCPY(code, "NFA_BACKREF1"); break;
2618 	case NFA_BACKREF2:  STRCPY(code, "NFA_BACKREF2"); break;
2619 	case NFA_BACKREF3:  STRCPY(code, "NFA_BACKREF3"); break;
2620 	case NFA_BACKREF4:  STRCPY(code, "NFA_BACKREF4"); break;
2621 	case NFA_BACKREF5:  STRCPY(code, "NFA_BACKREF5"); break;
2622 	case NFA_BACKREF6:  STRCPY(code, "NFA_BACKREF6"); break;
2623 	case NFA_BACKREF7:  STRCPY(code, "NFA_BACKREF7"); break;
2624 	case NFA_BACKREF8:  STRCPY(code, "NFA_BACKREF8"); break;
2625 	case NFA_BACKREF9:  STRCPY(code, "NFA_BACKREF9"); break;
2626 #ifdef FEAT_SYN_HL
2627 	case NFA_ZREF1:	    STRCPY(code, "NFA_ZREF1"); break;
2628 	case NFA_ZREF2:	    STRCPY(code, "NFA_ZREF2"); break;
2629 	case NFA_ZREF3:	    STRCPY(code, "NFA_ZREF3"); break;
2630 	case NFA_ZREF4:	    STRCPY(code, "NFA_ZREF4"); break;
2631 	case NFA_ZREF5:	    STRCPY(code, "NFA_ZREF5"); break;
2632 	case NFA_ZREF6:	    STRCPY(code, "NFA_ZREF6"); break;
2633 	case NFA_ZREF7:	    STRCPY(code, "NFA_ZREF7"); break;
2634 	case NFA_ZREF8:	    STRCPY(code, "NFA_ZREF8"); break;
2635 	case NFA_ZREF9:	    STRCPY(code, "NFA_ZREF9"); break;
2636 #endif
2637 	case NFA_SKIP:	    STRCPY(code, "NFA_SKIP"); break;
2638 
2639 	case NFA_PREV_ATOM_NO_WIDTH:
2640 			    STRCPY(code, "NFA_PREV_ATOM_NO_WIDTH"); break;
2641 	case NFA_PREV_ATOM_NO_WIDTH_NEG:
2642 			    STRCPY(code, "NFA_PREV_ATOM_NO_WIDTH_NEG"); break;
2643 	case NFA_PREV_ATOM_JUST_BEFORE:
2644 			    STRCPY(code, "NFA_PREV_ATOM_JUST_BEFORE"); break;
2645 	case NFA_PREV_ATOM_JUST_BEFORE_NEG:
2646 			 STRCPY(code, "NFA_PREV_ATOM_JUST_BEFORE_NEG"); break;
2647 	case NFA_PREV_ATOM_LIKE_PATTERN:
2648 			    STRCPY(code, "NFA_PREV_ATOM_LIKE_PATTERN"); break;
2649 
2650 	case NFA_NOPEN:		    STRCPY(code, "NFA_NOPEN"); break;
2651 	case NFA_NCLOSE:	    STRCPY(code, "NFA_NCLOSE"); break;
2652 	case NFA_START_INVISIBLE:   STRCPY(code, "NFA_START_INVISIBLE"); break;
2653 	case NFA_START_INVISIBLE_FIRST:
2654 			     STRCPY(code, "NFA_START_INVISIBLE_FIRST"); break;
2655 	case NFA_START_INVISIBLE_NEG:
2656 			       STRCPY(code, "NFA_START_INVISIBLE_NEG"); break;
2657 	case NFA_START_INVISIBLE_NEG_FIRST:
2658 			 STRCPY(code, "NFA_START_INVISIBLE_NEG_FIRST"); break;
2659 	case NFA_START_INVISIBLE_BEFORE:
2660 			    STRCPY(code, "NFA_START_INVISIBLE_BEFORE"); break;
2661 	case NFA_START_INVISIBLE_BEFORE_FIRST:
2662 		      STRCPY(code, "NFA_START_INVISIBLE_BEFORE_FIRST"); break;
2663 	case NFA_START_INVISIBLE_BEFORE_NEG:
2664 			STRCPY(code, "NFA_START_INVISIBLE_BEFORE_NEG"); break;
2665 	case NFA_START_INVISIBLE_BEFORE_NEG_FIRST:
2666 		  STRCPY(code, "NFA_START_INVISIBLE_BEFORE_NEG_FIRST"); break;
2667 	case NFA_START_PATTERN:   STRCPY(code, "NFA_START_PATTERN"); break;
2668 	case NFA_END_INVISIBLE:	    STRCPY(code, "NFA_END_INVISIBLE"); break;
2669 	case NFA_END_INVISIBLE_NEG: STRCPY(code, "NFA_END_INVISIBLE_NEG"); break;
2670 	case NFA_END_PATTERN:	    STRCPY(code, "NFA_END_PATTERN"); break;
2671 
2672 	case NFA_COMPOSING:	    STRCPY(code, "NFA_COMPOSING"); break;
2673 	case NFA_END_COMPOSING:	    STRCPY(code, "NFA_END_COMPOSING"); break;
2674 	case NFA_OPT_CHARS:	    STRCPY(code, "NFA_OPT_CHARS"); break;
2675 
2676 	case NFA_MOPEN:
2677 	case NFA_MOPEN1:
2678 	case NFA_MOPEN2:
2679 	case NFA_MOPEN3:
2680 	case NFA_MOPEN4:
2681 	case NFA_MOPEN5:
2682 	case NFA_MOPEN6:
2683 	case NFA_MOPEN7:
2684 	case NFA_MOPEN8:
2685 	case NFA_MOPEN9:
2686 	    STRCPY(code, "NFA_MOPEN(x)");
2687 	    code[10] = c - NFA_MOPEN + '0';
2688 	    break;
2689 	case NFA_MCLOSE:
2690 	case NFA_MCLOSE1:
2691 	case NFA_MCLOSE2:
2692 	case NFA_MCLOSE3:
2693 	case NFA_MCLOSE4:
2694 	case NFA_MCLOSE5:
2695 	case NFA_MCLOSE6:
2696 	case NFA_MCLOSE7:
2697 	case NFA_MCLOSE8:
2698 	case NFA_MCLOSE9:
2699 	    STRCPY(code, "NFA_MCLOSE(x)");
2700 	    code[11] = c - NFA_MCLOSE + '0';
2701 	    break;
2702 #ifdef FEAT_SYN_HL
2703 	case NFA_ZOPEN:
2704 	case NFA_ZOPEN1:
2705 	case NFA_ZOPEN2:
2706 	case NFA_ZOPEN3:
2707 	case NFA_ZOPEN4:
2708 	case NFA_ZOPEN5:
2709 	case NFA_ZOPEN6:
2710 	case NFA_ZOPEN7:
2711 	case NFA_ZOPEN8:
2712 	case NFA_ZOPEN9:
2713 	    STRCPY(code, "NFA_ZOPEN(x)");
2714 	    code[10] = c - NFA_ZOPEN + '0';
2715 	    break;
2716 	case NFA_ZCLOSE:
2717 	case NFA_ZCLOSE1:
2718 	case NFA_ZCLOSE2:
2719 	case NFA_ZCLOSE3:
2720 	case NFA_ZCLOSE4:
2721 	case NFA_ZCLOSE5:
2722 	case NFA_ZCLOSE6:
2723 	case NFA_ZCLOSE7:
2724 	case NFA_ZCLOSE8:
2725 	case NFA_ZCLOSE9:
2726 	    STRCPY(code, "NFA_ZCLOSE(x)");
2727 	    code[11] = c - NFA_ZCLOSE + '0';
2728 	    break;
2729 #endif
2730 	case NFA_EOL:		STRCPY(code, "NFA_EOL "); break;
2731 	case NFA_BOL:		STRCPY(code, "NFA_BOL "); break;
2732 	case NFA_EOW:		STRCPY(code, "NFA_EOW "); break;
2733 	case NFA_BOW:		STRCPY(code, "NFA_BOW "); break;
2734 	case NFA_EOF:		STRCPY(code, "NFA_EOF "); break;
2735 	case NFA_BOF:		STRCPY(code, "NFA_BOF "); break;
2736 	case NFA_LNUM:		STRCPY(code, "NFA_LNUM "); break;
2737 	case NFA_LNUM_GT:	STRCPY(code, "NFA_LNUM_GT "); break;
2738 	case NFA_LNUM_LT:	STRCPY(code, "NFA_LNUM_LT "); break;
2739 	case NFA_COL:		STRCPY(code, "NFA_COL "); break;
2740 	case NFA_COL_GT:	STRCPY(code, "NFA_COL_GT "); break;
2741 	case NFA_COL_LT:	STRCPY(code, "NFA_COL_LT "); break;
2742 	case NFA_VCOL:		STRCPY(code, "NFA_VCOL "); break;
2743 	case NFA_VCOL_GT:	STRCPY(code, "NFA_VCOL_GT "); break;
2744 	case NFA_VCOL_LT:	STRCPY(code, "NFA_VCOL_LT "); break;
2745 	case NFA_MARK:		STRCPY(code, "NFA_MARK "); break;
2746 	case NFA_MARK_GT:	STRCPY(code, "NFA_MARK_GT "); break;
2747 	case NFA_MARK_LT:	STRCPY(code, "NFA_MARK_LT "); break;
2748 	case NFA_CURSOR:	STRCPY(code, "NFA_CURSOR "); break;
2749 	case NFA_VISUAL:	STRCPY(code, "NFA_VISUAL "); break;
2750 	case NFA_ANY_COMPOSING:	STRCPY(code, "NFA_ANY_COMPOSING "); break;
2751 
2752 	case NFA_STAR:		STRCPY(code, "NFA_STAR "); break;
2753 	case NFA_STAR_NONGREEDY: STRCPY(code, "NFA_STAR_NONGREEDY "); break;
2754 	case NFA_QUEST:		STRCPY(code, "NFA_QUEST"); break;
2755 	case NFA_QUEST_NONGREEDY: STRCPY(code, "NFA_QUEST_NON_GREEDY"); break;
2756 	case NFA_EMPTY:		STRCPY(code, "NFA_EMPTY"); break;
2757 	case NFA_OR:		STRCPY(code, "NFA_OR"); break;
2758 
2759 	case NFA_START_COLL:	STRCPY(code, "NFA_START_COLL"); break;
2760 	case NFA_END_COLL:	STRCPY(code, "NFA_END_COLL"); break;
2761 	case NFA_START_NEG_COLL: STRCPY(code, "NFA_START_NEG_COLL"); break;
2762 	case NFA_END_NEG_COLL:	STRCPY(code, "NFA_END_NEG_COLL"); break;
2763 	case NFA_RANGE:		STRCPY(code, "NFA_RANGE"); break;
2764 	case NFA_RANGE_MIN:	STRCPY(code, "NFA_RANGE_MIN"); break;
2765 	case NFA_RANGE_MAX:	STRCPY(code, "NFA_RANGE_MAX"); break;
2766 
2767 	case NFA_CLASS_ALNUM:	STRCPY(code, "NFA_CLASS_ALNUM"); break;
2768 	case NFA_CLASS_ALPHA:	STRCPY(code, "NFA_CLASS_ALPHA"); break;
2769 	case NFA_CLASS_BLANK:	STRCPY(code, "NFA_CLASS_BLANK"); break;
2770 	case NFA_CLASS_CNTRL:	STRCPY(code, "NFA_CLASS_CNTRL"); break;
2771 	case NFA_CLASS_DIGIT:	STRCPY(code, "NFA_CLASS_DIGIT"); break;
2772 	case NFA_CLASS_GRAPH:	STRCPY(code, "NFA_CLASS_GRAPH"); break;
2773 	case NFA_CLASS_LOWER:	STRCPY(code, "NFA_CLASS_LOWER"); break;
2774 	case NFA_CLASS_PRINT:	STRCPY(code, "NFA_CLASS_PRINT"); break;
2775 	case NFA_CLASS_PUNCT:	STRCPY(code, "NFA_CLASS_PUNCT"); break;
2776 	case NFA_CLASS_SPACE:	STRCPY(code, "NFA_CLASS_SPACE"); break;
2777 	case NFA_CLASS_UPPER:	STRCPY(code, "NFA_CLASS_UPPER"); break;
2778 	case NFA_CLASS_XDIGIT:	STRCPY(code, "NFA_CLASS_XDIGIT"); break;
2779 	case NFA_CLASS_TAB:	STRCPY(code, "NFA_CLASS_TAB"); break;
2780 	case NFA_CLASS_RETURN:	STRCPY(code, "NFA_CLASS_RETURN"); break;
2781 	case NFA_CLASS_BACKSPACE:   STRCPY(code, "NFA_CLASS_BACKSPACE"); break;
2782 	case NFA_CLASS_ESCAPE:	STRCPY(code, "NFA_CLASS_ESCAPE"); break;
2783 	case NFA_CLASS_IDENT:	STRCPY(code, "NFA_CLASS_IDENT"); break;
2784 	case NFA_CLASS_KEYWORD:	STRCPY(code, "NFA_CLASS_KEYWORD"); break;
2785 	case NFA_CLASS_FNAME:	STRCPY(code, "NFA_CLASS_FNAME"); break;
2786 
2787 	case NFA_ANY:	STRCPY(code, "NFA_ANY"); break;
2788 	case NFA_IDENT:	STRCPY(code, "NFA_IDENT"); break;
2789 	case NFA_SIDENT:STRCPY(code, "NFA_SIDENT"); break;
2790 	case NFA_KWORD:	STRCPY(code, "NFA_KWORD"); break;
2791 	case NFA_SKWORD:STRCPY(code, "NFA_SKWORD"); break;
2792 	case NFA_FNAME:	STRCPY(code, "NFA_FNAME"); break;
2793 	case NFA_SFNAME:STRCPY(code, "NFA_SFNAME"); break;
2794 	case NFA_PRINT:	STRCPY(code, "NFA_PRINT"); break;
2795 	case NFA_SPRINT:STRCPY(code, "NFA_SPRINT"); break;
2796 	case NFA_WHITE:	STRCPY(code, "NFA_WHITE"); break;
2797 	case NFA_NWHITE:STRCPY(code, "NFA_NWHITE"); break;
2798 	case NFA_DIGIT:	STRCPY(code, "NFA_DIGIT"); break;
2799 	case NFA_NDIGIT:STRCPY(code, "NFA_NDIGIT"); break;
2800 	case NFA_HEX:	STRCPY(code, "NFA_HEX"); break;
2801 	case NFA_NHEX:	STRCPY(code, "NFA_NHEX"); break;
2802 	case NFA_OCTAL:	STRCPY(code, "NFA_OCTAL"); break;
2803 	case NFA_NOCTAL:STRCPY(code, "NFA_NOCTAL"); break;
2804 	case NFA_WORD:	STRCPY(code, "NFA_WORD"); break;
2805 	case NFA_NWORD:	STRCPY(code, "NFA_NWORD"); break;
2806 	case NFA_HEAD:	STRCPY(code, "NFA_HEAD"); break;
2807 	case NFA_NHEAD:	STRCPY(code, "NFA_NHEAD"); break;
2808 	case NFA_ALPHA:	STRCPY(code, "NFA_ALPHA"); break;
2809 	case NFA_NALPHA:STRCPY(code, "NFA_NALPHA"); break;
2810 	case NFA_LOWER:	STRCPY(code, "NFA_LOWER"); break;
2811 	case NFA_NLOWER:STRCPY(code, "NFA_NLOWER"); break;
2812 	case NFA_UPPER:	STRCPY(code, "NFA_UPPER"); break;
2813 	case NFA_NUPPER:STRCPY(code, "NFA_NUPPER"); break;
2814 	case NFA_LOWER_IC:  STRCPY(code, "NFA_LOWER_IC"); break;
2815 	case NFA_NLOWER_IC: STRCPY(code, "NFA_NLOWER_IC"); break;
2816 	case NFA_UPPER_IC:  STRCPY(code, "NFA_UPPER_IC"); break;
2817 	case NFA_NUPPER_IC: STRCPY(code, "NFA_NUPPER_IC"); break;
2818 
2819 	default:
2820 	    STRCPY(code, "CHAR(x)");
2821 	    code[5] = c;
2822     }
2823 
2824     if (addnl == TRUE)
2825 	STRCAT(code, " + NEWLINE ");
2826 
2827 }
2828 
2829 #ifdef ENABLE_LOG
2830 static FILE *log_fd;
2831 static char_u e_log_open_failed[] = N_("Could not open temporary log file for writing, displaying on stderr... ");
2832 
2833 /*
2834  * Print the postfix notation of the current regexp.
2835  */
2836     static void
nfa_postfix_dump(char_u * expr,int retval)2837 nfa_postfix_dump(char_u *expr, int retval)
2838 {
2839     int *p;
2840     FILE *f;
2841 
2842     f = fopen(NFA_REGEXP_DUMP_LOG, "a");
2843     if (f != NULL)
2844     {
2845 	fprintf(f, "\n-------------------------\n");
2846 	if (retval == FAIL)
2847 	    fprintf(f, ">>> NFA engine failed... \n");
2848 	else if (retval == OK)
2849 	    fprintf(f, ">>> NFA engine succeeded !\n");
2850 	fprintf(f, "Regexp: \"%s\"\nPostfix notation (char): \"", expr);
2851 	for (p = post_start; *p && p < post_ptr; p++)
2852 	{
2853 	    nfa_set_code(*p);
2854 	    fprintf(f, "%s, ", code);
2855 	}
2856 	fprintf(f, "\"\nPostfix notation (int): ");
2857 	for (p = post_start; *p && p < post_ptr; p++)
2858 		fprintf(f, "%d ", *p);
2859 	fprintf(f, "\n\n");
2860 	fclose(f);
2861     }
2862 }
2863 
2864 /*
2865  * Print the NFA starting with a root node "state".
2866  */
2867     static void
nfa_print_state(FILE * debugf,nfa_state_T * state)2868 nfa_print_state(FILE *debugf, nfa_state_T *state)
2869 {
2870     garray_T indent;
2871 
2872     ga_init2(&indent, 1, 64);
2873     ga_append(&indent, '\0');
2874     nfa_print_state2(debugf, state, &indent);
2875     ga_clear(&indent);
2876 }
2877 
2878     static void
nfa_print_state2(FILE * debugf,nfa_state_T * state,garray_T * indent)2879 nfa_print_state2(FILE *debugf, nfa_state_T *state, garray_T *indent)
2880 {
2881     char_u  *p;
2882 
2883     if (state == NULL)
2884 	return;
2885 
2886     fprintf(debugf, "(%2d)", abs(state->id));
2887 
2888     // Output indent
2889     p = (char_u *)indent->ga_data;
2890     if (indent->ga_len >= 3)
2891     {
2892 	int	last = indent->ga_len - 3;
2893 	char_u	save[2];
2894 
2895 	STRNCPY(save, &p[last], 2);
2896 	STRNCPY(&p[last], "+-", 2);
2897 	fprintf(debugf, " %s", p);
2898 	STRNCPY(&p[last], save, 2);
2899     }
2900     else
2901 	fprintf(debugf, " %s", p);
2902 
2903     nfa_set_code(state->c);
2904     fprintf(debugf, "%s (%d) (id=%d) val=%d\n",
2905 		 code,
2906 		 state->c,
2907 		 abs(state->id),
2908 		 state->val);
2909     if (state->id < 0)
2910 	return;
2911 
2912     state->id = abs(state->id) * -1;
2913 
2914     // grow indent for state->out
2915     indent->ga_len -= 1;
2916     if (state->out1)
2917 	ga_concat(indent, (char_u *)"| ");
2918     else
2919 	ga_concat(indent, (char_u *)"  ");
2920     ga_append(indent, '\0');
2921 
2922     nfa_print_state2(debugf, state->out, indent);
2923 
2924     // replace last part of indent for state->out1
2925     indent->ga_len -= 3;
2926     ga_concat(indent, (char_u *)"  ");
2927     ga_append(indent, '\0');
2928 
2929     nfa_print_state2(debugf, state->out1, indent);
2930 
2931     // shrink indent
2932     indent->ga_len -= 3;
2933     ga_append(indent, '\0');
2934 }
2935 
2936 /*
2937  * Print the NFA state machine.
2938  */
2939     static void
nfa_dump(nfa_regprog_T * prog)2940 nfa_dump(nfa_regprog_T *prog)
2941 {
2942     FILE *debugf = fopen(NFA_REGEXP_DUMP_LOG, "a");
2943 
2944     if (debugf != NULL)
2945     {
2946 	nfa_print_state(debugf, prog->start);
2947 
2948 	if (prog->reganch)
2949 	    fprintf(debugf, "reganch: %d\n", prog->reganch);
2950 	if (prog->regstart != NUL)
2951 	    fprintf(debugf, "regstart: %c (decimal: %d)\n",
2952 					      prog->regstart, prog->regstart);
2953 	if (prog->match_text != NULL)
2954 	    fprintf(debugf, "match_text: \"%s\"\n", prog->match_text);
2955 
2956 	fclose(debugf);
2957     }
2958 }
2959 #endif	    // ENABLE_LOG
2960 #endif	    // DEBUG
2961 
2962 /*
2963  * Parse r.e. @expr and convert it into postfix form.
2964  * Return the postfix string on success, NULL otherwise.
2965  */
2966     static int *
re2post(void)2967 re2post(void)
2968 {
2969     if (nfa_reg(REG_NOPAREN) == FAIL)
2970 	return NULL;
2971     EMIT(NFA_MOPEN);
2972     return post_start;
2973 }
2974 
2975 // NB. Some of the code below is inspired by Russ's.
2976 
2977 /*
2978  * Represents an NFA state plus zero or one or two arrows exiting.
2979  * if c == MATCH, no arrows out; matching state.
2980  * If c == SPLIT, unlabeled arrows to out and out1 (if != NULL).
2981  * If c < 256, labeled arrow with character c to out.
2982  */
2983 
2984 static nfa_state_T	*state_ptr; // points to nfa_prog->state
2985 
2986 /*
2987  * Allocate and initialize nfa_state_T.
2988  */
2989     static nfa_state_T *
alloc_state(int c,nfa_state_T * out,nfa_state_T * out1)2990 alloc_state(int c, nfa_state_T *out, nfa_state_T *out1)
2991 {
2992     nfa_state_T *s;
2993 
2994     if (istate >= nstate)
2995 	return NULL;
2996 
2997     s = &state_ptr[istate++];
2998 
2999     s->c    = c;
3000     s->out  = out;
3001     s->out1 = out1;
3002     s->val  = 0;
3003 
3004     s->id   = istate;
3005     s->lastlist[0] = 0;
3006     s->lastlist[1] = 0;
3007 
3008     return s;
3009 }
3010 
3011 /*
3012  * A partially built NFA without the matching state filled in.
3013  * Frag_T.start points at the start state.
3014  * Frag_T.out is a list of places that need to be set to the
3015  * next state for this fragment.
3016  */
3017 
3018 // Since the out pointers in the list are always
3019 // uninitialized, we use the pointers themselves
3020 // as storage for the Ptrlists.
3021 typedef union Ptrlist Ptrlist;
3022 union Ptrlist
3023 {
3024     Ptrlist	*next;
3025     nfa_state_T	*s;
3026 };
3027 
3028 struct Frag
3029 {
3030     nfa_state_T *start;
3031     Ptrlist	*out;
3032 };
3033 typedef struct Frag Frag_T;
3034 
3035 /*
3036  * Initialize a Frag_T struct and return it.
3037  */
3038     static Frag_T
frag(nfa_state_T * start,Ptrlist * out)3039 frag(nfa_state_T *start, Ptrlist *out)
3040 {
3041     Frag_T n;
3042 
3043     n.start = start;
3044     n.out = out;
3045     return n;
3046 }
3047 
3048 /*
3049  * Create singleton list containing just outp.
3050  */
3051     static Ptrlist *
list1(nfa_state_T ** outp)3052 list1(
3053     nfa_state_T	**outp)
3054 {
3055     Ptrlist *l;
3056 
3057     l = (Ptrlist *)outp;
3058     l->next = NULL;
3059     return l;
3060 }
3061 
3062 /*
3063  * Patch the list of states at out to point to start.
3064  */
3065     static void
patch(Ptrlist * l,nfa_state_T * s)3066 patch(Ptrlist *l, nfa_state_T *s)
3067 {
3068     Ptrlist *next;
3069 
3070     for (; l; l = next)
3071     {
3072 	next = l->next;
3073 	l->s = s;
3074     }
3075 }
3076 
3077 
3078 /*
3079  * Join the two lists l1 and l2, returning the combination.
3080  */
3081     static Ptrlist *
append(Ptrlist * l1,Ptrlist * l2)3082 append(Ptrlist *l1, Ptrlist *l2)
3083 {
3084     Ptrlist *oldl1;
3085 
3086     oldl1 = l1;
3087     while (l1->next)
3088 	l1 = l1->next;
3089     l1->next = l2;
3090     return oldl1;
3091 }
3092 
3093 /*
3094  * Stack used for transforming postfix form into NFA.
3095  */
3096 static Frag_T empty;
3097 
3098     static void
st_error(int * postfix UNUSED,int * end UNUSED,int * p UNUSED)3099 st_error(int *postfix UNUSED, int *end UNUSED, int *p UNUSED)
3100 {
3101 #ifdef NFA_REGEXP_ERROR_LOG
3102     FILE *df;
3103     int *p2;
3104 
3105     df = fopen(NFA_REGEXP_ERROR_LOG, "a");
3106     if (df)
3107     {
3108 	fprintf(df, "Error popping the stack!\n");
3109 # ifdef DEBUG
3110 	fprintf(df, "Current regexp is \"%s\"\n", nfa_regengine.expr);
3111 # endif
3112 	fprintf(df, "Postfix form is: ");
3113 # ifdef DEBUG
3114 	for (p2 = postfix; p2 < end; p2++)
3115 	{
3116 	    nfa_set_code(*p2);
3117 	    fprintf(df, "%s, ", code);
3118 	}
3119 	nfa_set_code(*p);
3120 	fprintf(df, "\nCurrent position is: ");
3121 	for (p2 = postfix; p2 <= p; p2 ++)
3122 	{
3123 	    nfa_set_code(*p2);
3124 	    fprintf(df, "%s, ", code);
3125 	}
3126 # else
3127 	for (p2 = postfix; p2 < end; p2++)
3128 	    fprintf(df, "%d, ", *p2);
3129 	fprintf(df, "\nCurrent position is: ");
3130 	for (p2 = postfix; p2 <= p; p2 ++)
3131 	    fprintf(df, "%d, ", *p2);
3132 # endif
3133 	fprintf(df, "\n--------------------------\n");
3134 	fclose(df);
3135     }
3136 #endif
3137     emsg(_("E874: (NFA) Could not pop the stack!"));
3138 }
3139 
3140 /*
3141  * Push an item onto the stack.
3142  */
3143     static void
st_push(Frag_T s,Frag_T ** p,Frag_T * stack_end)3144 st_push(Frag_T s, Frag_T **p, Frag_T *stack_end)
3145 {
3146     Frag_T *stackp = *p;
3147 
3148     if (stackp >= stack_end)
3149 	return;
3150     *stackp = s;
3151     *p = *p + 1;
3152 }
3153 
3154 /*
3155  * Pop an item from the stack.
3156  */
3157     static Frag_T
st_pop(Frag_T ** p,Frag_T * stack)3158 st_pop(Frag_T **p, Frag_T *stack)
3159 {
3160     Frag_T *stackp;
3161 
3162     *p = *p - 1;
3163     stackp = *p;
3164     if (stackp < stack)
3165 	return empty;
3166     return **p;
3167 }
3168 
3169 /*
3170  * Estimate the maximum byte length of anything matching "state".
3171  * When unknown or unlimited return -1.
3172  */
3173     static int
nfa_max_width(nfa_state_T * startstate,int depth)3174 nfa_max_width(nfa_state_T *startstate, int depth)
3175 {
3176     int		    l, r;
3177     nfa_state_T	    *state = startstate;
3178     int		    len = 0;
3179 
3180     // detect looping in a NFA_SPLIT
3181     if (depth > 4)
3182 	return -1;
3183 
3184     while (state != NULL)
3185     {
3186 	switch (state->c)
3187 	{
3188 	    case NFA_END_INVISIBLE:
3189 	    case NFA_END_INVISIBLE_NEG:
3190 		// the end, return what we have
3191 		return len;
3192 
3193 	    case NFA_SPLIT:
3194 		// two alternatives, use the maximum
3195 		l = nfa_max_width(state->out, depth + 1);
3196 		r = nfa_max_width(state->out1, depth + 1);
3197 		if (l < 0 || r < 0)
3198 		    return -1;
3199 		return len + (l > r ? l : r);
3200 
3201 	    case NFA_ANY:
3202 	    case NFA_START_COLL:
3203 	    case NFA_START_NEG_COLL:
3204 		// matches some character, including composing chars
3205 		if (enc_utf8)
3206 		    len += MB_MAXBYTES;
3207 		else if (has_mbyte)
3208 		    len += 2;
3209 		else
3210 		    ++len;
3211 		if (state->c != NFA_ANY)
3212 		{
3213 		    // skip over the characters
3214 		    state = state->out1->out;
3215 		    continue;
3216 		}
3217 		break;
3218 
3219 	    case NFA_DIGIT:
3220 	    case NFA_WHITE:
3221 	    case NFA_HEX:
3222 	    case NFA_OCTAL:
3223 		// ascii
3224 		++len;
3225 		break;
3226 
3227 	    case NFA_IDENT:
3228 	    case NFA_SIDENT:
3229 	    case NFA_KWORD:
3230 	    case NFA_SKWORD:
3231 	    case NFA_FNAME:
3232 	    case NFA_SFNAME:
3233 	    case NFA_PRINT:
3234 	    case NFA_SPRINT:
3235 	    case NFA_NWHITE:
3236 	    case NFA_NDIGIT:
3237 	    case NFA_NHEX:
3238 	    case NFA_NOCTAL:
3239 	    case NFA_WORD:
3240 	    case NFA_NWORD:
3241 	    case NFA_HEAD:
3242 	    case NFA_NHEAD:
3243 	    case NFA_ALPHA:
3244 	    case NFA_NALPHA:
3245 	    case NFA_LOWER:
3246 	    case NFA_NLOWER:
3247 	    case NFA_UPPER:
3248 	    case NFA_NUPPER:
3249 	    case NFA_LOWER_IC:
3250 	    case NFA_NLOWER_IC:
3251 	    case NFA_UPPER_IC:
3252 	    case NFA_NUPPER_IC:
3253 	    case NFA_ANY_COMPOSING:
3254 		// possibly non-ascii
3255 		if (has_mbyte)
3256 		    len += 3;
3257 		else
3258 		    ++len;
3259 		break;
3260 
3261 	    case NFA_START_INVISIBLE:
3262 	    case NFA_START_INVISIBLE_NEG:
3263 	    case NFA_START_INVISIBLE_BEFORE:
3264 	    case NFA_START_INVISIBLE_BEFORE_NEG:
3265 		// zero-width, out1 points to the END state
3266 		state = state->out1->out;
3267 		continue;
3268 
3269 	    case NFA_BACKREF1:
3270 	    case NFA_BACKREF2:
3271 	    case NFA_BACKREF3:
3272 	    case NFA_BACKREF4:
3273 	    case NFA_BACKREF5:
3274 	    case NFA_BACKREF6:
3275 	    case NFA_BACKREF7:
3276 	    case NFA_BACKREF8:
3277 	    case NFA_BACKREF9:
3278 #ifdef FEAT_SYN_HL
3279 	    case NFA_ZREF1:
3280 	    case NFA_ZREF2:
3281 	    case NFA_ZREF3:
3282 	    case NFA_ZREF4:
3283 	    case NFA_ZREF5:
3284 	    case NFA_ZREF6:
3285 	    case NFA_ZREF7:
3286 	    case NFA_ZREF8:
3287 	    case NFA_ZREF9:
3288 #endif
3289 	    case NFA_NEWL:
3290 	    case NFA_SKIP:
3291 		// unknown width
3292 		return -1;
3293 
3294 	    case NFA_BOL:
3295 	    case NFA_EOL:
3296 	    case NFA_BOF:
3297 	    case NFA_EOF:
3298 	    case NFA_BOW:
3299 	    case NFA_EOW:
3300 	    case NFA_MOPEN:
3301 	    case NFA_MOPEN1:
3302 	    case NFA_MOPEN2:
3303 	    case NFA_MOPEN3:
3304 	    case NFA_MOPEN4:
3305 	    case NFA_MOPEN5:
3306 	    case NFA_MOPEN6:
3307 	    case NFA_MOPEN7:
3308 	    case NFA_MOPEN8:
3309 	    case NFA_MOPEN9:
3310 #ifdef FEAT_SYN_HL
3311 	    case NFA_ZOPEN:
3312 	    case NFA_ZOPEN1:
3313 	    case NFA_ZOPEN2:
3314 	    case NFA_ZOPEN3:
3315 	    case NFA_ZOPEN4:
3316 	    case NFA_ZOPEN5:
3317 	    case NFA_ZOPEN6:
3318 	    case NFA_ZOPEN7:
3319 	    case NFA_ZOPEN8:
3320 	    case NFA_ZOPEN9:
3321 	    case NFA_ZCLOSE:
3322 	    case NFA_ZCLOSE1:
3323 	    case NFA_ZCLOSE2:
3324 	    case NFA_ZCLOSE3:
3325 	    case NFA_ZCLOSE4:
3326 	    case NFA_ZCLOSE5:
3327 	    case NFA_ZCLOSE6:
3328 	    case NFA_ZCLOSE7:
3329 	    case NFA_ZCLOSE8:
3330 	    case NFA_ZCLOSE9:
3331 #endif
3332 	    case NFA_MCLOSE:
3333 	    case NFA_MCLOSE1:
3334 	    case NFA_MCLOSE2:
3335 	    case NFA_MCLOSE3:
3336 	    case NFA_MCLOSE4:
3337 	    case NFA_MCLOSE5:
3338 	    case NFA_MCLOSE6:
3339 	    case NFA_MCLOSE7:
3340 	    case NFA_MCLOSE8:
3341 	    case NFA_MCLOSE9:
3342 	    case NFA_NOPEN:
3343 	    case NFA_NCLOSE:
3344 
3345 	    case NFA_LNUM_GT:
3346 	    case NFA_LNUM_LT:
3347 	    case NFA_COL_GT:
3348 	    case NFA_COL_LT:
3349 	    case NFA_VCOL_GT:
3350 	    case NFA_VCOL_LT:
3351 	    case NFA_MARK_GT:
3352 	    case NFA_MARK_LT:
3353 	    case NFA_VISUAL:
3354 	    case NFA_LNUM:
3355 	    case NFA_CURSOR:
3356 	    case NFA_COL:
3357 	    case NFA_VCOL:
3358 	    case NFA_MARK:
3359 
3360 	    case NFA_ZSTART:
3361 	    case NFA_ZEND:
3362 	    case NFA_OPT_CHARS:
3363 	    case NFA_EMPTY:
3364 	    case NFA_START_PATTERN:
3365 	    case NFA_END_PATTERN:
3366 	    case NFA_COMPOSING:
3367 	    case NFA_END_COMPOSING:
3368 		// zero-width
3369 		break;
3370 
3371 	    default:
3372 		if (state->c < 0)
3373 		    // don't know what this is
3374 		    return -1;
3375 		// normal character
3376 		len += MB_CHAR2LEN(state->c);
3377 		break;
3378 	}
3379 
3380 	// normal way to continue
3381 	state = state->out;
3382     }
3383 
3384     // unrecognized, "cannot happen"
3385     return -1;
3386 }
3387 
3388 /*
3389  * Convert a postfix form into its equivalent NFA.
3390  * Return the NFA start state on success, NULL otherwise.
3391  */
3392     static nfa_state_T *
post2nfa(int * postfix,int * end,int nfa_calc_size)3393 post2nfa(int *postfix, int *end, int nfa_calc_size)
3394 {
3395     int		*p;
3396     int		mopen;
3397     int		mclose;
3398     Frag_T	*stack = NULL;
3399     Frag_T	*stackp = NULL;
3400     Frag_T	*stack_end = NULL;
3401     Frag_T	e1;
3402     Frag_T	e2;
3403     Frag_T	e;
3404     nfa_state_T	*s;
3405     nfa_state_T	*s1;
3406     nfa_state_T	*matchstate;
3407     nfa_state_T	*ret = NULL;
3408 
3409     if (postfix == NULL)
3410 	return NULL;
3411 
3412 #define PUSH(s)	    st_push((s), &stackp, stack_end)
3413 #define POP()	    st_pop(&stackp, stack);		\
3414 		    if (stackp < stack)			\
3415 		    {					\
3416 			st_error(postfix, end, p);	\
3417 			vim_free(stack);		\
3418 			return NULL;			\
3419 		    }
3420 
3421     if (nfa_calc_size == FALSE)
3422     {
3423 	// Allocate space for the stack. Max states on the stack: "nstate".
3424 	stack = ALLOC_MULT(Frag_T, nstate + 1);
3425 	if (stack == NULL)
3426 	    return NULL;
3427 	stackp = stack;
3428 	stack_end = stack + (nstate + 1);
3429     }
3430 
3431     for (p = postfix; p < end; ++p)
3432     {
3433 	switch (*p)
3434 	{
3435 	case NFA_CONCAT:
3436 	    // Concatenation.
3437 	    // Pay attention: this operator does not exist in the r.e. itself
3438 	    // (it is implicit, really).  It is added when r.e. is translated
3439 	    // to postfix form in re2post().
3440 	    if (nfa_calc_size == TRUE)
3441 	    {
3442 		// nstate += 0;
3443 		break;
3444 	    }
3445 	    e2 = POP();
3446 	    e1 = POP();
3447 	    patch(e1.out, e2.start);
3448 	    PUSH(frag(e1.start, e2.out));
3449 	    break;
3450 
3451 	case NFA_OR:
3452 	    // Alternation
3453 	    if (nfa_calc_size == TRUE)
3454 	    {
3455 		nstate++;
3456 		break;
3457 	    }
3458 	    e2 = POP();
3459 	    e1 = POP();
3460 	    s = alloc_state(NFA_SPLIT, e1.start, e2.start);
3461 	    if (s == NULL)
3462 		goto theend;
3463 	    PUSH(frag(s, append(e1.out, e2.out)));
3464 	    break;
3465 
3466 	case NFA_STAR:
3467 	    // Zero or more, prefer more
3468 	    if (nfa_calc_size == TRUE)
3469 	    {
3470 		nstate++;
3471 		break;
3472 	    }
3473 	    e = POP();
3474 	    s = alloc_state(NFA_SPLIT, e.start, NULL);
3475 	    if (s == NULL)
3476 		goto theend;
3477 	    patch(e.out, s);
3478 	    PUSH(frag(s, list1(&s->out1)));
3479 	    break;
3480 
3481 	case NFA_STAR_NONGREEDY:
3482 	    // Zero or more, prefer zero
3483 	    if (nfa_calc_size == TRUE)
3484 	    {
3485 		nstate++;
3486 		break;
3487 	    }
3488 	    e = POP();
3489 	    s = alloc_state(NFA_SPLIT, NULL, e.start);
3490 	    if (s == NULL)
3491 		goto theend;
3492 	    patch(e.out, s);
3493 	    PUSH(frag(s, list1(&s->out)));
3494 	    break;
3495 
3496 	case NFA_QUEST:
3497 	    // one or zero atoms=> greedy match
3498 	    if (nfa_calc_size == TRUE)
3499 	    {
3500 		nstate++;
3501 		break;
3502 	    }
3503 	    e = POP();
3504 	    s = alloc_state(NFA_SPLIT, e.start, NULL);
3505 	    if (s == NULL)
3506 		goto theend;
3507 	    PUSH(frag(s, append(e.out, list1(&s->out1))));
3508 	    break;
3509 
3510 	case NFA_QUEST_NONGREEDY:
3511 	    // zero or one atoms => non-greedy match
3512 	    if (nfa_calc_size == TRUE)
3513 	    {
3514 		nstate++;
3515 		break;
3516 	    }
3517 	    e = POP();
3518 	    s = alloc_state(NFA_SPLIT, NULL, e.start);
3519 	    if (s == NULL)
3520 		goto theend;
3521 	    PUSH(frag(s, append(e.out, list1(&s->out))));
3522 	    break;
3523 
3524 	case NFA_END_COLL:
3525 	case NFA_END_NEG_COLL:
3526 	    // On the stack is the sequence starting with NFA_START_COLL or
3527 	    // NFA_START_NEG_COLL and all possible characters. Patch it to
3528 	    // add the output to the start.
3529 	    if (nfa_calc_size == TRUE)
3530 	    {
3531 		nstate++;
3532 		break;
3533 	    }
3534 	    e = POP();
3535 	    s = alloc_state(NFA_END_COLL, NULL, NULL);
3536 	    if (s == NULL)
3537 		goto theend;
3538 	    patch(e.out, s);
3539 	    e.start->out1 = s;
3540 	    PUSH(frag(e.start, list1(&s->out)));
3541 	    break;
3542 
3543 	case NFA_RANGE:
3544 	    // Before this are two characters, the low and high end of a
3545 	    // range.  Turn them into two states with MIN and MAX.
3546 	    if (nfa_calc_size == TRUE)
3547 	    {
3548 		// nstate += 0;
3549 		break;
3550 	    }
3551 	    e2 = POP();
3552 	    e1 = POP();
3553 	    e2.start->val = e2.start->c;
3554 	    e2.start->c = NFA_RANGE_MAX;
3555 	    e1.start->val = e1.start->c;
3556 	    e1.start->c = NFA_RANGE_MIN;
3557 	    patch(e1.out, e2.start);
3558 	    PUSH(frag(e1.start, e2.out));
3559 	    break;
3560 
3561 	case NFA_EMPTY:
3562 	    // 0-length, used in a repetition with max/min count of 0
3563 	    if (nfa_calc_size == TRUE)
3564 	    {
3565 		nstate++;
3566 		break;
3567 	    }
3568 	    s = alloc_state(NFA_EMPTY, NULL, NULL);
3569 	    if (s == NULL)
3570 		goto theend;
3571 	    PUSH(frag(s, list1(&s->out)));
3572 	    break;
3573 
3574 	case NFA_OPT_CHARS:
3575 	  {
3576 	    int    n;
3577 
3578 	    // \%[abc] implemented as:
3579 	    //    NFA_SPLIT
3580 	    //    +-CHAR(a)
3581 	    //    | +-NFA_SPLIT
3582 	    //    |   +-CHAR(b)
3583 	    //    |   | +-NFA_SPLIT
3584 	    //    |   |   +-CHAR(c)
3585 	    //    |   |   | +-next
3586 	    //    |   |   +- next
3587 	    //    |   +- next
3588 	    //    +- next
3589 	    n = *++p; // get number of characters
3590 	    if (nfa_calc_size == TRUE)
3591 	    {
3592 		nstate += n;
3593 		break;
3594 	    }
3595 	    s = NULL; // avoid compiler warning
3596 	    e1.out = NULL; // stores list with out1's
3597 	    s1 = NULL; // previous NFA_SPLIT to connect to
3598 	    while (n-- > 0)
3599 	    {
3600 		e = POP(); // get character
3601 		s = alloc_state(NFA_SPLIT, e.start, NULL);
3602 		if (s == NULL)
3603 		    goto theend;
3604 		if (e1.out == NULL)
3605 		    e1 = e;
3606 		patch(e.out, s1);
3607 		append(e1.out, list1(&s->out1));
3608 		s1 = s;
3609 	    }
3610 	    PUSH(frag(s, e1.out));
3611 	    break;
3612 	  }
3613 
3614 	case NFA_PREV_ATOM_NO_WIDTH:
3615 	case NFA_PREV_ATOM_NO_WIDTH_NEG:
3616 	case NFA_PREV_ATOM_JUST_BEFORE:
3617 	case NFA_PREV_ATOM_JUST_BEFORE_NEG:
3618 	case NFA_PREV_ATOM_LIKE_PATTERN:
3619 	  {
3620 	    int before = (*p == NFA_PREV_ATOM_JUST_BEFORE
3621 				      || *p == NFA_PREV_ATOM_JUST_BEFORE_NEG);
3622 	    int pattern = (*p == NFA_PREV_ATOM_LIKE_PATTERN);
3623 	    int start_state;
3624 	    int end_state;
3625 	    int n = 0;
3626 	    nfa_state_T *zend;
3627 	    nfa_state_T *skip;
3628 
3629 	    switch (*p)
3630 	    {
3631 		case NFA_PREV_ATOM_NO_WIDTH:
3632 		    start_state = NFA_START_INVISIBLE;
3633 		    end_state = NFA_END_INVISIBLE;
3634 		    break;
3635 		case NFA_PREV_ATOM_NO_WIDTH_NEG:
3636 		    start_state = NFA_START_INVISIBLE_NEG;
3637 		    end_state = NFA_END_INVISIBLE_NEG;
3638 		    break;
3639 		case NFA_PREV_ATOM_JUST_BEFORE:
3640 		    start_state = NFA_START_INVISIBLE_BEFORE;
3641 		    end_state = NFA_END_INVISIBLE;
3642 		    break;
3643 		case NFA_PREV_ATOM_JUST_BEFORE_NEG:
3644 		    start_state = NFA_START_INVISIBLE_BEFORE_NEG;
3645 		    end_state = NFA_END_INVISIBLE_NEG;
3646 		    break;
3647 		default: // NFA_PREV_ATOM_LIKE_PATTERN:
3648 		    start_state = NFA_START_PATTERN;
3649 		    end_state = NFA_END_PATTERN;
3650 		    break;
3651 	    }
3652 
3653 	    if (before)
3654 		n = *++p; // get the count
3655 
3656 	    // The \@= operator: match the preceding atom with zero width.
3657 	    // The \@! operator: no match for the preceding atom.
3658 	    // The \@<= operator: match for the preceding atom.
3659 	    // The \@<! operator: no match for the preceding atom.
3660 	    // Surrounds the preceding atom with START_INVISIBLE and
3661 	    // END_INVISIBLE, similarly to MOPEN.
3662 
3663 	    if (nfa_calc_size == TRUE)
3664 	    {
3665 		nstate += pattern ? 4 : 2;
3666 		break;
3667 	    }
3668 	    e = POP();
3669 	    s1 = alloc_state(end_state, NULL, NULL);
3670 	    if (s1 == NULL)
3671 		goto theend;
3672 
3673 	    s = alloc_state(start_state, e.start, s1);
3674 	    if (s == NULL)
3675 		goto theend;
3676 	    if (pattern)
3677 	    {
3678 		// NFA_ZEND -> NFA_END_PATTERN -> NFA_SKIP -> what follows.
3679 		skip = alloc_state(NFA_SKIP, NULL, NULL);
3680 		if (skip == NULL)
3681 		    goto theend;
3682 		zend = alloc_state(NFA_ZEND, s1, NULL);
3683 		if (zend == NULL)
3684 		    goto theend;
3685 		s1->out= skip;
3686 		patch(e.out, zend);
3687 		PUSH(frag(s, list1(&skip->out)));
3688 	    }
3689 	    else
3690 	    {
3691 		patch(e.out, s1);
3692 		PUSH(frag(s, list1(&s1->out)));
3693 		if (before)
3694 		{
3695 		    if (n <= 0)
3696 			// See if we can guess the maximum width, it avoids a
3697 			// lot of pointless tries.
3698 			n = nfa_max_width(e.start, 0);
3699 		    s->val = n; // store the count
3700 		}
3701 	    }
3702 	    break;
3703 	  }
3704 
3705 	case NFA_COMPOSING:	// char with composing char
3706 #if 0
3707 	    // TODO
3708 	    if (regflags & RF_ICOMBINE)
3709 	    {
3710 		// use the base character only
3711 	    }
3712 #endif
3713 	    // FALLTHROUGH
3714 
3715 	case NFA_MOPEN:	// \( \) Submatch
3716 	case NFA_MOPEN1:
3717 	case NFA_MOPEN2:
3718 	case NFA_MOPEN3:
3719 	case NFA_MOPEN4:
3720 	case NFA_MOPEN5:
3721 	case NFA_MOPEN6:
3722 	case NFA_MOPEN7:
3723 	case NFA_MOPEN8:
3724 	case NFA_MOPEN9:
3725 #ifdef FEAT_SYN_HL
3726 	case NFA_ZOPEN:	// \z( \) Submatch
3727 	case NFA_ZOPEN1:
3728 	case NFA_ZOPEN2:
3729 	case NFA_ZOPEN3:
3730 	case NFA_ZOPEN4:
3731 	case NFA_ZOPEN5:
3732 	case NFA_ZOPEN6:
3733 	case NFA_ZOPEN7:
3734 	case NFA_ZOPEN8:
3735 	case NFA_ZOPEN9:
3736 #endif
3737 	case NFA_NOPEN:	// \%( \) "Invisible Submatch"
3738 	    if (nfa_calc_size == TRUE)
3739 	    {
3740 		nstate += 2;
3741 		break;
3742 	    }
3743 
3744 	    mopen = *p;
3745 	    switch (*p)
3746 	    {
3747 		case NFA_NOPEN: mclose = NFA_NCLOSE; break;
3748 #ifdef FEAT_SYN_HL
3749 		case NFA_ZOPEN: mclose = NFA_ZCLOSE; break;
3750 		case NFA_ZOPEN1: mclose = NFA_ZCLOSE1; break;
3751 		case NFA_ZOPEN2: mclose = NFA_ZCLOSE2; break;
3752 		case NFA_ZOPEN3: mclose = NFA_ZCLOSE3; break;
3753 		case NFA_ZOPEN4: mclose = NFA_ZCLOSE4; break;
3754 		case NFA_ZOPEN5: mclose = NFA_ZCLOSE5; break;
3755 		case NFA_ZOPEN6: mclose = NFA_ZCLOSE6; break;
3756 		case NFA_ZOPEN7: mclose = NFA_ZCLOSE7; break;
3757 		case NFA_ZOPEN8: mclose = NFA_ZCLOSE8; break;
3758 		case NFA_ZOPEN9: mclose = NFA_ZCLOSE9; break;
3759 #endif
3760 		case NFA_COMPOSING: mclose = NFA_END_COMPOSING; break;
3761 		default:
3762 		    // NFA_MOPEN, NFA_MOPEN1 .. NFA_MOPEN9
3763 		    mclose = *p + NSUBEXP;
3764 		    break;
3765 	    }
3766 
3767 	    // Allow "NFA_MOPEN" as a valid postfix representation for
3768 	    // the empty regexp "". In this case, the NFA will be
3769 	    // NFA_MOPEN -> NFA_MCLOSE. Note that this also allows
3770 	    // empty groups of parenthesis, and empty mbyte chars
3771 	    if (stackp == stack)
3772 	    {
3773 		s = alloc_state(mopen, NULL, NULL);
3774 		if (s == NULL)
3775 		    goto theend;
3776 		s1 = alloc_state(mclose, NULL, NULL);
3777 		if (s1 == NULL)
3778 		    goto theend;
3779 		patch(list1(&s->out), s1);
3780 		PUSH(frag(s, list1(&s1->out)));
3781 		break;
3782 	    }
3783 
3784 	    // At least one node was emitted before NFA_MOPEN, so
3785 	    // at least one node will be between NFA_MOPEN and NFA_MCLOSE
3786 	    e = POP();
3787 	    s = alloc_state(mopen, e.start, NULL);   // `('
3788 	    if (s == NULL)
3789 		goto theend;
3790 
3791 	    s1 = alloc_state(mclose, NULL, NULL);   // `)'
3792 	    if (s1 == NULL)
3793 		goto theend;
3794 	    patch(e.out, s1);
3795 
3796 	    if (mopen == NFA_COMPOSING)
3797 		// COMPOSING->out1 = END_COMPOSING
3798 		patch(list1(&s->out1), s1);
3799 
3800 	    PUSH(frag(s, list1(&s1->out)));
3801 	    break;
3802 
3803 	case NFA_BACKREF1:
3804 	case NFA_BACKREF2:
3805 	case NFA_BACKREF3:
3806 	case NFA_BACKREF4:
3807 	case NFA_BACKREF5:
3808 	case NFA_BACKREF6:
3809 	case NFA_BACKREF7:
3810 	case NFA_BACKREF8:
3811 	case NFA_BACKREF9:
3812 #ifdef FEAT_SYN_HL
3813 	case NFA_ZREF1:
3814 	case NFA_ZREF2:
3815 	case NFA_ZREF3:
3816 	case NFA_ZREF4:
3817 	case NFA_ZREF5:
3818 	case NFA_ZREF6:
3819 	case NFA_ZREF7:
3820 	case NFA_ZREF8:
3821 	case NFA_ZREF9:
3822 #endif
3823 	    if (nfa_calc_size == TRUE)
3824 	    {
3825 		nstate += 2;
3826 		break;
3827 	    }
3828 	    s = alloc_state(*p, NULL, NULL);
3829 	    if (s == NULL)
3830 		goto theend;
3831 	    s1 = alloc_state(NFA_SKIP, NULL, NULL);
3832 	    if (s1 == NULL)
3833 		goto theend;
3834 	    patch(list1(&s->out), s1);
3835 	    PUSH(frag(s, list1(&s1->out)));
3836 	    break;
3837 
3838 	case NFA_LNUM:
3839 	case NFA_LNUM_GT:
3840 	case NFA_LNUM_LT:
3841 	case NFA_VCOL:
3842 	case NFA_VCOL_GT:
3843 	case NFA_VCOL_LT:
3844 	case NFA_COL:
3845 	case NFA_COL_GT:
3846 	case NFA_COL_LT:
3847 	case NFA_MARK:
3848 	case NFA_MARK_GT:
3849 	case NFA_MARK_LT:
3850 	  {
3851 	    int n = *++p; // lnum, col or mark name
3852 
3853 	    if (nfa_calc_size == TRUE)
3854 	    {
3855 		nstate += 1;
3856 		break;
3857 	    }
3858 	    s = alloc_state(p[-1], NULL, NULL);
3859 	    if (s == NULL)
3860 		goto theend;
3861 	    s->val = n;
3862 	    PUSH(frag(s, list1(&s->out)));
3863 	    break;
3864 	  }
3865 
3866 	case NFA_ZSTART:
3867 	case NFA_ZEND:
3868 	default:
3869 	    // Operands
3870 	    if (nfa_calc_size == TRUE)
3871 	    {
3872 		nstate++;
3873 		break;
3874 	    }
3875 	    s = alloc_state(*p, NULL, NULL);
3876 	    if (s == NULL)
3877 		goto theend;
3878 	    PUSH(frag(s, list1(&s->out)));
3879 	    break;
3880 
3881 	} // switch(*p)
3882 
3883     } // for(p = postfix; *p; ++p)
3884 
3885     if (nfa_calc_size == TRUE)
3886     {
3887 	nstate++;
3888 	goto theend;	// Return value when counting size is ignored anyway
3889     }
3890 
3891     e = POP();
3892     if (stackp != stack)
3893     {
3894 	vim_free(stack);
3895 	EMSG_RET_NULL(_("E875: (NFA regexp) (While converting from postfix to NFA), too many states left on stack"));
3896     }
3897 
3898     if (istate >= nstate)
3899     {
3900 	vim_free(stack);
3901 	EMSG_RET_NULL(_("E876: (NFA regexp) Not enough space to store the whole NFA "));
3902     }
3903 
3904     matchstate = &state_ptr[istate++]; // the match state
3905     matchstate->c = NFA_MATCH;
3906     matchstate->out = matchstate->out1 = NULL;
3907     matchstate->id = 0;
3908 
3909     patch(e.out, matchstate);
3910     ret = e.start;
3911 
3912 theend:
3913     vim_free(stack);
3914     return ret;
3915 
3916 #undef POP1
3917 #undef PUSH1
3918 #undef POP2
3919 #undef PUSH2
3920 #undef POP
3921 #undef PUSH
3922 }
3923 
3924 /*
3925  * After building the NFA program, inspect it to add optimization hints.
3926  */
3927     static void
nfa_postprocess(nfa_regprog_T * prog)3928 nfa_postprocess(nfa_regprog_T *prog)
3929 {
3930     int i;
3931     int c;
3932 
3933     for (i = 0; i < prog->nstate; ++i)
3934     {
3935 	c = prog->state[i].c;
3936 	if (c == NFA_START_INVISIBLE
3937 		|| c == NFA_START_INVISIBLE_NEG
3938 		|| c == NFA_START_INVISIBLE_BEFORE
3939 		|| c == NFA_START_INVISIBLE_BEFORE_NEG)
3940 	{
3941 	    int directly;
3942 
3943 	    // Do it directly when what follows is possibly the end of the
3944 	    // match.
3945 	    if (match_follows(prog->state[i].out1->out, 0))
3946 		directly = TRUE;
3947 	    else
3948 	    {
3949 		int ch_invisible = failure_chance(prog->state[i].out, 0);
3950 		int ch_follows = failure_chance(prog->state[i].out1->out, 0);
3951 
3952 		// Postpone when the invisible match is expensive or has a
3953 		// lower chance of failing.
3954 		if (c == NFA_START_INVISIBLE_BEFORE
3955 		     || c == NFA_START_INVISIBLE_BEFORE_NEG)
3956 		{
3957 		    // "before" matches are very expensive when
3958 		    // unbounded, always prefer what follows then,
3959 		    // unless what follows will always match.
3960 		    // Otherwise strongly prefer what follows.
3961 		    if (prog->state[i].val <= 0 && ch_follows > 0)
3962 			directly = FALSE;
3963 		    else
3964 			directly = ch_follows * 10 < ch_invisible;
3965 		}
3966 		else
3967 		{
3968 		    // normal invisible, first do the one with the
3969 		    // highest failure chance
3970 		    directly = ch_follows < ch_invisible;
3971 		}
3972 	    }
3973 	    if (directly)
3974 		// switch to the _FIRST state
3975 		++prog->state[i].c;
3976 	}
3977     }
3978 }
3979 
3980 /////////////////////////////////////////////////////////////////
3981 // NFA execution code.
3982 /////////////////////////////////////////////////////////////////
3983 
3984 typedef struct
3985 {
3986     int	    in_use; // number of subexpr with useful info
3987 
3988     // When REG_MULTI is TRUE list.multi is used, otherwise list.line.
3989     union
3990     {
3991 	struct multipos
3992 	{
3993 	    linenr_T	start_lnum;
3994 	    linenr_T	end_lnum;
3995 	    colnr_T	start_col;
3996 	    colnr_T	end_col;
3997 	} multi[NSUBEXP];
3998 	struct linepos
3999 	{
4000 	    char_u	*start;
4001 	    char_u	*end;
4002 	} line[NSUBEXP];
4003     } list;
4004 } regsub_T;
4005 
4006 typedef struct
4007 {
4008     regsub_T	norm; // \( .. \) matches
4009 #ifdef FEAT_SYN_HL
4010     regsub_T	synt; // \z( .. \) matches
4011 #endif
4012 } regsubs_T;
4013 
4014 // nfa_pim_T stores a Postponed Invisible Match.
4015 typedef struct nfa_pim_S nfa_pim_T;
4016 struct nfa_pim_S
4017 {
4018     int		result;		// NFA_PIM_*, see below
4019     nfa_state_T	*state;		// the invisible match start state
4020     regsubs_T	subs;		// submatch info, only party used
4021     union
4022     {
4023 	lpos_T	pos;
4024 	char_u	*ptr;
4025     } end;			// where the match must end
4026 };
4027 
4028 // Values for done in nfa_pim_T.
4029 #define NFA_PIM_UNUSED   0	// pim not used
4030 #define NFA_PIM_TODO     1	// pim not done yet
4031 #define NFA_PIM_MATCH    2	// pim executed, matches
4032 #define NFA_PIM_NOMATCH  3	// pim executed, no match
4033 
4034 
4035 // nfa_thread_T contains execution information of a NFA state
4036 typedef struct
4037 {
4038     nfa_state_T	*state;
4039     int		count;
4040     nfa_pim_T	pim;		// if pim.result != NFA_PIM_UNUSED: postponed
4041 				// invisible match
4042     regsubs_T	subs;		// submatch info, only party used
4043 } nfa_thread_T;
4044 
4045 // nfa_list_T contains the alternative NFA execution states.
4046 typedef struct
4047 {
4048     nfa_thread_T    *t;		// allocated array of states
4049     int		    n;		// nr of states currently in "t"
4050     int		    len;	// max nr of states in "t"
4051     int		    id;		// ID of the list
4052     int		    has_pim;	// TRUE when any state has a PIM
4053 } nfa_list_T;
4054 
4055 #ifdef ENABLE_LOG
4056 static void log_subexpr(regsub_T *sub);
4057 
4058     static void
log_subsexpr(regsubs_T * subs)4059 log_subsexpr(regsubs_T *subs)
4060 {
4061     log_subexpr(&subs->norm);
4062 # ifdef FEAT_SYN_HL
4063     if (rex.nfa_has_zsubexpr)
4064 	log_subexpr(&subs->synt);
4065 # endif
4066 }
4067 
4068     static void
log_subexpr(regsub_T * sub)4069 log_subexpr(regsub_T *sub)
4070 {
4071     int j;
4072 
4073     for (j = 0; j < sub->in_use; j++)
4074 	if (REG_MULTI)
4075 	    fprintf(log_fd, "*** group %d, start: c=%d, l=%d, end: c=%d, l=%d\n",
4076 		    j,
4077 		    sub->list.multi[j].start_col,
4078 		    (int)sub->list.multi[j].start_lnum,
4079 		    sub->list.multi[j].end_col,
4080 		    (int)sub->list.multi[j].end_lnum);
4081 	else
4082 	{
4083 	    char *s = (char *)sub->list.line[j].start;
4084 	    char *e = (char *)sub->list.line[j].end;
4085 
4086 	    fprintf(log_fd, "*** group %d, start: \"%s\", end: \"%s\"\n",
4087 		    j,
4088 		    s == NULL ? "NULL" : s,
4089 		    e == NULL ? "NULL" : e);
4090 	}
4091 }
4092 
4093     static char *
pim_info(nfa_pim_T * pim)4094 pim_info(nfa_pim_T *pim)
4095 {
4096     static char buf[30];
4097 
4098     if (pim == NULL || pim->result == NFA_PIM_UNUSED)
4099 	buf[0] = NUL;
4100     else
4101     {
4102 	sprintf(buf, " PIM col %d", REG_MULTI ? (int)pim->end.pos.col
4103 		: (int)(pim->end.ptr - rex.input));
4104     }
4105     return buf;
4106 }
4107 
4108 #endif
4109 
4110 // Used during execution: whether a match has been found.
4111 static int	    nfa_match;
4112 #ifdef FEAT_RELTIME
4113 static proftime_T  *nfa_time_limit;
4114 static int	   *nfa_timed_out;
4115 static int	    nfa_time_count;
4116 #endif
4117 
4118 static void copy_sub(regsub_T *to, regsub_T *from);
4119 static int pim_equal(nfa_pim_T *one, nfa_pim_T *two);
4120 
4121 /*
4122  * Copy postponed invisible match info from "from" to "to".
4123  */
4124     static void
copy_pim(nfa_pim_T * to,nfa_pim_T * from)4125 copy_pim(nfa_pim_T *to, nfa_pim_T *from)
4126 {
4127     to->result = from->result;
4128     to->state = from->state;
4129     copy_sub(&to->subs.norm, &from->subs.norm);
4130 #ifdef FEAT_SYN_HL
4131     if (rex.nfa_has_zsubexpr)
4132 	copy_sub(&to->subs.synt, &from->subs.synt);
4133 #endif
4134     to->end = from->end;
4135 }
4136 
4137     static void
clear_sub(regsub_T * sub)4138 clear_sub(regsub_T *sub)
4139 {
4140     if (REG_MULTI)
4141 	// Use 0xff to set lnum to -1
4142 	vim_memset(sub->list.multi, 0xff,
4143 				  sizeof(struct multipos) * rex.nfa_nsubexpr);
4144     else
4145 	vim_memset(sub->list.line, 0,
4146 				   sizeof(struct linepos) * rex.nfa_nsubexpr);
4147     sub->in_use = 0;
4148 }
4149 
4150 /*
4151  * Copy the submatches from "from" to "to".
4152  */
4153     static void
copy_sub(regsub_T * to,regsub_T * from)4154 copy_sub(regsub_T *to, regsub_T *from)
4155 {
4156     to->in_use = from->in_use;
4157     if (from->in_use > 0)
4158     {
4159 	// Copy the match start and end positions.
4160 	if (REG_MULTI)
4161 	    mch_memmove(&to->list.multi[0],
4162 			&from->list.multi[0],
4163 			sizeof(struct multipos) * from->in_use);
4164 	else
4165 	    mch_memmove(&to->list.line[0],
4166 			&from->list.line[0],
4167 			sizeof(struct linepos) * from->in_use);
4168     }
4169 }
4170 
4171 /*
4172  * Like copy_sub() but exclude the main match.
4173  */
4174     static void
copy_sub_off(regsub_T * to,regsub_T * from)4175 copy_sub_off(regsub_T *to, regsub_T *from)
4176 {
4177     if (to->in_use < from->in_use)
4178 	to->in_use = from->in_use;
4179     if (from->in_use > 1)
4180     {
4181 	// Copy the match start and end positions.
4182 	if (REG_MULTI)
4183 	    mch_memmove(&to->list.multi[1],
4184 			&from->list.multi[1],
4185 			sizeof(struct multipos) * (from->in_use - 1));
4186 	else
4187 	    mch_memmove(&to->list.line[1],
4188 			&from->list.line[1],
4189 			sizeof(struct linepos) * (from->in_use - 1));
4190     }
4191 }
4192 
4193 /*
4194  * Like copy_sub() but only do the end of the main match if \ze is present.
4195  */
4196     static void
copy_ze_off(regsub_T * to,regsub_T * from)4197 copy_ze_off(regsub_T *to, regsub_T *from)
4198 {
4199     if (rex.nfa_has_zend)
4200     {
4201 	if (REG_MULTI)
4202 	{
4203 	    if (from->list.multi[0].end_lnum >= 0)
4204 	    {
4205 		to->list.multi[0].end_lnum = from->list.multi[0].end_lnum;
4206 		to->list.multi[0].end_col = from->list.multi[0].end_col;
4207 	    }
4208 	}
4209 	else
4210 	{
4211 	    if (from->list.line[0].end != NULL)
4212 		to->list.line[0].end = from->list.line[0].end;
4213 	}
4214     }
4215 }
4216 
4217 /*
4218  * Return TRUE if "sub1" and "sub2" have the same start positions.
4219  * When using back-references also check the end position.
4220  */
4221     static int
sub_equal(regsub_T * sub1,regsub_T * sub2)4222 sub_equal(regsub_T *sub1, regsub_T *sub2)
4223 {
4224     int		i;
4225     int		todo;
4226     linenr_T	s1;
4227     linenr_T	s2;
4228     char_u	*sp1;
4229     char_u	*sp2;
4230 
4231     todo = sub1->in_use > sub2->in_use ? sub1->in_use : sub2->in_use;
4232     if (REG_MULTI)
4233     {
4234 	for (i = 0; i < todo; ++i)
4235 	{
4236 	    if (i < sub1->in_use)
4237 		s1 = sub1->list.multi[i].start_lnum;
4238 	    else
4239 		s1 = -1;
4240 	    if (i < sub2->in_use)
4241 		s2 = sub2->list.multi[i].start_lnum;
4242 	    else
4243 		s2 = -1;
4244 	    if (s1 != s2)
4245 		return FALSE;
4246 	    if (s1 != -1 && sub1->list.multi[i].start_col
4247 					     != sub2->list.multi[i].start_col)
4248 		return FALSE;
4249 
4250 	    if (rex.nfa_has_backref)
4251 	    {
4252 		if (i < sub1->in_use)
4253 		    s1 = sub1->list.multi[i].end_lnum;
4254 		else
4255 		    s1 = -1;
4256 		if (i < sub2->in_use)
4257 		    s2 = sub2->list.multi[i].end_lnum;
4258 		else
4259 		    s2 = -1;
4260 		if (s1 != s2)
4261 		    return FALSE;
4262 		if (s1 != -1 && sub1->list.multi[i].end_col
4263 					       != sub2->list.multi[i].end_col)
4264 		return FALSE;
4265 	    }
4266 	}
4267     }
4268     else
4269     {
4270 	for (i = 0; i < todo; ++i)
4271 	{
4272 	    if (i < sub1->in_use)
4273 		sp1 = sub1->list.line[i].start;
4274 	    else
4275 		sp1 = NULL;
4276 	    if (i < sub2->in_use)
4277 		sp2 = sub2->list.line[i].start;
4278 	    else
4279 		sp2 = NULL;
4280 	    if (sp1 != sp2)
4281 		return FALSE;
4282 	    if (rex.nfa_has_backref)
4283 	    {
4284 		if (i < sub1->in_use)
4285 		    sp1 = sub1->list.line[i].end;
4286 		else
4287 		    sp1 = NULL;
4288 		if (i < sub2->in_use)
4289 		    sp2 = sub2->list.line[i].end;
4290 		else
4291 		    sp2 = NULL;
4292 		if (sp1 != sp2)
4293 		    return FALSE;
4294 	    }
4295 	}
4296     }
4297 
4298     return TRUE;
4299 }
4300 
4301 #ifdef ENABLE_LOG
4302     static void
report_state(char * action,regsub_T * sub,nfa_state_T * state,int lid,nfa_pim_T * pim)4303 report_state(char *action,
4304 	     regsub_T *sub,
4305 	     nfa_state_T *state,
4306 	     int lid,
4307 	     nfa_pim_T *pim)
4308 {
4309     int col;
4310 
4311     if (sub->in_use <= 0)
4312 	col = -1;
4313     else if (REG_MULTI)
4314 	col = sub->list.multi[0].start_col;
4315     else
4316 	col = (int)(sub->list.line[0].start - rex.line);
4317     nfa_set_code(state->c);
4318     fprintf(log_fd, "> %s state %d to list %d. char %d: %s (start col %d)%s\n",
4319 	    action, abs(state->id), lid, state->c, code, col,
4320 	    pim_info(pim));
4321 }
4322 #endif
4323 
4324 /*
4325  * Return TRUE if the same state is already in list "l" with the same
4326  * positions as "subs".
4327  */
4328     static int
has_state_with_pos(nfa_list_T * l,nfa_state_T * state,regsubs_T * subs,nfa_pim_T * pim)4329 has_state_with_pos(
4330     nfa_list_T		*l,	// runtime state list
4331     nfa_state_T		*state,	// state to update
4332     regsubs_T		*subs,	// pointers to subexpressions
4333     nfa_pim_T		*pim)	// postponed match or NULL
4334 {
4335     nfa_thread_T	*thread;
4336     int			i;
4337 
4338     for (i = 0; i < l->n; ++i)
4339     {
4340 	thread = &l->t[i];
4341 	if (thread->state->id == state->id
4342 		&& sub_equal(&thread->subs.norm, &subs->norm)
4343 #ifdef FEAT_SYN_HL
4344 		&& (!rex.nfa_has_zsubexpr
4345 				|| sub_equal(&thread->subs.synt, &subs->synt))
4346 #endif
4347 		&& pim_equal(&thread->pim, pim))
4348 	    return TRUE;
4349     }
4350     return FALSE;
4351 }
4352 
4353 /*
4354  * Return TRUE if "one" and "two" are equal.  That includes when both are not
4355  * set.
4356  */
4357     static int
pim_equal(nfa_pim_T * one,nfa_pim_T * two)4358 pim_equal(nfa_pim_T *one, nfa_pim_T *two)
4359 {
4360     int one_unused = (one == NULL || one->result == NFA_PIM_UNUSED);
4361     int two_unused = (two == NULL || two->result == NFA_PIM_UNUSED);
4362 
4363     if (one_unused)
4364 	// one is unused: equal when two is also unused
4365 	return two_unused;
4366     if (two_unused)
4367 	// one is used and two is not: not equal
4368 	return FALSE;
4369     // compare the state id
4370     if (one->state->id != two->state->id)
4371 	return FALSE;
4372     // compare the position
4373     if (REG_MULTI)
4374 	return one->end.pos.lnum == two->end.pos.lnum
4375 	    && one->end.pos.col == two->end.pos.col;
4376     return one->end.ptr == two->end.ptr;
4377 }
4378 
4379 /*
4380  * Return TRUE if "state" leads to a NFA_MATCH without advancing the input.
4381  */
4382     static int
match_follows(nfa_state_T * startstate,int depth)4383 match_follows(nfa_state_T *startstate, int depth)
4384 {
4385     nfa_state_T	    *state = startstate;
4386 
4387     // avoid too much recursion
4388     if (depth > 10)
4389 	return FALSE;
4390 
4391     while (state != NULL)
4392     {
4393 	switch (state->c)
4394 	{
4395 	    case NFA_MATCH:
4396 	    case NFA_MCLOSE:
4397 	    case NFA_END_INVISIBLE:
4398 	    case NFA_END_INVISIBLE_NEG:
4399 	    case NFA_END_PATTERN:
4400 		return TRUE;
4401 
4402 	    case NFA_SPLIT:
4403 		return match_follows(state->out, depth + 1)
4404 				     || match_follows(state->out1, depth + 1);
4405 
4406 	    case NFA_START_INVISIBLE:
4407 	    case NFA_START_INVISIBLE_FIRST:
4408 	    case NFA_START_INVISIBLE_BEFORE:
4409 	    case NFA_START_INVISIBLE_BEFORE_FIRST:
4410 	    case NFA_START_INVISIBLE_NEG:
4411 	    case NFA_START_INVISIBLE_NEG_FIRST:
4412 	    case NFA_START_INVISIBLE_BEFORE_NEG:
4413 	    case NFA_START_INVISIBLE_BEFORE_NEG_FIRST:
4414 	    case NFA_COMPOSING:
4415 		// skip ahead to next state
4416 		state = state->out1->out;
4417 		continue;
4418 
4419 	    case NFA_ANY:
4420 	    case NFA_ANY_COMPOSING:
4421 	    case NFA_IDENT:
4422 	    case NFA_SIDENT:
4423 	    case NFA_KWORD:
4424 	    case NFA_SKWORD:
4425 	    case NFA_FNAME:
4426 	    case NFA_SFNAME:
4427 	    case NFA_PRINT:
4428 	    case NFA_SPRINT:
4429 	    case NFA_WHITE:
4430 	    case NFA_NWHITE:
4431 	    case NFA_DIGIT:
4432 	    case NFA_NDIGIT:
4433 	    case NFA_HEX:
4434 	    case NFA_NHEX:
4435 	    case NFA_OCTAL:
4436 	    case NFA_NOCTAL:
4437 	    case NFA_WORD:
4438 	    case NFA_NWORD:
4439 	    case NFA_HEAD:
4440 	    case NFA_NHEAD:
4441 	    case NFA_ALPHA:
4442 	    case NFA_NALPHA:
4443 	    case NFA_LOWER:
4444 	    case NFA_NLOWER:
4445 	    case NFA_UPPER:
4446 	    case NFA_NUPPER:
4447 	    case NFA_LOWER_IC:
4448 	    case NFA_NLOWER_IC:
4449 	    case NFA_UPPER_IC:
4450 	    case NFA_NUPPER_IC:
4451 	    case NFA_START_COLL:
4452 	    case NFA_START_NEG_COLL:
4453 	    case NFA_NEWL:
4454 		// state will advance input
4455 		return FALSE;
4456 
4457 	    default:
4458 		if (state->c > 0)
4459 		    // state will advance input
4460 		    return FALSE;
4461 
4462 		// Others: zero-width or possibly zero-width, might still find
4463 		// a match at the same position, keep looking.
4464 		break;
4465 	}
4466 	state = state->out;
4467     }
4468     return FALSE;
4469 }
4470 
4471 
4472 /*
4473  * Return TRUE if "state" is already in list "l".
4474  */
4475     static int
state_in_list(nfa_list_T * l,nfa_state_T * state,regsubs_T * subs)4476 state_in_list(
4477     nfa_list_T		*l,	// runtime state list
4478     nfa_state_T		*state,	// state to update
4479     regsubs_T		*subs)	// pointers to subexpressions
4480 {
4481     if (state->lastlist[nfa_ll_index] == l->id)
4482     {
4483 	if (!rex.nfa_has_backref || has_state_with_pos(l, state, subs, NULL))
4484 	    return TRUE;
4485     }
4486     return FALSE;
4487 }
4488 
4489 // Offset used for "off" by addstate_here().
4490 #define ADDSTATE_HERE_OFFSET 10
4491 
4492 /*
4493  * Add "state" and possibly what follows to state list ".".
4494  * Returns "subs_arg", possibly copied into temp_subs.
4495  * Returns NULL when recursiveness is too deep.
4496  */
4497     static regsubs_T *
addstate(nfa_list_T * l,nfa_state_T * state,regsubs_T * subs_arg,nfa_pim_T * pim,int off_arg)4498 addstate(
4499     nfa_list_T		*l,	    // runtime state list
4500     nfa_state_T		*state,	    // state to update
4501     regsubs_T		*subs_arg,  // pointers to subexpressions
4502     nfa_pim_T		*pim,	    // postponed look-behind match
4503     int			off_arg)    // byte offset, when -1 go to next line
4504 {
4505     int			subidx;
4506     int			off = off_arg;
4507     int			add_here = FALSE;
4508     int			listindex = 0;
4509     int			k;
4510     int			found = FALSE;
4511     nfa_thread_T	*thread;
4512     struct multipos	save_multipos;
4513     int			save_in_use;
4514     char_u		*save_ptr;
4515     int			i;
4516     regsub_T		*sub;
4517     regsubs_T		*subs = subs_arg;
4518     static regsubs_T	temp_subs;
4519 #ifdef ENABLE_LOG
4520     int			did_print = FALSE;
4521 #endif
4522     static int		depth = 0;
4523 
4524     // This function is called recursively.  When the depth is too much we run
4525     // out of stack and crash, limit recursiveness here.
4526     if (++depth >= 5000 || subs == NULL)
4527     {
4528 	--depth;
4529 	return NULL;
4530     }
4531 
4532     if (off_arg <= -ADDSTATE_HERE_OFFSET)
4533     {
4534 	add_here = TRUE;
4535 	off = 0;
4536 	listindex = -(off_arg + ADDSTATE_HERE_OFFSET);
4537     }
4538 
4539     switch (state->c)
4540     {
4541 	case NFA_NCLOSE:
4542 	case NFA_MCLOSE:
4543 	case NFA_MCLOSE1:
4544 	case NFA_MCLOSE2:
4545 	case NFA_MCLOSE3:
4546 	case NFA_MCLOSE4:
4547 	case NFA_MCLOSE5:
4548 	case NFA_MCLOSE6:
4549 	case NFA_MCLOSE7:
4550 	case NFA_MCLOSE8:
4551 	case NFA_MCLOSE9:
4552 #ifdef FEAT_SYN_HL
4553 	case NFA_ZCLOSE:
4554 	case NFA_ZCLOSE1:
4555 	case NFA_ZCLOSE2:
4556 	case NFA_ZCLOSE3:
4557 	case NFA_ZCLOSE4:
4558 	case NFA_ZCLOSE5:
4559 	case NFA_ZCLOSE6:
4560 	case NFA_ZCLOSE7:
4561 	case NFA_ZCLOSE8:
4562 	case NFA_ZCLOSE9:
4563 #endif
4564 	case NFA_MOPEN:
4565 	case NFA_ZEND:
4566 	case NFA_SPLIT:
4567 	case NFA_EMPTY:
4568 	    // These nodes are not added themselves but their "out" and/or
4569 	    // "out1" may be added below.
4570 	    break;
4571 
4572 	case NFA_BOL:
4573 	case NFA_BOF:
4574 	    // "^" won't match past end-of-line, don't bother trying.
4575 	    // Except when at the end of the line, or when we are going to the
4576 	    // next line for a look-behind match.
4577 	    if (rex.input > rex.line
4578 		    && *rex.input != NUL
4579 		    && (nfa_endp == NULL
4580 			|| !REG_MULTI
4581 			|| rex.lnum == nfa_endp->se_u.pos.lnum))
4582 		goto skip_add;
4583 	    // FALLTHROUGH
4584 
4585 	case NFA_MOPEN1:
4586 	case NFA_MOPEN2:
4587 	case NFA_MOPEN3:
4588 	case NFA_MOPEN4:
4589 	case NFA_MOPEN5:
4590 	case NFA_MOPEN6:
4591 	case NFA_MOPEN7:
4592 	case NFA_MOPEN8:
4593 	case NFA_MOPEN9:
4594 #ifdef FEAT_SYN_HL
4595 	case NFA_ZOPEN:
4596 	case NFA_ZOPEN1:
4597 	case NFA_ZOPEN2:
4598 	case NFA_ZOPEN3:
4599 	case NFA_ZOPEN4:
4600 	case NFA_ZOPEN5:
4601 	case NFA_ZOPEN6:
4602 	case NFA_ZOPEN7:
4603 	case NFA_ZOPEN8:
4604 	case NFA_ZOPEN9:
4605 #endif
4606 	case NFA_NOPEN:
4607 	case NFA_ZSTART:
4608 	    // These nodes need to be added so that we can bail out when it
4609 	    // was added to this list before at the same position to avoid an
4610 	    // endless loop for "\(\)*"
4611 
4612 	default:
4613 	    if (state->lastlist[nfa_ll_index] == l->id && state->c != NFA_SKIP)
4614 	    {
4615 		// This state is already in the list, don't add it again,
4616 		// unless it is an MOPEN that is used for a backreference or
4617 		// when there is a PIM. For NFA_MATCH check the position,
4618 		// lower position is preferred.
4619 		if (!rex.nfa_has_backref && pim == NULL && !l->has_pim
4620 						     && state->c != NFA_MATCH)
4621 		{
4622 		    // When called from addstate_here() do insert before
4623 		    // existing states.
4624 		    if (add_here)
4625 		    {
4626 			for (k = 0; k < l->n && k < listindex; ++k)
4627 			    if (l->t[k].state->id == state->id)
4628 			    {
4629 				found = TRUE;
4630 				break;
4631 			    }
4632 		    }
4633 		    if (!add_here || found)
4634 		    {
4635 skip_add:
4636 #ifdef ENABLE_LOG
4637 			nfa_set_code(state->c);
4638 			fprintf(log_fd, "> Not adding state %d to list %d. char %d: %s pim: %s has_pim: %d found: %d\n",
4639 			    abs(state->id), l->id, state->c, code,
4640 			    pim == NULL ? "NULL" : "yes", l->has_pim, found);
4641 #endif
4642 			--depth;
4643 			return subs;
4644 		    }
4645 		}
4646 
4647 		// Do not add the state again when it exists with the same
4648 		// positions.
4649 		if (has_state_with_pos(l, state, subs, pim))
4650 		    goto skip_add;
4651 	    }
4652 
4653 	    // When there are backreferences or PIMs the number of states may
4654 	    // be (a lot) bigger than anticipated.
4655 	    if (l->n == l->len)
4656 	    {
4657 		int		newlen = l->len * 3 / 2 + 50;
4658 		size_t		newsize = newlen * sizeof(nfa_thread_T);
4659 		nfa_thread_T	*newt;
4660 
4661 		if ((long)(newsize >> 10) >= p_mmp)
4662 		{
4663 		    emsg(_(e_maxmempat));
4664 		    --depth;
4665 		    return NULL;
4666 		}
4667 		if (subs != &temp_subs)
4668 		{
4669 		    // "subs" may point into the current array, need to make a
4670 		    // copy before it becomes invalid.
4671 		    copy_sub(&temp_subs.norm, &subs->norm);
4672 #ifdef FEAT_SYN_HL
4673 		    if (rex.nfa_has_zsubexpr)
4674 			copy_sub(&temp_subs.synt, &subs->synt);
4675 #endif
4676 		    subs = &temp_subs;
4677 		}
4678 
4679 		newt = vim_realloc(l->t, newsize);
4680 		if (newt == NULL)
4681 		{
4682 		    // out of memory
4683 		    --depth;
4684 		    return NULL;
4685 		}
4686 		l->t = newt;
4687 		l->len = newlen;
4688 	    }
4689 
4690 	    // add the state to the list
4691 	    state->lastlist[nfa_ll_index] = l->id;
4692 	    thread = &l->t[l->n++];
4693 	    thread->state = state;
4694 	    if (pim == NULL)
4695 		thread->pim.result = NFA_PIM_UNUSED;
4696 	    else
4697 	    {
4698 		copy_pim(&thread->pim, pim);
4699 		l->has_pim = TRUE;
4700 	    }
4701 	    copy_sub(&thread->subs.norm, &subs->norm);
4702 #ifdef FEAT_SYN_HL
4703 	    if (rex.nfa_has_zsubexpr)
4704 		copy_sub(&thread->subs.synt, &subs->synt);
4705 #endif
4706 #ifdef ENABLE_LOG
4707 	    report_state("Adding", &thread->subs.norm, state, l->id, pim);
4708 	    did_print = TRUE;
4709 #endif
4710     }
4711 
4712 #ifdef ENABLE_LOG
4713     if (!did_print)
4714 	report_state("Processing", &subs->norm, state, l->id, pim);
4715 #endif
4716     switch (state->c)
4717     {
4718 	case NFA_MATCH:
4719 	    break;
4720 
4721 	case NFA_SPLIT:
4722 	    // order matters here
4723 	    subs = addstate(l, state->out, subs, pim, off_arg);
4724 	    subs = addstate(l, state->out1, subs, pim, off_arg);
4725 	    break;
4726 
4727 	case NFA_EMPTY:
4728 	case NFA_NOPEN:
4729 	case NFA_NCLOSE:
4730 	    subs = addstate(l, state->out, subs, pim, off_arg);
4731 	    break;
4732 
4733 	case NFA_MOPEN:
4734 	case NFA_MOPEN1:
4735 	case NFA_MOPEN2:
4736 	case NFA_MOPEN3:
4737 	case NFA_MOPEN4:
4738 	case NFA_MOPEN5:
4739 	case NFA_MOPEN6:
4740 	case NFA_MOPEN7:
4741 	case NFA_MOPEN8:
4742 	case NFA_MOPEN9:
4743 #ifdef FEAT_SYN_HL
4744 	case NFA_ZOPEN:
4745 	case NFA_ZOPEN1:
4746 	case NFA_ZOPEN2:
4747 	case NFA_ZOPEN3:
4748 	case NFA_ZOPEN4:
4749 	case NFA_ZOPEN5:
4750 	case NFA_ZOPEN6:
4751 	case NFA_ZOPEN7:
4752 	case NFA_ZOPEN8:
4753 	case NFA_ZOPEN9:
4754 #endif
4755 	case NFA_ZSTART:
4756 	    if (state->c == NFA_ZSTART)
4757 	    {
4758 		subidx = 0;
4759 		sub = &subs->norm;
4760 	    }
4761 #ifdef FEAT_SYN_HL
4762 	    else if (state->c >= NFA_ZOPEN && state->c <= NFA_ZOPEN9)
4763 	    {
4764 		subidx = state->c - NFA_ZOPEN;
4765 		sub = &subs->synt;
4766 	    }
4767 #endif
4768 	    else
4769 	    {
4770 		subidx = state->c - NFA_MOPEN;
4771 		sub = &subs->norm;
4772 	    }
4773 
4774 	    // avoid compiler warnings
4775 	    save_ptr = NULL;
4776 	    CLEAR_FIELD(save_multipos);
4777 
4778 	    // Set the position (with "off" added) in the subexpression.  Save
4779 	    // and restore it when it was in use.  Otherwise fill any gap.
4780 	    if (REG_MULTI)
4781 	    {
4782 		if (subidx < sub->in_use)
4783 		{
4784 		    save_multipos = sub->list.multi[subidx];
4785 		    save_in_use = -1;
4786 		}
4787 		else
4788 		{
4789 		    save_in_use = sub->in_use;
4790 		    for (i = sub->in_use; i < subidx; ++i)
4791 		    {
4792 			sub->list.multi[i].start_lnum = -1;
4793 			sub->list.multi[i].end_lnum = -1;
4794 		    }
4795 		    sub->in_use = subidx + 1;
4796 		}
4797 		if (off == -1)
4798 		{
4799 		    sub->list.multi[subidx].start_lnum = rex.lnum + 1;
4800 		    sub->list.multi[subidx].start_col = 0;
4801 		}
4802 		else
4803 		{
4804 		    sub->list.multi[subidx].start_lnum = rex.lnum;
4805 		    sub->list.multi[subidx].start_col =
4806 					  (colnr_T)(rex.input - rex.line + off);
4807 		}
4808 		sub->list.multi[subidx].end_lnum = -1;
4809 	    }
4810 	    else
4811 	    {
4812 		if (subidx < sub->in_use)
4813 		{
4814 		    save_ptr = sub->list.line[subidx].start;
4815 		    save_in_use = -1;
4816 		}
4817 		else
4818 		{
4819 		    save_in_use = sub->in_use;
4820 		    for (i = sub->in_use; i < subidx; ++i)
4821 		    {
4822 			sub->list.line[i].start = NULL;
4823 			sub->list.line[i].end = NULL;
4824 		    }
4825 		    sub->in_use = subidx + 1;
4826 		}
4827 		sub->list.line[subidx].start = rex.input + off;
4828 	    }
4829 
4830 	    subs = addstate(l, state->out, subs, pim, off_arg);
4831 	    if (subs == NULL)
4832 		break;
4833 	    // "subs" may have changed, need to set "sub" again
4834 #ifdef FEAT_SYN_HL
4835 	    if (state->c >= NFA_ZOPEN && state->c <= NFA_ZOPEN9)
4836 		sub = &subs->synt;
4837 	    else
4838 #endif
4839 		sub = &subs->norm;
4840 
4841 	    if (save_in_use == -1)
4842 	    {
4843 		if (REG_MULTI)
4844 		    sub->list.multi[subidx] = save_multipos;
4845 		else
4846 		    sub->list.line[subidx].start = save_ptr;
4847 	    }
4848 	    else
4849 		sub->in_use = save_in_use;
4850 	    break;
4851 
4852 	case NFA_MCLOSE:
4853 	    if (rex.nfa_has_zend && (REG_MULTI
4854 			? subs->norm.list.multi[0].end_lnum >= 0
4855 			: subs->norm.list.line[0].end != NULL))
4856 	    {
4857 		// Do not overwrite the position set by \ze.
4858 		subs = addstate(l, state->out, subs, pim, off_arg);
4859 		break;
4860 	    }
4861 	    // FALLTHROUGH
4862 	case NFA_MCLOSE1:
4863 	case NFA_MCLOSE2:
4864 	case NFA_MCLOSE3:
4865 	case NFA_MCLOSE4:
4866 	case NFA_MCLOSE5:
4867 	case NFA_MCLOSE6:
4868 	case NFA_MCLOSE7:
4869 	case NFA_MCLOSE8:
4870 	case NFA_MCLOSE9:
4871 #ifdef FEAT_SYN_HL
4872 	case NFA_ZCLOSE:
4873 	case NFA_ZCLOSE1:
4874 	case NFA_ZCLOSE2:
4875 	case NFA_ZCLOSE3:
4876 	case NFA_ZCLOSE4:
4877 	case NFA_ZCLOSE5:
4878 	case NFA_ZCLOSE6:
4879 	case NFA_ZCLOSE7:
4880 	case NFA_ZCLOSE8:
4881 	case NFA_ZCLOSE9:
4882 #endif
4883 	case NFA_ZEND:
4884 	    if (state->c == NFA_ZEND)
4885 	    {
4886 		subidx = 0;
4887 		sub = &subs->norm;
4888 	    }
4889 #ifdef FEAT_SYN_HL
4890 	    else if (state->c >= NFA_ZCLOSE && state->c <= NFA_ZCLOSE9)
4891 	    {
4892 		subidx = state->c - NFA_ZCLOSE;
4893 		sub = &subs->synt;
4894 	    }
4895 #endif
4896 	    else
4897 	    {
4898 		subidx = state->c - NFA_MCLOSE;
4899 		sub = &subs->norm;
4900 	    }
4901 
4902 	    // We don't fill in gaps here, there must have been an MOPEN that
4903 	    // has done that.
4904 	    save_in_use = sub->in_use;
4905 	    if (sub->in_use <= subidx)
4906 		sub->in_use = subidx + 1;
4907 	    if (REG_MULTI)
4908 	    {
4909 		save_multipos = sub->list.multi[subidx];
4910 		if (off == -1)
4911 		{
4912 		    sub->list.multi[subidx].end_lnum = rex.lnum + 1;
4913 		    sub->list.multi[subidx].end_col = 0;
4914 		}
4915 		else
4916 		{
4917 		    sub->list.multi[subidx].end_lnum = rex.lnum;
4918 		    sub->list.multi[subidx].end_col =
4919 					  (colnr_T)(rex.input - rex.line + off);
4920 		}
4921 		// avoid compiler warnings
4922 		save_ptr = NULL;
4923 	    }
4924 	    else
4925 	    {
4926 		save_ptr = sub->list.line[subidx].end;
4927 		sub->list.line[subidx].end = rex.input + off;
4928 		// avoid compiler warnings
4929 		CLEAR_FIELD(save_multipos);
4930 	    }
4931 
4932 	    subs = addstate(l, state->out, subs, pim, off_arg);
4933 	    if (subs == NULL)
4934 		break;
4935 	    // "subs" may have changed, need to set "sub" again
4936 #ifdef FEAT_SYN_HL
4937 	    if (state->c >= NFA_ZCLOSE && state->c <= NFA_ZCLOSE9)
4938 		sub = &subs->synt;
4939 	    else
4940 #endif
4941 		sub = &subs->norm;
4942 
4943 	    if (REG_MULTI)
4944 		sub->list.multi[subidx] = save_multipos;
4945 	    else
4946 		sub->list.line[subidx].end = save_ptr;
4947 	    sub->in_use = save_in_use;
4948 	    break;
4949     }
4950     --depth;
4951     return subs;
4952 }
4953 
4954 /*
4955  * Like addstate(), but the new state(s) are put at position "*ip".
4956  * Used for zero-width matches, next state to use is the added one.
4957  * This makes sure the order of states to be tried does not change, which
4958  * matters for alternatives.
4959  */
4960     static regsubs_T *
addstate_here(nfa_list_T * l,nfa_state_T * state,regsubs_T * subs,nfa_pim_T * pim,int * ip)4961 addstate_here(
4962     nfa_list_T		*l,	// runtime state list
4963     nfa_state_T		*state,	// state to update
4964     regsubs_T		*subs,	// pointers to subexpressions
4965     nfa_pim_T		*pim,   // postponed look-behind match
4966     int			*ip)
4967 {
4968     int tlen = l->n;
4969     int count;
4970     int listidx = *ip;
4971     regsubs_T *r;
4972 
4973     // First add the state(s) at the end, so that we know how many there are.
4974     // Pass the listidx as offset (avoids adding another argument to
4975     // addstate().
4976     r = addstate(l, state, subs, pim, -listidx - ADDSTATE_HERE_OFFSET);
4977     if (r == NULL)
4978 	return NULL;
4979 
4980     // when "*ip" was at the end of the list, nothing to do
4981     if (listidx + 1 == tlen)
4982 	return r;
4983 
4984     // re-order to put the new state at the current position
4985     count = l->n - tlen;
4986     if (count == 0)
4987 	return r; // no state got added
4988     if (count == 1)
4989     {
4990 	// overwrite the current state
4991 	l->t[listidx] = l->t[l->n - 1];
4992     }
4993     else if (count > 1)
4994     {
4995 	if (l->n + count - 1 >= l->len)
4996 	{
4997 	    // not enough space to move the new states, reallocate the list
4998 	    // and move the states to the right position
4999 	    int		    newlen = l->len * 3 / 2 + 50;
5000 	    size_t	    newsize = newlen * sizeof(nfa_thread_T);
5001 	    nfa_thread_T    *newl;
5002 
5003 	    if ((long)(newsize >> 10) >= p_mmp)
5004 	    {
5005 		emsg(_(e_maxmempat));
5006 		return NULL;
5007 	    }
5008 	    newl = alloc(newsize);
5009 	    if (newl == NULL)
5010 		return NULL;
5011 	    l->len = newlen;
5012 	    mch_memmove(&(newl[0]),
5013 		    &(l->t[0]),
5014 		    sizeof(nfa_thread_T) * listidx);
5015 	    mch_memmove(&(newl[listidx]),
5016 		    &(l->t[l->n - count]),
5017 		    sizeof(nfa_thread_T) * count);
5018 	    mch_memmove(&(newl[listidx + count]),
5019 		    &(l->t[listidx + 1]),
5020 		    sizeof(nfa_thread_T) * (l->n - count - listidx - 1));
5021 	    vim_free(l->t);
5022 	    l->t = newl;
5023 	}
5024 	else
5025 	{
5026 	    // make space for new states, then move them from the
5027 	    // end to the current position
5028 	    mch_memmove(&(l->t[listidx + count]),
5029 		    &(l->t[listidx + 1]),
5030 		    sizeof(nfa_thread_T) * (l->n - listidx - 1));
5031 	    mch_memmove(&(l->t[listidx]),
5032 		    &(l->t[l->n - 1]),
5033 		    sizeof(nfa_thread_T) * count);
5034 	}
5035     }
5036     --l->n;
5037     *ip = listidx - 1;
5038 
5039     return r;
5040 }
5041 
5042 /*
5043  * Check character class "class" against current character c.
5044  */
5045     static int
check_char_class(int class,int c)5046 check_char_class(int class, int c)
5047 {
5048     switch (class)
5049     {
5050 	case NFA_CLASS_ALNUM:
5051 	    if (c >= 1 && c < 128 && isalnum(c))
5052 		return OK;
5053 	    break;
5054 	case NFA_CLASS_ALPHA:
5055 	    if (c >= 1 && c < 128 && isalpha(c))
5056 		return OK;
5057 	    break;
5058 	case NFA_CLASS_BLANK:
5059 	    if (c == ' ' || c == '\t')
5060 		return OK;
5061 	    break;
5062 	case NFA_CLASS_CNTRL:
5063 	    if (c >= 1 && c <= 127 && iscntrl(c))
5064 		return OK;
5065 	    break;
5066 	case NFA_CLASS_DIGIT:
5067 	    if (VIM_ISDIGIT(c))
5068 		return OK;
5069 	    break;
5070 	case NFA_CLASS_GRAPH:
5071 	    if (c >= 1 && c <= 127 && isgraph(c))
5072 		return OK;
5073 	    break;
5074 	case NFA_CLASS_LOWER:
5075 	    if (MB_ISLOWER(c) && c != 170 && c != 186)
5076 		return OK;
5077 	    break;
5078 	case NFA_CLASS_PRINT:
5079 	    if (vim_isprintc(c))
5080 		return OK;
5081 	    break;
5082 	case NFA_CLASS_PUNCT:
5083 	    if (c >= 1 && c < 128 && ispunct(c))
5084 		return OK;
5085 	    break;
5086 	case NFA_CLASS_SPACE:
5087 	    if ((c >= 9 && c <= 13) || (c == ' '))
5088 		return OK;
5089 	    break;
5090 	case NFA_CLASS_UPPER:
5091 	    if (MB_ISUPPER(c))
5092 		return OK;
5093 	    break;
5094 	case NFA_CLASS_XDIGIT:
5095 	    if (vim_isxdigit(c))
5096 		return OK;
5097 	    break;
5098 	case NFA_CLASS_TAB:
5099 	    if (c == '\t')
5100 		return OK;
5101 	    break;
5102 	case NFA_CLASS_RETURN:
5103 	    if (c == '\r')
5104 		return OK;
5105 	    break;
5106 	case NFA_CLASS_BACKSPACE:
5107 	    if (c == '\b')
5108 		return OK;
5109 	    break;
5110 	case NFA_CLASS_ESCAPE:
5111 	    if (c == '\033')
5112 		return OK;
5113 	    break;
5114 	case NFA_CLASS_IDENT:
5115 	    if (vim_isIDc(c))
5116 		return OK;
5117 	    break;
5118 	case NFA_CLASS_KEYWORD:
5119 	    if (reg_iswordc(c))
5120 		return OK;
5121 	    break;
5122 	case NFA_CLASS_FNAME:
5123 	    if (vim_isfilec(c))
5124 		return OK;
5125 	    break;
5126 
5127 	default:
5128 	    // should not be here :P
5129 	    siemsg(_(e_ill_char_class), class);
5130 	    return FAIL;
5131     }
5132     return FAIL;
5133 }
5134 
5135 /*
5136  * Check for a match with subexpression "subidx".
5137  * Return TRUE if it matches.
5138  */
5139     static int
match_backref(regsub_T * sub,int subidx,int * bytelen)5140 match_backref(
5141     regsub_T	*sub,	    // pointers to subexpressions
5142     int		subidx,
5143     int		*bytelen)   // out: length of match in bytes
5144 {
5145     int		len;
5146 
5147     if (sub->in_use <= subidx)
5148     {
5149 retempty:
5150 	// backref was not set, match an empty string
5151 	*bytelen = 0;
5152 	return TRUE;
5153     }
5154 
5155     if (REG_MULTI)
5156     {
5157 	if (sub->list.multi[subidx].start_lnum < 0
5158 				       || sub->list.multi[subidx].end_lnum < 0)
5159 	    goto retempty;
5160 	if (sub->list.multi[subidx].start_lnum == rex.lnum
5161 			       && sub->list.multi[subidx].end_lnum == rex.lnum)
5162 	{
5163 	    len = sub->list.multi[subidx].end_col
5164 					  - sub->list.multi[subidx].start_col;
5165 	    if (cstrncmp(rex.line + sub->list.multi[subidx].start_col,
5166 							 rex.input, &len) == 0)
5167 	    {
5168 		*bytelen = len;
5169 		return TRUE;
5170 	    }
5171 	}
5172 	else
5173 	{
5174 	    if (match_with_backref(
5175 			sub->list.multi[subidx].start_lnum,
5176 			sub->list.multi[subidx].start_col,
5177 			sub->list.multi[subidx].end_lnum,
5178 			sub->list.multi[subidx].end_col,
5179 			bytelen) == RA_MATCH)
5180 		return TRUE;
5181 	}
5182     }
5183     else
5184     {
5185 	if (sub->list.line[subidx].start == NULL
5186 					|| sub->list.line[subidx].end == NULL)
5187 	    goto retempty;
5188 	len = (int)(sub->list.line[subidx].end - sub->list.line[subidx].start);
5189 	if (cstrncmp(sub->list.line[subidx].start, rex.input, &len) == 0)
5190 	{
5191 	    *bytelen = len;
5192 	    return TRUE;
5193 	}
5194     }
5195     return FALSE;
5196 }
5197 
5198 #ifdef FEAT_SYN_HL
5199 
5200 /*
5201  * Check for a match with \z subexpression "subidx".
5202  * Return TRUE if it matches.
5203  */
5204     static int
match_zref(int subidx,int * bytelen)5205 match_zref(
5206     int		subidx,
5207     int		*bytelen)   // out: length of match in bytes
5208 {
5209     int		len;
5210 
5211     cleanup_zsubexpr();
5212     if (re_extmatch_in == NULL || re_extmatch_in->matches[subidx] == NULL)
5213     {
5214 	// backref was not set, match an empty string
5215 	*bytelen = 0;
5216 	return TRUE;
5217     }
5218 
5219     len = (int)STRLEN(re_extmatch_in->matches[subidx]);
5220     if (cstrncmp(re_extmatch_in->matches[subidx], rex.input, &len) == 0)
5221     {
5222 	*bytelen = len;
5223 	return TRUE;
5224     }
5225     return FALSE;
5226 }
5227 #endif
5228 
5229 /*
5230  * Save list IDs for all NFA states of "prog" into "list".
5231  * Also reset the IDs to zero.
5232  * Only used for the recursive value lastlist[1].
5233  */
5234     static void
nfa_save_listids(nfa_regprog_T * prog,int * list)5235 nfa_save_listids(nfa_regprog_T *prog, int *list)
5236 {
5237     int		    i;
5238     nfa_state_T	    *p;
5239 
5240     // Order in the list is reverse, it's a bit faster that way.
5241     p = &prog->state[0];
5242     for (i = prog->nstate; --i >= 0; )
5243     {
5244 	list[i] = p->lastlist[1];
5245 	p->lastlist[1] = 0;
5246 	++p;
5247     }
5248 }
5249 
5250 /*
5251  * Restore list IDs from "list" to all NFA states.
5252  */
5253     static void
nfa_restore_listids(nfa_regprog_T * prog,int * list)5254 nfa_restore_listids(nfa_regprog_T *prog, int *list)
5255 {
5256     int		    i;
5257     nfa_state_T	    *p;
5258 
5259     p = &prog->state[0];
5260     for (i = prog->nstate; --i >= 0; )
5261     {
5262 	p->lastlist[1] = list[i];
5263 	++p;
5264     }
5265 }
5266 
5267     static int
nfa_re_num_cmp(long_u val,int op,long_u pos)5268 nfa_re_num_cmp(long_u val, int op, long_u pos)
5269 {
5270     if (op == 1) return pos > val;
5271     if (op == 2) return pos < val;
5272     return val == pos;
5273 }
5274 
5275 static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, regsubs_T *submatch, regsubs_T *m);
5276 
5277 /*
5278  * Recursively call nfa_regmatch()
5279  * "pim" is NULL or contains info about a Postponed Invisible Match (start
5280  * position).
5281  */
5282     static int
recursive_regmatch(nfa_state_T * state,nfa_pim_T * pim,nfa_regprog_T * prog,regsubs_T * submatch,regsubs_T * m,int ** listids,int * listids_len)5283 recursive_regmatch(
5284     nfa_state_T	    *state,
5285     nfa_pim_T	    *pim,
5286     nfa_regprog_T   *prog,
5287     regsubs_T	    *submatch,
5288     regsubs_T	    *m,
5289     int		    **listids,
5290     int		    *listids_len)
5291 {
5292     int		save_reginput_col = (int)(rex.input - rex.line);
5293     int		save_reglnum = rex.lnum;
5294     int		save_nfa_match = nfa_match;
5295     int		save_nfa_listid = rex.nfa_listid;
5296     save_se_T   *save_nfa_endp = nfa_endp;
5297     save_se_T   endpos;
5298     save_se_T   *endposp = NULL;
5299     int		result;
5300     int		need_restore = FALSE;
5301 
5302     if (pim != NULL)
5303     {
5304 	// start at the position where the postponed match was
5305 	if (REG_MULTI)
5306 	    rex.input = rex.line + pim->end.pos.col;
5307 	else
5308 	    rex.input = pim->end.ptr;
5309     }
5310 
5311     if (state->c == NFA_START_INVISIBLE_BEFORE
5312 	    || state->c == NFA_START_INVISIBLE_BEFORE_FIRST
5313 	    || state->c == NFA_START_INVISIBLE_BEFORE_NEG
5314 	    || state->c == NFA_START_INVISIBLE_BEFORE_NEG_FIRST)
5315     {
5316 	// The recursive match must end at the current position. When "pim" is
5317 	// not NULL it specifies the current position.
5318 	endposp = &endpos;
5319 	if (REG_MULTI)
5320 	{
5321 	    if (pim == NULL)
5322 	    {
5323 		endpos.se_u.pos.col = (int)(rex.input - rex.line);
5324 		endpos.se_u.pos.lnum = rex.lnum;
5325 	    }
5326 	    else
5327 		endpos.se_u.pos = pim->end.pos;
5328 	}
5329 	else
5330 	{
5331 	    if (pim == NULL)
5332 		endpos.se_u.ptr = rex.input;
5333 	    else
5334 		endpos.se_u.ptr = pim->end.ptr;
5335 	}
5336 
5337 	// Go back the specified number of bytes, or as far as the
5338 	// start of the previous line, to try matching "\@<=" or
5339 	// not matching "\@<!". This is very inefficient, limit the number of
5340 	// bytes if possible.
5341 	if (state->val <= 0)
5342 	{
5343 	    if (REG_MULTI)
5344 	    {
5345 		rex.line = reg_getline(--rex.lnum);
5346 		if (rex.line == NULL)
5347 		    // can't go before the first line
5348 		    rex.line = reg_getline(++rex.lnum);
5349 	    }
5350 	    rex.input = rex.line;
5351 	}
5352 	else
5353 	{
5354 	    if (REG_MULTI && (int)(rex.input - rex.line) < state->val)
5355 	    {
5356 		// Not enough bytes in this line, go to end of
5357 		// previous line.
5358 		rex.line = reg_getline(--rex.lnum);
5359 		if (rex.line == NULL)
5360 		{
5361 		    // can't go before the first line
5362 		    rex.line = reg_getline(++rex.lnum);
5363 		    rex.input = rex.line;
5364 		}
5365 		else
5366 		    rex.input = rex.line + STRLEN(rex.line);
5367 	    }
5368 	    if ((int)(rex.input - rex.line) >= state->val)
5369 	    {
5370 		rex.input -= state->val;
5371 		if (has_mbyte)
5372 		    rex.input -= mb_head_off(rex.line, rex.input);
5373 	    }
5374 	    else
5375 		rex.input = rex.line;
5376 	}
5377     }
5378 
5379 #ifdef ENABLE_LOG
5380     if (log_fd != stderr)
5381 	fclose(log_fd);
5382     log_fd = NULL;
5383 #endif
5384     // Have to clear the lastlist field of the NFA nodes, so that
5385     // nfa_regmatch() and addstate() can run properly after recursion.
5386     if (nfa_ll_index == 1)
5387     {
5388 	// Already calling nfa_regmatch() recursively.  Save the lastlist[1]
5389 	// values and clear them.
5390 	if (*listids == NULL || *listids_len < prog->nstate)
5391 	{
5392 	    vim_free(*listids);
5393 	    *listids = ALLOC_MULT(int, prog->nstate);
5394 	    if (*listids == NULL)
5395 	    {
5396 		emsg(_("E878: (NFA) Could not allocate memory for branch traversal!"));
5397 		return 0;
5398 	    }
5399 	    *listids_len = prog->nstate;
5400 	}
5401 	nfa_save_listids(prog, *listids);
5402 	need_restore = TRUE;
5403 	// any value of rex.nfa_listid will do
5404     }
5405     else
5406     {
5407 	// First recursive nfa_regmatch() call, switch to the second lastlist
5408 	// entry.  Make sure rex.nfa_listid is different from a previous
5409 	// recursive call, because some states may still have this ID.
5410 	++nfa_ll_index;
5411 	if (rex.nfa_listid <= rex.nfa_alt_listid)
5412 	    rex.nfa_listid = rex.nfa_alt_listid;
5413     }
5414 
5415     // Call nfa_regmatch() to check if the current concat matches at this
5416     // position. The concat ends with the node NFA_END_INVISIBLE
5417     nfa_endp = endposp;
5418     result = nfa_regmatch(prog, state->out, submatch, m);
5419 
5420     if (need_restore)
5421 	nfa_restore_listids(prog, *listids);
5422     else
5423     {
5424 	--nfa_ll_index;
5425 	rex.nfa_alt_listid = rex.nfa_listid;
5426     }
5427 
5428     // restore position in input text
5429     rex.lnum = save_reglnum;
5430     if (REG_MULTI)
5431 	rex.line = reg_getline(rex.lnum);
5432     rex.input = rex.line + save_reginput_col;
5433     if (result != NFA_TOO_EXPENSIVE)
5434     {
5435 	nfa_match = save_nfa_match;
5436 	rex.nfa_listid = save_nfa_listid;
5437     }
5438     nfa_endp = save_nfa_endp;
5439 
5440 #ifdef ENABLE_LOG
5441     log_fd = fopen(NFA_REGEXP_RUN_LOG, "a");
5442     if (log_fd != NULL)
5443     {
5444 	fprintf(log_fd, "****************************\n");
5445 	fprintf(log_fd, "FINISHED RUNNING nfa_regmatch() recursively\n");
5446 	fprintf(log_fd, "MATCH = %s\n", result == TRUE ? "OK" : "FALSE");
5447 	fprintf(log_fd, "****************************\n");
5448     }
5449     else
5450     {
5451 	emsg(_(e_log_open_failed));
5452 	log_fd = stderr;
5453     }
5454 #endif
5455 
5456     return result;
5457 }
5458 
5459 /*
5460  * Estimate the chance of a match with "state" failing.
5461  * empty match: 0
5462  * NFA_ANY: 1
5463  * specific character: 99
5464  */
5465     static int
failure_chance(nfa_state_T * state,int depth)5466 failure_chance(nfa_state_T *state, int depth)
5467 {
5468     int c = state->c;
5469     int l, r;
5470 
5471     // detect looping
5472     if (depth > 4)
5473 	return 1;
5474 
5475     switch (c)
5476     {
5477 	case NFA_SPLIT:
5478 	    if (state->out->c == NFA_SPLIT || state->out1->c == NFA_SPLIT)
5479 		// avoid recursive stuff
5480 		return 1;
5481 	    // two alternatives, use the lowest failure chance
5482 	    l = failure_chance(state->out, depth + 1);
5483 	    r = failure_chance(state->out1, depth + 1);
5484 	    return l < r ? l : r;
5485 
5486 	case NFA_ANY:
5487 	    // matches anything, unlikely to fail
5488 	    return 1;
5489 
5490 	case NFA_MATCH:
5491 	case NFA_MCLOSE:
5492 	case NFA_ANY_COMPOSING:
5493 	    // empty match works always
5494 	    return 0;
5495 
5496 	case NFA_START_INVISIBLE:
5497 	case NFA_START_INVISIBLE_FIRST:
5498 	case NFA_START_INVISIBLE_NEG:
5499 	case NFA_START_INVISIBLE_NEG_FIRST:
5500 	case NFA_START_INVISIBLE_BEFORE:
5501 	case NFA_START_INVISIBLE_BEFORE_FIRST:
5502 	case NFA_START_INVISIBLE_BEFORE_NEG:
5503 	case NFA_START_INVISIBLE_BEFORE_NEG_FIRST:
5504 	case NFA_START_PATTERN:
5505 	    // recursive regmatch is expensive, use low failure chance
5506 	    return 5;
5507 
5508 	case NFA_BOL:
5509 	case NFA_EOL:
5510 	case NFA_BOF:
5511 	case NFA_EOF:
5512 	case NFA_NEWL:
5513 	    return 99;
5514 
5515 	case NFA_BOW:
5516 	case NFA_EOW:
5517 	    return 90;
5518 
5519 	case NFA_MOPEN:
5520 	case NFA_MOPEN1:
5521 	case NFA_MOPEN2:
5522 	case NFA_MOPEN3:
5523 	case NFA_MOPEN4:
5524 	case NFA_MOPEN5:
5525 	case NFA_MOPEN6:
5526 	case NFA_MOPEN7:
5527 	case NFA_MOPEN8:
5528 	case NFA_MOPEN9:
5529 #ifdef FEAT_SYN_HL
5530 	case NFA_ZOPEN:
5531 	case NFA_ZOPEN1:
5532 	case NFA_ZOPEN2:
5533 	case NFA_ZOPEN3:
5534 	case NFA_ZOPEN4:
5535 	case NFA_ZOPEN5:
5536 	case NFA_ZOPEN6:
5537 	case NFA_ZOPEN7:
5538 	case NFA_ZOPEN8:
5539 	case NFA_ZOPEN9:
5540 	case NFA_ZCLOSE:
5541 	case NFA_ZCLOSE1:
5542 	case NFA_ZCLOSE2:
5543 	case NFA_ZCLOSE3:
5544 	case NFA_ZCLOSE4:
5545 	case NFA_ZCLOSE5:
5546 	case NFA_ZCLOSE6:
5547 	case NFA_ZCLOSE7:
5548 	case NFA_ZCLOSE8:
5549 	case NFA_ZCLOSE9:
5550 #endif
5551 	case NFA_NOPEN:
5552 	case NFA_MCLOSE1:
5553 	case NFA_MCLOSE2:
5554 	case NFA_MCLOSE3:
5555 	case NFA_MCLOSE4:
5556 	case NFA_MCLOSE5:
5557 	case NFA_MCLOSE6:
5558 	case NFA_MCLOSE7:
5559 	case NFA_MCLOSE8:
5560 	case NFA_MCLOSE9:
5561 	case NFA_NCLOSE:
5562 	    return failure_chance(state->out, depth + 1);
5563 
5564 	case NFA_BACKREF1:
5565 	case NFA_BACKREF2:
5566 	case NFA_BACKREF3:
5567 	case NFA_BACKREF4:
5568 	case NFA_BACKREF5:
5569 	case NFA_BACKREF6:
5570 	case NFA_BACKREF7:
5571 	case NFA_BACKREF8:
5572 	case NFA_BACKREF9:
5573 #ifdef FEAT_SYN_HL
5574 	case NFA_ZREF1:
5575 	case NFA_ZREF2:
5576 	case NFA_ZREF3:
5577 	case NFA_ZREF4:
5578 	case NFA_ZREF5:
5579 	case NFA_ZREF6:
5580 	case NFA_ZREF7:
5581 	case NFA_ZREF8:
5582 	case NFA_ZREF9:
5583 #endif
5584 	    // backreferences don't match in many places
5585 	    return 94;
5586 
5587 	case NFA_LNUM_GT:
5588 	case NFA_LNUM_LT:
5589 	case NFA_COL_GT:
5590 	case NFA_COL_LT:
5591 	case NFA_VCOL_GT:
5592 	case NFA_VCOL_LT:
5593 	case NFA_MARK_GT:
5594 	case NFA_MARK_LT:
5595 	case NFA_VISUAL:
5596 	    // before/after positions don't match very often
5597 	    return 85;
5598 
5599 	case NFA_LNUM:
5600 	    return 90;
5601 
5602 	case NFA_CURSOR:
5603 	case NFA_COL:
5604 	case NFA_VCOL:
5605 	case NFA_MARK:
5606 	    // specific positions rarely match
5607 	    return 98;
5608 
5609 	case NFA_COMPOSING:
5610 	    return 95;
5611 
5612 	default:
5613 	    if (c > 0)
5614 		// character match fails often
5615 		return 95;
5616     }
5617 
5618     // something else, includes character classes
5619     return 50;
5620 }
5621 
5622 /*
5623  * Skip until the char "c" we know a match must start with.
5624  */
5625     static int
skip_to_start(int c,colnr_T * colp)5626 skip_to_start(int c, colnr_T *colp)
5627 {
5628     char_u *s;
5629 
5630     // Used often, do some work to avoid call overhead.
5631     if (!rex.reg_ic && !has_mbyte)
5632 	s = vim_strbyte(rex.line + *colp, c);
5633     else
5634 	s = cstrchr(rex.line + *colp, c);
5635     if (s == NULL)
5636 	return FAIL;
5637     *colp = (int)(s - rex.line);
5638     return OK;
5639 }
5640 
5641 /*
5642  * Check for a match with match_text.
5643  * Called after skip_to_start() has found regstart.
5644  * Returns zero for no match, 1 for a match.
5645  */
5646     static long
find_match_text(colnr_T startcol,int regstart,char_u * match_text)5647 find_match_text(colnr_T startcol, int regstart, char_u *match_text)
5648 {
5649     colnr_T col = startcol;
5650     int	    c1, c2;
5651     int	    len1, len2;
5652     int	    match;
5653 
5654     for (;;)
5655     {
5656 	match = TRUE;
5657 	len2 = MB_CHAR2LEN(regstart); // skip regstart
5658 	for (len1 = 0; match_text[len1] != NUL; len1 += MB_CHAR2LEN(c1))
5659 	{
5660 	    c1 = PTR2CHAR(match_text + len1);
5661 	    c2 = PTR2CHAR(rex.line + col + len2);
5662 	    if (c1 != c2 && (!rex.reg_ic || MB_CASEFOLD(c1) != MB_CASEFOLD(c2)))
5663 	    {
5664 		match = FALSE;
5665 		break;
5666 	    }
5667 	    len2 += enc_utf8 ? utf_ptr2len(rex.line + col + len2)
5668 							     : MB_CHAR2LEN(c2);
5669 	}
5670 	if (match
5671 		// check that no composing char follows
5672 		&& !(enc_utf8
5673 			  && utf_iscomposing(PTR2CHAR(rex.line + col + len2))))
5674 	{
5675 	    cleanup_subexpr();
5676 	    if (REG_MULTI)
5677 	    {
5678 		rex.reg_startpos[0].lnum = rex.lnum;
5679 		rex.reg_startpos[0].col = col;
5680 		rex.reg_endpos[0].lnum = rex.lnum;
5681 		rex.reg_endpos[0].col = col + len2;
5682 	    }
5683 	    else
5684 	    {
5685 		rex.reg_startp[0] = rex.line + col;
5686 		rex.reg_endp[0] = rex.line + col + len2;
5687 	    }
5688 	    return 1L;
5689 	}
5690 
5691 	// Try finding regstart after the current match.
5692 	col += MB_CHAR2LEN(regstart); // skip regstart
5693 	if (skip_to_start(regstart, &col) == FAIL)
5694 	    break;
5695     }
5696     return 0L;
5697 }
5698 
5699 #ifdef FEAT_RELTIME
5700     static int
nfa_did_time_out()5701 nfa_did_time_out()
5702 {
5703     if (nfa_time_limit != NULL && profile_passed_limit(nfa_time_limit))
5704     {
5705 	if (nfa_timed_out != NULL)
5706 	    *nfa_timed_out = TRUE;
5707 	return TRUE;
5708     }
5709     return FALSE;
5710 }
5711 #endif
5712 
5713 /*
5714  * Main matching routine.
5715  *
5716  * Run NFA to determine whether it matches rex.input.
5717  *
5718  * When "nfa_endp" is not NULL it is a required end-of-match position.
5719  *
5720  * Return TRUE if there is a match, FALSE if there is no match,
5721  * NFA_TOO_EXPENSIVE if we end up with too many states.
5722  * When there is a match "submatch" contains the positions.
5723  *
5724  * Note: Caller must ensure that: start != NULL.
5725  */
5726     static int
nfa_regmatch(nfa_regprog_T * prog,nfa_state_T * start,regsubs_T * submatch,regsubs_T * m)5727 nfa_regmatch(
5728     nfa_regprog_T	*prog,
5729     nfa_state_T		*start,
5730     regsubs_T		*submatch,
5731     regsubs_T		*m)
5732 {
5733     int		result = FALSE;
5734     size_t	size = 0;
5735     int		flag = 0;
5736     int		go_to_nextline = FALSE;
5737     nfa_thread_T *t;
5738     nfa_list_T	list[2];
5739     int		listidx;
5740     nfa_list_T	*thislist;
5741     nfa_list_T	*nextlist;
5742     int		*listids = NULL;
5743     int		listids_len = 0;
5744     nfa_state_T *add_state;
5745     int		add_here;
5746     int		add_count;
5747     int		add_off = 0;
5748     int		toplevel = start->c == NFA_MOPEN;
5749     regsubs_T	*r;
5750 #ifdef NFA_REGEXP_DEBUG_LOG
5751     FILE	*debug;
5752 #endif
5753 
5754     // Some patterns may take a long time to match, especially when using
5755     // recursive_regmatch(). Allow interrupting them with CTRL-C.
5756     fast_breakcheck();
5757     if (got_int)
5758 	return FALSE;
5759 #ifdef FEAT_RELTIME
5760     if (nfa_did_time_out())
5761 	return FALSE;
5762 #endif
5763 
5764 #ifdef NFA_REGEXP_DEBUG_LOG
5765     debug = fopen(NFA_REGEXP_DEBUG_LOG, "a");
5766     if (debug == NULL)
5767     {
5768 	semsg("(NFA) COULD NOT OPEN %s!", NFA_REGEXP_DEBUG_LOG);
5769 	return FALSE;
5770     }
5771 #endif
5772     nfa_match = FALSE;
5773 
5774     // Allocate memory for the lists of nodes.
5775     size = (prog->nstate + 1) * sizeof(nfa_thread_T);
5776 
5777     list[0].t = alloc(size);
5778     list[0].len = prog->nstate + 1;
5779     list[1].t = alloc(size);
5780     list[1].len = prog->nstate + 1;
5781     if (list[0].t == NULL || list[1].t == NULL)
5782 	goto theend;
5783 
5784 #ifdef ENABLE_LOG
5785     log_fd = fopen(NFA_REGEXP_RUN_LOG, "a");
5786     if (log_fd != NULL)
5787     {
5788 	fprintf(log_fd, "**********************************\n");
5789 	nfa_set_code(start->c);
5790 	fprintf(log_fd, " RUNNING nfa_regmatch() starting with state %d, code %s\n",
5791 	abs(start->id), code);
5792 	fprintf(log_fd, "**********************************\n");
5793     }
5794     else
5795     {
5796 	emsg(_(e_log_open_failed));
5797 	log_fd = stderr;
5798     }
5799 #endif
5800 
5801     thislist = &list[0];
5802     thislist->n = 0;
5803     thislist->has_pim = FALSE;
5804     nextlist = &list[1];
5805     nextlist->n = 0;
5806     nextlist->has_pim = FALSE;
5807 #ifdef ENABLE_LOG
5808     fprintf(log_fd, "(---) STARTSTATE first\n");
5809 #endif
5810     thislist->id = rex.nfa_listid + 1;
5811 
5812     // Inline optimized code for addstate(thislist, start, m, 0) if we know
5813     // it's the first MOPEN.
5814     if (toplevel)
5815     {
5816 	if (REG_MULTI)
5817 	{
5818 	    m->norm.list.multi[0].start_lnum = rex.lnum;
5819 	    m->norm.list.multi[0].start_col = (colnr_T)(rex.input - rex.line);
5820 	}
5821 	else
5822 	    m->norm.list.line[0].start = rex.input;
5823 	m->norm.in_use = 1;
5824 	r = addstate(thislist, start->out, m, NULL, 0);
5825     }
5826     else
5827 	r = addstate(thislist, start, m, NULL, 0);
5828     if (r == NULL)
5829     {
5830 	nfa_match = NFA_TOO_EXPENSIVE;
5831 	goto theend;
5832     }
5833 
5834 #define	ADD_STATE_IF_MATCH(state)			\
5835     if (result) {					\
5836 	add_state = state->out;				\
5837 	add_off = clen;					\
5838     }
5839 
5840     /*
5841      * Run for each character.
5842      */
5843     for (;;)
5844     {
5845 	int	curc;
5846 	int	clen;
5847 
5848 	if (has_mbyte)
5849 	{
5850 	    curc = (*mb_ptr2char)(rex.input);
5851 	    clen = (*mb_ptr2len)(rex.input);
5852 	}
5853 	else
5854 	{
5855 	    curc = *rex.input;
5856 	    clen = 1;
5857 	}
5858 	if (curc == NUL)
5859 	{
5860 	    clen = 0;
5861 	    go_to_nextline = FALSE;
5862 	}
5863 
5864 	// swap lists
5865 	thislist = &list[flag];
5866 	nextlist = &list[flag ^= 1];
5867 	nextlist->n = 0;	    // clear nextlist
5868 	nextlist->has_pim = FALSE;
5869 	++rex.nfa_listid;
5870 	if (prog->re_engine == AUTOMATIC_ENGINE
5871 		&& (rex.nfa_listid >= NFA_MAX_STATES
5872 # ifdef FEAT_EVAL
5873 		    || nfa_fail_for_testing
5874 # endif
5875 		    ))
5876 	{
5877 	    // too many states, retry with old engine
5878 	    nfa_match = NFA_TOO_EXPENSIVE;
5879 	    goto theend;
5880 	}
5881 
5882 	thislist->id = rex.nfa_listid;
5883 	nextlist->id = rex.nfa_listid + 1;
5884 
5885 #ifdef ENABLE_LOG
5886 	fprintf(log_fd, "------------------------------------------\n");
5887 	fprintf(log_fd, ">>> Reginput is \"%s\"\n", rex.input);
5888 	fprintf(log_fd, ">>> Advanced one character... Current char is %c (code %d) \n", curc, (int)curc);
5889 	fprintf(log_fd, ">>> Thislist has %d states available: ", thislist->n);
5890 	{
5891 	    int i;
5892 
5893 	    for (i = 0; i < thislist->n; i++)
5894 		fprintf(log_fd, "%d  ", abs(thislist->t[i].state->id));
5895 	}
5896 	fprintf(log_fd, "\n");
5897 #endif
5898 
5899 #ifdef NFA_REGEXP_DEBUG_LOG
5900 	fprintf(debug, "\n-------------------\n");
5901 #endif
5902 	/*
5903 	 * If the state lists are empty we can stop.
5904 	 */
5905 	if (thislist->n == 0)
5906 	    break;
5907 
5908 	// compute nextlist
5909 	for (listidx = 0; listidx < thislist->n; ++listidx)
5910 	{
5911 	    // If the list gets very long there probably is something wrong.
5912 	    // At least allow interrupting with CTRL-C.
5913 	    fast_breakcheck();
5914 	    if (got_int)
5915 		break;
5916 #ifdef FEAT_RELTIME
5917 	    if (nfa_time_limit != NULL && ++nfa_time_count == 20)
5918 	    {
5919 		nfa_time_count = 0;
5920 		if (nfa_did_time_out())
5921 		    break;
5922 	    }
5923 #endif
5924 	    t = &thislist->t[listidx];
5925 
5926 #ifdef NFA_REGEXP_DEBUG_LOG
5927 	    nfa_set_code(t->state->c);
5928 	    fprintf(debug, "%s, ", code);
5929 #endif
5930 #ifdef ENABLE_LOG
5931 	    {
5932 		int col;
5933 
5934 		if (t->subs.norm.in_use <= 0)
5935 		    col = -1;
5936 		else if (REG_MULTI)
5937 		    col = t->subs.norm.list.multi[0].start_col;
5938 		else
5939 		    col = (int)(t->subs.norm.list.line[0].start - rex.line);
5940 		nfa_set_code(t->state->c);
5941 		fprintf(log_fd, "(%d) char %d %s (start col %d)%s... \n",
5942 			abs(t->state->id), (int)t->state->c, code, col,
5943 			pim_info(&t->pim));
5944 	    }
5945 #endif
5946 
5947 	    /*
5948 	     * Handle the possible codes of the current state.
5949 	     * The most important is NFA_MATCH.
5950 	     */
5951 	    add_state = NULL;
5952 	    add_here = FALSE;
5953 	    add_count = 0;
5954 	    switch (t->state->c)
5955 	    {
5956 	    case NFA_MATCH:
5957 	      {
5958 		// If the match is not at the start of the line, ends before a
5959 		// composing characters and rex.reg_icombine is not set, that
5960 		// is not really a match.
5961 		if (enc_utf8 && !rex.reg_icombine
5962 			     && rex.input != rex.line && utf_iscomposing(curc))
5963 		    break;
5964 
5965 		nfa_match = TRUE;
5966 		copy_sub(&submatch->norm, &t->subs.norm);
5967 #ifdef FEAT_SYN_HL
5968 		if (rex.nfa_has_zsubexpr)
5969 		    copy_sub(&submatch->synt, &t->subs.synt);
5970 #endif
5971 #ifdef ENABLE_LOG
5972 		log_subsexpr(&t->subs);
5973 #endif
5974 		// Found the left-most longest match, do not look at any other
5975 		// states at this position.  When the list of states is going
5976 		// to be empty quit without advancing, so that "rex.input" is
5977 		// correct.
5978 		if (nextlist->n == 0)
5979 		    clen = 0;
5980 		goto nextchar;
5981 	      }
5982 
5983 	    case NFA_END_INVISIBLE:
5984 	    case NFA_END_INVISIBLE_NEG:
5985 	    case NFA_END_PATTERN:
5986 		/*
5987 		 * This is only encountered after a NFA_START_INVISIBLE or
5988 		 * NFA_START_INVISIBLE_BEFORE node.
5989 		 * They surround a zero-width group, used with "\@=", "\&",
5990 		 * "\@!", "\@<=" and "\@<!".
5991 		 * If we got here, it means that the current "invisible" group
5992 		 * finished successfully, so return control to the parent
5993 		 * nfa_regmatch().  For a look-behind match only when it ends
5994 		 * in the position in "nfa_endp".
5995 		 * Submatches are stored in *m, and used in the parent call.
5996 		 */
5997 #ifdef ENABLE_LOG
5998 		if (nfa_endp != NULL)
5999 		{
6000 		    if (REG_MULTI)
6001 			fprintf(log_fd, "Current lnum: %d, endp lnum: %d; current col: %d, endp col: %d\n",
6002 				(int)rex.lnum,
6003 				(int)nfa_endp->se_u.pos.lnum,
6004 				(int)(rex.input - rex.line),
6005 				nfa_endp->se_u.pos.col);
6006 		    else
6007 			fprintf(log_fd, "Current col: %d, endp col: %d\n",
6008 				(int)(rex.input - rex.line),
6009 				(int)(nfa_endp->se_u.ptr - rex.input));
6010 		}
6011 #endif
6012 		// If "nfa_endp" is set it's only a match if it ends at
6013 		// "nfa_endp"
6014 		if (nfa_endp != NULL && (REG_MULTI
6015 			? (rex.lnum != nfa_endp->se_u.pos.lnum
6016 			    || (int)(rex.input - rex.line)
6017 						!= nfa_endp->se_u.pos.col)
6018 			: rex.input != nfa_endp->se_u.ptr))
6019 		    break;
6020 
6021 		// do not set submatches for \@!
6022 		if (t->state->c != NFA_END_INVISIBLE_NEG)
6023 		{
6024 		    copy_sub(&m->norm, &t->subs.norm);
6025 #ifdef FEAT_SYN_HL
6026 		    if (rex.nfa_has_zsubexpr)
6027 			copy_sub(&m->synt, &t->subs.synt);
6028 #endif
6029 		}
6030 #ifdef ENABLE_LOG
6031 		fprintf(log_fd, "Match found:\n");
6032 		log_subsexpr(m);
6033 #endif
6034 		nfa_match = TRUE;
6035 		// See comment above at "goto nextchar".
6036 		if (nextlist->n == 0)
6037 		    clen = 0;
6038 		goto nextchar;
6039 
6040 	    case NFA_START_INVISIBLE:
6041 	    case NFA_START_INVISIBLE_FIRST:
6042 	    case NFA_START_INVISIBLE_NEG:
6043 	    case NFA_START_INVISIBLE_NEG_FIRST:
6044 	    case NFA_START_INVISIBLE_BEFORE:
6045 	    case NFA_START_INVISIBLE_BEFORE_FIRST:
6046 	    case NFA_START_INVISIBLE_BEFORE_NEG:
6047 	    case NFA_START_INVISIBLE_BEFORE_NEG_FIRST:
6048 		{
6049 #ifdef ENABLE_LOG
6050 		    fprintf(log_fd, "Failure chance invisible: %d, what follows: %d\n",
6051 			    failure_chance(t->state->out, 0),
6052 			    failure_chance(t->state->out1->out, 0));
6053 #endif
6054 		    // Do it directly if there already is a PIM or when
6055 		    // nfa_postprocess() detected it will work better.
6056 		    if (t->pim.result != NFA_PIM_UNUSED
6057 			 || t->state->c == NFA_START_INVISIBLE_FIRST
6058 			 || t->state->c == NFA_START_INVISIBLE_NEG_FIRST
6059 			 || t->state->c == NFA_START_INVISIBLE_BEFORE_FIRST
6060 			 || t->state->c == NFA_START_INVISIBLE_BEFORE_NEG_FIRST)
6061 		    {
6062 			int in_use = m->norm.in_use;
6063 
6064 			// Copy submatch info for the recursive call, opposite
6065 			// of what happens on success below.
6066 			copy_sub_off(&m->norm, &t->subs.norm);
6067 #ifdef FEAT_SYN_HL
6068 			if (rex.nfa_has_zsubexpr)
6069 			    copy_sub_off(&m->synt, &t->subs.synt);
6070 #endif
6071 
6072 			/*
6073 			 * First try matching the invisible match, then what
6074 			 * follows.
6075 			 */
6076 			result = recursive_regmatch(t->state, NULL, prog,
6077 					  submatch, m, &listids, &listids_len);
6078 			if (result == NFA_TOO_EXPENSIVE)
6079 			{
6080 			    nfa_match = result;
6081 			    goto theend;
6082 			}
6083 
6084 			// for \@! and \@<! it is a match when the result is
6085 			// FALSE
6086 			if (result != (t->state->c == NFA_START_INVISIBLE_NEG
6087 			       || t->state->c == NFA_START_INVISIBLE_NEG_FIRST
6088 			       || t->state->c
6089 					   == NFA_START_INVISIBLE_BEFORE_NEG
6090 			       || t->state->c
6091 				     == NFA_START_INVISIBLE_BEFORE_NEG_FIRST))
6092 			{
6093 			    // Copy submatch info from the recursive call
6094 			    copy_sub_off(&t->subs.norm, &m->norm);
6095 #ifdef FEAT_SYN_HL
6096 			    if (rex.nfa_has_zsubexpr)
6097 				copy_sub_off(&t->subs.synt, &m->synt);
6098 #endif
6099 			    // If the pattern has \ze and it matched in the
6100 			    // sub pattern, use it.
6101 			    copy_ze_off(&t->subs.norm, &m->norm);
6102 
6103 			    // t->state->out1 is the corresponding
6104 			    // END_INVISIBLE node; Add its out to the current
6105 			    // list (zero-width match).
6106 			    add_here = TRUE;
6107 			    add_state = t->state->out1->out;
6108 			}
6109 			m->norm.in_use = in_use;
6110 		    }
6111 		    else
6112 		    {
6113 			nfa_pim_T pim;
6114 
6115 			/*
6116 			 * First try matching what follows.  Only if a match
6117 			 * is found verify the invisible match matches.  Add a
6118 			 * nfa_pim_T to the following states, it contains info
6119 			 * about the invisible match.
6120 			 */
6121 			pim.state = t->state;
6122 			pim.result = NFA_PIM_TODO;
6123 			pim.subs.norm.in_use = 0;
6124 #ifdef FEAT_SYN_HL
6125 			pim.subs.synt.in_use = 0;
6126 #endif
6127 			if (REG_MULTI)
6128 			{
6129 			    pim.end.pos.col = (int)(rex.input - rex.line);
6130 			    pim.end.pos.lnum = rex.lnum;
6131 			}
6132 			else
6133 			    pim.end.ptr = rex.input;
6134 
6135 			// t->state->out1 is the corresponding END_INVISIBLE
6136 			// node; Add its out to the current list (zero-width
6137 			// match).
6138 			if (addstate_here(thislist, t->state->out1->out,
6139 					     &t->subs, &pim, &listidx) == NULL)
6140 			{
6141 			    nfa_match = NFA_TOO_EXPENSIVE;
6142 			    goto theend;
6143 			}
6144 		    }
6145 		}
6146 		break;
6147 
6148 	    case NFA_START_PATTERN:
6149 	      {
6150 		nfa_state_T *skip = NULL;
6151 #ifdef ENABLE_LOG
6152 		int	    skip_lid = 0;
6153 #endif
6154 
6155 		// There is no point in trying to match the pattern if the
6156 		// output state is not going to be added to the list.
6157 		if (state_in_list(nextlist, t->state->out1->out, &t->subs))
6158 		{
6159 		    skip = t->state->out1->out;
6160 #ifdef ENABLE_LOG
6161 		    skip_lid = nextlist->id;
6162 #endif
6163 		}
6164 		else if (state_in_list(nextlist,
6165 					  t->state->out1->out->out, &t->subs))
6166 		{
6167 		    skip = t->state->out1->out->out;
6168 #ifdef ENABLE_LOG
6169 		    skip_lid = nextlist->id;
6170 #endif
6171 		}
6172 		else if (state_in_list(thislist,
6173 					  t->state->out1->out->out, &t->subs))
6174 		{
6175 		    skip = t->state->out1->out->out;
6176 #ifdef ENABLE_LOG
6177 		    skip_lid = thislist->id;
6178 #endif
6179 		}
6180 		if (skip != NULL)
6181 		{
6182 #ifdef ENABLE_LOG
6183 		    nfa_set_code(skip->c);
6184 		    fprintf(log_fd, "> Not trying to match pattern, output state %d is already in list %d. char %d: %s\n",
6185 			    abs(skip->id), skip_lid, skip->c, code);
6186 #endif
6187 		    break;
6188 		}
6189 		// Copy submatch info to the recursive call, opposite of what
6190 		// happens afterwards.
6191 		copy_sub_off(&m->norm, &t->subs.norm);
6192 #ifdef FEAT_SYN_HL
6193 		if (rex.nfa_has_zsubexpr)
6194 		    copy_sub_off(&m->synt, &t->subs.synt);
6195 #endif
6196 
6197 		// First try matching the pattern.
6198 		result = recursive_regmatch(t->state, NULL, prog,
6199 					  submatch, m, &listids, &listids_len);
6200 		if (result == NFA_TOO_EXPENSIVE)
6201 		{
6202 		    nfa_match = result;
6203 		    goto theend;
6204 		}
6205 		if (result)
6206 		{
6207 		    int bytelen;
6208 
6209 #ifdef ENABLE_LOG
6210 		    fprintf(log_fd, "NFA_START_PATTERN matches:\n");
6211 		    log_subsexpr(m);
6212 #endif
6213 		    // Copy submatch info from the recursive call
6214 		    copy_sub_off(&t->subs.norm, &m->norm);
6215 #ifdef FEAT_SYN_HL
6216 		    if (rex.nfa_has_zsubexpr)
6217 			copy_sub_off(&t->subs.synt, &m->synt);
6218 #endif
6219 		    // Now we need to skip over the matched text and then
6220 		    // continue with what follows.
6221 		    if (REG_MULTI)
6222 			// TODO: multi-line match
6223 			bytelen = m->norm.list.multi[0].end_col
6224 						  - (int)(rex.input - rex.line);
6225 		    else
6226 			bytelen = (int)(m->norm.list.line[0].end - rex.input);
6227 
6228 #ifdef ENABLE_LOG
6229 		    fprintf(log_fd, "NFA_START_PATTERN length: %d\n", bytelen);
6230 #endif
6231 		    if (bytelen == 0)
6232 		    {
6233 			// empty match, output of corresponding
6234 			// NFA_END_PATTERN/NFA_SKIP to be used at current
6235 			// position
6236 			add_here = TRUE;
6237 			add_state = t->state->out1->out->out;
6238 		    }
6239 		    else if (bytelen <= clen)
6240 		    {
6241 			// match current character, output of corresponding
6242 			// NFA_END_PATTERN to be used at next position.
6243 			add_state = t->state->out1->out->out;
6244 			add_off = clen;
6245 		    }
6246 		    else
6247 		    {
6248 			// skip over the matched characters, set character
6249 			// count in NFA_SKIP
6250 			add_state = t->state->out1->out;
6251 			add_off = bytelen;
6252 			add_count = bytelen - clen;
6253 		    }
6254 		}
6255 		break;
6256 	      }
6257 
6258 	    case NFA_BOL:
6259 		if (rex.input == rex.line)
6260 		{
6261 		    add_here = TRUE;
6262 		    add_state = t->state->out;
6263 		}
6264 		break;
6265 
6266 	    case NFA_EOL:
6267 		if (curc == NUL)
6268 		{
6269 		    add_here = TRUE;
6270 		    add_state = t->state->out;
6271 		}
6272 		break;
6273 
6274 	    case NFA_BOW:
6275 		result = TRUE;
6276 
6277 		if (curc == NUL)
6278 		    result = FALSE;
6279 		else if (has_mbyte)
6280 		{
6281 		    int this_class;
6282 
6283 		    // Get class of current and previous char (if it exists).
6284 		    this_class = mb_get_class_buf(rex.input, rex.reg_buf);
6285 		    if (this_class <= 1)
6286 			result = FALSE;
6287 		    else if (reg_prev_class() == this_class)
6288 			result = FALSE;
6289 		}
6290 		else if (!vim_iswordc_buf(curc, rex.reg_buf)
6291 			   || (rex.input > rex.line
6292 				&& vim_iswordc_buf(rex.input[-1], rex.reg_buf)))
6293 		    result = FALSE;
6294 		if (result)
6295 		{
6296 		    add_here = TRUE;
6297 		    add_state = t->state->out;
6298 		}
6299 		break;
6300 
6301 	    case NFA_EOW:
6302 		result = TRUE;
6303 		if (rex.input == rex.line)
6304 		    result = FALSE;
6305 		else if (has_mbyte)
6306 		{
6307 		    int this_class, prev_class;
6308 
6309 		    // Get class of current and previous char (if it exists).
6310 		    this_class = mb_get_class_buf(rex.input, rex.reg_buf);
6311 		    prev_class = reg_prev_class();
6312 		    if (this_class == prev_class
6313 					|| prev_class == 0 || prev_class == 1)
6314 			result = FALSE;
6315 		}
6316 		else if (!vim_iswordc_buf(rex.input[-1], rex.reg_buf)
6317 			|| (rex.input[0] != NUL
6318 					&& vim_iswordc_buf(curc, rex.reg_buf)))
6319 		    result = FALSE;
6320 		if (result)
6321 		{
6322 		    add_here = TRUE;
6323 		    add_state = t->state->out;
6324 		}
6325 		break;
6326 
6327 	    case NFA_BOF:
6328 		if (rex.lnum == 0 && rex.input == rex.line
6329 				     && (!REG_MULTI || rex.reg_firstlnum == 1))
6330 		{
6331 		    add_here = TRUE;
6332 		    add_state = t->state->out;
6333 		}
6334 		break;
6335 
6336 	    case NFA_EOF:
6337 		if (rex.lnum == rex.reg_maxline && curc == NUL)
6338 		{
6339 		    add_here = TRUE;
6340 		    add_state = t->state->out;
6341 		}
6342 		break;
6343 
6344 	    case NFA_COMPOSING:
6345 	    {
6346 		int	    mc = curc;
6347 		int	    len = 0;
6348 		nfa_state_T *end;
6349 		nfa_state_T *sta;
6350 		int	    cchars[MAX_MCO];
6351 		int	    ccount = 0;
6352 		int	    j;
6353 
6354 		sta = t->state->out;
6355 		len = 0;
6356 		if (utf_iscomposing(sta->c))
6357 		{
6358 		    // Only match composing character(s), ignore base
6359 		    // character.  Used for ".{composing}" and "{composing}"
6360 		    // (no preceding character).
6361 		    len += mb_char2len(mc);
6362 		}
6363 		if (rex.reg_icombine && len == 0)
6364 		{
6365 		    // If \Z was present, then ignore composing characters.
6366 		    // When ignoring the base character this always matches.
6367 		    if (sta->c != curc)
6368 			result = FAIL;
6369 		    else
6370 			result = OK;
6371 		    while (sta->c != NFA_END_COMPOSING)
6372 			sta = sta->out;
6373 		}
6374 
6375 		// Check base character matches first, unless ignored.
6376 		else if (len > 0 || mc == sta->c)
6377 		{
6378 		    if (len == 0)
6379 		    {
6380 			len += mb_char2len(mc);
6381 			sta = sta->out;
6382 		    }
6383 
6384 		    // We don't care about the order of composing characters.
6385 		    // Get them into cchars[] first.
6386 		    while (len < clen)
6387 		    {
6388 			mc = mb_ptr2char(rex.input + len);
6389 			cchars[ccount++] = mc;
6390 			len += mb_char2len(mc);
6391 			if (ccount == MAX_MCO)
6392 			    break;
6393 		    }
6394 
6395 		    // Check that each composing char in the pattern matches a
6396 		    // composing char in the text.  We do not check if all
6397 		    // composing chars are matched.
6398 		    result = OK;
6399 		    while (sta->c != NFA_END_COMPOSING)
6400 		    {
6401 			for (j = 0; j < ccount; ++j)
6402 			    if (cchars[j] == sta->c)
6403 				break;
6404 			if (j == ccount)
6405 			{
6406 			    result = FAIL;
6407 			    break;
6408 			}
6409 			sta = sta->out;
6410 		    }
6411 		}
6412 		else
6413 		    result = FAIL;
6414 
6415 		end = t->state->out1;	    // NFA_END_COMPOSING
6416 		ADD_STATE_IF_MATCH(end);
6417 		break;
6418 	    }
6419 
6420 	    case NFA_NEWL:
6421 		if (curc == NUL && !rex.reg_line_lbr && REG_MULTI
6422 						 && rex.lnum <= rex.reg_maxline)
6423 		{
6424 		    go_to_nextline = TRUE;
6425 		    // Pass -1 for the offset, which means taking the position
6426 		    // at the start of the next line.
6427 		    add_state = t->state->out;
6428 		    add_off = -1;
6429 		}
6430 		else if (curc == '\n' && rex.reg_line_lbr)
6431 		{
6432 		    // match \n as if it is an ordinary character
6433 		    add_state = t->state->out;
6434 		    add_off = 1;
6435 		}
6436 		break;
6437 
6438 	    case NFA_START_COLL:
6439 	    case NFA_START_NEG_COLL:
6440 	      {
6441 		// What follows is a list of characters, until NFA_END_COLL.
6442 		// One of them must match or none of them must match.
6443 		nfa_state_T	*state;
6444 		int		result_if_matched;
6445 		int		c1, c2;
6446 
6447 		// Never match EOL. If it's part of the collection it is added
6448 		// as a separate state with an OR.
6449 		if (curc == NUL)
6450 		    break;
6451 
6452 		state = t->state->out;
6453 		result_if_matched = (t->state->c == NFA_START_COLL);
6454 		for (;;)
6455 		{
6456 		    if (state->c == NFA_END_COLL)
6457 		    {
6458 			result = !result_if_matched;
6459 			break;
6460 		    }
6461 		    if (state->c == NFA_RANGE_MIN)
6462 		    {
6463 			c1 = state->val;
6464 			state = state->out; // advance to NFA_RANGE_MAX
6465 			c2 = state->val;
6466 #ifdef ENABLE_LOG
6467 			fprintf(log_fd, "NFA_RANGE_MIN curc=%d c1=%d c2=%d\n",
6468 				curc, c1, c2);
6469 #endif
6470 			if (curc >= c1 && curc <= c2)
6471 			{
6472 			    result = result_if_matched;
6473 			    break;
6474 			}
6475 			if (rex.reg_ic)
6476 			{
6477 			    int curc_low = MB_CASEFOLD(curc);
6478 			    int done = FALSE;
6479 
6480 			    for ( ; c1 <= c2; ++c1)
6481 				if (MB_CASEFOLD(c1) == curc_low)
6482 				{
6483 				    result = result_if_matched;
6484 				    done = TRUE;
6485 				    break;
6486 				}
6487 			    if (done)
6488 				break;
6489 			}
6490 		    }
6491 		    else if (state->c < 0 ? check_char_class(state->c, curc)
6492 			       : (curc == state->c
6493 				   || (rex.reg_ic && MB_CASEFOLD(curc)
6494 						    == MB_CASEFOLD(state->c))))
6495 		    {
6496 			result = result_if_matched;
6497 			break;
6498 		    }
6499 		    state = state->out;
6500 		}
6501 		if (result)
6502 		{
6503 		    // next state is in out of the NFA_END_COLL, out1 of
6504 		    // START points to the END state
6505 		    add_state = t->state->out1->out;
6506 		    add_off = clen;
6507 		}
6508 		break;
6509 	      }
6510 
6511 	    case NFA_ANY:
6512 		// Any char except '\0', (end of input) does not match.
6513 		if (curc > 0)
6514 		{
6515 		    add_state = t->state->out;
6516 		    add_off = clen;
6517 		}
6518 		break;
6519 
6520 	    case NFA_ANY_COMPOSING:
6521 		// On a composing character skip over it.  Otherwise do
6522 		// nothing.  Always matches.
6523 		if (enc_utf8 && utf_iscomposing(curc))
6524 		{
6525 		    add_off = clen;
6526 		}
6527 		else
6528 		{
6529 		    add_here = TRUE;
6530 		    add_off = 0;
6531 		}
6532 		add_state = t->state->out;
6533 		break;
6534 
6535 	    /*
6536 	     * Character classes like \a for alpha, \d for digit etc.
6537 	     */
6538 	    case NFA_IDENT:	//  \i
6539 		result = vim_isIDc(curc);
6540 		ADD_STATE_IF_MATCH(t->state);
6541 		break;
6542 
6543 	    case NFA_SIDENT:	//  \I
6544 		result = !VIM_ISDIGIT(curc) && vim_isIDc(curc);
6545 		ADD_STATE_IF_MATCH(t->state);
6546 		break;
6547 
6548 	    case NFA_KWORD:	//  \k
6549 		result = vim_iswordp_buf(rex.input, rex.reg_buf);
6550 		ADD_STATE_IF_MATCH(t->state);
6551 		break;
6552 
6553 	    case NFA_SKWORD:	//  \K
6554 		result = !VIM_ISDIGIT(curc)
6555 				     && vim_iswordp_buf(rex.input, rex.reg_buf);
6556 		ADD_STATE_IF_MATCH(t->state);
6557 		break;
6558 
6559 	    case NFA_FNAME:	//  \f
6560 		result = vim_isfilec(curc);
6561 		ADD_STATE_IF_MATCH(t->state);
6562 		break;
6563 
6564 	    case NFA_SFNAME:	//  \F
6565 		result = !VIM_ISDIGIT(curc) && vim_isfilec(curc);
6566 		ADD_STATE_IF_MATCH(t->state);
6567 		break;
6568 
6569 	    case NFA_PRINT:	//  \p
6570 		result = vim_isprintc(PTR2CHAR(rex.input));
6571 		ADD_STATE_IF_MATCH(t->state);
6572 		break;
6573 
6574 	    case NFA_SPRINT:	//  \P
6575 		result = !VIM_ISDIGIT(curc) && vim_isprintc(PTR2CHAR(rex.input));
6576 		ADD_STATE_IF_MATCH(t->state);
6577 		break;
6578 
6579 	    case NFA_WHITE:	//  \s
6580 		result = VIM_ISWHITE(curc);
6581 		ADD_STATE_IF_MATCH(t->state);
6582 		break;
6583 
6584 	    case NFA_NWHITE:	//  \S
6585 		result = curc != NUL && !VIM_ISWHITE(curc);
6586 		ADD_STATE_IF_MATCH(t->state);
6587 		break;
6588 
6589 	    case NFA_DIGIT:	//  \d
6590 		result = ri_digit(curc);
6591 		ADD_STATE_IF_MATCH(t->state);
6592 		break;
6593 
6594 	    case NFA_NDIGIT:	//  \D
6595 		result = curc != NUL && !ri_digit(curc);
6596 		ADD_STATE_IF_MATCH(t->state);
6597 		break;
6598 
6599 	    case NFA_HEX:	//  \x
6600 		result = ri_hex(curc);
6601 		ADD_STATE_IF_MATCH(t->state);
6602 		break;
6603 
6604 	    case NFA_NHEX:	//  \X
6605 		result = curc != NUL && !ri_hex(curc);
6606 		ADD_STATE_IF_MATCH(t->state);
6607 		break;
6608 
6609 	    case NFA_OCTAL:	//  \o
6610 		result = ri_octal(curc);
6611 		ADD_STATE_IF_MATCH(t->state);
6612 		break;
6613 
6614 	    case NFA_NOCTAL:	//  \O
6615 		result = curc != NUL && !ri_octal(curc);
6616 		ADD_STATE_IF_MATCH(t->state);
6617 		break;
6618 
6619 	    case NFA_WORD:	//  \w
6620 		result = ri_word(curc);
6621 		ADD_STATE_IF_MATCH(t->state);
6622 		break;
6623 
6624 	    case NFA_NWORD:	//  \W
6625 		result = curc != NUL && !ri_word(curc);
6626 		ADD_STATE_IF_MATCH(t->state);
6627 		break;
6628 
6629 	    case NFA_HEAD:	//  \h
6630 		result = ri_head(curc);
6631 		ADD_STATE_IF_MATCH(t->state);
6632 		break;
6633 
6634 	    case NFA_NHEAD:	//  \H
6635 		result = curc != NUL && !ri_head(curc);
6636 		ADD_STATE_IF_MATCH(t->state);
6637 		break;
6638 
6639 	    case NFA_ALPHA:	//  \a
6640 		result = ri_alpha(curc);
6641 		ADD_STATE_IF_MATCH(t->state);
6642 		break;
6643 
6644 	    case NFA_NALPHA:	//  \A
6645 		result = curc != NUL && !ri_alpha(curc);
6646 		ADD_STATE_IF_MATCH(t->state);
6647 		break;
6648 
6649 	    case NFA_LOWER:	//  \l
6650 		result = ri_lower(curc);
6651 		ADD_STATE_IF_MATCH(t->state);
6652 		break;
6653 
6654 	    case NFA_NLOWER:	//  \L
6655 		result = curc != NUL && !ri_lower(curc);
6656 		ADD_STATE_IF_MATCH(t->state);
6657 		break;
6658 
6659 	    case NFA_UPPER:	//  \u
6660 		result = ri_upper(curc);
6661 		ADD_STATE_IF_MATCH(t->state);
6662 		break;
6663 
6664 	    case NFA_NUPPER:	// \U
6665 		result = curc != NUL && !ri_upper(curc);
6666 		ADD_STATE_IF_MATCH(t->state);
6667 		break;
6668 
6669 	    case NFA_LOWER_IC:	// [a-z]
6670 		result = ri_lower(curc) || (rex.reg_ic && ri_upper(curc));
6671 		ADD_STATE_IF_MATCH(t->state);
6672 		break;
6673 
6674 	    case NFA_NLOWER_IC:	// [^a-z]
6675 		result = curc != NUL
6676 			&& !(ri_lower(curc) || (rex.reg_ic && ri_upper(curc)));
6677 		ADD_STATE_IF_MATCH(t->state);
6678 		break;
6679 
6680 	    case NFA_UPPER_IC:	// [A-Z]
6681 		result = ri_upper(curc) || (rex.reg_ic && ri_lower(curc));
6682 		ADD_STATE_IF_MATCH(t->state);
6683 		break;
6684 
6685 	    case NFA_NUPPER_IC:	// ^[A-Z]
6686 		result = curc != NUL
6687 			&& !(ri_upper(curc) || (rex.reg_ic && ri_lower(curc)));
6688 		ADD_STATE_IF_MATCH(t->state);
6689 		break;
6690 
6691 	    case NFA_BACKREF1:
6692 	    case NFA_BACKREF2:
6693 	    case NFA_BACKREF3:
6694 	    case NFA_BACKREF4:
6695 	    case NFA_BACKREF5:
6696 	    case NFA_BACKREF6:
6697 	    case NFA_BACKREF7:
6698 	    case NFA_BACKREF8:
6699 	    case NFA_BACKREF9:
6700 #ifdef FEAT_SYN_HL
6701 	    case NFA_ZREF1:
6702 	    case NFA_ZREF2:
6703 	    case NFA_ZREF3:
6704 	    case NFA_ZREF4:
6705 	    case NFA_ZREF5:
6706 	    case NFA_ZREF6:
6707 	    case NFA_ZREF7:
6708 	    case NFA_ZREF8:
6709 	    case NFA_ZREF9:
6710 #endif
6711 		// \1 .. \9  \z1 .. \z9
6712 	      {
6713 		int subidx;
6714 		int bytelen;
6715 
6716 		if (t->state->c <= NFA_BACKREF9)
6717 		{
6718 		    subidx = t->state->c - NFA_BACKREF1 + 1;
6719 		    result = match_backref(&t->subs.norm, subidx, &bytelen);
6720 		}
6721 #ifdef FEAT_SYN_HL
6722 		else
6723 		{
6724 		    subidx = t->state->c - NFA_ZREF1 + 1;
6725 		    result = match_zref(subidx, &bytelen);
6726 		}
6727 #endif
6728 
6729 		if (result)
6730 		{
6731 		    if (bytelen == 0)
6732 		    {
6733 			// empty match always works, output of NFA_SKIP to be
6734 			// used next
6735 			add_here = TRUE;
6736 			add_state = t->state->out->out;
6737 		    }
6738 		    else if (bytelen <= clen)
6739 		    {
6740 			// match current character, jump ahead to out of
6741 			// NFA_SKIP
6742 			add_state = t->state->out->out;
6743 			add_off = clen;
6744 		    }
6745 		    else
6746 		    {
6747 			// skip over the matched characters, set character
6748 			// count in NFA_SKIP
6749 			add_state = t->state->out;
6750 			add_off = bytelen;
6751 			add_count = bytelen - clen;
6752 		    }
6753 		}
6754 		break;
6755 	      }
6756 	    case NFA_SKIP:
6757 	      // character of previous matching \1 .. \9  or \@>
6758 	      if (t->count - clen <= 0)
6759 	      {
6760 		  // end of match, go to what follows
6761 		  add_state = t->state->out;
6762 		  add_off = clen;
6763 	      }
6764 	      else
6765 	      {
6766 		  // add state again with decremented count
6767 		  add_state = t->state;
6768 		  add_off = 0;
6769 		  add_count = t->count - clen;
6770 	      }
6771 	      break;
6772 
6773 	    case NFA_LNUM:
6774 	    case NFA_LNUM_GT:
6775 	    case NFA_LNUM_LT:
6776 		result = (REG_MULTI &&
6777 			nfa_re_num_cmp(t->state->val, t->state->c - NFA_LNUM,
6778 			    (long_u)(rex.lnum + rex.reg_firstlnum)));
6779 		if (result)
6780 		{
6781 		    add_here = TRUE;
6782 		    add_state = t->state->out;
6783 		}
6784 		break;
6785 
6786 	    case NFA_COL:
6787 	    case NFA_COL_GT:
6788 	    case NFA_COL_LT:
6789 		result = nfa_re_num_cmp(t->state->val, t->state->c - NFA_COL,
6790 			(long_u)(rex.input - rex.line) + 1);
6791 		if (result)
6792 		{
6793 		    add_here = TRUE;
6794 		    add_state = t->state->out;
6795 		}
6796 		break;
6797 
6798 	    case NFA_VCOL:
6799 	    case NFA_VCOL_GT:
6800 	    case NFA_VCOL_LT:
6801 		{
6802 		    int     op = t->state->c - NFA_VCOL;
6803 		    colnr_T col = (colnr_T)(rex.input - rex.line);
6804 		    win_T   *wp = rex.reg_win == NULL ? curwin : rex.reg_win;
6805 
6806 		    // Bail out quickly when there can't be a match, avoid the
6807 		    // overhead of win_linetabsize() on long lines.
6808 		    if (op != 1 && col > t->state->val
6809 			    * (has_mbyte ? MB_MAXBYTES : 1))
6810 			break;
6811 		    result = FALSE;
6812 		    if (op == 1 && col - 1 > t->state->val && col > 100)
6813 		    {
6814 			int ts = wp->w_buffer->b_p_ts;
6815 
6816 			// Guess that a character won't use more columns than
6817 			// 'tabstop', with a minimum of 4.
6818 			if (ts < 4)
6819 			    ts = 4;
6820 			result = col > t->state->val * ts;
6821 		    }
6822 		    if (!result)
6823 			result = nfa_re_num_cmp(t->state->val, op,
6824 				(long_u)win_linetabsize(wp, rex.line, col) + 1);
6825 		    if (result)
6826 		    {
6827 			add_here = TRUE;
6828 			add_state = t->state->out;
6829 		    }
6830 		}
6831 		break;
6832 
6833 	    case NFA_MARK:
6834 	    case NFA_MARK_GT:
6835 	    case NFA_MARK_LT:
6836 	      {
6837 		size_t	col = rex.input - rex.line;
6838 		pos_T	*pos = getmark_buf(rex.reg_buf, t->state->val, FALSE);
6839 
6840 		// Line may have been freed, get it again.
6841 		if (REG_MULTI)
6842 		{
6843 		    rex.line = reg_getline(rex.lnum);
6844 		    rex.input = rex.line + col;
6845 		}
6846 
6847 		// Compare the mark position to the match position, if the mark
6848 		// exists and mark is set in reg_buf.
6849 		if (pos != NULL && pos->lnum > 0)
6850 		{
6851 		    colnr_T pos_col = pos->lnum == rex.lnum + rex.reg_firstlnum
6852 							  && pos->col == MAXCOL
6853 				      ? (colnr_T)STRLEN(reg_getline(
6854 						pos->lnum - rex.reg_firstlnum))
6855 				      : pos->col;
6856 
6857 		    result = (pos->lnum == rex.lnum + rex.reg_firstlnum
6858 				? (pos_col == (colnr_T)(rex.input - rex.line)
6859 				    ? t->state->c == NFA_MARK
6860 				    : (pos_col < (colnr_T)(rex.input - rex.line)
6861 					? t->state->c == NFA_MARK_GT
6862 					: t->state->c == NFA_MARK_LT))
6863 				: (pos->lnum < rex.lnum + rex.reg_firstlnum
6864 				    ? t->state->c == NFA_MARK_GT
6865 				    : t->state->c == NFA_MARK_LT));
6866 		    if (result)
6867 		    {
6868 			add_here = TRUE;
6869 			add_state = t->state->out;
6870 		    }
6871 		}
6872 		break;
6873 	      }
6874 
6875 	    case NFA_CURSOR:
6876 		result = (rex.reg_win != NULL
6877 			&& (rex.lnum + rex.reg_firstlnum
6878 						 == rex.reg_win->w_cursor.lnum)
6879 			&& ((colnr_T)(rex.input - rex.line)
6880 						== rex.reg_win->w_cursor.col));
6881 		if (result)
6882 		{
6883 		    add_here = TRUE;
6884 		    add_state = t->state->out;
6885 		}
6886 		break;
6887 
6888 	    case NFA_VISUAL:
6889 		result = reg_match_visual();
6890 		if (result)
6891 		{
6892 		    add_here = TRUE;
6893 		    add_state = t->state->out;
6894 		}
6895 		break;
6896 
6897 	    case NFA_MOPEN1:
6898 	    case NFA_MOPEN2:
6899 	    case NFA_MOPEN3:
6900 	    case NFA_MOPEN4:
6901 	    case NFA_MOPEN5:
6902 	    case NFA_MOPEN6:
6903 	    case NFA_MOPEN7:
6904 	    case NFA_MOPEN8:
6905 	    case NFA_MOPEN9:
6906 #ifdef FEAT_SYN_HL
6907 	    case NFA_ZOPEN:
6908 	    case NFA_ZOPEN1:
6909 	    case NFA_ZOPEN2:
6910 	    case NFA_ZOPEN3:
6911 	    case NFA_ZOPEN4:
6912 	    case NFA_ZOPEN5:
6913 	    case NFA_ZOPEN6:
6914 	    case NFA_ZOPEN7:
6915 	    case NFA_ZOPEN8:
6916 	    case NFA_ZOPEN9:
6917 #endif
6918 	    case NFA_NOPEN:
6919 	    case NFA_ZSTART:
6920 		// These states are only added to be able to bail out when
6921 		// they are added again, nothing is to be done.
6922 		break;
6923 
6924 	    default:	// regular character
6925 	      {
6926 		int c = t->state->c;
6927 
6928 #ifdef DEBUG
6929 		if (c < 0)
6930 		    siemsg("INTERNAL: Negative state char: %ld", c);
6931 #endif
6932 		result = (c == curc);
6933 
6934 		if (!result && rex.reg_ic)
6935 		    result = MB_CASEFOLD(c) == MB_CASEFOLD(curc);
6936 		// If rex.reg_icombine is not set only skip over the character
6937 		// itself.  When it is set skip over composing characters.
6938 		if (result && enc_utf8 && !rex.reg_icombine)
6939 		    clen = utf_ptr2len(rex.input);
6940 		ADD_STATE_IF_MATCH(t->state);
6941 		break;
6942 	      }
6943 
6944 	    } // switch (t->state->c)
6945 
6946 	    if (add_state != NULL)
6947 	    {
6948 		nfa_pim_T *pim;
6949 		nfa_pim_T pim_copy;
6950 
6951 		if (t->pim.result == NFA_PIM_UNUSED)
6952 		    pim = NULL;
6953 		else
6954 		    pim = &t->pim;
6955 
6956 		// Handle the postponed invisible match if the match might end
6957 		// without advancing and before the end of the line.
6958 		if (pim != NULL && (clen == 0 || match_follows(add_state, 0)))
6959 		{
6960 		    if (pim->result == NFA_PIM_TODO)
6961 		    {
6962 #ifdef ENABLE_LOG
6963 			fprintf(log_fd, "\n");
6964 			fprintf(log_fd, "==================================\n");
6965 			fprintf(log_fd, "Postponed recursive nfa_regmatch()\n");
6966 			fprintf(log_fd, "\n");
6967 #endif
6968 			result = recursive_regmatch(pim->state, pim,
6969 				    prog, submatch, m, &listids, &listids_len);
6970 			pim->result = result ? NFA_PIM_MATCH : NFA_PIM_NOMATCH;
6971 			// for \@! and \@<! it is a match when the result is
6972 			// FALSE
6973 			if (result != (pim->state->c == NFA_START_INVISIBLE_NEG
6974 			     || pim->state->c == NFA_START_INVISIBLE_NEG_FIRST
6975 			     || pim->state->c
6976 					   == NFA_START_INVISIBLE_BEFORE_NEG
6977 			     || pim->state->c
6978 				     == NFA_START_INVISIBLE_BEFORE_NEG_FIRST))
6979 			{
6980 			    // Copy submatch info from the recursive call
6981 			    copy_sub_off(&pim->subs.norm, &m->norm);
6982 #ifdef FEAT_SYN_HL
6983 			    if (rex.nfa_has_zsubexpr)
6984 				copy_sub_off(&pim->subs.synt, &m->synt);
6985 #endif
6986 			}
6987 		    }
6988 		    else
6989 		    {
6990 			result = (pim->result == NFA_PIM_MATCH);
6991 #ifdef ENABLE_LOG
6992 			fprintf(log_fd, "\n");
6993 			fprintf(log_fd, "Using previous recursive nfa_regmatch() result, result == %d\n", pim->result);
6994 			fprintf(log_fd, "MATCH = %s\n", result == TRUE ? "OK" : "FALSE");
6995 			fprintf(log_fd, "\n");
6996 #endif
6997 		    }
6998 
6999 		    // for \@! and \@<! it is a match when result is FALSE
7000 		    if (result != (pim->state->c == NFA_START_INVISIBLE_NEG
7001 			     || pim->state->c == NFA_START_INVISIBLE_NEG_FIRST
7002 			     || pim->state->c
7003 					   == NFA_START_INVISIBLE_BEFORE_NEG
7004 			     || pim->state->c
7005 				     == NFA_START_INVISIBLE_BEFORE_NEG_FIRST))
7006 		    {
7007 			// Copy submatch info from the recursive call
7008 			copy_sub_off(&t->subs.norm, &pim->subs.norm);
7009 #ifdef FEAT_SYN_HL
7010 			if (rex.nfa_has_zsubexpr)
7011 			    copy_sub_off(&t->subs.synt, &pim->subs.synt);
7012 #endif
7013 		    }
7014 		    else
7015 			// look-behind match failed, don't add the state
7016 			continue;
7017 
7018 		    // Postponed invisible match was handled, don't add it to
7019 		    // following states.
7020 		    pim = NULL;
7021 		}
7022 
7023 		// If "pim" points into l->t it will become invalid when
7024 		// adding the state causes the list to be reallocated.  Make a
7025 		// local copy to avoid that.
7026 		if (pim == &t->pim)
7027 		{
7028 		    copy_pim(&pim_copy, pim);
7029 		    pim = &pim_copy;
7030 		}
7031 
7032 		if (add_here)
7033 		    r = addstate_here(thislist, add_state, &t->subs,
7034 								pim, &listidx);
7035 		else
7036 		{
7037 		    r = addstate(nextlist, add_state, &t->subs, pim, add_off);
7038 		    if (add_count > 0)
7039 			nextlist->t[nextlist->n - 1].count = add_count;
7040 		}
7041 		if (r == NULL)
7042 		{
7043 		    nfa_match = NFA_TOO_EXPENSIVE;
7044 		    goto theend;
7045 		}
7046 	    }
7047 
7048 	} // for (thislist = thislist; thislist->state; thislist++)
7049 
7050 	// Look for the start of a match in the current position by adding the
7051 	// start state to the list of states.
7052 	// The first found match is the leftmost one, thus the order of states
7053 	// matters!
7054 	// Do not add the start state in recursive calls of nfa_regmatch(),
7055 	// because recursive calls should only start in the first position.
7056 	// Unless "nfa_endp" is not NULL, then we match the end position.
7057 	// Also don't start a match past the first line.
7058 	if (nfa_match == FALSE
7059 		&& ((toplevel
7060 			&& rex.lnum == 0
7061 			&& clen != 0
7062 			&& (rex.reg_maxcol == 0
7063 			    || (colnr_T)(rex.input - rex.line) < rex.reg_maxcol))
7064 		    || (nfa_endp != NULL
7065 			&& (REG_MULTI
7066 			    ? (rex.lnum < nfa_endp->se_u.pos.lnum
7067 			       || (rex.lnum == nfa_endp->se_u.pos.lnum
7068 				   && (int)(rex.input - rex.line)
7069 						    < nfa_endp->se_u.pos.col))
7070 			    : rex.input < nfa_endp->se_u.ptr))))
7071 	{
7072 #ifdef ENABLE_LOG
7073 	    fprintf(log_fd, "(---) STARTSTATE\n");
7074 #endif
7075 	    // Inline optimized code for addstate() if we know the state is
7076 	    // the first MOPEN.
7077 	    if (toplevel)
7078 	    {
7079 		int add = TRUE;
7080 		int c;
7081 
7082 		if (prog->regstart != NUL && clen != 0)
7083 		{
7084 		    if (nextlist->n == 0)
7085 		    {
7086 			colnr_T col = (colnr_T)(rex.input - rex.line) + clen;
7087 
7088 			// Nextlist is empty, we can skip ahead to the
7089 			// character that must appear at the start.
7090 			if (skip_to_start(prog->regstart, &col) == FAIL)
7091 			    break;
7092 #ifdef ENABLE_LOG
7093 			fprintf(log_fd, "  Skipping ahead %d bytes to regstart\n",
7094 				col - ((colnr_T)(rex.input - rex.line) + clen));
7095 #endif
7096 			rex.input = rex.line + col - clen;
7097 		    }
7098 		    else
7099 		    {
7100 			// Checking if the required start character matches is
7101 			// cheaper than adding a state that won't match.
7102 			c = PTR2CHAR(rex.input + clen);
7103 			if (c != prog->regstart && (!rex.reg_ic
7104 			     || MB_CASEFOLD(c) != MB_CASEFOLD(prog->regstart)))
7105 			{
7106 #ifdef ENABLE_LOG
7107 			    fprintf(log_fd, "  Skipping start state, regstart does not match\n");
7108 #endif
7109 			    add = FALSE;
7110 			}
7111 		    }
7112 		}
7113 
7114 		if (add)
7115 		{
7116 		    if (REG_MULTI)
7117 			m->norm.list.multi[0].start_col =
7118 					 (colnr_T)(rex.input - rex.line) + clen;
7119 		    else
7120 			m->norm.list.line[0].start = rex.input + clen;
7121 		    if (addstate(nextlist, start->out, m, NULL, clen) == NULL)
7122 		    {
7123 			nfa_match = NFA_TOO_EXPENSIVE;
7124 			goto theend;
7125 		    }
7126 		}
7127 	    }
7128 	    else
7129 	    {
7130 		if (addstate(nextlist, start, m, NULL, clen) == NULL)
7131 		{
7132 		    nfa_match = NFA_TOO_EXPENSIVE;
7133 		    goto theend;
7134 		}
7135 	    }
7136 	}
7137 
7138 #ifdef ENABLE_LOG
7139 	fprintf(log_fd, ">>> Thislist had %d states available: ", thislist->n);
7140 	{
7141 	    int i;
7142 
7143 	    for (i = 0; i < thislist->n; i++)
7144 		fprintf(log_fd, "%d  ", abs(thislist->t[i].state->id));
7145 	}
7146 	fprintf(log_fd, "\n");
7147 #endif
7148 
7149 nextchar:
7150 	// Advance to the next character, or advance to the next line, or
7151 	// finish.
7152 	if (clen != 0)
7153 	    rex.input += clen;
7154 	else if (go_to_nextline || (nfa_endp != NULL && REG_MULTI
7155 					&& rex.lnum < nfa_endp->se_u.pos.lnum))
7156 	    reg_nextline();
7157 	else
7158 	    break;
7159 
7160 	// Allow interrupting with CTRL-C.
7161 	line_breakcheck();
7162 	if (got_int)
7163 	    break;
7164 #ifdef FEAT_RELTIME
7165 	// Check for timeout once in a twenty times to avoid overhead.
7166 	if (nfa_time_limit != NULL && ++nfa_time_count == 20)
7167 	{
7168 	    nfa_time_count = 0;
7169 	    if (nfa_did_time_out())
7170 		break;
7171 	}
7172 #endif
7173     }
7174 
7175 #ifdef ENABLE_LOG
7176     if (log_fd != stderr)
7177 	fclose(log_fd);
7178     log_fd = NULL;
7179 #endif
7180 
7181 theend:
7182     // Free memory
7183     vim_free(list[0].t);
7184     vim_free(list[1].t);
7185     vim_free(listids);
7186 #undef ADD_STATE_IF_MATCH
7187 #ifdef NFA_REGEXP_DEBUG_LOG
7188     fclose(debug);
7189 #endif
7190 
7191     return nfa_match;
7192 }
7193 
7194 /*
7195  * Try match of "prog" with at rex.line["col"].
7196  * Returns <= 0 for failure, number of lines contained in the match otherwise.
7197  */
7198     static long
nfa_regtry(nfa_regprog_T * prog,colnr_T col,proftime_T * tm UNUSED,int * timed_out UNUSED)7199 nfa_regtry(
7200     nfa_regprog_T   *prog,
7201     colnr_T	    col,
7202     proftime_T	    *tm UNUSED,	// timeout limit or NULL
7203     int		    *timed_out UNUSED)	// flag set on timeout or NULL
7204 {
7205     int		i;
7206     regsubs_T	subs, m;
7207     nfa_state_T	*start = prog->start;
7208     int		result;
7209 #ifdef ENABLE_LOG
7210     FILE	*f;
7211 #endif
7212 
7213     rex.input = rex.line + col;
7214 #ifdef FEAT_RELTIME
7215     nfa_time_limit = tm;
7216     nfa_timed_out = timed_out;
7217     nfa_time_count = 0;
7218 #endif
7219 
7220 #ifdef ENABLE_LOG
7221     f = fopen(NFA_REGEXP_RUN_LOG, "a");
7222     if (f != NULL)
7223     {
7224 	fprintf(f, "\n\n\t=======================================================\n");
7225 #ifdef DEBUG
7226 	fprintf(f, "\tRegexp is \"%s\"\n", nfa_regengine.expr);
7227 #endif
7228 	fprintf(f, "\tInput text is \"%s\" \n", rex.input);
7229 	fprintf(f, "\t=======================================================\n\n");
7230 	nfa_print_state(f, start);
7231 	fprintf(f, "\n\n");
7232 	fclose(f);
7233     }
7234     else
7235 	emsg("Could not open temporary log file for writing");
7236 #endif
7237 
7238     clear_sub(&subs.norm);
7239     clear_sub(&m.norm);
7240 #ifdef FEAT_SYN_HL
7241     clear_sub(&subs.synt);
7242     clear_sub(&m.synt);
7243 #endif
7244 
7245     result = nfa_regmatch(prog, start, &subs, &m);
7246     if (result == FALSE)
7247 	return 0;
7248     else if (result == NFA_TOO_EXPENSIVE)
7249 	return result;
7250 
7251     cleanup_subexpr();
7252     if (REG_MULTI)
7253     {
7254 	for (i = 0; i < subs.norm.in_use; i++)
7255 	{
7256 	    rex.reg_startpos[i].lnum = subs.norm.list.multi[i].start_lnum;
7257 	    rex.reg_startpos[i].col = subs.norm.list.multi[i].start_col;
7258 
7259 	    rex.reg_endpos[i].lnum = subs.norm.list.multi[i].end_lnum;
7260 	    rex.reg_endpos[i].col = subs.norm.list.multi[i].end_col;
7261 	}
7262 
7263 	if (rex.reg_startpos[0].lnum < 0)
7264 	{
7265 	    rex.reg_startpos[0].lnum = 0;
7266 	    rex.reg_startpos[0].col = col;
7267 	}
7268 	if (rex.reg_endpos[0].lnum < 0)
7269 	{
7270 	    // pattern has a \ze but it didn't match, use current end
7271 	    rex.reg_endpos[0].lnum = rex.lnum;
7272 	    rex.reg_endpos[0].col = (int)(rex.input - rex.line);
7273 	}
7274 	else
7275 	    // Use line number of "\ze".
7276 	    rex.lnum = rex.reg_endpos[0].lnum;
7277     }
7278     else
7279     {
7280 	for (i = 0; i < subs.norm.in_use; i++)
7281 	{
7282 	    rex.reg_startp[i] = subs.norm.list.line[i].start;
7283 	    rex.reg_endp[i] = subs.norm.list.line[i].end;
7284 	}
7285 
7286 	if (rex.reg_startp[0] == NULL)
7287 	    rex.reg_startp[0] = rex.line + col;
7288 	if (rex.reg_endp[0] == NULL)
7289 	    rex.reg_endp[0] = rex.input;
7290     }
7291 
7292 #ifdef FEAT_SYN_HL
7293     // Package any found \z(...\) matches for export. Default is none.
7294     unref_extmatch(re_extmatch_out);
7295     re_extmatch_out = NULL;
7296 
7297     if (prog->reghasz == REX_SET)
7298     {
7299 	cleanup_zsubexpr();
7300 	re_extmatch_out = make_extmatch();
7301 	if (re_extmatch_out == NULL)
7302 	    return 0;
7303 	// Loop over \z1, \z2, etc.  There is no \z0.
7304 	for (i = 1; i < subs.synt.in_use; i++)
7305 	{
7306 	    if (REG_MULTI)
7307 	    {
7308 		struct multipos *mpos = &subs.synt.list.multi[i];
7309 
7310 		// Only accept single line matches that are valid.
7311 		if (mpos->start_lnum >= 0
7312 			&& mpos->start_lnum == mpos->end_lnum
7313 			&& mpos->end_col >= mpos->start_col)
7314 		    re_extmatch_out->matches[i] =
7315 			vim_strnsave(reg_getline(mpos->start_lnum)
7316 							    + mpos->start_col,
7317 					     mpos->end_col - mpos->start_col);
7318 	    }
7319 	    else
7320 	    {
7321 		struct linepos *lpos = &subs.synt.list.line[i];
7322 
7323 		if (lpos->start != NULL && lpos->end != NULL)
7324 		    re_extmatch_out->matches[i] =
7325 			    vim_strnsave(lpos->start, lpos->end - lpos->start);
7326 	    }
7327 	}
7328     }
7329 #endif
7330 
7331     return 1 + rex.lnum;
7332 }
7333 
7334 /*
7335  * Match a regexp against a string ("line" points to the string) or multiple
7336  * lines (if "line" is NULL, use reg_getline()).
7337  *
7338  * Returns <= 0 for failure, number of lines contained in the match otherwise.
7339  */
7340     static long
nfa_regexec_both(char_u * line,colnr_T startcol,proftime_T * tm,int * timed_out)7341 nfa_regexec_both(
7342     char_u	*line,
7343     colnr_T	startcol,	// column to start looking for match
7344     proftime_T	*tm,		// timeout limit or NULL
7345     int		*timed_out)	// flag set on timeout or NULL
7346 {
7347     nfa_regprog_T   *prog;
7348     long	    retval = 0L;
7349     int		    i;
7350     colnr_T	    col = startcol;
7351 
7352     if (REG_MULTI)
7353     {
7354 	prog = (nfa_regprog_T *)rex.reg_mmatch->regprog;
7355 	line = reg_getline((linenr_T)0);    // relative to the cursor
7356 	rex.reg_startpos = rex.reg_mmatch->startpos;
7357 	rex.reg_endpos = rex.reg_mmatch->endpos;
7358     }
7359     else
7360     {
7361 	prog = (nfa_regprog_T *)rex.reg_match->regprog;
7362 	rex.reg_startp = rex.reg_match->startp;
7363 	rex.reg_endp = rex.reg_match->endp;
7364     }
7365 
7366     // Be paranoid...
7367     if (prog == NULL || line == NULL)
7368     {
7369 	iemsg(_(e_null_argument));
7370 	goto theend;
7371     }
7372 
7373     // If pattern contains "\c" or "\C": overrule value of rex.reg_ic
7374     if (prog->regflags & RF_ICASE)
7375 	rex.reg_ic = TRUE;
7376     else if (prog->regflags & RF_NOICASE)
7377 	rex.reg_ic = FALSE;
7378 
7379     // If pattern contains "\Z" overrule value of rex.reg_icombine
7380     if (prog->regflags & RF_ICOMBINE)
7381 	rex.reg_icombine = TRUE;
7382 
7383     rex.line = line;
7384     rex.lnum = 0;    // relative to line
7385 
7386     rex.nfa_has_zend = prog->has_zend;
7387     rex.nfa_has_backref = prog->has_backref;
7388     rex.nfa_nsubexpr = prog->nsubexp;
7389     rex.nfa_listid = 1;
7390     rex.nfa_alt_listid = 2;
7391 #ifdef DEBUG
7392     nfa_regengine.expr = prog->pattern;
7393 #endif
7394 
7395     if (prog->reganch && col > 0)
7396 	return 0L;
7397 
7398     rex.need_clear_subexpr = TRUE;
7399 #ifdef FEAT_SYN_HL
7400     // Clear the external match subpointers if necessary.
7401     if (prog->reghasz == REX_SET)
7402     {
7403 	rex.nfa_has_zsubexpr = TRUE;
7404 	rex.need_clear_zsubexpr = TRUE;
7405     }
7406     else
7407     {
7408 	rex.nfa_has_zsubexpr = FALSE;
7409 	rex.need_clear_zsubexpr = FALSE;
7410     }
7411 #endif
7412 
7413     if (prog->regstart != NUL)
7414     {
7415 	// Skip ahead until a character we know the match must start with.
7416 	// When there is none there is no match.
7417 	if (skip_to_start(prog->regstart, &col) == FAIL)
7418 	    return 0L;
7419 
7420 	// If match_text is set it contains the full text that must match.
7421 	// Nothing else to try. Doesn't handle combining chars well.
7422 	if (prog->match_text != NULL && !rex.reg_icombine)
7423 	    return find_match_text(col, prog->regstart, prog->match_text);
7424     }
7425 
7426     // If the start column is past the maximum column: no need to try.
7427     if (rex.reg_maxcol > 0 && col >= rex.reg_maxcol)
7428 	goto theend;
7429 
7430     // Set the "nstate" used by nfa_regcomp() to zero to trigger an error when
7431     // it's accidentally used during execution.
7432     nstate = 0;
7433     for (i = 0; i < prog->nstate; ++i)
7434     {
7435 	prog->state[i].id = i;
7436 	prog->state[i].lastlist[0] = 0;
7437 	prog->state[i].lastlist[1] = 0;
7438     }
7439 
7440     retval = nfa_regtry(prog, col, tm, timed_out);
7441 
7442 #ifdef DEBUG
7443     nfa_regengine.expr = NULL;
7444 #endif
7445 
7446 theend:
7447     if (retval > 0)
7448     {
7449 	// Make sure the end is never before the start.  Can happen when \zs and
7450 	// \ze are used.
7451 	if (REG_MULTI)
7452 	{
7453 	    lpos_T *start = &rex.reg_mmatch->startpos[0];
7454 	    lpos_T *end = &rex.reg_mmatch->endpos[0];
7455 
7456 	    if (end->lnum < start->lnum
7457 			|| (end->lnum == start->lnum && end->col < start->col))
7458 		rex.reg_mmatch->endpos[0] = rex.reg_mmatch->startpos[0];
7459 	}
7460 	else
7461 	{
7462 	    if (rex.reg_match->endp[0] < rex.reg_match->startp[0])
7463 		rex.reg_match->endp[0] = rex.reg_match->startp[0];
7464 	}
7465     }
7466 
7467     return retval;
7468 }
7469 
7470 /*
7471  * Compile a regular expression into internal code for the NFA matcher.
7472  * Returns the program in allocated space.  Returns NULL for an error.
7473  */
7474     static regprog_T *
nfa_regcomp(char_u * expr,int re_flags)7475 nfa_regcomp(char_u *expr, int re_flags)
7476 {
7477     nfa_regprog_T	*prog = NULL;
7478     size_t		prog_size;
7479     int			*postfix;
7480 
7481     if (expr == NULL)
7482 	return NULL;
7483 
7484 #ifdef DEBUG
7485     nfa_regengine.expr = expr;
7486 #endif
7487     nfa_re_flags = re_flags;
7488 
7489     init_class_tab();
7490 
7491     if (nfa_regcomp_start(expr, re_flags) == FAIL)
7492 	return NULL;
7493 
7494     // Build postfix form of the regexp. Needed to build the NFA
7495     // (and count its size).
7496     postfix = re2post();
7497     if (postfix == NULL)
7498 	goto fail;	    // Cascaded (syntax?) error
7499 
7500     /*
7501      * In order to build the NFA, we parse the input regexp twice:
7502      * 1. first pass to count size (so we can allocate space)
7503      * 2. second to emit code
7504      */
7505 #ifdef ENABLE_LOG
7506     {
7507 	FILE *f = fopen(NFA_REGEXP_RUN_LOG, "a");
7508 
7509 	if (f != NULL)
7510 	{
7511 	    fprintf(f, "\n*****************************\n\n\n\n\tCompiling regexp \"%s\"... hold on !\n", expr);
7512 	    fclose(f);
7513 	}
7514     }
7515 #endif
7516 
7517     /*
7518      * PASS 1
7519      * Count number of NFA states in "nstate". Do not build the NFA.
7520      */
7521     post2nfa(postfix, post_ptr, TRUE);
7522 
7523     // allocate the regprog with space for the compiled regexp
7524     prog_size = sizeof(nfa_regprog_T) + sizeof(nfa_state_T) * (nstate - 1);
7525     prog = alloc(prog_size);
7526     if (prog == NULL)
7527 	goto fail;
7528     state_ptr = prog->state;
7529     prog->re_in_use = FALSE;
7530 
7531     /*
7532      * PASS 2
7533      * Build the NFA
7534      */
7535     prog->start = post2nfa(postfix, post_ptr, FALSE);
7536     if (prog->start == NULL)
7537 	goto fail;
7538 
7539     prog->regflags = regflags;
7540     prog->engine = &nfa_regengine;
7541     prog->nstate = nstate;
7542     prog->has_zend = rex.nfa_has_zend;
7543     prog->has_backref = rex.nfa_has_backref;
7544     prog->nsubexp = regnpar;
7545 
7546     nfa_postprocess(prog);
7547 
7548     prog->reganch = nfa_get_reganch(prog->start, 0);
7549     prog->regstart = nfa_get_regstart(prog->start, 0);
7550     prog->match_text = nfa_get_match_text(prog->start);
7551 
7552 #ifdef ENABLE_LOG
7553     nfa_postfix_dump(expr, OK);
7554     nfa_dump(prog);
7555 #endif
7556 #ifdef FEAT_SYN_HL
7557     // Remember whether this pattern has any \z specials in it.
7558     prog->reghasz = re_has_z;
7559 #endif
7560     prog->pattern = vim_strsave(expr);
7561 #ifdef DEBUG
7562     nfa_regengine.expr = NULL;
7563 #endif
7564 
7565 out:
7566     VIM_CLEAR(post_start);
7567     post_ptr = post_end = NULL;
7568     state_ptr = NULL;
7569     return (regprog_T *)prog;
7570 
7571 fail:
7572     VIM_CLEAR(prog);
7573 #ifdef ENABLE_LOG
7574     nfa_postfix_dump(expr, FAIL);
7575 #endif
7576 #ifdef DEBUG
7577     nfa_regengine.expr = NULL;
7578 #endif
7579     goto out;
7580 }
7581 
7582 /*
7583  * Free a compiled regexp program, returned by nfa_regcomp().
7584  */
7585     static void
nfa_regfree(regprog_T * prog)7586 nfa_regfree(regprog_T *prog)
7587 {
7588     if (prog != NULL)
7589     {
7590 	vim_free(((nfa_regprog_T *)prog)->match_text);
7591 	vim_free(((nfa_regprog_T *)prog)->pattern);
7592 	vim_free(prog);
7593     }
7594 }
7595 
7596 /*
7597  * Match a regexp against a string.
7598  * "rmp->regprog" is a compiled regexp as returned by nfa_regcomp().
7599  * Uses curbuf for line count and 'iskeyword'.
7600  * If "line_lbr" is TRUE consider a "\n" in "line" to be a line break.
7601  *
7602  * Returns <= 0 for failure, number of lines contained in the match otherwise.
7603  */
7604     static int
nfa_regexec_nl(regmatch_T * rmp,char_u * line,colnr_T col,int line_lbr)7605 nfa_regexec_nl(
7606     regmatch_T	*rmp,
7607     char_u	*line,	// string to match against
7608     colnr_T	col,	// column to start looking for match
7609     int		line_lbr)
7610 {
7611     rex.reg_match = rmp;
7612     rex.reg_mmatch = NULL;
7613     rex.reg_maxline = 0;
7614     rex.reg_line_lbr = line_lbr;
7615     rex.reg_buf = curbuf;
7616     rex.reg_win = NULL;
7617     rex.reg_ic = rmp->rm_ic;
7618     rex.reg_icombine = FALSE;
7619     rex.reg_maxcol = 0;
7620     return nfa_regexec_both(line, col, NULL, NULL);
7621 }
7622 
7623 
7624 /*
7625  * Match a regexp against multiple lines.
7626  * "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
7627  * Uses curbuf for line count and 'iskeyword'.
7628  *
7629  * Return <= 0 if there is no match.  Return number of lines contained in the
7630  * match otherwise.
7631  *
7632  * Note: the body is the same as bt_regexec() except for nfa_regexec_both()
7633  *
7634  * ! Also NOTE : match may actually be in another line. e.g.:
7635  * when r.e. is \nc, cursor is at 'a' and the text buffer looks like
7636  *
7637  * +-------------------------+
7638  * |a                        |
7639  * |b                        |
7640  * |c                        |
7641  * |                         |
7642  * +-------------------------+
7643  *
7644  * then nfa_regexec_multi() returns 3. while the original
7645  * vim_regexec_multi() returns 0 and a second call at line 2 will return 2.
7646  *
7647  * FIXME if this behavior is not compatible.
7648  */
7649     static long
nfa_regexec_multi(regmmatch_T * rmp,win_T * win,buf_T * buf,linenr_T lnum,colnr_T col,proftime_T * tm,int * timed_out)7650 nfa_regexec_multi(
7651     regmmatch_T	*rmp,
7652     win_T	*win,		// window in which to search or NULL
7653     buf_T	*buf,		// buffer in which to search
7654     linenr_T	lnum,		// nr of line to start looking for match
7655     colnr_T	col,		// column to start looking for match
7656     proftime_T	*tm,		// timeout limit or NULL
7657     int		*timed_out)	// flag set on timeout or NULL
7658 {
7659     init_regexec_multi(rmp, win, buf, lnum);
7660     return nfa_regexec_both(NULL, col, tm, timed_out);
7661 }
7662 
7663 #ifdef DEBUG
7664 # undef ENABLE_LOG
7665 #endif
7666