1 /* vi:set ts=8 sts=4 sw=4 noet:
2 *
3 * NFA regular expression implementation.
4 *
5 * This file is included in "regexp.c".
6 */
7
8 /*
9 * Logging of NFA engine.
10 *
11 * The NFA engine can write four log files:
12 * - Error log: Contains NFA engine's fatal errors.
13 * - Dump log: Contains compiled NFA state machine's information.
14 * - Run log: Contains information of matching procedure.
15 * - Debug log: Contains detailed information of matching procedure. Can be
16 * disabled by undefining NFA_REGEXP_DEBUG_LOG.
17 * The first one can also be used without debug mode.
18 * The last three are enabled when compiled as debug mode and individually
19 * disabled by commenting them out.
20 * The log files can get quite big!
21 * Do disable all of this when compiling Vim for debugging, undefine DEBUG in
22 * regexp.c
23 */
24 #ifdef DEBUG
25 # define NFA_REGEXP_ERROR_LOG "nfa_regexp_error.log"
26 # define ENABLE_LOG
27 # define NFA_REGEXP_DUMP_LOG "nfa_regexp_dump.log"
28 # define NFA_REGEXP_RUN_LOG "nfa_regexp_run.log"
29 # define NFA_REGEXP_DEBUG_LOG "nfa_regexp_debug.log"
30 #endif
31
32 // Added to NFA_ANY - NFA_NUPPER_IC to include a NL.
33 #define NFA_ADD_NL 31
34
35 enum
36 {
37 NFA_SPLIT = -1024,
38 NFA_MATCH,
39 NFA_EMPTY, // matches 0-length
40
41 NFA_START_COLL, // [abc] start
42 NFA_END_COLL, // [abc] end
43 NFA_START_NEG_COLL, // [^abc] start
44 NFA_END_NEG_COLL, // [^abc] end (postfix only)
45 NFA_RANGE, // range of the two previous items
46 // (postfix only)
47 NFA_RANGE_MIN, // low end of a range
48 NFA_RANGE_MAX, // high end of a range
49
50 NFA_CONCAT, // concatenate two previous items (postfix
51 // only)
52 NFA_OR, // \| (postfix only)
53 NFA_STAR, // greedy * (postfix only)
54 NFA_STAR_NONGREEDY, // non-greedy * (postfix only)
55 NFA_QUEST, // greedy \? (postfix only)
56 NFA_QUEST_NONGREEDY, // non-greedy \? (postfix only)
57
58 NFA_BOL, // ^ Begin line
59 NFA_EOL, // $ End line
60 NFA_BOW, // \< Begin word
61 NFA_EOW, // \> End word
62 NFA_BOF, // \%^ Begin file
63 NFA_EOF, // \%$ End file
64 NFA_NEWL,
65 NFA_ZSTART, // Used for \zs
66 NFA_ZEND, // Used for \ze
67 NFA_NOPEN, // Start of subexpression marked with \%(
68 NFA_NCLOSE, // End of subexpr. marked with \%( ... \)
69 NFA_START_INVISIBLE,
70 NFA_START_INVISIBLE_FIRST,
71 NFA_START_INVISIBLE_NEG,
72 NFA_START_INVISIBLE_NEG_FIRST,
73 NFA_START_INVISIBLE_BEFORE,
74 NFA_START_INVISIBLE_BEFORE_FIRST,
75 NFA_START_INVISIBLE_BEFORE_NEG,
76 NFA_START_INVISIBLE_BEFORE_NEG_FIRST,
77 NFA_START_PATTERN,
78 NFA_END_INVISIBLE,
79 NFA_END_INVISIBLE_NEG,
80 NFA_END_PATTERN,
81 NFA_COMPOSING, // Next nodes in NFA are part of the
82 // composing multibyte char
83 NFA_END_COMPOSING, // End of a composing char in the NFA
84 NFA_ANY_COMPOSING, // \%C: Any composing characters.
85 NFA_OPT_CHARS, // \%[abc]
86
87 // The following are used only in the postfix form, not in the NFA
88 NFA_PREV_ATOM_NO_WIDTH, // Used for \@=
89 NFA_PREV_ATOM_NO_WIDTH_NEG, // Used for \@!
90 NFA_PREV_ATOM_JUST_BEFORE, // Used for \@<=
91 NFA_PREV_ATOM_JUST_BEFORE_NEG, // Used for \@<!
92 NFA_PREV_ATOM_LIKE_PATTERN, // Used for \@>
93
94 NFA_BACKREF1, // \1
95 NFA_BACKREF2, // \2
96 NFA_BACKREF3, // \3
97 NFA_BACKREF4, // \4
98 NFA_BACKREF5, // \5
99 NFA_BACKREF6, // \6
100 NFA_BACKREF7, // \7
101 NFA_BACKREF8, // \8
102 NFA_BACKREF9, // \9
103 #ifdef FEAT_SYN_HL
104 NFA_ZREF1, // \z1
105 NFA_ZREF2, // \z2
106 NFA_ZREF3, // \z3
107 NFA_ZREF4, // \z4
108 NFA_ZREF5, // \z5
109 NFA_ZREF6, // \z6
110 NFA_ZREF7, // \z7
111 NFA_ZREF8, // \z8
112 NFA_ZREF9, // \z9
113 #endif
114 NFA_SKIP, // Skip characters
115
116 NFA_MOPEN,
117 NFA_MOPEN1,
118 NFA_MOPEN2,
119 NFA_MOPEN3,
120 NFA_MOPEN4,
121 NFA_MOPEN5,
122 NFA_MOPEN6,
123 NFA_MOPEN7,
124 NFA_MOPEN8,
125 NFA_MOPEN9,
126
127 NFA_MCLOSE,
128 NFA_MCLOSE1,
129 NFA_MCLOSE2,
130 NFA_MCLOSE3,
131 NFA_MCLOSE4,
132 NFA_MCLOSE5,
133 NFA_MCLOSE6,
134 NFA_MCLOSE7,
135 NFA_MCLOSE8,
136 NFA_MCLOSE9,
137
138 #ifdef FEAT_SYN_HL
139 NFA_ZOPEN,
140 NFA_ZOPEN1,
141 NFA_ZOPEN2,
142 NFA_ZOPEN3,
143 NFA_ZOPEN4,
144 NFA_ZOPEN5,
145 NFA_ZOPEN6,
146 NFA_ZOPEN7,
147 NFA_ZOPEN8,
148 NFA_ZOPEN9,
149
150 NFA_ZCLOSE,
151 NFA_ZCLOSE1,
152 NFA_ZCLOSE2,
153 NFA_ZCLOSE3,
154 NFA_ZCLOSE4,
155 NFA_ZCLOSE5,
156 NFA_ZCLOSE6,
157 NFA_ZCLOSE7,
158 NFA_ZCLOSE8,
159 NFA_ZCLOSE9,
160 #endif
161
162 // NFA_FIRST_NL
163 NFA_ANY, // Match any one character.
164 NFA_IDENT, // Match identifier char
165 NFA_SIDENT, // Match identifier char but no digit
166 NFA_KWORD, // Match keyword char
167 NFA_SKWORD, // Match word char but no digit
168 NFA_FNAME, // Match file name char
169 NFA_SFNAME, // Match file name char but no digit
170 NFA_PRINT, // Match printable char
171 NFA_SPRINT, // Match printable char but no digit
172 NFA_WHITE, // Match whitespace char
173 NFA_NWHITE, // Match non-whitespace char
174 NFA_DIGIT, // Match digit char
175 NFA_NDIGIT, // Match non-digit char
176 NFA_HEX, // Match hex char
177 NFA_NHEX, // Match non-hex char
178 NFA_OCTAL, // Match octal char
179 NFA_NOCTAL, // Match non-octal char
180 NFA_WORD, // Match word char
181 NFA_NWORD, // Match non-word char
182 NFA_HEAD, // Match head char
183 NFA_NHEAD, // Match non-head char
184 NFA_ALPHA, // Match alpha char
185 NFA_NALPHA, // Match non-alpha char
186 NFA_LOWER, // Match lowercase char
187 NFA_NLOWER, // Match non-lowercase char
188 NFA_UPPER, // Match uppercase char
189 NFA_NUPPER, // Match non-uppercase char
190 NFA_LOWER_IC, // Match [a-z]
191 NFA_NLOWER_IC, // Match [^a-z]
192 NFA_UPPER_IC, // Match [A-Z]
193 NFA_NUPPER_IC, // Match [^A-Z]
194
195 NFA_FIRST_NL = NFA_ANY + NFA_ADD_NL,
196 NFA_LAST_NL = NFA_NUPPER_IC + NFA_ADD_NL,
197
198 NFA_CURSOR, // Match cursor pos
199 NFA_LNUM, // Match line number
200 NFA_LNUM_GT, // Match > line number
201 NFA_LNUM_LT, // Match < line number
202 NFA_COL, // Match cursor column
203 NFA_COL_GT, // Match > cursor column
204 NFA_COL_LT, // Match < cursor column
205 NFA_VCOL, // Match cursor virtual column
206 NFA_VCOL_GT, // Match > cursor virtual column
207 NFA_VCOL_LT, // Match < cursor virtual column
208 NFA_MARK, // Match mark
209 NFA_MARK_GT, // Match > mark
210 NFA_MARK_LT, // Match < mark
211 NFA_VISUAL, // Match Visual area
212
213 // Character classes [:alnum:] etc
214 NFA_CLASS_ALNUM,
215 NFA_CLASS_ALPHA,
216 NFA_CLASS_BLANK,
217 NFA_CLASS_CNTRL,
218 NFA_CLASS_DIGIT,
219 NFA_CLASS_GRAPH,
220 NFA_CLASS_LOWER,
221 NFA_CLASS_PRINT,
222 NFA_CLASS_PUNCT,
223 NFA_CLASS_SPACE,
224 NFA_CLASS_UPPER,
225 NFA_CLASS_XDIGIT,
226 NFA_CLASS_TAB,
227 NFA_CLASS_RETURN,
228 NFA_CLASS_BACKSPACE,
229 NFA_CLASS_ESCAPE,
230 NFA_CLASS_IDENT,
231 NFA_CLASS_KEYWORD,
232 NFA_CLASS_FNAME
233 };
234
235 // Keep in sync with classchars.
236 static int nfa_classcodes[] = {
237 NFA_ANY, NFA_IDENT, NFA_SIDENT, NFA_KWORD,NFA_SKWORD,
238 NFA_FNAME, NFA_SFNAME, NFA_PRINT, NFA_SPRINT,
239 NFA_WHITE, NFA_NWHITE, NFA_DIGIT, NFA_NDIGIT,
240 NFA_HEX, NFA_NHEX, NFA_OCTAL, NFA_NOCTAL,
241 NFA_WORD, NFA_NWORD, NFA_HEAD, NFA_NHEAD,
242 NFA_ALPHA, NFA_NALPHA, NFA_LOWER, NFA_NLOWER,
243 NFA_UPPER, NFA_NUPPER
244 };
245
246 static char_u e_nul_found[] = N_("E865: (NFA) Regexp end encountered prematurely");
247 static char_u e_misplaced[] = N_("E866: (NFA regexp) Misplaced %c");
248 static char_u e_ill_char_class[] = N_("E877: (NFA regexp) Invalid character class: %d");
249 static char_u e_value_too_large[] = N_("E951: \\% value too large");
250
251 // Variables only used in nfa_regcomp() and descendants.
252 static int nfa_re_flags; // re_flags passed to nfa_regcomp()
253 static int *post_start; // holds the postfix form of r.e.
254 static int *post_end;
255 static int *post_ptr;
256
257 // Set when the pattern should use the NFA engine.
258 // E.g. [[:upper:]] only allows 8bit characters for BT engine,
259 // while NFA engine handles multibyte characters correctly.
260 static int wants_nfa;
261
262 static int nstate; // Number of states in the NFA.
263 static int istate; // Index in the state vector, used in alloc_state()
264
265 // If not NULL match must end at this position
266 static save_se_T *nfa_endp = NULL;
267
268 // 0 for first call to nfa_regmatch(), 1 for recursive call.
269 static int nfa_ll_index = 0;
270
271 static int realloc_post_list(void);
272 static int nfa_reg(int paren);
273 #ifdef DEBUG
274 static void nfa_print_state2(FILE *debugf, nfa_state_T *state, garray_T *indent);
275 #endif
276 static int match_follows(nfa_state_T *startstate, int depth);
277 static int failure_chance(nfa_state_T *state, int depth);
278
279 // helper functions used when doing re2post() ... regatom() parsing
280 #define EMIT(c) do { \
281 if (post_ptr >= post_end && realloc_post_list() == FAIL) \
282 return FAIL; \
283 *post_ptr++ = c; \
284 } while (0)
285
286 /*
287 * Initialize internal variables before NFA compilation.
288 * Return OK on success, FAIL otherwise.
289 */
290 static int
nfa_regcomp_start(char_u * expr,int re_flags)291 nfa_regcomp_start(
292 char_u *expr,
293 int re_flags) // see vim_regcomp()
294 {
295 size_t postfix_size;
296 int nstate_max;
297
298 nstate = 0;
299 istate = 0;
300 // A reasonable estimation for maximum size
301 nstate_max = (int)(STRLEN(expr) + 1) * 25;
302
303 // Some items blow up in size, such as [A-z]. Add more space for that.
304 // When it is still not enough realloc_post_list() will be used.
305 nstate_max += 1000;
306
307 // Size for postfix representation of expr.
308 postfix_size = sizeof(int) * nstate_max;
309
310 post_start = alloc(postfix_size);
311 if (post_start == NULL)
312 return FAIL;
313 post_ptr = post_start;
314 post_end = post_start + nstate_max;
315 wants_nfa = FALSE;
316 rex.nfa_has_zend = FALSE;
317 rex.nfa_has_backref = FALSE;
318
319 // shared with BT engine
320 regcomp_start(expr, re_flags);
321
322 return OK;
323 }
324
325 /*
326 * Figure out if the NFA state list starts with an anchor, must match at start
327 * of the line.
328 */
329 static int
nfa_get_reganch(nfa_state_T * start,int depth)330 nfa_get_reganch(nfa_state_T *start, int depth)
331 {
332 nfa_state_T *p = start;
333
334 if (depth > 4)
335 return 0;
336
337 while (p != NULL)
338 {
339 switch (p->c)
340 {
341 case NFA_BOL:
342 case NFA_BOF:
343 return 1; // yes!
344
345 case NFA_ZSTART:
346 case NFA_ZEND:
347 case NFA_CURSOR:
348 case NFA_VISUAL:
349
350 case NFA_MOPEN:
351 case NFA_MOPEN1:
352 case NFA_MOPEN2:
353 case NFA_MOPEN3:
354 case NFA_MOPEN4:
355 case NFA_MOPEN5:
356 case NFA_MOPEN6:
357 case NFA_MOPEN7:
358 case NFA_MOPEN8:
359 case NFA_MOPEN9:
360 case NFA_NOPEN:
361 #ifdef FEAT_SYN_HL
362 case NFA_ZOPEN:
363 case NFA_ZOPEN1:
364 case NFA_ZOPEN2:
365 case NFA_ZOPEN3:
366 case NFA_ZOPEN4:
367 case NFA_ZOPEN5:
368 case NFA_ZOPEN6:
369 case NFA_ZOPEN7:
370 case NFA_ZOPEN8:
371 case NFA_ZOPEN9:
372 #endif
373 p = p->out;
374 break;
375
376 case NFA_SPLIT:
377 return nfa_get_reganch(p->out, depth + 1)
378 && nfa_get_reganch(p->out1, depth + 1);
379
380 default:
381 return 0; // noooo
382 }
383 }
384 return 0;
385 }
386
387 /*
388 * Figure out if the NFA state list starts with a character which must match
389 * at start of the match.
390 */
391 static int
nfa_get_regstart(nfa_state_T * start,int depth)392 nfa_get_regstart(nfa_state_T *start, int depth)
393 {
394 nfa_state_T *p = start;
395
396 if (depth > 4)
397 return 0;
398
399 while (p != NULL)
400 {
401 switch (p->c)
402 {
403 // all kinds of zero-width matches
404 case NFA_BOL:
405 case NFA_BOF:
406 case NFA_BOW:
407 case NFA_EOW:
408 case NFA_ZSTART:
409 case NFA_ZEND:
410 case NFA_CURSOR:
411 case NFA_VISUAL:
412 case NFA_LNUM:
413 case NFA_LNUM_GT:
414 case NFA_LNUM_LT:
415 case NFA_COL:
416 case NFA_COL_GT:
417 case NFA_COL_LT:
418 case NFA_VCOL:
419 case NFA_VCOL_GT:
420 case NFA_VCOL_LT:
421 case NFA_MARK:
422 case NFA_MARK_GT:
423 case NFA_MARK_LT:
424
425 case NFA_MOPEN:
426 case NFA_MOPEN1:
427 case NFA_MOPEN2:
428 case NFA_MOPEN3:
429 case NFA_MOPEN4:
430 case NFA_MOPEN5:
431 case NFA_MOPEN6:
432 case NFA_MOPEN7:
433 case NFA_MOPEN8:
434 case NFA_MOPEN9:
435 case NFA_NOPEN:
436 #ifdef FEAT_SYN_HL
437 case NFA_ZOPEN:
438 case NFA_ZOPEN1:
439 case NFA_ZOPEN2:
440 case NFA_ZOPEN3:
441 case NFA_ZOPEN4:
442 case NFA_ZOPEN5:
443 case NFA_ZOPEN6:
444 case NFA_ZOPEN7:
445 case NFA_ZOPEN8:
446 case NFA_ZOPEN9:
447 #endif
448 p = p->out;
449 break;
450
451 case NFA_SPLIT:
452 {
453 int c1 = nfa_get_regstart(p->out, depth + 1);
454 int c2 = nfa_get_regstart(p->out1, depth + 1);
455
456 if (c1 == c2)
457 return c1; // yes!
458 return 0;
459 }
460
461 default:
462 if (p->c > 0)
463 return p->c; // yes!
464 return 0;
465 }
466 }
467 return 0;
468 }
469
470 /*
471 * Figure out if the NFA state list contains just literal text and nothing
472 * else. If so return a string in allocated memory with what must match after
473 * regstart. Otherwise return NULL.
474 */
475 static char_u *
nfa_get_match_text(nfa_state_T * start)476 nfa_get_match_text(nfa_state_T *start)
477 {
478 nfa_state_T *p = start;
479 int len = 0;
480 char_u *ret;
481 char_u *s;
482
483 if (p->c != NFA_MOPEN)
484 return NULL; // just in case
485 p = p->out;
486 while (p->c > 0)
487 {
488 len += MB_CHAR2LEN(p->c);
489 p = p->out;
490 }
491 if (p->c != NFA_MCLOSE || p->out->c != NFA_MATCH)
492 return NULL;
493
494 ret = alloc(len);
495 if (ret != NULL)
496 {
497 p = start->out->out; // skip first char, it goes into regstart
498 s = ret;
499 while (p->c > 0)
500 {
501 if (has_mbyte)
502 s += (*mb_char2bytes)(p->c, s);
503 else
504 *s++ = p->c;
505 p = p->out;
506 }
507 *s = NUL;
508 }
509 return ret;
510 }
511
512 /*
513 * Allocate more space for post_start. Called when
514 * running above the estimated number of states.
515 */
516 static int
realloc_post_list(void)517 realloc_post_list(void)
518 {
519 int nstate_max = (int)(post_end - post_start);
520 int new_max;
521 int *new_start;
522 int *old_start;
523
524 // For weird patterns the number of states can be very high. Increasing by
525 // 50% seems a reasonable compromise between memory use and speed.
526 new_max = nstate_max * 3 / 2;
527 new_start = ALLOC_MULT(int, new_max);
528 if (new_start == NULL)
529 return FAIL;
530 mch_memmove(new_start, post_start, nstate_max * sizeof(int));
531 old_start = post_start;
532 post_start = new_start;
533 post_ptr = new_start + (post_ptr - old_start);
534 post_end = post_start + new_max;
535 vim_free(old_start);
536 return OK;
537 }
538
539 /*
540 * Search between "start" and "end" and try to recognize a
541 * character class in expanded form. For example [0-9].
542 * On success, return the id the character class to be emitted.
543 * On failure, return 0 (=FAIL)
544 * Start points to the first char of the range, while end should point
545 * to the closing brace.
546 * Keep in mind that 'ignorecase' applies at execution time, thus [a-z] may
547 * need to be interpreted as [a-zA-Z].
548 */
549 static int
nfa_recognize_char_class(char_u * start,char_u * end,int extra_newl)550 nfa_recognize_char_class(char_u *start, char_u *end, int extra_newl)
551 {
552 # define CLASS_not 0x80
553 # define CLASS_af 0x40
554 # define CLASS_AF 0x20
555 # define CLASS_az 0x10
556 # define CLASS_AZ 0x08
557 # define CLASS_o7 0x04
558 # define CLASS_o9 0x02
559 # define CLASS_underscore 0x01
560
561 int newl = FALSE;
562 char_u *p;
563 int config = 0;
564
565 if (extra_newl == TRUE)
566 newl = TRUE;
567
568 if (*end != ']')
569 return FAIL;
570 p = start;
571 if (*p == '^')
572 {
573 config |= CLASS_not;
574 p++;
575 }
576
577 while (p < end)
578 {
579 if (p + 2 < end && *(p + 1) == '-')
580 {
581 switch (*p)
582 {
583 case '0':
584 if (*(p + 2) == '9')
585 {
586 config |= CLASS_o9;
587 break;
588 }
589 if (*(p + 2) == '7')
590 {
591 config |= CLASS_o7;
592 break;
593 }
594 return FAIL;
595
596 case 'a':
597 if (*(p + 2) == 'z')
598 {
599 config |= CLASS_az;
600 break;
601 }
602 if (*(p + 2) == 'f')
603 {
604 config |= CLASS_af;
605 break;
606 }
607 return FAIL;
608
609 case 'A':
610 if (*(p + 2) == 'Z')
611 {
612 config |= CLASS_AZ;
613 break;
614 }
615 if (*(p + 2) == 'F')
616 {
617 config |= CLASS_AF;
618 break;
619 }
620 return FAIL;
621
622 default:
623 return FAIL;
624 }
625 p += 3;
626 }
627 else if (p + 1 < end && *p == '\\' && *(p + 1) == 'n')
628 {
629 newl = TRUE;
630 p += 2;
631 }
632 else if (*p == '_')
633 {
634 config |= CLASS_underscore;
635 p ++;
636 }
637 else if (*p == '\n')
638 {
639 newl = TRUE;
640 p ++;
641 }
642 else
643 return FAIL;
644 } // while (p < end)
645
646 if (p != end)
647 return FAIL;
648
649 if (newl == TRUE)
650 extra_newl = NFA_ADD_NL;
651
652 switch (config)
653 {
654 case CLASS_o9:
655 return extra_newl + NFA_DIGIT;
656 case CLASS_not | CLASS_o9:
657 return extra_newl + NFA_NDIGIT;
658 case CLASS_af | CLASS_AF | CLASS_o9:
659 return extra_newl + NFA_HEX;
660 case CLASS_not | CLASS_af | CLASS_AF | CLASS_o9:
661 return extra_newl + NFA_NHEX;
662 case CLASS_o7:
663 return extra_newl + NFA_OCTAL;
664 case CLASS_not | CLASS_o7:
665 return extra_newl + NFA_NOCTAL;
666 case CLASS_az | CLASS_AZ | CLASS_o9 | CLASS_underscore:
667 return extra_newl + NFA_WORD;
668 case CLASS_not | CLASS_az | CLASS_AZ | CLASS_o9 | CLASS_underscore:
669 return extra_newl + NFA_NWORD;
670 case CLASS_az | CLASS_AZ | CLASS_underscore:
671 return extra_newl + NFA_HEAD;
672 case CLASS_not | CLASS_az | CLASS_AZ | CLASS_underscore:
673 return extra_newl + NFA_NHEAD;
674 case CLASS_az | CLASS_AZ:
675 return extra_newl + NFA_ALPHA;
676 case CLASS_not | CLASS_az | CLASS_AZ:
677 return extra_newl + NFA_NALPHA;
678 case CLASS_az:
679 return extra_newl + NFA_LOWER_IC;
680 case CLASS_not | CLASS_az:
681 return extra_newl + NFA_NLOWER_IC;
682 case CLASS_AZ:
683 return extra_newl + NFA_UPPER_IC;
684 case CLASS_not | CLASS_AZ:
685 return extra_newl + NFA_NUPPER_IC;
686 }
687 return FAIL;
688 }
689
690 /*
691 * Produce the bytes for equivalence class "c".
692 * Currently only handles latin1, latin9 and utf-8.
693 * Emits bytes in postfix notation: 'a,b,NFA_OR,c,NFA_OR' is
694 * equivalent to 'a OR b OR c'
695 *
696 * NOTE! When changing this function, also update reg_equi_class()
697 */
698 static int
nfa_emit_equi_class(int c)699 nfa_emit_equi_class(int c)
700 {
701 #define EMIT2(c) EMIT(c); EMIT(NFA_CONCAT);
702
703 if (enc_utf8 || STRCMP(p_enc, "latin1") == 0
704 || STRCMP(p_enc, "iso-8859-15") == 0)
705 {
706 #ifdef EBCDIC
707 # define A_circumflex 0x62
708 # define A_diaeresis 0x63
709 # define A_grave 0x64
710 # define A_acute 0x65
711 # define A_virguilla 0x66
712 # define A_ring 0x67
713 # define C_cedilla 0x68
714 # define E_acute 0x71
715 # define E_circumflex 0x72
716 # define E_diaeresis 0x73
717 # define E_grave 0x74
718 # define I_acute 0x75
719 # define I_circumflex 0x76
720 # define I_diaeresis 0x77
721 # define I_grave 0x78
722 # define N_virguilla 0x69
723 # define O_circumflex 0xeb
724 # define O_diaeresis 0xec
725 # define O_grave 0xed
726 # define O_acute 0xee
727 # define O_virguilla 0xef
728 # define O_slash 0x80
729 # define U_circumflex 0xfb
730 # define U_diaeresis 0xfc
731 # define U_grave 0xfd
732 # define U_acute 0xfe
733 # define Y_acute 0xba
734 # define a_grave 0x42
735 # define a_acute 0x43
736 # define a_circumflex 0x44
737 # define a_virguilla 0x45
738 # define a_diaeresis 0x46
739 # define a_ring 0x47
740 # define c_cedilla 0x48
741 # define e_grave 0x51
742 # define e_acute 0x52
743 # define e_circumflex 0x53
744 # define e_diaeresis 0x54
745 # define i_grave 0x55
746 # define i_acute 0x56
747 # define i_circumflex 0x57
748 # define i_diaeresis 0x58
749 # define n_virguilla 0x49
750 # define o_grave 0xcb
751 # define o_acute 0xcc
752 # define o_circumflex 0xcd
753 # define o_virguilla 0xce
754 # define o_diaeresis 0xcf
755 # define o_slash 0x70
756 # define u_grave 0xdb
757 # define u_acute 0xdc
758 # define u_circumflex 0xdd
759 # define u_diaeresis 0xde
760 # define y_acute 0x8d
761 # define y_diaeresis 0xdf
762 #else
763 # define A_grave 0xc0
764 # define A_acute 0xc1
765 # define A_circumflex 0xc2
766 # define A_virguilla 0xc3
767 # define A_diaeresis 0xc4
768 # define A_ring 0xc5
769 # define C_cedilla 0xc7
770 # define E_grave 0xc8
771 # define E_acute 0xc9
772 # define E_circumflex 0xca
773 # define E_diaeresis 0xcb
774 # define I_grave 0xcc
775 # define I_acute 0xcd
776 # define I_circumflex 0xce
777 # define I_diaeresis 0xcf
778 # define N_virguilla 0xd1
779 # define O_grave 0xd2
780 # define O_acute 0xd3
781 # define O_circumflex 0xd4
782 # define O_virguilla 0xd5
783 # define O_diaeresis 0xd6
784 # define O_slash 0xd8
785 # define U_grave 0xd9
786 # define U_acute 0xda
787 # define U_circumflex 0xdb
788 # define U_diaeresis 0xdc
789 # define Y_acute 0xdd
790 # define a_grave 0xe0
791 # define a_acute 0xe1
792 # define a_circumflex 0xe2
793 # define a_virguilla 0xe3
794 # define a_diaeresis 0xe4
795 # define a_ring 0xe5
796 # define c_cedilla 0xe7
797 # define e_grave 0xe8
798 # define e_acute 0xe9
799 # define e_circumflex 0xea
800 # define e_diaeresis 0xeb
801 # define i_grave 0xec
802 # define i_acute 0xed
803 # define i_circumflex 0xee
804 # define i_diaeresis 0xef
805 # define n_virguilla 0xf1
806 # define o_grave 0xf2
807 # define o_acute 0xf3
808 # define o_circumflex 0xf4
809 # define o_virguilla 0xf5
810 # define o_diaeresis 0xf6
811 # define o_slash 0xf8
812 # define u_grave 0xf9
813 # define u_acute 0xfa
814 # define u_circumflex 0xfb
815 # define u_diaeresis 0xfc
816 # define y_acute 0xfd
817 # define y_diaeresis 0xff
818 #endif
819 switch (c)
820 {
821 case 'A': case A_grave: case A_acute: case A_circumflex:
822 case A_virguilla: case A_diaeresis: case A_ring:
823 case 0x100: case 0x102: case 0x104: case 0x1cd:
824 case 0x1de: case 0x1e0: case 0x1fa: case 0x200:
825 case 0x202: case 0x226: case 0x23a: case 0x1e00:
826 case 0x1ea0: case 0x1ea2: case 0x1ea4: case 0x1ea6:
827 case 0x1ea8: case 0x1eaa: case 0x1eac: case 0x1eae:
828 case 0x1eb0: case 0x1eb2: case 0x1eb4: case 0x1eb6:
829 EMIT2('A') EMIT2(A_grave) EMIT2(A_acute)
830 EMIT2(A_circumflex) EMIT2(A_virguilla)
831 EMIT2(A_diaeresis) EMIT2(A_ring)
832 EMIT2(0x100) EMIT2(0x102) EMIT2(0x104)
833 EMIT2(0x1cd) EMIT2(0x1de) EMIT2(0x1e0)
834 EMIT2(0x1fa) EMIT2(0x200) EMIT2(0x202)
835 EMIT2(0x226) EMIT2(0x23a) EMIT2(0x1e00)
836 EMIT2(0x1ea0) EMIT2(0x1ea2) EMIT2(0x1ea4)
837 EMIT2(0x1ea6) EMIT2(0x1ea8) EMIT2(0x1eaa)
838 EMIT2(0x1eac) EMIT2(0x1eae) EMIT2(0x1eb0)
839 EMIT2(0x1eb2) EMIT2(0x1eb6) EMIT2(0x1eb4)
840 return OK;
841
842 case 'B': case 0x181: case 0x243: case 0x1e02:
843 case 0x1e04: case 0x1e06:
844 EMIT2('B')
845 EMIT2(0x181) EMIT2(0x243) EMIT2(0x1e02)
846 EMIT2(0x1e04) EMIT2(0x1e06)
847 return OK;
848
849 case 'C': case C_cedilla: case 0x106: case 0x108:
850 case 0x10a: case 0x10c: case 0x187: case 0x23b:
851 case 0x1e08: case 0xa792:
852 EMIT2('C') EMIT2(C_cedilla)
853 EMIT2(0x106) EMIT2(0x108) EMIT2(0x10a)
854 EMIT2(0x10c) EMIT2(0x187) EMIT2(0x23b)
855 EMIT2(0x1e08) EMIT2(0xa792)
856 return OK;
857
858 case 'D': case 0x10e: case 0x110: case 0x18a:
859 case 0x1e0a: case 0x1e0c: case 0x1e0e: case 0x1e10:
860 case 0x1e12:
861 EMIT2('D') EMIT2(0x10e) EMIT2(0x110) EMIT2(0x18a)
862 EMIT2(0x1e0a) EMIT2(0x1e0c) EMIT2(0x1e0e)
863 EMIT2(0x1e10) EMIT2(0x1e12)
864 return OK;
865
866 case 'E': case E_grave: case E_acute: case E_circumflex:
867 case E_diaeresis: case 0x112: case 0x114: case 0x116:
868 case 0x118: case 0x11a: case 0x204: case 0x206:
869 case 0x228: case 0x246: case 0x1e14: case 0x1e16:
870 case 0x1e18: case 0x1e1a: case 0x1e1c: case 0x1eb8:
871 case 0x1eba: case 0x1ebc: case 0x1ebe: case 0x1ec0:
872 case 0x1ec2: case 0x1ec4: case 0x1ec6:
873 EMIT2('E') EMIT2(E_grave) EMIT2(E_acute)
874 EMIT2(E_circumflex) EMIT2(E_diaeresis)
875 EMIT2(0x112) EMIT2(0x114) EMIT2(0x116)
876 EMIT2(0x118) EMIT2(0x11a) EMIT2(0x204)
877 EMIT2(0x206) EMIT2(0x228) EMIT2(0x246)
878 EMIT2(0x1e14) EMIT2(0x1e16) EMIT2(0x1e18)
879 EMIT2(0x1e1a) EMIT2(0x1e1c) EMIT2(0x1eb8)
880 EMIT2(0x1eba) EMIT2(0x1ebc) EMIT2(0x1ebe)
881 EMIT2(0x1ec0) EMIT2(0x1ec2) EMIT2(0x1ec4)
882 EMIT2(0x1ec6)
883 return OK;
884
885 case 'F': case 0x191: case 0x1e1e: case 0xa798:
886 EMIT2('F') EMIT2(0x191) EMIT2(0x1e1e) EMIT2(0xa798)
887 return OK;
888
889 case 'G': case 0x11c: case 0x11e: case 0x120:
890 case 0x122: case 0x193: case 0x1e4: case 0x1e6:
891 case 0x1f4: case 0x1e20: case 0xa7a0:
892 EMIT2('G') EMIT2(0x11c) EMIT2(0x11e) EMIT2(0x120)
893 EMIT2(0x122) EMIT2(0x193) EMIT2(0x1e4)
894 EMIT2(0x1e6) EMIT2(0x1f4) EMIT2(0x1e20)
895 EMIT2(0xa7a0)
896 return OK;
897
898 case 'H': case 0x124: case 0x126: case 0x21e:
899 case 0x1e22: case 0x1e24: case 0x1e26: case 0x1e28:
900 case 0x1e2a: case 0x2c67:
901 EMIT2('H') EMIT2(0x124) EMIT2(0x126) EMIT2(0x21e)
902 EMIT2(0x1e22) EMIT2(0x1e24) EMIT2(0x1e26)
903 EMIT2(0x1e28) EMIT2(0x1e2a) EMIT2(0x2c67)
904 return OK;
905
906 case 'I': case I_grave: case I_acute: case I_circumflex:
907 case I_diaeresis: case 0x128: case 0x12a: case 0x12c:
908 case 0x12e: case 0x130: case 0x197: case 0x1cf:
909 case 0x208: case 0x20a: case 0x1e2c: case 0x1e2e:
910 case 0x1ec8: case 0x1eca:
911 EMIT2('I') EMIT2(I_grave) EMIT2(I_acute)
912 EMIT2(I_circumflex) EMIT2(I_diaeresis)
913 EMIT2(0x128) EMIT2(0x12a) EMIT2(0x12c)
914 EMIT2(0x12e) EMIT2(0x130) EMIT2(0x197)
915 EMIT2(0x1cf) EMIT2(0x208) EMIT2(0x20a)
916 EMIT2(0x1e2c) EMIT2(0x1e2e) EMIT2(0x1ec8)
917 EMIT2(0x1eca)
918 return OK;
919
920 case 'J': case 0x134: case 0x248:
921 EMIT2('J') EMIT2(0x134) EMIT2(0x248)
922 return OK;
923
924 case 'K': case 0x136: case 0x198: case 0x1e8: case 0x1e30:
925 case 0x1e32: case 0x1e34: case 0x2c69: case 0xa740:
926 EMIT2('K') EMIT2(0x136) EMIT2(0x198) EMIT2(0x1e8)
927 EMIT2(0x1e30) EMIT2(0x1e32) EMIT2(0x1e34)
928 EMIT2(0x2c69) EMIT2(0xa740)
929 return OK;
930
931 case 'L': case 0x139: case 0x13b: case 0x13d:
932 case 0x13f: case 0x141: case 0x23d: case 0x1e36:
933 case 0x1e38: case 0x1e3a: case 0x1e3c: case 0x2c60:
934 EMIT2('L') EMIT2(0x139) EMIT2(0x13b)
935 EMIT2(0x13d) EMIT2(0x13f) EMIT2(0x141)
936 EMIT2(0x23d) EMIT2(0x1e36) EMIT2(0x1e38)
937 EMIT2(0x1e3a) EMIT2(0x1e3c) EMIT2(0x2c60)
938 return OK;
939
940 case 'M': case 0x1e3e: case 0x1e40: case 0x1e42:
941 EMIT2('M') EMIT2(0x1e3e) EMIT2(0x1e40)
942 EMIT2(0x1e42)
943 return OK;
944
945 case 'N': case N_virguilla:
946 case 0x143: case 0x145: case 0x147: case 0x1f8:
947 case 0x1e44: case 0x1e46: case 0x1e48: case 0x1e4a:
948 case 0xa7a4:
949 EMIT2('N') EMIT2(N_virguilla)
950 EMIT2(0x143) EMIT2(0x145) EMIT2(0x147)
951 EMIT2(0x1f8) EMIT2(0x1e44) EMIT2(0x1e46)
952 EMIT2(0x1e48) EMIT2(0x1e4a) EMIT2(0xa7a4)
953 return OK;
954
955 case 'O': case O_grave: case O_acute: case O_circumflex:
956 case O_virguilla: case O_diaeresis: case O_slash:
957 case 0x14c: case 0x14e: case 0x150: case 0x19f:
958 case 0x1a0: case 0x1d1: case 0x1ea: case 0x1ec:
959 case 0x1fe: case 0x20c: case 0x20e: case 0x22a:
960 case 0x22c: case 0x22e: case 0x230: case 0x1e4c:
961 case 0x1e4e: case 0x1e50: case 0x1e52: case 0x1ecc:
962 case 0x1ece: case 0x1ed0: case 0x1ed2: case 0x1ed4:
963 case 0x1ed6: case 0x1ed8: case 0x1eda: case 0x1edc:
964 case 0x1ede: case 0x1ee0: case 0x1ee2:
965 EMIT2('O') EMIT2(O_grave) EMIT2(O_acute)
966 EMIT2(O_circumflex) EMIT2(O_virguilla)
967 EMIT2(O_diaeresis) EMIT2(O_slash)
968 EMIT2(0x14c) EMIT2(0x14e) EMIT2(0x150)
969 EMIT2(0x19f) EMIT2(0x1a0) EMIT2(0x1d1)
970 EMIT2(0x1ea) EMIT2(0x1ec) EMIT2(0x1fe)
971 EMIT2(0x20c) EMIT2(0x20e) EMIT2(0x22a)
972 EMIT2(0x22c) EMIT2(0x22e) EMIT2(0x230)
973 EMIT2(0x1e4c) EMIT2(0x1e4e) EMIT2(0x1e50)
974 EMIT2(0x1e52) EMIT2(0x1ecc) EMIT2(0x1ece)
975 EMIT2(0x1ed0) EMIT2(0x1ed2) EMIT2(0x1ed4)
976 EMIT2(0x1ed6) EMIT2(0x1ed8) EMIT2(0x1eda)
977 EMIT2(0x1edc) EMIT2(0x1ede) EMIT2(0x1ee0)
978 EMIT2(0x1ee2)
979 return OK;
980
981 case 'P': case 0x1a4: case 0x1e54: case 0x1e56: case 0x2c63:
982 EMIT2('P') EMIT2(0x1a4) EMIT2(0x1e54) EMIT2(0x1e56)
983 EMIT2(0x2c63)
984 return OK;
985
986 case 'Q': case 0x24a:
987 EMIT2('Q') EMIT2(0x24a)
988 return OK;
989
990 case 'R': case 0x154: case 0x156: case 0x158: case 0x210:
991 case 0x212: case 0x24c: case 0x1e58: case 0x1e5a:
992 case 0x1e5c: case 0x1e5e: case 0x2c64: case 0xa7a6:
993 EMIT2('R') EMIT2(0x154) EMIT2(0x156) EMIT2(0x158)
994 EMIT2(0x210) EMIT2(0x212) EMIT2(0x24c) EMIT2(0x1e58)
995 EMIT2(0x1e5a) EMIT2(0x1e5c) EMIT2(0x1e5e) EMIT2(0x2c64)
996 EMIT2(0xa7a6)
997 return OK;
998
999 case 'S': case 0x15a: case 0x15c: case 0x15e: case 0x160:
1000 case 0x218: case 0x1e60: case 0x1e62: case 0x1e64:
1001 case 0x1e66: case 0x1e68: case 0x2c7e: case 0xa7a8:
1002 EMIT2('S') EMIT2(0x15a) EMIT2(0x15c) EMIT2(0x15e)
1003 EMIT2(0x160) EMIT2(0x218) EMIT2(0x1e60) EMIT2(0x1e62)
1004 EMIT2(0x1e64) EMIT2(0x1e66) EMIT2(0x1e68) EMIT2(0x2c7e)
1005 EMIT2(0xa7a8)
1006 return OK;
1007
1008 case 'T': case 0x162: case 0x164: case 0x166: case 0x1ac:
1009 case 0x1ae: case 0x21a: case 0x23e: case 0x1e6a: case 0x1e6c:
1010 case 0x1e6e: case 0x1e70:
1011 EMIT2('T') EMIT2(0x162) EMIT2(0x164) EMIT2(0x166)
1012 EMIT2(0x1ac) EMIT2(0x1ae) EMIT2(0x23e) EMIT2(0x21a)
1013 EMIT2(0x1e6a) EMIT2(0x1e6c) EMIT2(0x1e6e) EMIT2(0x1e70)
1014 return OK;
1015
1016 case 'U': case U_grave: case U_acute: case U_diaeresis:
1017 case U_circumflex: case 0x168: case 0x16a: case 0x16c:
1018 case 0x16e: case 0x170: case 0x172: case 0x1af:
1019 case 0x1d3: case 0x1d5: case 0x1d7: case 0x1d9:
1020 case 0x1db: case 0x214: case 0x216: case 0x244:
1021 case 0x1e72: case 0x1e74: case 0x1e76: case 0x1e78:
1022 case 0x1e7a: case 0x1ee4: case 0x1ee6: case 0x1ee8:
1023 case 0x1eea: case 0x1eec: case 0x1eee: case 0x1ef0:
1024 EMIT2('U') EMIT2(U_grave) EMIT2(U_acute)
1025 EMIT2(U_diaeresis) EMIT2(U_circumflex)
1026 EMIT2(0x168) EMIT2(0x16a)
1027 EMIT2(0x16c) EMIT2(0x16e) EMIT2(0x170)
1028 EMIT2(0x172) EMIT2(0x1af) EMIT2(0x1d3)
1029 EMIT2(0x1d5) EMIT2(0x1d7) EMIT2(0x1d9)
1030 EMIT2(0x1db) EMIT2(0x214) EMIT2(0x216)
1031 EMIT2(0x244) EMIT2(0x1e72) EMIT2(0x1e74)
1032 EMIT2(0x1e76) EMIT2(0x1e78) EMIT2(0x1e7a)
1033 EMIT2(0x1ee4) EMIT2(0x1ee6) EMIT2(0x1ee8)
1034 EMIT2(0x1eea) EMIT2(0x1eec) EMIT2(0x1eee)
1035 EMIT2(0x1ef0)
1036 return OK;
1037
1038 case 'V': case 0x1b2: case 0x1e7c: case 0x1e7e:
1039 EMIT2('V') EMIT2(0x1b2) EMIT2(0x1e7c) EMIT2(0x1e7e)
1040 return OK;
1041
1042 case 'W': case 0x174: case 0x1e80: case 0x1e82: case 0x1e84:
1043 case 0x1e86: case 0x1e88:
1044 EMIT2('W') EMIT2(0x174) EMIT2(0x1e80) EMIT2(0x1e82)
1045 EMIT2(0x1e84) EMIT2(0x1e86) EMIT2(0x1e88)
1046 return OK;
1047
1048 case 'X': case 0x1e8a: case 0x1e8c:
1049 EMIT2('X') EMIT2(0x1e8a) EMIT2(0x1e8c)
1050 return OK;
1051
1052 case 'Y': case Y_acute: case 0x176: case 0x178:
1053 case 0x1b3: case 0x232: case 0x24e: case 0x1e8e:
1054 case 0x1ef2: case 0x1ef4: case 0x1ef6: case 0x1ef8:
1055 EMIT2('Y') EMIT2(Y_acute)
1056 EMIT2(0x176) EMIT2(0x178) EMIT2(0x1b3)
1057 EMIT2(0x232) EMIT2(0x24e) EMIT2(0x1e8e)
1058 EMIT2(0x1ef2) EMIT2(0x1ef4) EMIT2(0x1ef6)
1059 EMIT2(0x1ef8)
1060 return OK;
1061
1062 case 'Z': case 0x179: case 0x17b: case 0x17d:
1063 case 0x1b5: case 0x1e90: case 0x1e92: case 0x1e94:
1064 case 0x2c6b:
1065 EMIT2('Z') EMIT2(0x179) EMIT2(0x17b) EMIT2(0x17d)
1066 EMIT2(0x1b5) EMIT2(0x1e90) EMIT2(0x1e92)
1067 EMIT2(0x1e94) EMIT2(0x2c6b)
1068 return OK;
1069
1070 case 'a': case a_grave: case a_acute: case a_circumflex:
1071 case a_virguilla: case a_diaeresis: case a_ring:
1072 case 0x101: case 0x103: case 0x105: case 0x1ce:
1073 case 0x1df: case 0x1e1: case 0x1fb: case 0x201:
1074 case 0x203: case 0x227: case 0x1d8f: case 0x1e01:
1075 case 0x1e9a: case 0x1ea1: case 0x1ea3: case 0x1ea5:
1076 case 0x1ea7: case 0x1ea9: case 0x1eab: case 0x1ead:
1077 case 0x1eaf: case 0x1eb1: case 0x1eb3: case 0x1eb5:
1078 case 0x1eb7: case 0x2c65:
1079 EMIT2('a') EMIT2(a_grave) EMIT2(a_acute)
1080 EMIT2(a_circumflex) EMIT2(a_virguilla)
1081 EMIT2(a_diaeresis) EMIT2(a_ring)
1082 EMIT2(0x101) EMIT2(0x103) EMIT2(0x105)
1083 EMIT2(0x1ce) EMIT2(0x1df) EMIT2(0x1e1)
1084 EMIT2(0x1fb) EMIT2(0x201) EMIT2(0x203)
1085 EMIT2(0x227) EMIT2(0x1d8f) EMIT2(0x1e01)
1086 EMIT2(0x1e9a) EMIT2(0x1ea1) EMIT2(0x1ea3)
1087 EMIT2(0x1ea5) EMIT2(0x1ea7) EMIT2(0x1ea9)
1088 EMIT2(0x1eab) EMIT2(0x1ead) EMIT2(0x1eaf)
1089 EMIT2(0x1eb1) EMIT2(0x1eb3) EMIT2(0x1eb5)
1090 EMIT2(0x1eb7) EMIT2(0x2c65)
1091 return OK;
1092
1093 case 'b': case 0x180: case 0x253: case 0x1d6c: case 0x1d80:
1094 case 0x1e03: case 0x1e05: case 0x1e07:
1095 EMIT2('b') EMIT2(0x180) EMIT2(0x253) EMIT2(0x1d6c)
1096 EMIT2(0x1d80) EMIT2(0x1e03) EMIT2(0x1e05) EMIT2(0x1e07)
1097 return OK;
1098
1099 case 'c': case c_cedilla: case 0x107: case 0x109: case 0x10b:
1100 case 0x10d: case 0x188: case 0x23c: case 0x1e09: case 0xa793:
1101 case 0xa794:
1102 EMIT2('c') EMIT2(c_cedilla)
1103 EMIT2(0x107) EMIT2(0x109) EMIT2(0x10b)
1104 EMIT2(0x10d) EMIT2(0x188) EMIT2(0x23c)
1105 EMIT2(0x1e09) EMIT2(0xa793) EMIT2(0xa794)
1106 return OK;
1107
1108 case 'd': case 0x10f: case 0x111: case 0x257: case 0x1d6d:
1109 case 0x1d81: case 0x1d91: case 0x1e0b: case 0x1e0d: case 0x1e0f:
1110 case 0x1e11: case 0x1e13:
1111 EMIT2('d') EMIT2(0x10f) EMIT2(0x111)
1112 EMIT2(0x257) EMIT2(0x1d6d) EMIT2(0x1d81)
1113 EMIT2(0x1d91) EMIT2(0x1e0b) EMIT2(0x1e0d)
1114 EMIT2(0x1e0f) EMIT2(0x1e11) EMIT2(0x1e13)
1115 return OK;
1116
1117 case 'e': case e_grave: case e_acute: case e_circumflex:
1118 case e_diaeresis: case 0x113: case 0x115: case 0x117:
1119 case 0x119: case 0x11b: case 0x205: case 0x207:
1120 case 0x229: case 0x247: case 0x1d92: case 0x1e15:
1121 case 0x1e17: case 0x1e19: case 0x1e1b: case 0x1e1d:
1122 case 0x1eb9: case 0x1ebb: case 0x1ebd: case 0x1ebf:
1123 case 0x1ec1: case 0x1ec3: case 0x1ec5: case 0x1ec7:
1124 EMIT2('e') EMIT2(e_grave) EMIT2(e_acute)
1125 EMIT2(e_circumflex) EMIT2(e_diaeresis)
1126 EMIT2(0x113) EMIT2(0x115)
1127 EMIT2(0x117) EMIT2(0x119) EMIT2(0x11b)
1128 EMIT2(0x205) EMIT2(0x207) EMIT2(0x229)
1129 EMIT2(0x247) EMIT2(0x1d92) EMIT2(0x1e15)
1130 EMIT2(0x1e17) EMIT2(0x1e19) EMIT2(0x1e1b)
1131 EMIT2(0x1e1d) EMIT2(0x1eb9) EMIT2(0x1ebb)
1132 EMIT2(0x1ebd) EMIT2(0x1ebf) EMIT2(0x1ec1)
1133 EMIT2(0x1ec3) EMIT2(0x1ec5) EMIT2(0x1ec7)
1134 return OK;
1135
1136 case 'f': case 0x192: case 0x1d6e: case 0x1d82:
1137 case 0x1e1f: case 0xa799:
1138 EMIT2('f') EMIT2(0x192) EMIT2(0x1d6e) EMIT2(0x1d82)
1139 EMIT2(0x1e1f) EMIT2(0xa799)
1140 return OK;
1141
1142 case 'g': case 0x11d: case 0x11f: case 0x121: case 0x123:
1143 case 0x1e5: case 0x1e7: case 0x1f5: case 0x260: case 0x1d83:
1144 case 0x1e21: case 0xa7a1:
1145 EMIT2('g') EMIT2(0x11d) EMIT2(0x11f) EMIT2(0x121)
1146 EMIT2(0x123) EMIT2(0x1e5) EMIT2(0x1e7)
1147 EMIT2(0x1f5) EMIT2(0x260) EMIT2(0x1d83)
1148 EMIT2(0x1e21) EMIT2(0xa7a1)
1149 return OK;
1150
1151 case 'h': case 0x125: case 0x127: case 0x21f: case 0x1e23:
1152 case 0x1e25: case 0x1e27: case 0x1e29: case 0x1e2b:
1153 case 0x1e96: case 0x2c68: case 0xa795:
1154 EMIT2('h') EMIT2(0x125) EMIT2(0x127) EMIT2(0x21f)
1155 EMIT2(0x1e23) EMIT2(0x1e25) EMIT2(0x1e27)
1156 EMIT2(0x1e29) EMIT2(0x1e2b) EMIT2(0x1e96)
1157 EMIT2(0x2c68) EMIT2(0xa795)
1158 return OK;
1159
1160 case 'i': case i_grave: case i_acute: case i_circumflex:
1161 case i_diaeresis: case 0x129: case 0x12b: case 0x12d:
1162 case 0x12f: case 0x1d0: case 0x209: case 0x20b:
1163 case 0x268: case 0x1d96: case 0x1e2d: case 0x1e2f:
1164 case 0x1ec9: case 0x1ecb:
1165 EMIT2('i') EMIT2(i_grave) EMIT2(i_acute)
1166 EMIT2(i_circumflex) EMIT2(i_diaeresis)
1167 EMIT2(0x129) EMIT2(0x12b) EMIT2(0x12d)
1168 EMIT2(0x12f) EMIT2(0x1d0) EMIT2(0x209)
1169 EMIT2(0x20b) EMIT2(0x268) EMIT2(0x1d96)
1170 EMIT2(0x1e2d) EMIT2(0x1e2f) EMIT2(0x1ec9)
1171 EMIT2(0x1ecb) EMIT2(0x1ecb)
1172 return OK;
1173
1174 case 'j': case 0x135: case 0x1f0: case 0x249:
1175 EMIT2('j') EMIT2(0x135) EMIT2(0x1f0) EMIT2(0x249)
1176 return OK;
1177
1178 case 'k': case 0x137: case 0x199: case 0x1e9: case 0x1d84:
1179 case 0x1e31: case 0x1e33: case 0x1e35: case 0x2c6a: case 0xa741:
1180 EMIT2('k') EMIT2(0x137) EMIT2(0x199) EMIT2(0x1e9)
1181 EMIT2(0x1d84) EMIT2(0x1e31) EMIT2(0x1e33)
1182 EMIT2(0x1e35) EMIT2(0x2c6a) EMIT2(0xa741)
1183 return OK;
1184
1185 case 'l': case 0x13a: case 0x13c: case 0x13e: case 0x140:
1186 case 0x142: case 0x19a: case 0x1e37: case 0x1e39: case 0x1e3b:
1187 case 0x1e3d: case 0x2c61:
1188 EMIT2('l') EMIT2(0x13a) EMIT2(0x13c)
1189 EMIT2(0x13e) EMIT2(0x140) EMIT2(0x142)
1190 EMIT2(0x19a) EMIT2(0x1e37) EMIT2(0x1e39)
1191 EMIT2(0x1e3b) EMIT2(0x1e3d) EMIT2(0x2c61)
1192 return OK;
1193
1194 case 'm': case 0x1d6f: case 0x1e3f: case 0x1e41: case 0x1e43:
1195 EMIT2('m') EMIT2(0x1d6f) EMIT2(0x1e3f)
1196 EMIT2(0x1e41) EMIT2(0x1e43)
1197 return OK;
1198
1199 case 'n': case n_virguilla: case 0x144: case 0x146: case 0x148:
1200 case 0x149: case 0x1f9: case 0x1d70: case 0x1d87: case 0x1e45:
1201 case 0x1e47: case 0x1e49: case 0x1e4b: case 0xa7a5:
1202 EMIT2('n') EMIT2(n_virguilla)
1203 EMIT2(0x144) EMIT2(0x146) EMIT2(0x148)
1204 EMIT2(0x149) EMIT2(0x1f9) EMIT2(0x1d70)
1205 EMIT2(0x1d87) EMIT2(0x1e45) EMIT2(0x1e47)
1206 EMIT2(0x1e49) EMIT2(0x1e4b) EMIT2(0xa7a5)
1207 return OK;
1208
1209 case 'o': case o_grave: case o_acute: case o_circumflex:
1210 case o_virguilla: case o_diaeresis: case o_slash:
1211 case 0x14d: case 0x14f: case 0x151: case 0x1a1:
1212 case 0x1d2: case 0x1eb: case 0x1ed: case 0x1ff:
1213 case 0x20d: case 0x20f: case 0x22b: case 0x22d:
1214 case 0x22f: case 0x231: case 0x275: case 0x1e4d:
1215 case 0x1e4f: case 0x1e51: case 0x1e53: case 0x1ecd:
1216 case 0x1ecf: case 0x1ed1: case 0x1ed3: case 0x1ed5:
1217 case 0x1ed7: case 0x1ed9: case 0x1edb: case 0x1edd:
1218 case 0x1edf: case 0x1ee1: case 0x1ee3:
1219 EMIT2('o') EMIT2(o_grave) EMIT2(o_acute)
1220 EMIT2(o_circumflex) EMIT2(o_virguilla)
1221 EMIT2(o_diaeresis) EMIT2(o_slash)
1222 EMIT2(0x14d) EMIT2(0x14f) EMIT2(0x151)
1223 EMIT2(0x1a1) EMIT2(0x1d2) EMIT2(0x1eb)
1224 EMIT2(0x1ed) EMIT2(0x1ff) EMIT2(0x20d)
1225 EMIT2(0x20f) EMIT2(0x22b) EMIT2(0x22d)
1226 EMIT2(0x22f) EMIT2(0x231) EMIT2(0x275)
1227 EMIT2(0x1e4d) EMIT2(0x1e4f) EMIT2(0x1e51)
1228 EMIT2(0x1e53) EMIT2(0x1ecd) EMIT2(0x1ecf)
1229 EMIT2(0x1ed1) EMIT2(0x1ed3) EMIT2(0x1ed5)
1230 EMIT2(0x1ed7) EMIT2(0x1ed9) EMIT2(0x1edb)
1231 EMIT2(0x1edd) EMIT2(0x1edf) EMIT2(0x1ee1)
1232 EMIT2(0x1ee3)
1233 return OK;
1234
1235 case 'p': case 0x1a5: case 0x1d71: case 0x1d7d: case 0x1d88:
1236 case 0x1e55: case 0x1e57:
1237 EMIT2('p') EMIT2(0x1a5) EMIT2(0x1d71) EMIT2(0x1d7d)
1238 EMIT2(0x1d88) EMIT2(0x1e55) EMIT2(0x1e57)
1239 return OK;
1240
1241 case 'q': case 0x24b: case 0x2a0:
1242 EMIT2('q') EMIT2(0x24b) EMIT2(0x2a0)
1243 return OK;
1244
1245 case 'r': case 0x155: case 0x157: case 0x159: case 0x211:
1246 case 0x213: case 0x24d: case 0x27d: case 0x1d72: case 0x1d73:
1247 case 0x1d89: case 0x1e59: case 0x1e5b: case 0x1e5d: case 0x1e5f:
1248 case 0xa7a7:
1249 EMIT2('r') EMIT2(0x155) EMIT2(0x157) EMIT2(0x159)
1250 EMIT2(0x211) EMIT2(0x213) EMIT2(0x24d) EMIT2(0x27d)
1251 EMIT2(0x1d72) EMIT2(0x1d73) EMIT2(0x1d89) EMIT2(0x1e59)
1252 EMIT2(0x1e5b) EMIT2(0x1e5d) EMIT2(0x1e5f) EMIT2(0xa7a7)
1253 return OK;
1254
1255 case 's': case 0x15b: case 0x15d: case 0x15f: case 0x161:
1256 case 0x219: case 0x23f: case 0x1d74: case 0x1d8a: case 0x1e61:
1257 case 0x1e63: case 0x1e65: case 0x1e67: case 0x1e69: case 0xa7a9:
1258 EMIT2('s') EMIT2(0x15b) EMIT2(0x15d) EMIT2(0x15f)
1259 EMIT2(0x161) EMIT2(0x219) EMIT2(0x23f) EMIT2(0x1d74)
1260 EMIT2(0x1d8a) EMIT2(0x1e61) EMIT2(0x1e63) EMIT2(0x1e65)
1261 EMIT2(0x1e67) EMIT2(0x1e69) EMIT2(0xa7a9)
1262 return OK;
1263
1264 case 't': case 0x163: case 0x165: case 0x167: case 0x1ab:
1265 case 0x1ad: case 0x21b: case 0x288: case 0x1d75: case 0x1e6b:
1266 case 0x1e6d: case 0x1e6f: case 0x1e71: case 0x1e97: case 0x2c66:
1267 EMIT2('t') EMIT2(0x163) EMIT2(0x165) EMIT2(0x167)
1268 EMIT2(0x1ab) EMIT2(0x1ad) EMIT2(0x21b) EMIT2(0x288)
1269 EMIT2(0x1d75) EMIT2(0x1e6b) EMIT2(0x1e6d) EMIT2(0x1e6f)
1270 EMIT2(0x1e71) EMIT2(0x1e97) EMIT2(0x2c66)
1271 return OK;
1272
1273 case 'u': case u_grave: case u_acute: case u_circumflex:
1274 case u_diaeresis: case 0x169: case 0x16b: case 0x16d:
1275 case 0x16f: case 0x171: case 0x173: case 0x1b0: case 0x1d4:
1276 case 0x1d6: case 0x1d8: case 0x1da: case 0x1dc: case 0x215:
1277 case 0x217: case 0x289: case 0x1d7e: case 0x1d99: case 0x1e73:
1278 case 0x1e75: case 0x1e77: case 0x1e79: case 0x1e7b:
1279 case 0x1ee5: case 0x1ee7: case 0x1ee9: case 0x1eeb:
1280 case 0x1eed: case 0x1eef: case 0x1ef1:
1281 EMIT2('u') EMIT2(u_grave) EMIT2(u_acute)
1282 EMIT2(u_circumflex) EMIT2(u_diaeresis)
1283 EMIT2(0x169) EMIT2(0x16b)
1284 EMIT2(0x16d) EMIT2(0x16f) EMIT2(0x171)
1285 EMIT2(0x173) EMIT2(0x1d6) EMIT2(0x1d8)
1286 EMIT2(0x215) EMIT2(0x217) EMIT2(0x1b0)
1287 EMIT2(0x1d4) EMIT2(0x1da) EMIT2(0x1dc)
1288 EMIT2(0x289) EMIT2(0x1e73) EMIT2(0x1d7e)
1289 EMIT2(0x1d99) EMIT2(0x1e75) EMIT2(0x1e77)
1290 EMIT2(0x1e79) EMIT2(0x1e7b) EMIT2(0x1ee5)
1291 EMIT2(0x1ee7) EMIT2(0x1ee9) EMIT2(0x1eeb)
1292 EMIT2(0x1eed) EMIT2(0x1eef) EMIT2(0x1ef1)
1293 return OK;
1294
1295 case 'v': case 0x28b: case 0x1d8c: case 0x1e7d: case 0x1e7f:
1296 EMIT2('v') EMIT2(0x28b) EMIT2(0x1d8c) EMIT2(0x1e7d)
1297 EMIT2(0x1e7f)
1298 return OK;
1299
1300 case 'w': case 0x175: case 0x1e81: case 0x1e83: case 0x1e85:
1301 case 0x1e87: case 0x1e89: case 0x1e98:
1302 EMIT2('w') EMIT2(0x175) EMIT2(0x1e81) EMIT2(0x1e83)
1303 EMIT2(0x1e85) EMIT2(0x1e87) EMIT2(0x1e89) EMIT2(0x1e98)
1304 return OK;
1305
1306 case 'x': case 0x1e8b: case 0x1e8d:
1307 EMIT2('x') EMIT2(0x1e8b) EMIT2(0x1e8d)
1308 return OK;
1309
1310 case 'y': case y_acute: case y_diaeresis: case 0x177:
1311 case 0x1b4: case 0x233: case 0x24f: case 0x1e8f:
1312 case 0x1e99: case 0x1ef3: case 0x1ef5: case 0x1ef7:
1313 case 0x1ef9:
1314 EMIT2('y') EMIT2(y_acute) EMIT2(y_diaeresis)
1315 EMIT2(0x177) EMIT2(0x1b4) EMIT2(0x233) EMIT2(0x24f)
1316 EMIT2(0x1e8f) EMIT2(0x1e99) EMIT2(0x1ef3)
1317 EMIT2(0x1ef5) EMIT2(0x1ef7) EMIT2(0x1ef9)
1318 return OK;
1319
1320 case 'z': case 0x17a: case 0x17c: case 0x17e: case 0x1b6:
1321 case 0x1d76: case 0x1d8e: case 0x1e91: case 0x1e93:
1322 case 0x1e95: case 0x2c6c:
1323 EMIT2('z') EMIT2(0x17a) EMIT2(0x17c) EMIT2(0x17e)
1324 EMIT2(0x1b6) EMIT2(0x1d76) EMIT2(0x1d8e) EMIT2(0x1e91)
1325 EMIT2(0x1e93) EMIT2(0x1e95) EMIT2(0x2c6c)
1326 return OK;
1327
1328 // default: character itself
1329 }
1330 }
1331
1332 EMIT2(c);
1333 return OK;
1334 #undef EMIT2
1335 #undef EMIT2
1336 }
1337
1338 /*
1339 * Code to parse regular expression.
1340 *
1341 * We try to reuse parsing functions in regexp.c to
1342 * minimize surprise and keep the syntax consistent.
1343 */
1344
1345 /*
1346 * Parse the lowest level.
1347 *
1348 * An atom can be one of a long list of items. Many atoms match one character
1349 * in the text. It is often an ordinary character or a character class.
1350 * Braces can be used to make a pattern into an atom. The "\z(\)" construct
1351 * is only for syntax highlighting.
1352 *
1353 * atom ::= ordinary-atom
1354 * or \( pattern \)
1355 * or \%( pattern \)
1356 * or \z( pattern \)
1357 */
1358 static int
nfa_regatom(void)1359 nfa_regatom(void)
1360 {
1361 int c;
1362 int charclass;
1363 int equiclass;
1364 int collclass;
1365 int got_coll_char;
1366 char_u *p;
1367 char_u *endp;
1368 char_u *old_regparse = regparse;
1369 int extra = 0;
1370 int emit_range;
1371 int negated;
1372 int result;
1373 int startc = -1;
1374 int save_prev_at_start = prev_at_start;
1375
1376 c = getchr();
1377 switch (c)
1378 {
1379 case NUL:
1380 EMSG_RET_FAIL(_(e_nul_found));
1381
1382 case Magic('^'):
1383 EMIT(NFA_BOL);
1384 break;
1385
1386 case Magic('$'):
1387 EMIT(NFA_EOL);
1388 #if defined(FEAT_SYN_HL) || defined(PROTO)
1389 had_eol = TRUE;
1390 #endif
1391 break;
1392
1393 case Magic('<'):
1394 EMIT(NFA_BOW);
1395 break;
1396
1397 case Magic('>'):
1398 EMIT(NFA_EOW);
1399 break;
1400
1401 case Magic('_'):
1402 c = no_Magic(getchr());
1403 if (c == NUL)
1404 EMSG_RET_FAIL(_(e_nul_found));
1405
1406 if (c == '^') // "\_^" is start-of-line
1407 {
1408 EMIT(NFA_BOL);
1409 break;
1410 }
1411 if (c == '$') // "\_$" is end-of-line
1412 {
1413 EMIT(NFA_EOL);
1414 #if defined(FEAT_SYN_HL) || defined(PROTO)
1415 had_eol = TRUE;
1416 #endif
1417 break;
1418 }
1419
1420 extra = NFA_ADD_NL;
1421
1422 // "\_[" is collection plus newline
1423 if (c == '[')
1424 goto collection;
1425
1426 // "\_x" is character class plus newline
1427 // FALLTHROUGH
1428
1429 /*
1430 * Character classes.
1431 */
1432 case Magic('.'):
1433 case Magic('i'):
1434 case Magic('I'):
1435 case Magic('k'):
1436 case Magic('K'):
1437 case Magic('f'):
1438 case Magic('F'):
1439 case Magic('p'):
1440 case Magic('P'):
1441 case Magic('s'):
1442 case Magic('S'):
1443 case Magic('d'):
1444 case Magic('D'):
1445 case Magic('x'):
1446 case Magic('X'):
1447 case Magic('o'):
1448 case Magic('O'):
1449 case Magic('w'):
1450 case Magic('W'):
1451 case Magic('h'):
1452 case Magic('H'):
1453 case Magic('a'):
1454 case Magic('A'):
1455 case Magic('l'):
1456 case Magic('L'):
1457 case Magic('u'):
1458 case Magic('U'):
1459 p = vim_strchr(classchars, no_Magic(c));
1460 if (p == NULL)
1461 {
1462 if (extra == NFA_ADD_NL)
1463 {
1464 semsg(_(e_ill_char_class), c);
1465 rc_did_emsg = TRUE;
1466 return FAIL;
1467 }
1468 siemsg("INTERNAL: Unknown character class char: %d", c);
1469 return FAIL;
1470 }
1471
1472 // When '.' is followed by a composing char ignore the dot, so that
1473 // the composing char is matched here.
1474 if (enc_utf8 && c == Magic('.') && utf_iscomposing(peekchr()))
1475 {
1476 old_regparse = regparse;
1477 c = getchr();
1478 goto nfa_do_multibyte;
1479 }
1480 EMIT(nfa_classcodes[p - classchars]);
1481 if (extra == NFA_ADD_NL)
1482 {
1483 EMIT(NFA_NEWL);
1484 EMIT(NFA_OR);
1485 regflags |= RF_HASNL;
1486 }
1487 break;
1488
1489 case Magic('n'):
1490 if (reg_string)
1491 // In a string "\n" matches a newline character.
1492 EMIT(NL);
1493 else
1494 {
1495 // In buffer text "\n" matches the end of a line.
1496 EMIT(NFA_NEWL);
1497 regflags |= RF_HASNL;
1498 }
1499 break;
1500
1501 case Magic('('):
1502 if (nfa_reg(REG_PAREN) == FAIL)
1503 return FAIL; // cascaded error
1504 break;
1505
1506 case Magic('|'):
1507 case Magic('&'):
1508 case Magic(')'):
1509 semsg(_(e_misplaced), no_Magic(c));
1510 return FAIL;
1511
1512 case Magic('='):
1513 case Magic('?'):
1514 case Magic('+'):
1515 case Magic('@'):
1516 case Magic('*'):
1517 case Magic('{'):
1518 // these should follow an atom, not form an atom
1519 semsg(_(e_misplaced), no_Magic(c));
1520 return FAIL;
1521
1522 case Magic('~'):
1523 {
1524 char_u *lp;
1525
1526 // Previous substitute pattern.
1527 // Generated as "\%(pattern\)".
1528 if (reg_prev_sub == NULL)
1529 {
1530 emsg(_(e_no_previous_substitute_regular_expression));
1531 return FAIL;
1532 }
1533 for (lp = reg_prev_sub; *lp != NUL; MB_CPTR_ADV(lp))
1534 {
1535 EMIT(PTR2CHAR(lp));
1536 if (lp != reg_prev_sub)
1537 EMIT(NFA_CONCAT);
1538 }
1539 EMIT(NFA_NOPEN);
1540 break;
1541 }
1542
1543 case Magic('1'):
1544 case Magic('2'):
1545 case Magic('3'):
1546 case Magic('4'):
1547 case Magic('5'):
1548 case Magic('6'):
1549 case Magic('7'):
1550 case Magic('8'):
1551 case Magic('9'):
1552 {
1553 int refnum = no_Magic(c) - '1';
1554
1555 if (!seen_endbrace(refnum + 1))
1556 return FAIL;
1557 EMIT(NFA_BACKREF1 + refnum);
1558 rex.nfa_has_backref = TRUE;
1559 }
1560 break;
1561
1562 case Magic('z'):
1563 c = no_Magic(getchr());
1564 switch (c)
1565 {
1566 case 's':
1567 EMIT(NFA_ZSTART);
1568 if (re_mult_next("\\zs") == FAIL)
1569 return FAIL;
1570 break;
1571 case 'e':
1572 EMIT(NFA_ZEND);
1573 rex.nfa_has_zend = TRUE;
1574 if (re_mult_next("\\ze") == FAIL)
1575 return FAIL;
1576 break;
1577 #ifdef FEAT_SYN_HL
1578 case '1':
1579 case '2':
1580 case '3':
1581 case '4':
1582 case '5':
1583 case '6':
1584 case '7':
1585 case '8':
1586 case '9':
1587 // \z1...\z9
1588 if ((reg_do_extmatch & REX_USE) == 0)
1589 EMSG_RET_FAIL(_(e_z1_not_allowed));
1590 EMIT(NFA_ZREF1 + (no_Magic(c) - '1'));
1591 // No need to set rex.nfa_has_backref, the sub-matches don't
1592 // change when \z1 .. \z9 matches or not.
1593 re_has_z = REX_USE;
1594 break;
1595 case '(':
1596 // \z(
1597 if ((reg_do_extmatch & REX_SET) == 0)
1598 EMSG_RET_FAIL(_(e_z_not_allowed));
1599 if (nfa_reg(REG_ZPAREN) == FAIL)
1600 return FAIL; // cascaded error
1601 re_has_z = REX_SET;
1602 break;
1603 #endif
1604 default:
1605 semsg(_("E867: (NFA) Unknown operator '\\z%c'"),
1606 no_Magic(c));
1607 return FAIL;
1608 }
1609 break;
1610
1611 case Magic('%'):
1612 c = no_Magic(getchr());
1613 switch (c)
1614 {
1615 // () without a back reference
1616 case '(':
1617 if (nfa_reg(REG_NPAREN) == FAIL)
1618 return FAIL;
1619 EMIT(NFA_NOPEN);
1620 break;
1621
1622 case 'd': // %d123 decimal
1623 case 'o': // %o123 octal
1624 case 'x': // %xab hex 2
1625 case 'u': // %uabcd hex 4
1626 case 'U': // %U1234abcd hex 8
1627 {
1628 long nr;
1629
1630 switch (c)
1631 {
1632 case 'd': nr = getdecchrs(); break;
1633 case 'o': nr = getoctchrs(); break;
1634 case 'x': nr = gethexchrs(2); break;
1635 case 'u': nr = gethexchrs(4); break;
1636 case 'U': nr = gethexchrs(8); break;
1637 default: nr = -1; break;
1638 }
1639
1640 if (nr < 0 || nr > INT_MAX)
1641 EMSG2_RET_FAIL(
1642 _("E678: Invalid character after %s%%[dxouU]"),
1643 reg_magic == MAGIC_ALL);
1644 // A NUL is stored in the text as NL
1645 // TODO: what if a composing character follows?
1646 EMIT(nr == 0 ? 0x0a : nr);
1647 }
1648 break;
1649
1650 // Catch \%^ and \%$ regardless of where they appear in the
1651 // pattern -- regardless of whether or not it makes sense.
1652 case '^':
1653 EMIT(NFA_BOF);
1654 break;
1655
1656 case '$':
1657 EMIT(NFA_EOF);
1658 break;
1659
1660 case '#':
1661 EMIT(NFA_CURSOR);
1662 break;
1663
1664 case 'V':
1665 EMIT(NFA_VISUAL);
1666 break;
1667
1668 case 'C':
1669 EMIT(NFA_ANY_COMPOSING);
1670 break;
1671
1672 case '[':
1673 {
1674 int n;
1675
1676 // \%[abc]
1677 for (n = 0; (c = peekchr()) != ']'; ++n)
1678 {
1679 if (c == NUL)
1680 EMSG2_RET_FAIL(_(e_missing_sb),
1681 reg_magic == MAGIC_ALL);
1682 // recursive call!
1683 if (nfa_regatom() == FAIL)
1684 return FAIL;
1685 }
1686 getchr(); // get the ]
1687 if (n == 0)
1688 EMSG2_RET_FAIL(_(e_empty_sb),
1689 reg_magic == MAGIC_ALL);
1690 EMIT(NFA_OPT_CHARS);
1691 EMIT(n);
1692
1693 // Emit as "\%(\%[abc]\)" to be able to handle
1694 // "\%[abc]*" which would cause the empty string to be
1695 // matched an unlimited number of times. NFA_NOPEN is
1696 // added only once at a position, while NFA_SPLIT is
1697 // added multiple times. This is more efficient than
1698 // not allowing NFA_SPLIT multiple times, it is used
1699 // a lot.
1700 EMIT(NFA_NOPEN);
1701 break;
1702 }
1703
1704 default:
1705 {
1706 long_u n = 0;
1707 int cmp = c;
1708 int cur = FALSE;
1709
1710 if (c == '<' || c == '>')
1711 c = getchr();
1712 if (no_Magic(c) == '.')
1713 {
1714 cur = TRUE;
1715 c = getchr();
1716 }
1717 while (VIM_ISDIGIT(c))
1718 {
1719 long_u tmp;
1720
1721 if (cur)
1722 semsg(_(e_regexp_number_after_dot_pos_search),
1723 no_Magic(c));
1724 tmp = n * 10 + (c - '0');
1725
1726 if (tmp < n)
1727 {
1728 // overflow.
1729 emsg(_(e_value_too_large));
1730 return FAIL;
1731 }
1732 n = tmp;
1733 c = getchr();
1734 }
1735 if (c == 'l' || c == 'c' || c == 'v')
1736 {
1737 long_u limit = INT_MAX;
1738
1739 if (c == 'l')
1740 {
1741 if (cur)
1742 n = curwin->w_cursor.lnum;
1743 // \%{n}l \%{n}<l \%{n}>l
1744 EMIT(cmp == '<' ? NFA_LNUM_LT :
1745 cmp == '>' ? NFA_LNUM_GT : NFA_LNUM);
1746 if (save_prev_at_start)
1747 at_start = TRUE;
1748 }
1749 else if (c == 'c')
1750 {
1751 if (cur)
1752 {
1753 n = curwin->w_cursor.col;
1754 n++;
1755 }
1756 // \%{n}c \%{n}<c \%{n}>c
1757 EMIT(cmp == '<' ? NFA_COL_LT :
1758 cmp == '>' ? NFA_COL_GT : NFA_COL);
1759 }
1760 else
1761 {
1762 if (cur)
1763 {
1764 colnr_T vcol = 0;
1765
1766 getvvcol(curwin, &curwin->w_cursor,
1767 NULL, NULL, &vcol);
1768 n = ++vcol;
1769 }
1770 // \%{n}v \%{n}<v \%{n}>v
1771 EMIT(cmp == '<' ? NFA_VCOL_LT :
1772 cmp == '>' ? NFA_VCOL_GT : NFA_VCOL);
1773 limit = INT_MAX / MB_MAXBYTES;
1774 }
1775 if (n >= limit)
1776 {
1777 emsg(_(e_value_too_large));
1778 return FAIL;
1779 }
1780 EMIT((int)n);
1781 break;
1782 }
1783 else if (c == '\'' && n == 0)
1784 {
1785 // \%'m \%<'m \%>'m
1786 EMIT(cmp == '<' ? NFA_MARK_LT :
1787 cmp == '>' ? NFA_MARK_GT : NFA_MARK);
1788 EMIT(getchr());
1789 break;
1790 }
1791 }
1792 semsg(_("E867: (NFA) Unknown operator '\\%%%c'"),
1793 no_Magic(c));
1794 return FAIL;
1795 }
1796 break;
1797
1798 case Magic('['):
1799 collection:
1800 /*
1801 * [abc] uses NFA_START_COLL - NFA_END_COLL
1802 * [^abc] uses NFA_START_NEG_COLL - NFA_END_NEG_COLL
1803 * Each character is produced as a regular state, using
1804 * NFA_CONCAT to bind them together.
1805 * Besides normal characters there can be:
1806 * - character classes NFA_CLASS_*
1807 * - ranges, two characters followed by NFA_RANGE.
1808 */
1809
1810 p = regparse;
1811 endp = skip_anyof(p);
1812 if (*endp == ']')
1813 {
1814 /*
1815 * Try to reverse engineer character classes. For example,
1816 * recognize that [0-9] stands for \d and [A-Za-z_] for \h,
1817 * and perform the necessary substitutions in the NFA.
1818 */
1819 result = nfa_recognize_char_class(regparse, endp,
1820 extra == NFA_ADD_NL);
1821 if (result != FAIL)
1822 {
1823 if (result >= NFA_FIRST_NL && result <= NFA_LAST_NL)
1824 {
1825 EMIT(result - NFA_ADD_NL);
1826 EMIT(NFA_NEWL);
1827 EMIT(NFA_OR);
1828 }
1829 else
1830 EMIT(result);
1831 regparse = endp;
1832 MB_PTR_ADV(regparse);
1833 return OK;
1834 }
1835 /*
1836 * Failed to recognize a character class. Use the simple
1837 * version that turns [abc] into 'a' OR 'b' OR 'c'
1838 */
1839 startc = -1;
1840 negated = FALSE;
1841 if (*regparse == '^') // negated range
1842 {
1843 negated = TRUE;
1844 MB_PTR_ADV(regparse);
1845 EMIT(NFA_START_NEG_COLL);
1846 }
1847 else
1848 EMIT(NFA_START_COLL);
1849 if (*regparse == '-')
1850 {
1851 startc = '-';
1852 EMIT(startc);
1853 EMIT(NFA_CONCAT);
1854 MB_PTR_ADV(regparse);
1855 }
1856 // Emit the OR branches for each character in the []
1857 emit_range = FALSE;
1858 while (regparse < endp)
1859 {
1860 int oldstartc = startc;
1861
1862 startc = -1;
1863 got_coll_char = FALSE;
1864 if (*regparse == '[')
1865 {
1866 // Check for [: :], [= =], [. .]
1867 equiclass = collclass = 0;
1868 charclass = get_char_class(®parse);
1869 if (charclass == CLASS_NONE)
1870 {
1871 equiclass = get_equi_class(®parse);
1872 if (equiclass == 0)
1873 collclass = get_coll_element(®parse);
1874 }
1875
1876 // Character class like [:alpha:]
1877 if (charclass != CLASS_NONE)
1878 {
1879 switch (charclass)
1880 {
1881 case CLASS_ALNUM:
1882 EMIT(NFA_CLASS_ALNUM);
1883 break;
1884 case CLASS_ALPHA:
1885 EMIT(NFA_CLASS_ALPHA);
1886 break;
1887 case CLASS_BLANK:
1888 EMIT(NFA_CLASS_BLANK);
1889 break;
1890 case CLASS_CNTRL:
1891 EMIT(NFA_CLASS_CNTRL);
1892 break;
1893 case CLASS_DIGIT:
1894 EMIT(NFA_CLASS_DIGIT);
1895 break;
1896 case CLASS_GRAPH:
1897 EMIT(NFA_CLASS_GRAPH);
1898 break;
1899 case CLASS_LOWER:
1900 wants_nfa = TRUE;
1901 EMIT(NFA_CLASS_LOWER);
1902 break;
1903 case CLASS_PRINT:
1904 EMIT(NFA_CLASS_PRINT);
1905 break;
1906 case CLASS_PUNCT:
1907 EMIT(NFA_CLASS_PUNCT);
1908 break;
1909 case CLASS_SPACE:
1910 EMIT(NFA_CLASS_SPACE);
1911 break;
1912 case CLASS_UPPER:
1913 wants_nfa = TRUE;
1914 EMIT(NFA_CLASS_UPPER);
1915 break;
1916 case CLASS_XDIGIT:
1917 EMIT(NFA_CLASS_XDIGIT);
1918 break;
1919 case CLASS_TAB:
1920 EMIT(NFA_CLASS_TAB);
1921 break;
1922 case CLASS_RETURN:
1923 EMIT(NFA_CLASS_RETURN);
1924 break;
1925 case CLASS_BACKSPACE:
1926 EMIT(NFA_CLASS_BACKSPACE);
1927 break;
1928 case CLASS_ESCAPE:
1929 EMIT(NFA_CLASS_ESCAPE);
1930 break;
1931 case CLASS_IDENT:
1932 EMIT(NFA_CLASS_IDENT);
1933 break;
1934 case CLASS_KEYWORD:
1935 EMIT(NFA_CLASS_KEYWORD);
1936 break;
1937 case CLASS_FNAME:
1938 EMIT(NFA_CLASS_FNAME);
1939 break;
1940 }
1941 EMIT(NFA_CONCAT);
1942 continue;
1943 }
1944 // Try equivalence class [=a=] and the like
1945 if (equiclass != 0)
1946 {
1947 result = nfa_emit_equi_class(equiclass);
1948 if (result == FAIL)
1949 {
1950 // should never happen
1951 EMSG_RET_FAIL(_("E868: Error building NFA with equivalence class!"));
1952 }
1953 continue;
1954 }
1955 // Try collating class like [. .]
1956 if (collclass != 0)
1957 {
1958 startc = collclass; // allow [.a.]-x as a range
1959 // Will emit the proper atom at the end of the
1960 // while loop.
1961 }
1962 }
1963 // Try a range like 'a-x' or '\t-z'. Also allows '-' as a
1964 // start character.
1965 if (*regparse == '-' && oldstartc != -1)
1966 {
1967 emit_range = TRUE;
1968 startc = oldstartc;
1969 MB_PTR_ADV(regparse);
1970 continue; // reading the end of the range
1971 }
1972
1973 // Now handle simple and escaped characters.
1974 // Only "\]", "\^", "\]" and "\\" are special in Vi. Vim
1975 // accepts "\t", "\e", etc., but only when the 'l' flag in
1976 // 'cpoptions' is not included.
1977 // Posix doesn't recognize backslash at all.
1978 if (*regparse == '\\'
1979 && !reg_cpo_bsl
1980 && regparse + 1 <= endp
1981 && (vim_strchr(REGEXP_INRANGE, regparse[1]) != NULL
1982 || (!reg_cpo_lit
1983 && vim_strchr(REGEXP_ABBR, regparse[1])
1984 != NULL)
1985 )
1986 )
1987 {
1988 MB_PTR_ADV(regparse);
1989
1990 if (*regparse == 'n')
1991 startc = (reg_string || emit_range
1992 || regparse[1] == '-') ? NL : NFA_NEWL;
1993 else if (*regparse == 'd'
1994 || *regparse == 'o'
1995 || *regparse == 'x'
1996 || *regparse == 'u'
1997 || *regparse == 'U'
1998 )
1999 {
2000 // TODO(RE) This needs more testing
2001 startc = coll_get_char();
2002 got_coll_char = TRUE;
2003 MB_PTR_BACK(old_regparse, regparse);
2004 }
2005 else
2006 {
2007 // \r,\t,\e,\b
2008 startc = backslash_trans(*regparse);
2009 }
2010 }
2011
2012 // Normal printable char
2013 if (startc == -1)
2014 startc = PTR2CHAR(regparse);
2015
2016 // Previous char was '-', so this char is end of range.
2017 if (emit_range)
2018 {
2019 int endc = startc;
2020
2021 startc = oldstartc;
2022 if (startc > endc)
2023 EMSG_RET_FAIL(_(e_reverse_range));
2024
2025 if (endc > startc + 2)
2026 {
2027 // Emit a range instead of the sequence of
2028 // individual characters.
2029 if (startc == 0)
2030 // \x00 is translated to \x0a, start at \x01.
2031 EMIT(1);
2032 else
2033 --post_ptr; // remove NFA_CONCAT
2034 EMIT(endc);
2035 EMIT(NFA_RANGE);
2036 EMIT(NFA_CONCAT);
2037 }
2038 else if (has_mbyte && ((*mb_char2len)(startc) > 1
2039 || (*mb_char2len)(endc) > 1))
2040 {
2041 // Emit the characters in the range.
2042 // "startc" was already emitted, so skip it.
2043 //
2044 for (c = startc + 1; c <= endc; c++)
2045 {
2046 EMIT(c);
2047 EMIT(NFA_CONCAT);
2048 }
2049 }
2050 else
2051 {
2052 #ifdef EBCDIC
2053 int alpha_only = FALSE;
2054
2055 // for alphabetical range skip the gaps
2056 // 'i'-'j', 'r'-'s', 'I'-'J' and 'R'-'S'.
2057 if (isalpha(startc) && isalpha(endc))
2058 alpha_only = TRUE;
2059 #endif
2060 // Emit the range. "startc" was already emitted, so
2061 // skip it.
2062 for (c = startc + 1; c <= endc; c++)
2063 #ifdef EBCDIC
2064 if (!alpha_only || isalpha(startc))
2065 #endif
2066 {
2067 EMIT(c);
2068 EMIT(NFA_CONCAT);
2069 }
2070 }
2071 emit_range = FALSE;
2072 startc = -1;
2073 }
2074 else
2075 {
2076 // This char (startc) is not part of a range. Just
2077 // emit it.
2078 // Normally, simply emit startc. But if we get char
2079 // code=0 from a collating char, then replace it with
2080 // 0x0a.
2081 // This is needed to completely mimic the behaviour of
2082 // the backtracking engine.
2083 if (startc == NFA_NEWL)
2084 {
2085 // Line break can't be matched as part of the
2086 // collection, add an OR below. But not for negated
2087 // range.
2088 if (!negated)
2089 extra = NFA_ADD_NL;
2090 }
2091 else
2092 {
2093 if (got_coll_char == TRUE && startc == 0)
2094 EMIT(0x0a);
2095 else
2096 EMIT(startc);
2097 EMIT(NFA_CONCAT);
2098 }
2099 }
2100
2101 MB_PTR_ADV(regparse);
2102 } // while (p < endp)
2103
2104 MB_PTR_BACK(old_regparse, regparse);
2105 if (*regparse == '-') // if last, '-' is just a char
2106 {
2107 EMIT('-');
2108 EMIT(NFA_CONCAT);
2109 }
2110
2111 // skip the trailing ]
2112 regparse = endp;
2113 MB_PTR_ADV(regparse);
2114
2115 // Mark end of the collection.
2116 if (negated == TRUE)
2117 EMIT(NFA_END_NEG_COLL);
2118 else
2119 EMIT(NFA_END_COLL);
2120
2121 // \_[] also matches \n but it's not negated
2122 if (extra == NFA_ADD_NL)
2123 {
2124 EMIT(reg_string ? NL : NFA_NEWL);
2125 EMIT(NFA_OR);
2126 }
2127
2128 return OK;
2129 } // if exists closing ]
2130
2131 if (reg_strict)
2132 EMSG_RET_FAIL(_(e_missingbracket));
2133 // FALLTHROUGH
2134
2135 default:
2136 {
2137 int plen;
2138
2139 nfa_do_multibyte:
2140 // plen is length of current char with composing chars
2141 if (enc_utf8 && ((*mb_char2len)(c)
2142 != (plen = utfc_ptr2len(old_regparse))
2143 || utf_iscomposing(c)))
2144 {
2145 int i = 0;
2146
2147 // A base character plus composing characters, or just one
2148 // or more composing characters.
2149 // This requires creating a separate atom as if enclosing
2150 // the characters in (), where NFA_COMPOSING is the ( and
2151 // NFA_END_COMPOSING is the ). Note that right now we are
2152 // building the postfix form, not the NFA itself;
2153 // a composing char could be: a, b, c, NFA_COMPOSING
2154 // where 'b' and 'c' are chars with codes > 256.
2155 for (;;)
2156 {
2157 EMIT(c);
2158 if (i > 0)
2159 EMIT(NFA_CONCAT);
2160 if ((i += utf_char2len(c)) >= plen)
2161 break;
2162 c = utf_ptr2char(old_regparse + i);
2163 }
2164 EMIT(NFA_COMPOSING);
2165 regparse = old_regparse + plen;
2166 }
2167 else
2168 {
2169 c = no_Magic(c);
2170 EMIT(c);
2171 }
2172 return OK;
2173 }
2174 }
2175
2176 return OK;
2177 }
2178
2179 /*
2180 * Parse something followed by possible [*+=].
2181 *
2182 * A piece is an atom, possibly followed by a multi, an indication of how many
2183 * times the atom can be matched. Example: "a*" matches any sequence of "a"
2184 * characters: "", "a", "aa", etc.
2185 *
2186 * piece ::= atom
2187 * or atom multi
2188 */
2189 static int
nfa_regpiece(void)2190 nfa_regpiece(void)
2191 {
2192 int i;
2193 int op;
2194 int ret;
2195 long minval, maxval;
2196 int greedy = TRUE; // Braces are prefixed with '-' ?
2197 parse_state_T old_state;
2198 parse_state_T new_state;
2199 long c2;
2200 int old_post_pos;
2201 int my_post_start;
2202 int quest;
2203
2204 // Save the current parse state, so that we can use it if <atom>{m,n} is
2205 // next.
2206 save_parse_state(&old_state);
2207
2208 // store current pos in the postfix form, for \{m,n} involving 0s
2209 my_post_start = (int)(post_ptr - post_start);
2210
2211 ret = nfa_regatom();
2212 if (ret == FAIL)
2213 return FAIL; // cascaded error
2214
2215 op = peekchr();
2216 if (re_multi_type(op) == NOT_MULTI)
2217 return OK;
2218
2219 skipchr();
2220 switch (op)
2221 {
2222 case Magic('*'):
2223 EMIT(NFA_STAR);
2224 break;
2225
2226 case Magic('+'):
2227 /*
2228 * Trick: Normally, (a*)\+ would match the whole input "aaa". The
2229 * first and only submatch would be "aaa". But the backtracking
2230 * engine interprets the plus as "try matching one more time", and
2231 * a* matches a second time at the end of the input, the empty
2232 * string.
2233 * The submatch will be the empty string.
2234 *
2235 * In order to be consistent with the old engine, we replace
2236 * <atom>+ with <atom><atom>*
2237 */
2238 restore_parse_state(&old_state);
2239 curchr = -1;
2240 if (nfa_regatom() == FAIL)
2241 return FAIL;
2242 EMIT(NFA_STAR);
2243 EMIT(NFA_CONCAT);
2244 skipchr(); // skip the \+
2245 break;
2246
2247 case Magic('@'):
2248 c2 = getdecchrs();
2249 op = no_Magic(getchr());
2250 i = 0;
2251 switch(op)
2252 {
2253 case '=':
2254 // \@=
2255 i = NFA_PREV_ATOM_NO_WIDTH;
2256 break;
2257 case '!':
2258 // \@!
2259 i = NFA_PREV_ATOM_NO_WIDTH_NEG;
2260 break;
2261 case '<':
2262 op = no_Magic(getchr());
2263 if (op == '=')
2264 // \@<=
2265 i = NFA_PREV_ATOM_JUST_BEFORE;
2266 else if (op == '!')
2267 // \@<!
2268 i = NFA_PREV_ATOM_JUST_BEFORE_NEG;
2269 break;
2270 case '>':
2271 // \@>
2272 i = NFA_PREV_ATOM_LIKE_PATTERN;
2273 break;
2274 }
2275 if (i == 0)
2276 {
2277 semsg(_("E869: (NFA) Unknown operator '\\@%c'"), op);
2278 return FAIL;
2279 }
2280 EMIT(i);
2281 if (i == NFA_PREV_ATOM_JUST_BEFORE
2282 || i == NFA_PREV_ATOM_JUST_BEFORE_NEG)
2283 EMIT(c2);
2284 break;
2285
2286 case Magic('?'):
2287 case Magic('='):
2288 EMIT(NFA_QUEST);
2289 break;
2290
2291 case Magic('{'):
2292 // a{2,5} will expand to 'aaa?a?a?'
2293 // a{-1,3} will expand to 'aa??a??', where ?? is the nongreedy
2294 // version of '?'
2295 // \v(ab){2,3} will expand to '(ab)(ab)(ab)?', where all the
2296 // parenthesis have the same id
2297
2298 greedy = TRUE;
2299 c2 = peekchr();
2300 if (c2 == '-' || c2 == Magic('-'))
2301 {
2302 skipchr();
2303 greedy = FALSE;
2304 }
2305 if (!read_limits(&minval, &maxval))
2306 EMSG_RET_FAIL(_("E870: (NFA regexp) Error reading repetition limits"));
2307
2308 // <atom>{0,inf}, <atom>{0,} and <atom>{} are equivalent to
2309 // <atom>*
2310 if (minval == 0 && maxval == MAX_LIMIT)
2311 {
2312 if (greedy) // { { (match the braces)
2313 // \{}, \{0,}
2314 EMIT(NFA_STAR);
2315 else // { { (match the braces)
2316 // \{-}, \{-0,}
2317 EMIT(NFA_STAR_NONGREEDY);
2318 break;
2319 }
2320
2321 // Special case: x{0} or x{-0}
2322 if (maxval == 0)
2323 {
2324 // Ignore result of previous call to nfa_regatom()
2325 post_ptr = post_start + my_post_start;
2326 // NFA_EMPTY is 0-length and works everywhere
2327 EMIT(NFA_EMPTY);
2328 return OK;
2329 }
2330
2331 // The engine is very inefficient (uses too many states) when the
2332 // maximum is much larger than the minimum and when the maximum is
2333 // large. However, when maxval is MAX_LIMIT, it is okay, as this
2334 // will emit NFA_STAR.
2335 // Bail out if we can use the other engine, but only, when the
2336 // pattern does not need the NFA engine like (e.g. [[:upper:]]\{2,\}
2337 // does not work with with characters > 8 bit with the BT engine)
2338 if ((nfa_re_flags & RE_AUTO)
2339 && (maxval > 500 || maxval > minval + 200)
2340 && (maxval != MAX_LIMIT && minval < 200)
2341 && !wants_nfa)
2342 return FAIL;
2343
2344 // Ignore previous call to nfa_regatom()
2345 post_ptr = post_start + my_post_start;
2346 // Save parse state after the repeated atom and the \{}
2347 save_parse_state(&new_state);
2348
2349 quest = (greedy == TRUE? NFA_QUEST : NFA_QUEST_NONGREEDY);
2350 for (i = 0; i < maxval; i++)
2351 {
2352 // Goto beginning of the repeated atom
2353 restore_parse_state(&old_state);
2354 old_post_pos = (int)(post_ptr - post_start);
2355 if (nfa_regatom() == FAIL)
2356 return FAIL;
2357 // after "minval" times, atoms are optional
2358 if (i + 1 > minval)
2359 {
2360 if (maxval == MAX_LIMIT)
2361 {
2362 if (greedy)
2363 EMIT(NFA_STAR);
2364 else
2365 EMIT(NFA_STAR_NONGREEDY);
2366 }
2367 else
2368 EMIT(quest);
2369 }
2370 if (old_post_pos != my_post_start)
2371 EMIT(NFA_CONCAT);
2372 if (i + 1 > minval && maxval == MAX_LIMIT)
2373 break;
2374 }
2375
2376 // Go to just after the repeated atom and the \{}
2377 restore_parse_state(&new_state);
2378 curchr = -1;
2379
2380 break;
2381
2382
2383 default:
2384 break;
2385 } // end switch
2386
2387 if (re_multi_type(peekchr()) != NOT_MULTI)
2388 // Can't have a multi follow a multi.
2389 EMSG_RET_FAIL(_("E871: (NFA regexp) Can't have a multi follow a multi"));
2390
2391 return OK;
2392 }
2393
2394 /*
2395 * Parse one or more pieces, concatenated. It matches a match for the
2396 * first piece, followed by a match for the second piece, etc. Example:
2397 * "f[0-9]b", first matches "f", then a digit and then "b".
2398 *
2399 * concat ::= piece
2400 * or piece piece
2401 * or piece piece piece
2402 * etc.
2403 */
2404 static int
nfa_regconcat(void)2405 nfa_regconcat(void)
2406 {
2407 int cont = TRUE;
2408 int first = TRUE;
2409
2410 while (cont)
2411 {
2412 switch (peekchr())
2413 {
2414 case NUL:
2415 case Magic('|'):
2416 case Magic('&'):
2417 case Magic(')'):
2418 cont = FALSE;
2419 break;
2420
2421 case Magic('Z'):
2422 regflags |= RF_ICOMBINE;
2423 skipchr_keepstart();
2424 break;
2425 case Magic('c'):
2426 regflags |= RF_ICASE;
2427 skipchr_keepstart();
2428 break;
2429 case Magic('C'):
2430 regflags |= RF_NOICASE;
2431 skipchr_keepstart();
2432 break;
2433 case Magic('v'):
2434 reg_magic = MAGIC_ALL;
2435 skipchr_keepstart();
2436 curchr = -1;
2437 break;
2438 case Magic('m'):
2439 reg_magic = MAGIC_ON;
2440 skipchr_keepstart();
2441 curchr = -1;
2442 break;
2443 case Magic('M'):
2444 reg_magic = MAGIC_OFF;
2445 skipchr_keepstart();
2446 curchr = -1;
2447 break;
2448 case Magic('V'):
2449 reg_magic = MAGIC_NONE;
2450 skipchr_keepstart();
2451 curchr = -1;
2452 break;
2453
2454 default:
2455 if (nfa_regpiece() == FAIL)
2456 return FAIL;
2457 if (first == FALSE)
2458 EMIT(NFA_CONCAT);
2459 else
2460 first = FALSE;
2461 break;
2462 }
2463 }
2464
2465 return OK;
2466 }
2467
2468 /*
2469 * Parse a branch, one or more concats, separated by "\&". It matches the
2470 * last concat, but only if all the preceding concats also match at the same
2471 * position. Examples:
2472 * "foobeep\&..." matches "foo" in "foobeep".
2473 * ".*Peter\&.*Bob" matches in a line containing both "Peter" and "Bob"
2474 *
2475 * branch ::= concat
2476 * or concat \& concat
2477 * or concat \& concat \& concat
2478 * etc.
2479 */
2480 static int
nfa_regbranch(void)2481 nfa_regbranch(void)
2482 {
2483 int old_post_pos;
2484
2485 old_post_pos = (int)(post_ptr - post_start);
2486
2487 // First branch, possibly the only one
2488 if (nfa_regconcat() == FAIL)
2489 return FAIL;
2490
2491 // Try next concats
2492 while (peekchr() == Magic('&'))
2493 {
2494 skipchr();
2495 // if concat is empty do emit a node
2496 if (old_post_pos == (int)(post_ptr - post_start))
2497 EMIT(NFA_EMPTY);
2498 EMIT(NFA_NOPEN);
2499 EMIT(NFA_PREV_ATOM_NO_WIDTH);
2500 old_post_pos = (int)(post_ptr - post_start);
2501 if (nfa_regconcat() == FAIL)
2502 return FAIL;
2503 // if concat is empty do emit a node
2504 if (old_post_pos == (int)(post_ptr - post_start))
2505 EMIT(NFA_EMPTY);
2506 EMIT(NFA_CONCAT);
2507 }
2508
2509 // if a branch is empty, emit one node for it
2510 if (old_post_pos == (int)(post_ptr - post_start))
2511 EMIT(NFA_EMPTY);
2512
2513 return OK;
2514 }
2515
2516 /*
2517 * Parse a pattern, one or more branches, separated by "\|". It matches
2518 * anything that matches one of the branches. Example: "foo\|beep" matches
2519 * "foo" and matches "beep". If more than one branch matches, the first one
2520 * is used.
2521 *
2522 * pattern ::= branch
2523 * or branch \| branch
2524 * or branch \| branch \| branch
2525 * etc.
2526 */
2527 static int
nfa_reg(int paren)2528 nfa_reg(
2529 int paren) // REG_NOPAREN, REG_PAREN, REG_NPAREN or REG_ZPAREN
2530 {
2531 int parno = 0;
2532
2533 if (paren == REG_PAREN)
2534 {
2535 if (regnpar >= NSUBEXP) // Too many `('
2536 EMSG_RET_FAIL(_("E872: (NFA regexp) Too many '('"));
2537 parno = regnpar++;
2538 }
2539 #ifdef FEAT_SYN_HL
2540 else if (paren == REG_ZPAREN)
2541 {
2542 // Make a ZOPEN node.
2543 if (regnzpar >= NSUBEXP)
2544 EMSG_RET_FAIL(_("E879: (NFA regexp) Too many \\z("));
2545 parno = regnzpar++;
2546 }
2547 #endif
2548
2549 if (nfa_regbranch() == FAIL)
2550 return FAIL; // cascaded error
2551
2552 while (peekchr() == Magic('|'))
2553 {
2554 skipchr();
2555 if (nfa_regbranch() == FAIL)
2556 return FAIL; // cascaded error
2557 EMIT(NFA_OR);
2558 }
2559
2560 // Check for proper termination.
2561 if (paren != REG_NOPAREN && getchr() != Magic(')'))
2562 {
2563 if (paren == REG_NPAREN)
2564 EMSG2_RET_FAIL(_(e_unmatched_str_percent_open),
2565 reg_magic == MAGIC_ALL);
2566 else
2567 EMSG2_RET_FAIL(_(e_unmatched_str_open), reg_magic == MAGIC_ALL);
2568 }
2569 else if (paren == REG_NOPAREN && peekchr() != NUL)
2570 {
2571 if (peekchr() == Magic(')'))
2572 EMSG2_RET_FAIL(_(e_unmatched_str_close), reg_magic == MAGIC_ALL);
2573 else
2574 EMSG_RET_FAIL(_("E873: (NFA regexp) proper termination error"));
2575 }
2576 /*
2577 * Here we set the flag allowing back references to this set of
2578 * parentheses.
2579 */
2580 if (paren == REG_PAREN)
2581 {
2582 had_endbrace[parno] = TRUE; // have seen the close paren
2583 EMIT(NFA_MOPEN + parno);
2584 }
2585 #ifdef FEAT_SYN_HL
2586 else if (paren == REG_ZPAREN)
2587 EMIT(NFA_ZOPEN + parno);
2588 #endif
2589
2590 return OK;
2591 }
2592
2593 #ifdef DEBUG
2594 static char_u code[50];
2595
2596 static void
nfa_set_code(int c)2597 nfa_set_code(int c)
2598 {
2599 int addnl = FALSE;
2600
2601 if (c >= NFA_FIRST_NL && c <= NFA_LAST_NL)
2602 {
2603 addnl = TRUE;
2604 c -= NFA_ADD_NL;
2605 }
2606
2607 STRCPY(code, "");
2608 switch (c)
2609 {
2610 case NFA_MATCH: STRCPY(code, "NFA_MATCH "); break;
2611 case NFA_SPLIT: STRCPY(code, "NFA_SPLIT "); break;
2612 case NFA_CONCAT: STRCPY(code, "NFA_CONCAT "); break;
2613 case NFA_NEWL: STRCPY(code, "NFA_NEWL "); break;
2614 case NFA_ZSTART: STRCPY(code, "NFA_ZSTART"); break;
2615 case NFA_ZEND: STRCPY(code, "NFA_ZEND"); break;
2616
2617 case NFA_BACKREF1: STRCPY(code, "NFA_BACKREF1"); break;
2618 case NFA_BACKREF2: STRCPY(code, "NFA_BACKREF2"); break;
2619 case NFA_BACKREF3: STRCPY(code, "NFA_BACKREF3"); break;
2620 case NFA_BACKREF4: STRCPY(code, "NFA_BACKREF4"); break;
2621 case NFA_BACKREF5: STRCPY(code, "NFA_BACKREF5"); break;
2622 case NFA_BACKREF6: STRCPY(code, "NFA_BACKREF6"); break;
2623 case NFA_BACKREF7: STRCPY(code, "NFA_BACKREF7"); break;
2624 case NFA_BACKREF8: STRCPY(code, "NFA_BACKREF8"); break;
2625 case NFA_BACKREF9: STRCPY(code, "NFA_BACKREF9"); break;
2626 #ifdef FEAT_SYN_HL
2627 case NFA_ZREF1: STRCPY(code, "NFA_ZREF1"); break;
2628 case NFA_ZREF2: STRCPY(code, "NFA_ZREF2"); break;
2629 case NFA_ZREF3: STRCPY(code, "NFA_ZREF3"); break;
2630 case NFA_ZREF4: STRCPY(code, "NFA_ZREF4"); break;
2631 case NFA_ZREF5: STRCPY(code, "NFA_ZREF5"); break;
2632 case NFA_ZREF6: STRCPY(code, "NFA_ZREF6"); break;
2633 case NFA_ZREF7: STRCPY(code, "NFA_ZREF7"); break;
2634 case NFA_ZREF8: STRCPY(code, "NFA_ZREF8"); break;
2635 case NFA_ZREF9: STRCPY(code, "NFA_ZREF9"); break;
2636 #endif
2637 case NFA_SKIP: STRCPY(code, "NFA_SKIP"); break;
2638
2639 case NFA_PREV_ATOM_NO_WIDTH:
2640 STRCPY(code, "NFA_PREV_ATOM_NO_WIDTH"); break;
2641 case NFA_PREV_ATOM_NO_WIDTH_NEG:
2642 STRCPY(code, "NFA_PREV_ATOM_NO_WIDTH_NEG"); break;
2643 case NFA_PREV_ATOM_JUST_BEFORE:
2644 STRCPY(code, "NFA_PREV_ATOM_JUST_BEFORE"); break;
2645 case NFA_PREV_ATOM_JUST_BEFORE_NEG:
2646 STRCPY(code, "NFA_PREV_ATOM_JUST_BEFORE_NEG"); break;
2647 case NFA_PREV_ATOM_LIKE_PATTERN:
2648 STRCPY(code, "NFA_PREV_ATOM_LIKE_PATTERN"); break;
2649
2650 case NFA_NOPEN: STRCPY(code, "NFA_NOPEN"); break;
2651 case NFA_NCLOSE: STRCPY(code, "NFA_NCLOSE"); break;
2652 case NFA_START_INVISIBLE: STRCPY(code, "NFA_START_INVISIBLE"); break;
2653 case NFA_START_INVISIBLE_FIRST:
2654 STRCPY(code, "NFA_START_INVISIBLE_FIRST"); break;
2655 case NFA_START_INVISIBLE_NEG:
2656 STRCPY(code, "NFA_START_INVISIBLE_NEG"); break;
2657 case NFA_START_INVISIBLE_NEG_FIRST:
2658 STRCPY(code, "NFA_START_INVISIBLE_NEG_FIRST"); break;
2659 case NFA_START_INVISIBLE_BEFORE:
2660 STRCPY(code, "NFA_START_INVISIBLE_BEFORE"); break;
2661 case NFA_START_INVISIBLE_BEFORE_FIRST:
2662 STRCPY(code, "NFA_START_INVISIBLE_BEFORE_FIRST"); break;
2663 case NFA_START_INVISIBLE_BEFORE_NEG:
2664 STRCPY(code, "NFA_START_INVISIBLE_BEFORE_NEG"); break;
2665 case NFA_START_INVISIBLE_BEFORE_NEG_FIRST:
2666 STRCPY(code, "NFA_START_INVISIBLE_BEFORE_NEG_FIRST"); break;
2667 case NFA_START_PATTERN: STRCPY(code, "NFA_START_PATTERN"); break;
2668 case NFA_END_INVISIBLE: STRCPY(code, "NFA_END_INVISIBLE"); break;
2669 case NFA_END_INVISIBLE_NEG: STRCPY(code, "NFA_END_INVISIBLE_NEG"); break;
2670 case NFA_END_PATTERN: STRCPY(code, "NFA_END_PATTERN"); break;
2671
2672 case NFA_COMPOSING: STRCPY(code, "NFA_COMPOSING"); break;
2673 case NFA_END_COMPOSING: STRCPY(code, "NFA_END_COMPOSING"); break;
2674 case NFA_OPT_CHARS: STRCPY(code, "NFA_OPT_CHARS"); break;
2675
2676 case NFA_MOPEN:
2677 case NFA_MOPEN1:
2678 case NFA_MOPEN2:
2679 case NFA_MOPEN3:
2680 case NFA_MOPEN4:
2681 case NFA_MOPEN5:
2682 case NFA_MOPEN6:
2683 case NFA_MOPEN7:
2684 case NFA_MOPEN8:
2685 case NFA_MOPEN9:
2686 STRCPY(code, "NFA_MOPEN(x)");
2687 code[10] = c - NFA_MOPEN + '0';
2688 break;
2689 case NFA_MCLOSE:
2690 case NFA_MCLOSE1:
2691 case NFA_MCLOSE2:
2692 case NFA_MCLOSE3:
2693 case NFA_MCLOSE4:
2694 case NFA_MCLOSE5:
2695 case NFA_MCLOSE6:
2696 case NFA_MCLOSE7:
2697 case NFA_MCLOSE8:
2698 case NFA_MCLOSE9:
2699 STRCPY(code, "NFA_MCLOSE(x)");
2700 code[11] = c - NFA_MCLOSE + '0';
2701 break;
2702 #ifdef FEAT_SYN_HL
2703 case NFA_ZOPEN:
2704 case NFA_ZOPEN1:
2705 case NFA_ZOPEN2:
2706 case NFA_ZOPEN3:
2707 case NFA_ZOPEN4:
2708 case NFA_ZOPEN5:
2709 case NFA_ZOPEN6:
2710 case NFA_ZOPEN7:
2711 case NFA_ZOPEN8:
2712 case NFA_ZOPEN9:
2713 STRCPY(code, "NFA_ZOPEN(x)");
2714 code[10] = c - NFA_ZOPEN + '0';
2715 break;
2716 case NFA_ZCLOSE:
2717 case NFA_ZCLOSE1:
2718 case NFA_ZCLOSE2:
2719 case NFA_ZCLOSE3:
2720 case NFA_ZCLOSE4:
2721 case NFA_ZCLOSE5:
2722 case NFA_ZCLOSE6:
2723 case NFA_ZCLOSE7:
2724 case NFA_ZCLOSE8:
2725 case NFA_ZCLOSE9:
2726 STRCPY(code, "NFA_ZCLOSE(x)");
2727 code[11] = c - NFA_ZCLOSE + '0';
2728 break;
2729 #endif
2730 case NFA_EOL: STRCPY(code, "NFA_EOL "); break;
2731 case NFA_BOL: STRCPY(code, "NFA_BOL "); break;
2732 case NFA_EOW: STRCPY(code, "NFA_EOW "); break;
2733 case NFA_BOW: STRCPY(code, "NFA_BOW "); break;
2734 case NFA_EOF: STRCPY(code, "NFA_EOF "); break;
2735 case NFA_BOF: STRCPY(code, "NFA_BOF "); break;
2736 case NFA_LNUM: STRCPY(code, "NFA_LNUM "); break;
2737 case NFA_LNUM_GT: STRCPY(code, "NFA_LNUM_GT "); break;
2738 case NFA_LNUM_LT: STRCPY(code, "NFA_LNUM_LT "); break;
2739 case NFA_COL: STRCPY(code, "NFA_COL "); break;
2740 case NFA_COL_GT: STRCPY(code, "NFA_COL_GT "); break;
2741 case NFA_COL_LT: STRCPY(code, "NFA_COL_LT "); break;
2742 case NFA_VCOL: STRCPY(code, "NFA_VCOL "); break;
2743 case NFA_VCOL_GT: STRCPY(code, "NFA_VCOL_GT "); break;
2744 case NFA_VCOL_LT: STRCPY(code, "NFA_VCOL_LT "); break;
2745 case NFA_MARK: STRCPY(code, "NFA_MARK "); break;
2746 case NFA_MARK_GT: STRCPY(code, "NFA_MARK_GT "); break;
2747 case NFA_MARK_LT: STRCPY(code, "NFA_MARK_LT "); break;
2748 case NFA_CURSOR: STRCPY(code, "NFA_CURSOR "); break;
2749 case NFA_VISUAL: STRCPY(code, "NFA_VISUAL "); break;
2750 case NFA_ANY_COMPOSING: STRCPY(code, "NFA_ANY_COMPOSING "); break;
2751
2752 case NFA_STAR: STRCPY(code, "NFA_STAR "); break;
2753 case NFA_STAR_NONGREEDY: STRCPY(code, "NFA_STAR_NONGREEDY "); break;
2754 case NFA_QUEST: STRCPY(code, "NFA_QUEST"); break;
2755 case NFA_QUEST_NONGREEDY: STRCPY(code, "NFA_QUEST_NON_GREEDY"); break;
2756 case NFA_EMPTY: STRCPY(code, "NFA_EMPTY"); break;
2757 case NFA_OR: STRCPY(code, "NFA_OR"); break;
2758
2759 case NFA_START_COLL: STRCPY(code, "NFA_START_COLL"); break;
2760 case NFA_END_COLL: STRCPY(code, "NFA_END_COLL"); break;
2761 case NFA_START_NEG_COLL: STRCPY(code, "NFA_START_NEG_COLL"); break;
2762 case NFA_END_NEG_COLL: STRCPY(code, "NFA_END_NEG_COLL"); break;
2763 case NFA_RANGE: STRCPY(code, "NFA_RANGE"); break;
2764 case NFA_RANGE_MIN: STRCPY(code, "NFA_RANGE_MIN"); break;
2765 case NFA_RANGE_MAX: STRCPY(code, "NFA_RANGE_MAX"); break;
2766
2767 case NFA_CLASS_ALNUM: STRCPY(code, "NFA_CLASS_ALNUM"); break;
2768 case NFA_CLASS_ALPHA: STRCPY(code, "NFA_CLASS_ALPHA"); break;
2769 case NFA_CLASS_BLANK: STRCPY(code, "NFA_CLASS_BLANK"); break;
2770 case NFA_CLASS_CNTRL: STRCPY(code, "NFA_CLASS_CNTRL"); break;
2771 case NFA_CLASS_DIGIT: STRCPY(code, "NFA_CLASS_DIGIT"); break;
2772 case NFA_CLASS_GRAPH: STRCPY(code, "NFA_CLASS_GRAPH"); break;
2773 case NFA_CLASS_LOWER: STRCPY(code, "NFA_CLASS_LOWER"); break;
2774 case NFA_CLASS_PRINT: STRCPY(code, "NFA_CLASS_PRINT"); break;
2775 case NFA_CLASS_PUNCT: STRCPY(code, "NFA_CLASS_PUNCT"); break;
2776 case NFA_CLASS_SPACE: STRCPY(code, "NFA_CLASS_SPACE"); break;
2777 case NFA_CLASS_UPPER: STRCPY(code, "NFA_CLASS_UPPER"); break;
2778 case NFA_CLASS_XDIGIT: STRCPY(code, "NFA_CLASS_XDIGIT"); break;
2779 case NFA_CLASS_TAB: STRCPY(code, "NFA_CLASS_TAB"); break;
2780 case NFA_CLASS_RETURN: STRCPY(code, "NFA_CLASS_RETURN"); break;
2781 case NFA_CLASS_BACKSPACE: STRCPY(code, "NFA_CLASS_BACKSPACE"); break;
2782 case NFA_CLASS_ESCAPE: STRCPY(code, "NFA_CLASS_ESCAPE"); break;
2783 case NFA_CLASS_IDENT: STRCPY(code, "NFA_CLASS_IDENT"); break;
2784 case NFA_CLASS_KEYWORD: STRCPY(code, "NFA_CLASS_KEYWORD"); break;
2785 case NFA_CLASS_FNAME: STRCPY(code, "NFA_CLASS_FNAME"); break;
2786
2787 case NFA_ANY: STRCPY(code, "NFA_ANY"); break;
2788 case NFA_IDENT: STRCPY(code, "NFA_IDENT"); break;
2789 case NFA_SIDENT:STRCPY(code, "NFA_SIDENT"); break;
2790 case NFA_KWORD: STRCPY(code, "NFA_KWORD"); break;
2791 case NFA_SKWORD:STRCPY(code, "NFA_SKWORD"); break;
2792 case NFA_FNAME: STRCPY(code, "NFA_FNAME"); break;
2793 case NFA_SFNAME:STRCPY(code, "NFA_SFNAME"); break;
2794 case NFA_PRINT: STRCPY(code, "NFA_PRINT"); break;
2795 case NFA_SPRINT:STRCPY(code, "NFA_SPRINT"); break;
2796 case NFA_WHITE: STRCPY(code, "NFA_WHITE"); break;
2797 case NFA_NWHITE:STRCPY(code, "NFA_NWHITE"); break;
2798 case NFA_DIGIT: STRCPY(code, "NFA_DIGIT"); break;
2799 case NFA_NDIGIT:STRCPY(code, "NFA_NDIGIT"); break;
2800 case NFA_HEX: STRCPY(code, "NFA_HEX"); break;
2801 case NFA_NHEX: STRCPY(code, "NFA_NHEX"); break;
2802 case NFA_OCTAL: STRCPY(code, "NFA_OCTAL"); break;
2803 case NFA_NOCTAL:STRCPY(code, "NFA_NOCTAL"); break;
2804 case NFA_WORD: STRCPY(code, "NFA_WORD"); break;
2805 case NFA_NWORD: STRCPY(code, "NFA_NWORD"); break;
2806 case NFA_HEAD: STRCPY(code, "NFA_HEAD"); break;
2807 case NFA_NHEAD: STRCPY(code, "NFA_NHEAD"); break;
2808 case NFA_ALPHA: STRCPY(code, "NFA_ALPHA"); break;
2809 case NFA_NALPHA:STRCPY(code, "NFA_NALPHA"); break;
2810 case NFA_LOWER: STRCPY(code, "NFA_LOWER"); break;
2811 case NFA_NLOWER:STRCPY(code, "NFA_NLOWER"); break;
2812 case NFA_UPPER: STRCPY(code, "NFA_UPPER"); break;
2813 case NFA_NUPPER:STRCPY(code, "NFA_NUPPER"); break;
2814 case NFA_LOWER_IC: STRCPY(code, "NFA_LOWER_IC"); break;
2815 case NFA_NLOWER_IC: STRCPY(code, "NFA_NLOWER_IC"); break;
2816 case NFA_UPPER_IC: STRCPY(code, "NFA_UPPER_IC"); break;
2817 case NFA_NUPPER_IC: STRCPY(code, "NFA_NUPPER_IC"); break;
2818
2819 default:
2820 STRCPY(code, "CHAR(x)");
2821 code[5] = c;
2822 }
2823
2824 if (addnl == TRUE)
2825 STRCAT(code, " + NEWLINE ");
2826
2827 }
2828
2829 #ifdef ENABLE_LOG
2830 static FILE *log_fd;
2831 static char_u e_log_open_failed[] = N_("Could not open temporary log file for writing, displaying on stderr... ");
2832
2833 /*
2834 * Print the postfix notation of the current regexp.
2835 */
2836 static void
nfa_postfix_dump(char_u * expr,int retval)2837 nfa_postfix_dump(char_u *expr, int retval)
2838 {
2839 int *p;
2840 FILE *f;
2841
2842 f = fopen(NFA_REGEXP_DUMP_LOG, "a");
2843 if (f != NULL)
2844 {
2845 fprintf(f, "\n-------------------------\n");
2846 if (retval == FAIL)
2847 fprintf(f, ">>> NFA engine failed... \n");
2848 else if (retval == OK)
2849 fprintf(f, ">>> NFA engine succeeded !\n");
2850 fprintf(f, "Regexp: \"%s\"\nPostfix notation (char): \"", expr);
2851 for (p = post_start; *p && p < post_ptr; p++)
2852 {
2853 nfa_set_code(*p);
2854 fprintf(f, "%s, ", code);
2855 }
2856 fprintf(f, "\"\nPostfix notation (int): ");
2857 for (p = post_start; *p && p < post_ptr; p++)
2858 fprintf(f, "%d ", *p);
2859 fprintf(f, "\n\n");
2860 fclose(f);
2861 }
2862 }
2863
2864 /*
2865 * Print the NFA starting with a root node "state".
2866 */
2867 static void
nfa_print_state(FILE * debugf,nfa_state_T * state)2868 nfa_print_state(FILE *debugf, nfa_state_T *state)
2869 {
2870 garray_T indent;
2871
2872 ga_init2(&indent, 1, 64);
2873 ga_append(&indent, '\0');
2874 nfa_print_state2(debugf, state, &indent);
2875 ga_clear(&indent);
2876 }
2877
2878 static void
nfa_print_state2(FILE * debugf,nfa_state_T * state,garray_T * indent)2879 nfa_print_state2(FILE *debugf, nfa_state_T *state, garray_T *indent)
2880 {
2881 char_u *p;
2882
2883 if (state == NULL)
2884 return;
2885
2886 fprintf(debugf, "(%2d)", abs(state->id));
2887
2888 // Output indent
2889 p = (char_u *)indent->ga_data;
2890 if (indent->ga_len >= 3)
2891 {
2892 int last = indent->ga_len - 3;
2893 char_u save[2];
2894
2895 STRNCPY(save, &p[last], 2);
2896 STRNCPY(&p[last], "+-", 2);
2897 fprintf(debugf, " %s", p);
2898 STRNCPY(&p[last], save, 2);
2899 }
2900 else
2901 fprintf(debugf, " %s", p);
2902
2903 nfa_set_code(state->c);
2904 fprintf(debugf, "%s (%d) (id=%d) val=%d\n",
2905 code,
2906 state->c,
2907 abs(state->id),
2908 state->val);
2909 if (state->id < 0)
2910 return;
2911
2912 state->id = abs(state->id) * -1;
2913
2914 // grow indent for state->out
2915 indent->ga_len -= 1;
2916 if (state->out1)
2917 ga_concat(indent, (char_u *)"| ");
2918 else
2919 ga_concat(indent, (char_u *)" ");
2920 ga_append(indent, '\0');
2921
2922 nfa_print_state2(debugf, state->out, indent);
2923
2924 // replace last part of indent for state->out1
2925 indent->ga_len -= 3;
2926 ga_concat(indent, (char_u *)" ");
2927 ga_append(indent, '\0');
2928
2929 nfa_print_state2(debugf, state->out1, indent);
2930
2931 // shrink indent
2932 indent->ga_len -= 3;
2933 ga_append(indent, '\0');
2934 }
2935
2936 /*
2937 * Print the NFA state machine.
2938 */
2939 static void
nfa_dump(nfa_regprog_T * prog)2940 nfa_dump(nfa_regprog_T *prog)
2941 {
2942 FILE *debugf = fopen(NFA_REGEXP_DUMP_LOG, "a");
2943
2944 if (debugf != NULL)
2945 {
2946 nfa_print_state(debugf, prog->start);
2947
2948 if (prog->reganch)
2949 fprintf(debugf, "reganch: %d\n", prog->reganch);
2950 if (prog->regstart != NUL)
2951 fprintf(debugf, "regstart: %c (decimal: %d)\n",
2952 prog->regstart, prog->regstart);
2953 if (prog->match_text != NULL)
2954 fprintf(debugf, "match_text: \"%s\"\n", prog->match_text);
2955
2956 fclose(debugf);
2957 }
2958 }
2959 #endif // ENABLE_LOG
2960 #endif // DEBUG
2961
2962 /*
2963 * Parse r.e. @expr and convert it into postfix form.
2964 * Return the postfix string on success, NULL otherwise.
2965 */
2966 static int *
re2post(void)2967 re2post(void)
2968 {
2969 if (nfa_reg(REG_NOPAREN) == FAIL)
2970 return NULL;
2971 EMIT(NFA_MOPEN);
2972 return post_start;
2973 }
2974
2975 // NB. Some of the code below is inspired by Russ's.
2976
2977 /*
2978 * Represents an NFA state plus zero or one or two arrows exiting.
2979 * if c == MATCH, no arrows out; matching state.
2980 * If c == SPLIT, unlabeled arrows to out and out1 (if != NULL).
2981 * If c < 256, labeled arrow with character c to out.
2982 */
2983
2984 static nfa_state_T *state_ptr; // points to nfa_prog->state
2985
2986 /*
2987 * Allocate and initialize nfa_state_T.
2988 */
2989 static nfa_state_T *
alloc_state(int c,nfa_state_T * out,nfa_state_T * out1)2990 alloc_state(int c, nfa_state_T *out, nfa_state_T *out1)
2991 {
2992 nfa_state_T *s;
2993
2994 if (istate >= nstate)
2995 return NULL;
2996
2997 s = &state_ptr[istate++];
2998
2999 s->c = c;
3000 s->out = out;
3001 s->out1 = out1;
3002 s->val = 0;
3003
3004 s->id = istate;
3005 s->lastlist[0] = 0;
3006 s->lastlist[1] = 0;
3007
3008 return s;
3009 }
3010
3011 /*
3012 * A partially built NFA without the matching state filled in.
3013 * Frag_T.start points at the start state.
3014 * Frag_T.out is a list of places that need to be set to the
3015 * next state for this fragment.
3016 */
3017
3018 // Since the out pointers in the list are always
3019 // uninitialized, we use the pointers themselves
3020 // as storage for the Ptrlists.
3021 typedef union Ptrlist Ptrlist;
3022 union Ptrlist
3023 {
3024 Ptrlist *next;
3025 nfa_state_T *s;
3026 };
3027
3028 struct Frag
3029 {
3030 nfa_state_T *start;
3031 Ptrlist *out;
3032 };
3033 typedef struct Frag Frag_T;
3034
3035 /*
3036 * Initialize a Frag_T struct and return it.
3037 */
3038 static Frag_T
frag(nfa_state_T * start,Ptrlist * out)3039 frag(nfa_state_T *start, Ptrlist *out)
3040 {
3041 Frag_T n;
3042
3043 n.start = start;
3044 n.out = out;
3045 return n;
3046 }
3047
3048 /*
3049 * Create singleton list containing just outp.
3050 */
3051 static Ptrlist *
list1(nfa_state_T ** outp)3052 list1(
3053 nfa_state_T **outp)
3054 {
3055 Ptrlist *l;
3056
3057 l = (Ptrlist *)outp;
3058 l->next = NULL;
3059 return l;
3060 }
3061
3062 /*
3063 * Patch the list of states at out to point to start.
3064 */
3065 static void
patch(Ptrlist * l,nfa_state_T * s)3066 patch(Ptrlist *l, nfa_state_T *s)
3067 {
3068 Ptrlist *next;
3069
3070 for (; l; l = next)
3071 {
3072 next = l->next;
3073 l->s = s;
3074 }
3075 }
3076
3077
3078 /*
3079 * Join the two lists l1 and l2, returning the combination.
3080 */
3081 static Ptrlist *
append(Ptrlist * l1,Ptrlist * l2)3082 append(Ptrlist *l1, Ptrlist *l2)
3083 {
3084 Ptrlist *oldl1;
3085
3086 oldl1 = l1;
3087 while (l1->next)
3088 l1 = l1->next;
3089 l1->next = l2;
3090 return oldl1;
3091 }
3092
3093 /*
3094 * Stack used for transforming postfix form into NFA.
3095 */
3096 static Frag_T empty;
3097
3098 static void
st_error(int * postfix UNUSED,int * end UNUSED,int * p UNUSED)3099 st_error(int *postfix UNUSED, int *end UNUSED, int *p UNUSED)
3100 {
3101 #ifdef NFA_REGEXP_ERROR_LOG
3102 FILE *df;
3103 int *p2;
3104
3105 df = fopen(NFA_REGEXP_ERROR_LOG, "a");
3106 if (df)
3107 {
3108 fprintf(df, "Error popping the stack!\n");
3109 # ifdef DEBUG
3110 fprintf(df, "Current regexp is \"%s\"\n", nfa_regengine.expr);
3111 # endif
3112 fprintf(df, "Postfix form is: ");
3113 # ifdef DEBUG
3114 for (p2 = postfix; p2 < end; p2++)
3115 {
3116 nfa_set_code(*p2);
3117 fprintf(df, "%s, ", code);
3118 }
3119 nfa_set_code(*p);
3120 fprintf(df, "\nCurrent position is: ");
3121 for (p2 = postfix; p2 <= p; p2 ++)
3122 {
3123 nfa_set_code(*p2);
3124 fprintf(df, "%s, ", code);
3125 }
3126 # else
3127 for (p2 = postfix; p2 < end; p2++)
3128 fprintf(df, "%d, ", *p2);
3129 fprintf(df, "\nCurrent position is: ");
3130 for (p2 = postfix; p2 <= p; p2 ++)
3131 fprintf(df, "%d, ", *p2);
3132 # endif
3133 fprintf(df, "\n--------------------------\n");
3134 fclose(df);
3135 }
3136 #endif
3137 emsg(_("E874: (NFA) Could not pop the stack!"));
3138 }
3139
3140 /*
3141 * Push an item onto the stack.
3142 */
3143 static void
st_push(Frag_T s,Frag_T ** p,Frag_T * stack_end)3144 st_push(Frag_T s, Frag_T **p, Frag_T *stack_end)
3145 {
3146 Frag_T *stackp = *p;
3147
3148 if (stackp >= stack_end)
3149 return;
3150 *stackp = s;
3151 *p = *p + 1;
3152 }
3153
3154 /*
3155 * Pop an item from the stack.
3156 */
3157 static Frag_T
st_pop(Frag_T ** p,Frag_T * stack)3158 st_pop(Frag_T **p, Frag_T *stack)
3159 {
3160 Frag_T *stackp;
3161
3162 *p = *p - 1;
3163 stackp = *p;
3164 if (stackp < stack)
3165 return empty;
3166 return **p;
3167 }
3168
3169 /*
3170 * Estimate the maximum byte length of anything matching "state".
3171 * When unknown or unlimited return -1.
3172 */
3173 static int
nfa_max_width(nfa_state_T * startstate,int depth)3174 nfa_max_width(nfa_state_T *startstate, int depth)
3175 {
3176 int l, r;
3177 nfa_state_T *state = startstate;
3178 int len = 0;
3179
3180 // detect looping in a NFA_SPLIT
3181 if (depth > 4)
3182 return -1;
3183
3184 while (state != NULL)
3185 {
3186 switch (state->c)
3187 {
3188 case NFA_END_INVISIBLE:
3189 case NFA_END_INVISIBLE_NEG:
3190 // the end, return what we have
3191 return len;
3192
3193 case NFA_SPLIT:
3194 // two alternatives, use the maximum
3195 l = nfa_max_width(state->out, depth + 1);
3196 r = nfa_max_width(state->out1, depth + 1);
3197 if (l < 0 || r < 0)
3198 return -1;
3199 return len + (l > r ? l : r);
3200
3201 case NFA_ANY:
3202 case NFA_START_COLL:
3203 case NFA_START_NEG_COLL:
3204 // matches some character, including composing chars
3205 if (enc_utf8)
3206 len += MB_MAXBYTES;
3207 else if (has_mbyte)
3208 len += 2;
3209 else
3210 ++len;
3211 if (state->c != NFA_ANY)
3212 {
3213 // skip over the characters
3214 state = state->out1->out;
3215 continue;
3216 }
3217 break;
3218
3219 case NFA_DIGIT:
3220 case NFA_WHITE:
3221 case NFA_HEX:
3222 case NFA_OCTAL:
3223 // ascii
3224 ++len;
3225 break;
3226
3227 case NFA_IDENT:
3228 case NFA_SIDENT:
3229 case NFA_KWORD:
3230 case NFA_SKWORD:
3231 case NFA_FNAME:
3232 case NFA_SFNAME:
3233 case NFA_PRINT:
3234 case NFA_SPRINT:
3235 case NFA_NWHITE:
3236 case NFA_NDIGIT:
3237 case NFA_NHEX:
3238 case NFA_NOCTAL:
3239 case NFA_WORD:
3240 case NFA_NWORD:
3241 case NFA_HEAD:
3242 case NFA_NHEAD:
3243 case NFA_ALPHA:
3244 case NFA_NALPHA:
3245 case NFA_LOWER:
3246 case NFA_NLOWER:
3247 case NFA_UPPER:
3248 case NFA_NUPPER:
3249 case NFA_LOWER_IC:
3250 case NFA_NLOWER_IC:
3251 case NFA_UPPER_IC:
3252 case NFA_NUPPER_IC:
3253 case NFA_ANY_COMPOSING:
3254 // possibly non-ascii
3255 if (has_mbyte)
3256 len += 3;
3257 else
3258 ++len;
3259 break;
3260
3261 case NFA_START_INVISIBLE:
3262 case NFA_START_INVISIBLE_NEG:
3263 case NFA_START_INVISIBLE_BEFORE:
3264 case NFA_START_INVISIBLE_BEFORE_NEG:
3265 // zero-width, out1 points to the END state
3266 state = state->out1->out;
3267 continue;
3268
3269 case NFA_BACKREF1:
3270 case NFA_BACKREF2:
3271 case NFA_BACKREF3:
3272 case NFA_BACKREF4:
3273 case NFA_BACKREF5:
3274 case NFA_BACKREF6:
3275 case NFA_BACKREF7:
3276 case NFA_BACKREF8:
3277 case NFA_BACKREF9:
3278 #ifdef FEAT_SYN_HL
3279 case NFA_ZREF1:
3280 case NFA_ZREF2:
3281 case NFA_ZREF3:
3282 case NFA_ZREF4:
3283 case NFA_ZREF5:
3284 case NFA_ZREF6:
3285 case NFA_ZREF7:
3286 case NFA_ZREF8:
3287 case NFA_ZREF9:
3288 #endif
3289 case NFA_NEWL:
3290 case NFA_SKIP:
3291 // unknown width
3292 return -1;
3293
3294 case NFA_BOL:
3295 case NFA_EOL:
3296 case NFA_BOF:
3297 case NFA_EOF:
3298 case NFA_BOW:
3299 case NFA_EOW:
3300 case NFA_MOPEN:
3301 case NFA_MOPEN1:
3302 case NFA_MOPEN2:
3303 case NFA_MOPEN3:
3304 case NFA_MOPEN4:
3305 case NFA_MOPEN5:
3306 case NFA_MOPEN6:
3307 case NFA_MOPEN7:
3308 case NFA_MOPEN8:
3309 case NFA_MOPEN9:
3310 #ifdef FEAT_SYN_HL
3311 case NFA_ZOPEN:
3312 case NFA_ZOPEN1:
3313 case NFA_ZOPEN2:
3314 case NFA_ZOPEN3:
3315 case NFA_ZOPEN4:
3316 case NFA_ZOPEN5:
3317 case NFA_ZOPEN6:
3318 case NFA_ZOPEN7:
3319 case NFA_ZOPEN8:
3320 case NFA_ZOPEN9:
3321 case NFA_ZCLOSE:
3322 case NFA_ZCLOSE1:
3323 case NFA_ZCLOSE2:
3324 case NFA_ZCLOSE3:
3325 case NFA_ZCLOSE4:
3326 case NFA_ZCLOSE5:
3327 case NFA_ZCLOSE6:
3328 case NFA_ZCLOSE7:
3329 case NFA_ZCLOSE8:
3330 case NFA_ZCLOSE9:
3331 #endif
3332 case NFA_MCLOSE:
3333 case NFA_MCLOSE1:
3334 case NFA_MCLOSE2:
3335 case NFA_MCLOSE3:
3336 case NFA_MCLOSE4:
3337 case NFA_MCLOSE5:
3338 case NFA_MCLOSE6:
3339 case NFA_MCLOSE7:
3340 case NFA_MCLOSE8:
3341 case NFA_MCLOSE9:
3342 case NFA_NOPEN:
3343 case NFA_NCLOSE:
3344
3345 case NFA_LNUM_GT:
3346 case NFA_LNUM_LT:
3347 case NFA_COL_GT:
3348 case NFA_COL_LT:
3349 case NFA_VCOL_GT:
3350 case NFA_VCOL_LT:
3351 case NFA_MARK_GT:
3352 case NFA_MARK_LT:
3353 case NFA_VISUAL:
3354 case NFA_LNUM:
3355 case NFA_CURSOR:
3356 case NFA_COL:
3357 case NFA_VCOL:
3358 case NFA_MARK:
3359
3360 case NFA_ZSTART:
3361 case NFA_ZEND:
3362 case NFA_OPT_CHARS:
3363 case NFA_EMPTY:
3364 case NFA_START_PATTERN:
3365 case NFA_END_PATTERN:
3366 case NFA_COMPOSING:
3367 case NFA_END_COMPOSING:
3368 // zero-width
3369 break;
3370
3371 default:
3372 if (state->c < 0)
3373 // don't know what this is
3374 return -1;
3375 // normal character
3376 len += MB_CHAR2LEN(state->c);
3377 break;
3378 }
3379
3380 // normal way to continue
3381 state = state->out;
3382 }
3383
3384 // unrecognized, "cannot happen"
3385 return -1;
3386 }
3387
3388 /*
3389 * Convert a postfix form into its equivalent NFA.
3390 * Return the NFA start state on success, NULL otherwise.
3391 */
3392 static nfa_state_T *
post2nfa(int * postfix,int * end,int nfa_calc_size)3393 post2nfa(int *postfix, int *end, int nfa_calc_size)
3394 {
3395 int *p;
3396 int mopen;
3397 int mclose;
3398 Frag_T *stack = NULL;
3399 Frag_T *stackp = NULL;
3400 Frag_T *stack_end = NULL;
3401 Frag_T e1;
3402 Frag_T e2;
3403 Frag_T e;
3404 nfa_state_T *s;
3405 nfa_state_T *s1;
3406 nfa_state_T *matchstate;
3407 nfa_state_T *ret = NULL;
3408
3409 if (postfix == NULL)
3410 return NULL;
3411
3412 #define PUSH(s) st_push((s), &stackp, stack_end)
3413 #define POP() st_pop(&stackp, stack); \
3414 if (stackp < stack) \
3415 { \
3416 st_error(postfix, end, p); \
3417 vim_free(stack); \
3418 return NULL; \
3419 }
3420
3421 if (nfa_calc_size == FALSE)
3422 {
3423 // Allocate space for the stack. Max states on the stack: "nstate".
3424 stack = ALLOC_MULT(Frag_T, nstate + 1);
3425 if (stack == NULL)
3426 return NULL;
3427 stackp = stack;
3428 stack_end = stack + (nstate + 1);
3429 }
3430
3431 for (p = postfix; p < end; ++p)
3432 {
3433 switch (*p)
3434 {
3435 case NFA_CONCAT:
3436 // Concatenation.
3437 // Pay attention: this operator does not exist in the r.e. itself
3438 // (it is implicit, really). It is added when r.e. is translated
3439 // to postfix form in re2post().
3440 if (nfa_calc_size == TRUE)
3441 {
3442 // nstate += 0;
3443 break;
3444 }
3445 e2 = POP();
3446 e1 = POP();
3447 patch(e1.out, e2.start);
3448 PUSH(frag(e1.start, e2.out));
3449 break;
3450
3451 case NFA_OR:
3452 // Alternation
3453 if (nfa_calc_size == TRUE)
3454 {
3455 nstate++;
3456 break;
3457 }
3458 e2 = POP();
3459 e1 = POP();
3460 s = alloc_state(NFA_SPLIT, e1.start, e2.start);
3461 if (s == NULL)
3462 goto theend;
3463 PUSH(frag(s, append(e1.out, e2.out)));
3464 break;
3465
3466 case NFA_STAR:
3467 // Zero or more, prefer more
3468 if (nfa_calc_size == TRUE)
3469 {
3470 nstate++;
3471 break;
3472 }
3473 e = POP();
3474 s = alloc_state(NFA_SPLIT, e.start, NULL);
3475 if (s == NULL)
3476 goto theend;
3477 patch(e.out, s);
3478 PUSH(frag(s, list1(&s->out1)));
3479 break;
3480
3481 case NFA_STAR_NONGREEDY:
3482 // Zero or more, prefer zero
3483 if (nfa_calc_size == TRUE)
3484 {
3485 nstate++;
3486 break;
3487 }
3488 e = POP();
3489 s = alloc_state(NFA_SPLIT, NULL, e.start);
3490 if (s == NULL)
3491 goto theend;
3492 patch(e.out, s);
3493 PUSH(frag(s, list1(&s->out)));
3494 break;
3495
3496 case NFA_QUEST:
3497 // one or zero atoms=> greedy match
3498 if (nfa_calc_size == TRUE)
3499 {
3500 nstate++;
3501 break;
3502 }
3503 e = POP();
3504 s = alloc_state(NFA_SPLIT, e.start, NULL);
3505 if (s == NULL)
3506 goto theend;
3507 PUSH(frag(s, append(e.out, list1(&s->out1))));
3508 break;
3509
3510 case NFA_QUEST_NONGREEDY:
3511 // zero or one atoms => non-greedy match
3512 if (nfa_calc_size == TRUE)
3513 {
3514 nstate++;
3515 break;
3516 }
3517 e = POP();
3518 s = alloc_state(NFA_SPLIT, NULL, e.start);
3519 if (s == NULL)
3520 goto theend;
3521 PUSH(frag(s, append(e.out, list1(&s->out))));
3522 break;
3523
3524 case NFA_END_COLL:
3525 case NFA_END_NEG_COLL:
3526 // On the stack is the sequence starting with NFA_START_COLL or
3527 // NFA_START_NEG_COLL and all possible characters. Patch it to
3528 // add the output to the start.
3529 if (nfa_calc_size == TRUE)
3530 {
3531 nstate++;
3532 break;
3533 }
3534 e = POP();
3535 s = alloc_state(NFA_END_COLL, NULL, NULL);
3536 if (s == NULL)
3537 goto theend;
3538 patch(e.out, s);
3539 e.start->out1 = s;
3540 PUSH(frag(e.start, list1(&s->out)));
3541 break;
3542
3543 case NFA_RANGE:
3544 // Before this are two characters, the low and high end of a
3545 // range. Turn them into two states with MIN and MAX.
3546 if (nfa_calc_size == TRUE)
3547 {
3548 // nstate += 0;
3549 break;
3550 }
3551 e2 = POP();
3552 e1 = POP();
3553 e2.start->val = e2.start->c;
3554 e2.start->c = NFA_RANGE_MAX;
3555 e1.start->val = e1.start->c;
3556 e1.start->c = NFA_RANGE_MIN;
3557 patch(e1.out, e2.start);
3558 PUSH(frag(e1.start, e2.out));
3559 break;
3560
3561 case NFA_EMPTY:
3562 // 0-length, used in a repetition with max/min count of 0
3563 if (nfa_calc_size == TRUE)
3564 {
3565 nstate++;
3566 break;
3567 }
3568 s = alloc_state(NFA_EMPTY, NULL, NULL);
3569 if (s == NULL)
3570 goto theend;
3571 PUSH(frag(s, list1(&s->out)));
3572 break;
3573
3574 case NFA_OPT_CHARS:
3575 {
3576 int n;
3577
3578 // \%[abc] implemented as:
3579 // NFA_SPLIT
3580 // +-CHAR(a)
3581 // | +-NFA_SPLIT
3582 // | +-CHAR(b)
3583 // | | +-NFA_SPLIT
3584 // | | +-CHAR(c)
3585 // | | | +-next
3586 // | | +- next
3587 // | +- next
3588 // +- next
3589 n = *++p; // get number of characters
3590 if (nfa_calc_size == TRUE)
3591 {
3592 nstate += n;
3593 break;
3594 }
3595 s = NULL; // avoid compiler warning
3596 e1.out = NULL; // stores list with out1's
3597 s1 = NULL; // previous NFA_SPLIT to connect to
3598 while (n-- > 0)
3599 {
3600 e = POP(); // get character
3601 s = alloc_state(NFA_SPLIT, e.start, NULL);
3602 if (s == NULL)
3603 goto theend;
3604 if (e1.out == NULL)
3605 e1 = e;
3606 patch(e.out, s1);
3607 append(e1.out, list1(&s->out1));
3608 s1 = s;
3609 }
3610 PUSH(frag(s, e1.out));
3611 break;
3612 }
3613
3614 case NFA_PREV_ATOM_NO_WIDTH:
3615 case NFA_PREV_ATOM_NO_WIDTH_NEG:
3616 case NFA_PREV_ATOM_JUST_BEFORE:
3617 case NFA_PREV_ATOM_JUST_BEFORE_NEG:
3618 case NFA_PREV_ATOM_LIKE_PATTERN:
3619 {
3620 int before = (*p == NFA_PREV_ATOM_JUST_BEFORE
3621 || *p == NFA_PREV_ATOM_JUST_BEFORE_NEG);
3622 int pattern = (*p == NFA_PREV_ATOM_LIKE_PATTERN);
3623 int start_state;
3624 int end_state;
3625 int n = 0;
3626 nfa_state_T *zend;
3627 nfa_state_T *skip;
3628
3629 switch (*p)
3630 {
3631 case NFA_PREV_ATOM_NO_WIDTH:
3632 start_state = NFA_START_INVISIBLE;
3633 end_state = NFA_END_INVISIBLE;
3634 break;
3635 case NFA_PREV_ATOM_NO_WIDTH_NEG:
3636 start_state = NFA_START_INVISIBLE_NEG;
3637 end_state = NFA_END_INVISIBLE_NEG;
3638 break;
3639 case NFA_PREV_ATOM_JUST_BEFORE:
3640 start_state = NFA_START_INVISIBLE_BEFORE;
3641 end_state = NFA_END_INVISIBLE;
3642 break;
3643 case NFA_PREV_ATOM_JUST_BEFORE_NEG:
3644 start_state = NFA_START_INVISIBLE_BEFORE_NEG;
3645 end_state = NFA_END_INVISIBLE_NEG;
3646 break;
3647 default: // NFA_PREV_ATOM_LIKE_PATTERN:
3648 start_state = NFA_START_PATTERN;
3649 end_state = NFA_END_PATTERN;
3650 break;
3651 }
3652
3653 if (before)
3654 n = *++p; // get the count
3655
3656 // The \@= operator: match the preceding atom with zero width.
3657 // The \@! operator: no match for the preceding atom.
3658 // The \@<= operator: match for the preceding atom.
3659 // The \@<! operator: no match for the preceding atom.
3660 // Surrounds the preceding atom with START_INVISIBLE and
3661 // END_INVISIBLE, similarly to MOPEN.
3662
3663 if (nfa_calc_size == TRUE)
3664 {
3665 nstate += pattern ? 4 : 2;
3666 break;
3667 }
3668 e = POP();
3669 s1 = alloc_state(end_state, NULL, NULL);
3670 if (s1 == NULL)
3671 goto theend;
3672
3673 s = alloc_state(start_state, e.start, s1);
3674 if (s == NULL)
3675 goto theend;
3676 if (pattern)
3677 {
3678 // NFA_ZEND -> NFA_END_PATTERN -> NFA_SKIP -> what follows.
3679 skip = alloc_state(NFA_SKIP, NULL, NULL);
3680 if (skip == NULL)
3681 goto theend;
3682 zend = alloc_state(NFA_ZEND, s1, NULL);
3683 if (zend == NULL)
3684 goto theend;
3685 s1->out= skip;
3686 patch(e.out, zend);
3687 PUSH(frag(s, list1(&skip->out)));
3688 }
3689 else
3690 {
3691 patch(e.out, s1);
3692 PUSH(frag(s, list1(&s1->out)));
3693 if (before)
3694 {
3695 if (n <= 0)
3696 // See if we can guess the maximum width, it avoids a
3697 // lot of pointless tries.
3698 n = nfa_max_width(e.start, 0);
3699 s->val = n; // store the count
3700 }
3701 }
3702 break;
3703 }
3704
3705 case NFA_COMPOSING: // char with composing char
3706 #if 0
3707 // TODO
3708 if (regflags & RF_ICOMBINE)
3709 {
3710 // use the base character only
3711 }
3712 #endif
3713 // FALLTHROUGH
3714
3715 case NFA_MOPEN: // \( \) Submatch
3716 case NFA_MOPEN1:
3717 case NFA_MOPEN2:
3718 case NFA_MOPEN3:
3719 case NFA_MOPEN4:
3720 case NFA_MOPEN5:
3721 case NFA_MOPEN6:
3722 case NFA_MOPEN7:
3723 case NFA_MOPEN8:
3724 case NFA_MOPEN9:
3725 #ifdef FEAT_SYN_HL
3726 case NFA_ZOPEN: // \z( \) Submatch
3727 case NFA_ZOPEN1:
3728 case NFA_ZOPEN2:
3729 case NFA_ZOPEN3:
3730 case NFA_ZOPEN4:
3731 case NFA_ZOPEN5:
3732 case NFA_ZOPEN6:
3733 case NFA_ZOPEN7:
3734 case NFA_ZOPEN8:
3735 case NFA_ZOPEN9:
3736 #endif
3737 case NFA_NOPEN: // \%( \) "Invisible Submatch"
3738 if (nfa_calc_size == TRUE)
3739 {
3740 nstate += 2;
3741 break;
3742 }
3743
3744 mopen = *p;
3745 switch (*p)
3746 {
3747 case NFA_NOPEN: mclose = NFA_NCLOSE; break;
3748 #ifdef FEAT_SYN_HL
3749 case NFA_ZOPEN: mclose = NFA_ZCLOSE; break;
3750 case NFA_ZOPEN1: mclose = NFA_ZCLOSE1; break;
3751 case NFA_ZOPEN2: mclose = NFA_ZCLOSE2; break;
3752 case NFA_ZOPEN3: mclose = NFA_ZCLOSE3; break;
3753 case NFA_ZOPEN4: mclose = NFA_ZCLOSE4; break;
3754 case NFA_ZOPEN5: mclose = NFA_ZCLOSE5; break;
3755 case NFA_ZOPEN6: mclose = NFA_ZCLOSE6; break;
3756 case NFA_ZOPEN7: mclose = NFA_ZCLOSE7; break;
3757 case NFA_ZOPEN8: mclose = NFA_ZCLOSE8; break;
3758 case NFA_ZOPEN9: mclose = NFA_ZCLOSE9; break;
3759 #endif
3760 case NFA_COMPOSING: mclose = NFA_END_COMPOSING; break;
3761 default:
3762 // NFA_MOPEN, NFA_MOPEN1 .. NFA_MOPEN9
3763 mclose = *p + NSUBEXP;
3764 break;
3765 }
3766
3767 // Allow "NFA_MOPEN" as a valid postfix representation for
3768 // the empty regexp "". In this case, the NFA will be
3769 // NFA_MOPEN -> NFA_MCLOSE. Note that this also allows
3770 // empty groups of parenthesis, and empty mbyte chars
3771 if (stackp == stack)
3772 {
3773 s = alloc_state(mopen, NULL, NULL);
3774 if (s == NULL)
3775 goto theend;
3776 s1 = alloc_state(mclose, NULL, NULL);
3777 if (s1 == NULL)
3778 goto theend;
3779 patch(list1(&s->out), s1);
3780 PUSH(frag(s, list1(&s1->out)));
3781 break;
3782 }
3783
3784 // At least one node was emitted before NFA_MOPEN, so
3785 // at least one node will be between NFA_MOPEN and NFA_MCLOSE
3786 e = POP();
3787 s = alloc_state(mopen, e.start, NULL); // `('
3788 if (s == NULL)
3789 goto theend;
3790
3791 s1 = alloc_state(mclose, NULL, NULL); // `)'
3792 if (s1 == NULL)
3793 goto theend;
3794 patch(e.out, s1);
3795
3796 if (mopen == NFA_COMPOSING)
3797 // COMPOSING->out1 = END_COMPOSING
3798 patch(list1(&s->out1), s1);
3799
3800 PUSH(frag(s, list1(&s1->out)));
3801 break;
3802
3803 case NFA_BACKREF1:
3804 case NFA_BACKREF2:
3805 case NFA_BACKREF3:
3806 case NFA_BACKREF4:
3807 case NFA_BACKREF5:
3808 case NFA_BACKREF6:
3809 case NFA_BACKREF7:
3810 case NFA_BACKREF8:
3811 case NFA_BACKREF9:
3812 #ifdef FEAT_SYN_HL
3813 case NFA_ZREF1:
3814 case NFA_ZREF2:
3815 case NFA_ZREF3:
3816 case NFA_ZREF4:
3817 case NFA_ZREF5:
3818 case NFA_ZREF6:
3819 case NFA_ZREF7:
3820 case NFA_ZREF8:
3821 case NFA_ZREF9:
3822 #endif
3823 if (nfa_calc_size == TRUE)
3824 {
3825 nstate += 2;
3826 break;
3827 }
3828 s = alloc_state(*p, NULL, NULL);
3829 if (s == NULL)
3830 goto theend;
3831 s1 = alloc_state(NFA_SKIP, NULL, NULL);
3832 if (s1 == NULL)
3833 goto theend;
3834 patch(list1(&s->out), s1);
3835 PUSH(frag(s, list1(&s1->out)));
3836 break;
3837
3838 case NFA_LNUM:
3839 case NFA_LNUM_GT:
3840 case NFA_LNUM_LT:
3841 case NFA_VCOL:
3842 case NFA_VCOL_GT:
3843 case NFA_VCOL_LT:
3844 case NFA_COL:
3845 case NFA_COL_GT:
3846 case NFA_COL_LT:
3847 case NFA_MARK:
3848 case NFA_MARK_GT:
3849 case NFA_MARK_LT:
3850 {
3851 int n = *++p; // lnum, col or mark name
3852
3853 if (nfa_calc_size == TRUE)
3854 {
3855 nstate += 1;
3856 break;
3857 }
3858 s = alloc_state(p[-1], NULL, NULL);
3859 if (s == NULL)
3860 goto theend;
3861 s->val = n;
3862 PUSH(frag(s, list1(&s->out)));
3863 break;
3864 }
3865
3866 case NFA_ZSTART:
3867 case NFA_ZEND:
3868 default:
3869 // Operands
3870 if (nfa_calc_size == TRUE)
3871 {
3872 nstate++;
3873 break;
3874 }
3875 s = alloc_state(*p, NULL, NULL);
3876 if (s == NULL)
3877 goto theend;
3878 PUSH(frag(s, list1(&s->out)));
3879 break;
3880
3881 } // switch(*p)
3882
3883 } // for(p = postfix; *p; ++p)
3884
3885 if (nfa_calc_size == TRUE)
3886 {
3887 nstate++;
3888 goto theend; // Return value when counting size is ignored anyway
3889 }
3890
3891 e = POP();
3892 if (stackp != stack)
3893 {
3894 vim_free(stack);
3895 EMSG_RET_NULL(_("E875: (NFA regexp) (While converting from postfix to NFA), too many states left on stack"));
3896 }
3897
3898 if (istate >= nstate)
3899 {
3900 vim_free(stack);
3901 EMSG_RET_NULL(_("E876: (NFA regexp) Not enough space to store the whole NFA "));
3902 }
3903
3904 matchstate = &state_ptr[istate++]; // the match state
3905 matchstate->c = NFA_MATCH;
3906 matchstate->out = matchstate->out1 = NULL;
3907 matchstate->id = 0;
3908
3909 patch(e.out, matchstate);
3910 ret = e.start;
3911
3912 theend:
3913 vim_free(stack);
3914 return ret;
3915
3916 #undef POP1
3917 #undef PUSH1
3918 #undef POP2
3919 #undef PUSH2
3920 #undef POP
3921 #undef PUSH
3922 }
3923
3924 /*
3925 * After building the NFA program, inspect it to add optimization hints.
3926 */
3927 static void
nfa_postprocess(nfa_regprog_T * prog)3928 nfa_postprocess(nfa_regprog_T *prog)
3929 {
3930 int i;
3931 int c;
3932
3933 for (i = 0; i < prog->nstate; ++i)
3934 {
3935 c = prog->state[i].c;
3936 if (c == NFA_START_INVISIBLE
3937 || c == NFA_START_INVISIBLE_NEG
3938 || c == NFA_START_INVISIBLE_BEFORE
3939 || c == NFA_START_INVISIBLE_BEFORE_NEG)
3940 {
3941 int directly;
3942
3943 // Do it directly when what follows is possibly the end of the
3944 // match.
3945 if (match_follows(prog->state[i].out1->out, 0))
3946 directly = TRUE;
3947 else
3948 {
3949 int ch_invisible = failure_chance(prog->state[i].out, 0);
3950 int ch_follows = failure_chance(prog->state[i].out1->out, 0);
3951
3952 // Postpone when the invisible match is expensive or has a
3953 // lower chance of failing.
3954 if (c == NFA_START_INVISIBLE_BEFORE
3955 || c == NFA_START_INVISIBLE_BEFORE_NEG)
3956 {
3957 // "before" matches are very expensive when
3958 // unbounded, always prefer what follows then,
3959 // unless what follows will always match.
3960 // Otherwise strongly prefer what follows.
3961 if (prog->state[i].val <= 0 && ch_follows > 0)
3962 directly = FALSE;
3963 else
3964 directly = ch_follows * 10 < ch_invisible;
3965 }
3966 else
3967 {
3968 // normal invisible, first do the one with the
3969 // highest failure chance
3970 directly = ch_follows < ch_invisible;
3971 }
3972 }
3973 if (directly)
3974 // switch to the _FIRST state
3975 ++prog->state[i].c;
3976 }
3977 }
3978 }
3979
3980 /////////////////////////////////////////////////////////////////
3981 // NFA execution code.
3982 /////////////////////////////////////////////////////////////////
3983
3984 typedef struct
3985 {
3986 int in_use; // number of subexpr with useful info
3987
3988 // When REG_MULTI is TRUE list.multi is used, otherwise list.line.
3989 union
3990 {
3991 struct multipos
3992 {
3993 linenr_T start_lnum;
3994 linenr_T end_lnum;
3995 colnr_T start_col;
3996 colnr_T end_col;
3997 } multi[NSUBEXP];
3998 struct linepos
3999 {
4000 char_u *start;
4001 char_u *end;
4002 } line[NSUBEXP];
4003 } list;
4004 } regsub_T;
4005
4006 typedef struct
4007 {
4008 regsub_T norm; // \( .. \) matches
4009 #ifdef FEAT_SYN_HL
4010 regsub_T synt; // \z( .. \) matches
4011 #endif
4012 } regsubs_T;
4013
4014 // nfa_pim_T stores a Postponed Invisible Match.
4015 typedef struct nfa_pim_S nfa_pim_T;
4016 struct nfa_pim_S
4017 {
4018 int result; // NFA_PIM_*, see below
4019 nfa_state_T *state; // the invisible match start state
4020 regsubs_T subs; // submatch info, only party used
4021 union
4022 {
4023 lpos_T pos;
4024 char_u *ptr;
4025 } end; // where the match must end
4026 };
4027
4028 // Values for done in nfa_pim_T.
4029 #define NFA_PIM_UNUSED 0 // pim not used
4030 #define NFA_PIM_TODO 1 // pim not done yet
4031 #define NFA_PIM_MATCH 2 // pim executed, matches
4032 #define NFA_PIM_NOMATCH 3 // pim executed, no match
4033
4034
4035 // nfa_thread_T contains execution information of a NFA state
4036 typedef struct
4037 {
4038 nfa_state_T *state;
4039 int count;
4040 nfa_pim_T pim; // if pim.result != NFA_PIM_UNUSED: postponed
4041 // invisible match
4042 regsubs_T subs; // submatch info, only party used
4043 } nfa_thread_T;
4044
4045 // nfa_list_T contains the alternative NFA execution states.
4046 typedef struct
4047 {
4048 nfa_thread_T *t; // allocated array of states
4049 int n; // nr of states currently in "t"
4050 int len; // max nr of states in "t"
4051 int id; // ID of the list
4052 int has_pim; // TRUE when any state has a PIM
4053 } nfa_list_T;
4054
4055 #ifdef ENABLE_LOG
4056 static void log_subexpr(regsub_T *sub);
4057
4058 static void
log_subsexpr(regsubs_T * subs)4059 log_subsexpr(regsubs_T *subs)
4060 {
4061 log_subexpr(&subs->norm);
4062 # ifdef FEAT_SYN_HL
4063 if (rex.nfa_has_zsubexpr)
4064 log_subexpr(&subs->synt);
4065 # endif
4066 }
4067
4068 static void
log_subexpr(regsub_T * sub)4069 log_subexpr(regsub_T *sub)
4070 {
4071 int j;
4072
4073 for (j = 0; j < sub->in_use; j++)
4074 if (REG_MULTI)
4075 fprintf(log_fd, "*** group %d, start: c=%d, l=%d, end: c=%d, l=%d\n",
4076 j,
4077 sub->list.multi[j].start_col,
4078 (int)sub->list.multi[j].start_lnum,
4079 sub->list.multi[j].end_col,
4080 (int)sub->list.multi[j].end_lnum);
4081 else
4082 {
4083 char *s = (char *)sub->list.line[j].start;
4084 char *e = (char *)sub->list.line[j].end;
4085
4086 fprintf(log_fd, "*** group %d, start: \"%s\", end: \"%s\"\n",
4087 j,
4088 s == NULL ? "NULL" : s,
4089 e == NULL ? "NULL" : e);
4090 }
4091 }
4092
4093 static char *
pim_info(nfa_pim_T * pim)4094 pim_info(nfa_pim_T *pim)
4095 {
4096 static char buf[30];
4097
4098 if (pim == NULL || pim->result == NFA_PIM_UNUSED)
4099 buf[0] = NUL;
4100 else
4101 {
4102 sprintf(buf, " PIM col %d", REG_MULTI ? (int)pim->end.pos.col
4103 : (int)(pim->end.ptr - rex.input));
4104 }
4105 return buf;
4106 }
4107
4108 #endif
4109
4110 // Used during execution: whether a match has been found.
4111 static int nfa_match;
4112 #ifdef FEAT_RELTIME
4113 static proftime_T *nfa_time_limit;
4114 static int *nfa_timed_out;
4115 static int nfa_time_count;
4116 #endif
4117
4118 static void copy_sub(regsub_T *to, regsub_T *from);
4119 static int pim_equal(nfa_pim_T *one, nfa_pim_T *two);
4120
4121 /*
4122 * Copy postponed invisible match info from "from" to "to".
4123 */
4124 static void
copy_pim(nfa_pim_T * to,nfa_pim_T * from)4125 copy_pim(nfa_pim_T *to, nfa_pim_T *from)
4126 {
4127 to->result = from->result;
4128 to->state = from->state;
4129 copy_sub(&to->subs.norm, &from->subs.norm);
4130 #ifdef FEAT_SYN_HL
4131 if (rex.nfa_has_zsubexpr)
4132 copy_sub(&to->subs.synt, &from->subs.synt);
4133 #endif
4134 to->end = from->end;
4135 }
4136
4137 static void
clear_sub(regsub_T * sub)4138 clear_sub(regsub_T *sub)
4139 {
4140 if (REG_MULTI)
4141 // Use 0xff to set lnum to -1
4142 vim_memset(sub->list.multi, 0xff,
4143 sizeof(struct multipos) * rex.nfa_nsubexpr);
4144 else
4145 vim_memset(sub->list.line, 0,
4146 sizeof(struct linepos) * rex.nfa_nsubexpr);
4147 sub->in_use = 0;
4148 }
4149
4150 /*
4151 * Copy the submatches from "from" to "to".
4152 */
4153 static void
copy_sub(regsub_T * to,regsub_T * from)4154 copy_sub(regsub_T *to, regsub_T *from)
4155 {
4156 to->in_use = from->in_use;
4157 if (from->in_use > 0)
4158 {
4159 // Copy the match start and end positions.
4160 if (REG_MULTI)
4161 mch_memmove(&to->list.multi[0],
4162 &from->list.multi[0],
4163 sizeof(struct multipos) * from->in_use);
4164 else
4165 mch_memmove(&to->list.line[0],
4166 &from->list.line[0],
4167 sizeof(struct linepos) * from->in_use);
4168 }
4169 }
4170
4171 /*
4172 * Like copy_sub() but exclude the main match.
4173 */
4174 static void
copy_sub_off(regsub_T * to,regsub_T * from)4175 copy_sub_off(regsub_T *to, regsub_T *from)
4176 {
4177 if (to->in_use < from->in_use)
4178 to->in_use = from->in_use;
4179 if (from->in_use > 1)
4180 {
4181 // Copy the match start and end positions.
4182 if (REG_MULTI)
4183 mch_memmove(&to->list.multi[1],
4184 &from->list.multi[1],
4185 sizeof(struct multipos) * (from->in_use - 1));
4186 else
4187 mch_memmove(&to->list.line[1],
4188 &from->list.line[1],
4189 sizeof(struct linepos) * (from->in_use - 1));
4190 }
4191 }
4192
4193 /*
4194 * Like copy_sub() but only do the end of the main match if \ze is present.
4195 */
4196 static void
copy_ze_off(regsub_T * to,regsub_T * from)4197 copy_ze_off(regsub_T *to, regsub_T *from)
4198 {
4199 if (rex.nfa_has_zend)
4200 {
4201 if (REG_MULTI)
4202 {
4203 if (from->list.multi[0].end_lnum >= 0)
4204 {
4205 to->list.multi[0].end_lnum = from->list.multi[0].end_lnum;
4206 to->list.multi[0].end_col = from->list.multi[0].end_col;
4207 }
4208 }
4209 else
4210 {
4211 if (from->list.line[0].end != NULL)
4212 to->list.line[0].end = from->list.line[0].end;
4213 }
4214 }
4215 }
4216
4217 /*
4218 * Return TRUE if "sub1" and "sub2" have the same start positions.
4219 * When using back-references also check the end position.
4220 */
4221 static int
sub_equal(regsub_T * sub1,regsub_T * sub2)4222 sub_equal(regsub_T *sub1, regsub_T *sub2)
4223 {
4224 int i;
4225 int todo;
4226 linenr_T s1;
4227 linenr_T s2;
4228 char_u *sp1;
4229 char_u *sp2;
4230
4231 todo = sub1->in_use > sub2->in_use ? sub1->in_use : sub2->in_use;
4232 if (REG_MULTI)
4233 {
4234 for (i = 0; i < todo; ++i)
4235 {
4236 if (i < sub1->in_use)
4237 s1 = sub1->list.multi[i].start_lnum;
4238 else
4239 s1 = -1;
4240 if (i < sub2->in_use)
4241 s2 = sub2->list.multi[i].start_lnum;
4242 else
4243 s2 = -1;
4244 if (s1 != s2)
4245 return FALSE;
4246 if (s1 != -1 && sub1->list.multi[i].start_col
4247 != sub2->list.multi[i].start_col)
4248 return FALSE;
4249
4250 if (rex.nfa_has_backref)
4251 {
4252 if (i < sub1->in_use)
4253 s1 = sub1->list.multi[i].end_lnum;
4254 else
4255 s1 = -1;
4256 if (i < sub2->in_use)
4257 s2 = sub2->list.multi[i].end_lnum;
4258 else
4259 s2 = -1;
4260 if (s1 != s2)
4261 return FALSE;
4262 if (s1 != -1 && sub1->list.multi[i].end_col
4263 != sub2->list.multi[i].end_col)
4264 return FALSE;
4265 }
4266 }
4267 }
4268 else
4269 {
4270 for (i = 0; i < todo; ++i)
4271 {
4272 if (i < sub1->in_use)
4273 sp1 = sub1->list.line[i].start;
4274 else
4275 sp1 = NULL;
4276 if (i < sub2->in_use)
4277 sp2 = sub2->list.line[i].start;
4278 else
4279 sp2 = NULL;
4280 if (sp1 != sp2)
4281 return FALSE;
4282 if (rex.nfa_has_backref)
4283 {
4284 if (i < sub1->in_use)
4285 sp1 = sub1->list.line[i].end;
4286 else
4287 sp1 = NULL;
4288 if (i < sub2->in_use)
4289 sp2 = sub2->list.line[i].end;
4290 else
4291 sp2 = NULL;
4292 if (sp1 != sp2)
4293 return FALSE;
4294 }
4295 }
4296 }
4297
4298 return TRUE;
4299 }
4300
4301 #ifdef ENABLE_LOG
4302 static void
report_state(char * action,regsub_T * sub,nfa_state_T * state,int lid,nfa_pim_T * pim)4303 report_state(char *action,
4304 regsub_T *sub,
4305 nfa_state_T *state,
4306 int lid,
4307 nfa_pim_T *pim)
4308 {
4309 int col;
4310
4311 if (sub->in_use <= 0)
4312 col = -1;
4313 else if (REG_MULTI)
4314 col = sub->list.multi[0].start_col;
4315 else
4316 col = (int)(sub->list.line[0].start - rex.line);
4317 nfa_set_code(state->c);
4318 fprintf(log_fd, "> %s state %d to list %d. char %d: %s (start col %d)%s\n",
4319 action, abs(state->id), lid, state->c, code, col,
4320 pim_info(pim));
4321 }
4322 #endif
4323
4324 /*
4325 * Return TRUE if the same state is already in list "l" with the same
4326 * positions as "subs".
4327 */
4328 static int
has_state_with_pos(nfa_list_T * l,nfa_state_T * state,regsubs_T * subs,nfa_pim_T * pim)4329 has_state_with_pos(
4330 nfa_list_T *l, // runtime state list
4331 nfa_state_T *state, // state to update
4332 regsubs_T *subs, // pointers to subexpressions
4333 nfa_pim_T *pim) // postponed match or NULL
4334 {
4335 nfa_thread_T *thread;
4336 int i;
4337
4338 for (i = 0; i < l->n; ++i)
4339 {
4340 thread = &l->t[i];
4341 if (thread->state->id == state->id
4342 && sub_equal(&thread->subs.norm, &subs->norm)
4343 #ifdef FEAT_SYN_HL
4344 && (!rex.nfa_has_zsubexpr
4345 || sub_equal(&thread->subs.synt, &subs->synt))
4346 #endif
4347 && pim_equal(&thread->pim, pim))
4348 return TRUE;
4349 }
4350 return FALSE;
4351 }
4352
4353 /*
4354 * Return TRUE if "one" and "two" are equal. That includes when both are not
4355 * set.
4356 */
4357 static int
pim_equal(nfa_pim_T * one,nfa_pim_T * two)4358 pim_equal(nfa_pim_T *one, nfa_pim_T *two)
4359 {
4360 int one_unused = (one == NULL || one->result == NFA_PIM_UNUSED);
4361 int two_unused = (two == NULL || two->result == NFA_PIM_UNUSED);
4362
4363 if (one_unused)
4364 // one is unused: equal when two is also unused
4365 return two_unused;
4366 if (two_unused)
4367 // one is used and two is not: not equal
4368 return FALSE;
4369 // compare the state id
4370 if (one->state->id != two->state->id)
4371 return FALSE;
4372 // compare the position
4373 if (REG_MULTI)
4374 return one->end.pos.lnum == two->end.pos.lnum
4375 && one->end.pos.col == two->end.pos.col;
4376 return one->end.ptr == two->end.ptr;
4377 }
4378
4379 /*
4380 * Return TRUE if "state" leads to a NFA_MATCH without advancing the input.
4381 */
4382 static int
match_follows(nfa_state_T * startstate,int depth)4383 match_follows(nfa_state_T *startstate, int depth)
4384 {
4385 nfa_state_T *state = startstate;
4386
4387 // avoid too much recursion
4388 if (depth > 10)
4389 return FALSE;
4390
4391 while (state != NULL)
4392 {
4393 switch (state->c)
4394 {
4395 case NFA_MATCH:
4396 case NFA_MCLOSE:
4397 case NFA_END_INVISIBLE:
4398 case NFA_END_INVISIBLE_NEG:
4399 case NFA_END_PATTERN:
4400 return TRUE;
4401
4402 case NFA_SPLIT:
4403 return match_follows(state->out, depth + 1)
4404 || match_follows(state->out1, depth + 1);
4405
4406 case NFA_START_INVISIBLE:
4407 case NFA_START_INVISIBLE_FIRST:
4408 case NFA_START_INVISIBLE_BEFORE:
4409 case NFA_START_INVISIBLE_BEFORE_FIRST:
4410 case NFA_START_INVISIBLE_NEG:
4411 case NFA_START_INVISIBLE_NEG_FIRST:
4412 case NFA_START_INVISIBLE_BEFORE_NEG:
4413 case NFA_START_INVISIBLE_BEFORE_NEG_FIRST:
4414 case NFA_COMPOSING:
4415 // skip ahead to next state
4416 state = state->out1->out;
4417 continue;
4418
4419 case NFA_ANY:
4420 case NFA_ANY_COMPOSING:
4421 case NFA_IDENT:
4422 case NFA_SIDENT:
4423 case NFA_KWORD:
4424 case NFA_SKWORD:
4425 case NFA_FNAME:
4426 case NFA_SFNAME:
4427 case NFA_PRINT:
4428 case NFA_SPRINT:
4429 case NFA_WHITE:
4430 case NFA_NWHITE:
4431 case NFA_DIGIT:
4432 case NFA_NDIGIT:
4433 case NFA_HEX:
4434 case NFA_NHEX:
4435 case NFA_OCTAL:
4436 case NFA_NOCTAL:
4437 case NFA_WORD:
4438 case NFA_NWORD:
4439 case NFA_HEAD:
4440 case NFA_NHEAD:
4441 case NFA_ALPHA:
4442 case NFA_NALPHA:
4443 case NFA_LOWER:
4444 case NFA_NLOWER:
4445 case NFA_UPPER:
4446 case NFA_NUPPER:
4447 case NFA_LOWER_IC:
4448 case NFA_NLOWER_IC:
4449 case NFA_UPPER_IC:
4450 case NFA_NUPPER_IC:
4451 case NFA_START_COLL:
4452 case NFA_START_NEG_COLL:
4453 case NFA_NEWL:
4454 // state will advance input
4455 return FALSE;
4456
4457 default:
4458 if (state->c > 0)
4459 // state will advance input
4460 return FALSE;
4461
4462 // Others: zero-width or possibly zero-width, might still find
4463 // a match at the same position, keep looking.
4464 break;
4465 }
4466 state = state->out;
4467 }
4468 return FALSE;
4469 }
4470
4471
4472 /*
4473 * Return TRUE if "state" is already in list "l".
4474 */
4475 static int
state_in_list(nfa_list_T * l,nfa_state_T * state,regsubs_T * subs)4476 state_in_list(
4477 nfa_list_T *l, // runtime state list
4478 nfa_state_T *state, // state to update
4479 regsubs_T *subs) // pointers to subexpressions
4480 {
4481 if (state->lastlist[nfa_ll_index] == l->id)
4482 {
4483 if (!rex.nfa_has_backref || has_state_with_pos(l, state, subs, NULL))
4484 return TRUE;
4485 }
4486 return FALSE;
4487 }
4488
4489 // Offset used for "off" by addstate_here().
4490 #define ADDSTATE_HERE_OFFSET 10
4491
4492 /*
4493 * Add "state" and possibly what follows to state list ".".
4494 * Returns "subs_arg", possibly copied into temp_subs.
4495 * Returns NULL when recursiveness is too deep.
4496 */
4497 static regsubs_T *
addstate(nfa_list_T * l,nfa_state_T * state,regsubs_T * subs_arg,nfa_pim_T * pim,int off_arg)4498 addstate(
4499 nfa_list_T *l, // runtime state list
4500 nfa_state_T *state, // state to update
4501 regsubs_T *subs_arg, // pointers to subexpressions
4502 nfa_pim_T *pim, // postponed look-behind match
4503 int off_arg) // byte offset, when -1 go to next line
4504 {
4505 int subidx;
4506 int off = off_arg;
4507 int add_here = FALSE;
4508 int listindex = 0;
4509 int k;
4510 int found = FALSE;
4511 nfa_thread_T *thread;
4512 struct multipos save_multipos;
4513 int save_in_use;
4514 char_u *save_ptr;
4515 int i;
4516 regsub_T *sub;
4517 regsubs_T *subs = subs_arg;
4518 static regsubs_T temp_subs;
4519 #ifdef ENABLE_LOG
4520 int did_print = FALSE;
4521 #endif
4522 static int depth = 0;
4523
4524 // This function is called recursively. When the depth is too much we run
4525 // out of stack and crash, limit recursiveness here.
4526 if (++depth >= 5000 || subs == NULL)
4527 {
4528 --depth;
4529 return NULL;
4530 }
4531
4532 if (off_arg <= -ADDSTATE_HERE_OFFSET)
4533 {
4534 add_here = TRUE;
4535 off = 0;
4536 listindex = -(off_arg + ADDSTATE_HERE_OFFSET);
4537 }
4538
4539 switch (state->c)
4540 {
4541 case NFA_NCLOSE:
4542 case NFA_MCLOSE:
4543 case NFA_MCLOSE1:
4544 case NFA_MCLOSE2:
4545 case NFA_MCLOSE3:
4546 case NFA_MCLOSE4:
4547 case NFA_MCLOSE5:
4548 case NFA_MCLOSE6:
4549 case NFA_MCLOSE7:
4550 case NFA_MCLOSE8:
4551 case NFA_MCLOSE9:
4552 #ifdef FEAT_SYN_HL
4553 case NFA_ZCLOSE:
4554 case NFA_ZCLOSE1:
4555 case NFA_ZCLOSE2:
4556 case NFA_ZCLOSE3:
4557 case NFA_ZCLOSE4:
4558 case NFA_ZCLOSE5:
4559 case NFA_ZCLOSE6:
4560 case NFA_ZCLOSE7:
4561 case NFA_ZCLOSE8:
4562 case NFA_ZCLOSE9:
4563 #endif
4564 case NFA_MOPEN:
4565 case NFA_ZEND:
4566 case NFA_SPLIT:
4567 case NFA_EMPTY:
4568 // These nodes are not added themselves but their "out" and/or
4569 // "out1" may be added below.
4570 break;
4571
4572 case NFA_BOL:
4573 case NFA_BOF:
4574 // "^" won't match past end-of-line, don't bother trying.
4575 // Except when at the end of the line, or when we are going to the
4576 // next line for a look-behind match.
4577 if (rex.input > rex.line
4578 && *rex.input != NUL
4579 && (nfa_endp == NULL
4580 || !REG_MULTI
4581 || rex.lnum == nfa_endp->se_u.pos.lnum))
4582 goto skip_add;
4583 // FALLTHROUGH
4584
4585 case NFA_MOPEN1:
4586 case NFA_MOPEN2:
4587 case NFA_MOPEN3:
4588 case NFA_MOPEN4:
4589 case NFA_MOPEN5:
4590 case NFA_MOPEN6:
4591 case NFA_MOPEN7:
4592 case NFA_MOPEN8:
4593 case NFA_MOPEN9:
4594 #ifdef FEAT_SYN_HL
4595 case NFA_ZOPEN:
4596 case NFA_ZOPEN1:
4597 case NFA_ZOPEN2:
4598 case NFA_ZOPEN3:
4599 case NFA_ZOPEN4:
4600 case NFA_ZOPEN5:
4601 case NFA_ZOPEN6:
4602 case NFA_ZOPEN7:
4603 case NFA_ZOPEN8:
4604 case NFA_ZOPEN9:
4605 #endif
4606 case NFA_NOPEN:
4607 case NFA_ZSTART:
4608 // These nodes need to be added so that we can bail out when it
4609 // was added to this list before at the same position to avoid an
4610 // endless loop for "\(\)*"
4611
4612 default:
4613 if (state->lastlist[nfa_ll_index] == l->id && state->c != NFA_SKIP)
4614 {
4615 // This state is already in the list, don't add it again,
4616 // unless it is an MOPEN that is used for a backreference or
4617 // when there is a PIM. For NFA_MATCH check the position,
4618 // lower position is preferred.
4619 if (!rex.nfa_has_backref && pim == NULL && !l->has_pim
4620 && state->c != NFA_MATCH)
4621 {
4622 // When called from addstate_here() do insert before
4623 // existing states.
4624 if (add_here)
4625 {
4626 for (k = 0; k < l->n && k < listindex; ++k)
4627 if (l->t[k].state->id == state->id)
4628 {
4629 found = TRUE;
4630 break;
4631 }
4632 }
4633 if (!add_here || found)
4634 {
4635 skip_add:
4636 #ifdef ENABLE_LOG
4637 nfa_set_code(state->c);
4638 fprintf(log_fd, "> Not adding state %d to list %d. char %d: %s pim: %s has_pim: %d found: %d\n",
4639 abs(state->id), l->id, state->c, code,
4640 pim == NULL ? "NULL" : "yes", l->has_pim, found);
4641 #endif
4642 --depth;
4643 return subs;
4644 }
4645 }
4646
4647 // Do not add the state again when it exists with the same
4648 // positions.
4649 if (has_state_with_pos(l, state, subs, pim))
4650 goto skip_add;
4651 }
4652
4653 // When there are backreferences or PIMs the number of states may
4654 // be (a lot) bigger than anticipated.
4655 if (l->n == l->len)
4656 {
4657 int newlen = l->len * 3 / 2 + 50;
4658 size_t newsize = newlen * sizeof(nfa_thread_T);
4659 nfa_thread_T *newt;
4660
4661 if ((long)(newsize >> 10) >= p_mmp)
4662 {
4663 emsg(_(e_maxmempat));
4664 --depth;
4665 return NULL;
4666 }
4667 if (subs != &temp_subs)
4668 {
4669 // "subs" may point into the current array, need to make a
4670 // copy before it becomes invalid.
4671 copy_sub(&temp_subs.norm, &subs->norm);
4672 #ifdef FEAT_SYN_HL
4673 if (rex.nfa_has_zsubexpr)
4674 copy_sub(&temp_subs.synt, &subs->synt);
4675 #endif
4676 subs = &temp_subs;
4677 }
4678
4679 newt = vim_realloc(l->t, newsize);
4680 if (newt == NULL)
4681 {
4682 // out of memory
4683 --depth;
4684 return NULL;
4685 }
4686 l->t = newt;
4687 l->len = newlen;
4688 }
4689
4690 // add the state to the list
4691 state->lastlist[nfa_ll_index] = l->id;
4692 thread = &l->t[l->n++];
4693 thread->state = state;
4694 if (pim == NULL)
4695 thread->pim.result = NFA_PIM_UNUSED;
4696 else
4697 {
4698 copy_pim(&thread->pim, pim);
4699 l->has_pim = TRUE;
4700 }
4701 copy_sub(&thread->subs.norm, &subs->norm);
4702 #ifdef FEAT_SYN_HL
4703 if (rex.nfa_has_zsubexpr)
4704 copy_sub(&thread->subs.synt, &subs->synt);
4705 #endif
4706 #ifdef ENABLE_LOG
4707 report_state("Adding", &thread->subs.norm, state, l->id, pim);
4708 did_print = TRUE;
4709 #endif
4710 }
4711
4712 #ifdef ENABLE_LOG
4713 if (!did_print)
4714 report_state("Processing", &subs->norm, state, l->id, pim);
4715 #endif
4716 switch (state->c)
4717 {
4718 case NFA_MATCH:
4719 break;
4720
4721 case NFA_SPLIT:
4722 // order matters here
4723 subs = addstate(l, state->out, subs, pim, off_arg);
4724 subs = addstate(l, state->out1, subs, pim, off_arg);
4725 break;
4726
4727 case NFA_EMPTY:
4728 case NFA_NOPEN:
4729 case NFA_NCLOSE:
4730 subs = addstate(l, state->out, subs, pim, off_arg);
4731 break;
4732
4733 case NFA_MOPEN:
4734 case NFA_MOPEN1:
4735 case NFA_MOPEN2:
4736 case NFA_MOPEN3:
4737 case NFA_MOPEN4:
4738 case NFA_MOPEN5:
4739 case NFA_MOPEN6:
4740 case NFA_MOPEN7:
4741 case NFA_MOPEN8:
4742 case NFA_MOPEN9:
4743 #ifdef FEAT_SYN_HL
4744 case NFA_ZOPEN:
4745 case NFA_ZOPEN1:
4746 case NFA_ZOPEN2:
4747 case NFA_ZOPEN3:
4748 case NFA_ZOPEN4:
4749 case NFA_ZOPEN5:
4750 case NFA_ZOPEN6:
4751 case NFA_ZOPEN7:
4752 case NFA_ZOPEN8:
4753 case NFA_ZOPEN9:
4754 #endif
4755 case NFA_ZSTART:
4756 if (state->c == NFA_ZSTART)
4757 {
4758 subidx = 0;
4759 sub = &subs->norm;
4760 }
4761 #ifdef FEAT_SYN_HL
4762 else if (state->c >= NFA_ZOPEN && state->c <= NFA_ZOPEN9)
4763 {
4764 subidx = state->c - NFA_ZOPEN;
4765 sub = &subs->synt;
4766 }
4767 #endif
4768 else
4769 {
4770 subidx = state->c - NFA_MOPEN;
4771 sub = &subs->norm;
4772 }
4773
4774 // avoid compiler warnings
4775 save_ptr = NULL;
4776 CLEAR_FIELD(save_multipos);
4777
4778 // Set the position (with "off" added) in the subexpression. Save
4779 // and restore it when it was in use. Otherwise fill any gap.
4780 if (REG_MULTI)
4781 {
4782 if (subidx < sub->in_use)
4783 {
4784 save_multipos = sub->list.multi[subidx];
4785 save_in_use = -1;
4786 }
4787 else
4788 {
4789 save_in_use = sub->in_use;
4790 for (i = sub->in_use; i < subidx; ++i)
4791 {
4792 sub->list.multi[i].start_lnum = -1;
4793 sub->list.multi[i].end_lnum = -1;
4794 }
4795 sub->in_use = subidx + 1;
4796 }
4797 if (off == -1)
4798 {
4799 sub->list.multi[subidx].start_lnum = rex.lnum + 1;
4800 sub->list.multi[subidx].start_col = 0;
4801 }
4802 else
4803 {
4804 sub->list.multi[subidx].start_lnum = rex.lnum;
4805 sub->list.multi[subidx].start_col =
4806 (colnr_T)(rex.input - rex.line + off);
4807 }
4808 sub->list.multi[subidx].end_lnum = -1;
4809 }
4810 else
4811 {
4812 if (subidx < sub->in_use)
4813 {
4814 save_ptr = sub->list.line[subidx].start;
4815 save_in_use = -1;
4816 }
4817 else
4818 {
4819 save_in_use = sub->in_use;
4820 for (i = sub->in_use; i < subidx; ++i)
4821 {
4822 sub->list.line[i].start = NULL;
4823 sub->list.line[i].end = NULL;
4824 }
4825 sub->in_use = subidx + 1;
4826 }
4827 sub->list.line[subidx].start = rex.input + off;
4828 }
4829
4830 subs = addstate(l, state->out, subs, pim, off_arg);
4831 if (subs == NULL)
4832 break;
4833 // "subs" may have changed, need to set "sub" again
4834 #ifdef FEAT_SYN_HL
4835 if (state->c >= NFA_ZOPEN && state->c <= NFA_ZOPEN9)
4836 sub = &subs->synt;
4837 else
4838 #endif
4839 sub = &subs->norm;
4840
4841 if (save_in_use == -1)
4842 {
4843 if (REG_MULTI)
4844 sub->list.multi[subidx] = save_multipos;
4845 else
4846 sub->list.line[subidx].start = save_ptr;
4847 }
4848 else
4849 sub->in_use = save_in_use;
4850 break;
4851
4852 case NFA_MCLOSE:
4853 if (rex.nfa_has_zend && (REG_MULTI
4854 ? subs->norm.list.multi[0].end_lnum >= 0
4855 : subs->norm.list.line[0].end != NULL))
4856 {
4857 // Do not overwrite the position set by \ze.
4858 subs = addstate(l, state->out, subs, pim, off_arg);
4859 break;
4860 }
4861 // FALLTHROUGH
4862 case NFA_MCLOSE1:
4863 case NFA_MCLOSE2:
4864 case NFA_MCLOSE3:
4865 case NFA_MCLOSE4:
4866 case NFA_MCLOSE5:
4867 case NFA_MCLOSE6:
4868 case NFA_MCLOSE7:
4869 case NFA_MCLOSE8:
4870 case NFA_MCLOSE9:
4871 #ifdef FEAT_SYN_HL
4872 case NFA_ZCLOSE:
4873 case NFA_ZCLOSE1:
4874 case NFA_ZCLOSE2:
4875 case NFA_ZCLOSE3:
4876 case NFA_ZCLOSE4:
4877 case NFA_ZCLOSE5:
4878 case NFA_ZCLOSE6:
4879 case NFA_ZCLOSE7:
4880 case NFA_ZCLOSE8:
4881 case NFA_ZCLOSE9:
4882 #endif
4883 case NFA_ZEND:
4884 if (state->c == NFA_ZEND)
4885 {
4886 subidx = 0;
4887 sub = &subs->norm;
4888 }
4889 #ifdef FEAT_SYN_HL
4890 else if (state->c >= NFA_ZCLOSE && state->c <= NFA_ZCLOSE9)
4891 {
4892 subidx = state->c - NFA_ZCLOSE;
4893 sub = &subs->synt;
4894 }
4895 #endif
4896 else
4897 {
4898 subidx = state->c - NFA_MCLOSE;
4899 sub = &subs->norm;
4900 }
4901
4902 // We don't fill in gaps here, there must have been an MOPEN that
4903 // has done that.
4904 save_in_use = sub->in_use;
4905 if (sub->in_use <= subidx)
4906 sub->in_use = subidx + 1;
4907 if (REG_MULTI)
4908 {
4909 save_multipos = sub->list.multi[subidx];
4910 if (off == -1)
4911 {
4912 sub->list.multi[subidx].end_lnum = rex.lnum + 1;
4913 sub->list.multi[subidx].end_col = 0;
4914 }
4915 else
4916 {
4917 sub->list.multi[subidx].end_lnum = rex.lnum;
4918 sub->list.multi[subidx].end_col =
4919 (colnr_T)(rex.input - rex.line + off);
4920 }
4921 // avoid compiler warnings
4922 save_ptr = NULL;
4923 }
4924 else
4925 {
4926 save_ptr = sub->list.line[subidx].end;
4927 sub->list.line[subidx].end = rex.input + off;
4928 // avoid compiler warnings
4929 CLEAR_FIELD(save_multipos);
4930 }
4931
4932 subs = addstate(l, state->out, subs, pim, off_arg);
4933 if (subs == NULL)
4934 break;
4935 // "subs" may have changed, need to set "sub" again
4936 #ifdef FEAT_SYN_HL
4937 if (state->c >= NFA_ZCLOSE && state->c <= NFA_ZCLOSE9)
4938 sub = &subs->synt;
4939 else
4940 #endif
4941 sub = &subs->norm;
4942
4943 if (REG_MULTI)
4944 sub->list.multi[subidx] = save_multipos;
4945 else
4946 sub->list.line[subidx].end = save_ptr;
4947 sub->in_use = save_in_use;
4948 break;
4949 }
4950 --depth;
4951 return subs;
4952 }
4953
4954 /*
4955 * Like addstate(), but the new state(s) are put at position "*ip".
4956 * Used for zero-width matches, next state to use is the added one.
4957 * This makes sure the order of states to be tried does not change, which
4958 * matters for alternatives.
4959 */
4960 static regsubs_T *
addstate_here(nfa_list_T * l,nfa_state_T * state,regsubs_T * subs,nfa_pim_T * pim,int * ip)4961 addstate_here(
4962 nfa_list_T *l, // runtime state list
4963 nfa_state_T *state, // state to update
4964 regsubs_T *subs, // pointers to subexpressions
4965 nfa_pim_T *pim, // postponed look-behind match
4966 int *ip)
4967 {
4968 int tlen = l->n;
4969 int count;
4970 int listidx = *ip;
4971 regsubs_T *r;
4972
4973 // First add the state(s) at the end, so that we know how many there are.
4974 // Pass the listidx as offset (avoids adding another argument to
4975 // addstate().
4976 r = addstate(l, state, subs, pim, -listidx - ADDSTATE_HERE_OFFSET);
4977 if (r == NULL)
4978 return NULL;
4979
4980 // when "*ip" was at the end of the list, nothing to do
4981 if (listidx + 1 == tlen)
4982 return r;
4983
4984 // re-order to put the new state at the current position
4985 count = l->n - tlen;
4986 if (count == 0)
4987 return r; // no state got added
4988 if (count == 1)
4989 {
4990 // overwrite the current state
4991 l->t[listidx] = l->t[l->n - 1];
4992 }
4993 else if (count > 1)
4994 {
4995 if (l->n + count - 1 >= l->len)
4996 {
4997 // not enough space to move the new states, reallocate the list
4998 // and move the states to the right position
4999 int newlen = l->len * 3 / 2 + 50;
5000 size_t newsize = newlen * sizeof(nfa_thread_T);
5001 nfa_thread_T *newl;
5002
5003 if ((long)(newsize >> 10) >= p_mmp)
5004 {
5005 emsg(_(e_maxmempat));
5006 return NULL;
5007 }
5008 newl = alloc(newsize);
5009 if (newl == NULL)
5010 return NULL;
5011 l->len = newlen;
5012 mch_memmove(&(newl[0]),
5013 &(l->t[0]),
5014 sizeof(nfa_thread_T) * listidx);
5015 mch_memmove(&(newl[listidx]),
5016 &(l->t[l->n - count]),
5017 sizeof(nfa_thread_T) * count);
5018 mch_memmove(&(newl[listidx + count]),
5019 &(l->t[listidx + 1]),
5020 sizeof(nfa_thread_T) * (l->n - count - listidx - 1));
5021 vim_free(l->t);
5022 l->t = newl;
5023 }
5024 else
5025 {
5026 // make space for new states, then move them from the
5027 // end to the current position
5028 mch_memmove(&(l->t[listidx + count]),
5029 &(l->t[listidx + 1]),
5030 sizeof(nfa_thread_T) * (l->n - listidx - 1));
5031 mch_memmove(&(l->t[listidx]),
5032 &(l->t[l->n - 1]),
5033 sizeof(nfa_thread_T) * count);
5034 }
5035 }
5036 --l->n;
5037 *ip = listidx - 1;
5038
5039 return r;
5040 }
5041
5042 /*
5043 * Check character class "class" against current character c.
5044 */
5045 static int
check_char_class(int class,int c)5046 check_char_class(int class, int c)
5047 {
5048 switch (class)
5049 {
5050 case NFA_CLASS_ALNUM:
5051 if (c >= 1 && c < 128 && isalnum(c))
5052 return OK;
5053 break;
5054 case NFA_CLASS_ALPHA:
5055 if (c >= 1 && c < 128 && isalpha(c))
5056 return OK;
5057 break;
5058 case NFA_CLASS_BLANK:
5059 if (c == ' ' || c == '\t')
5060 return OK;
5061 break;
5062 case NFA_CLASS_CNTRL:
5063 if (c >= 1 && c <= 127 && iscntrl(c))
5064 return OK;
5065 break;
5066 case NFA_CLASS_DIGIT:
5067 if (VIM_ISDIGIT(c))
5068 return OK;
5069 break;
5070 case NFA_CLASS_GRAPH:
5071 if (c >= 1 && c <= 127 && isgraph(c))
5072 return OK;
5073 break;
5074 case NFA_CLASS_LOWER:
5075 if (MB_ISLOWER(c) && c != 170 && c != 186)
5076 return OK;
5077 break;
5078 case NFA_CLASS_PRINT:
5079 if (vim_isprintc(c))
5080 return OK;
5081 break;
5082 case NFA_CLASS_PUNCT:
5083 if (c >= 1 && c < 128 && ispunct(c))
5084 return OK;
5085 break;
5086 case NFA_CLASS_SPACE:
5087 if ((c >= 9 && c <= 13) || (c == ' '))
5088 return OK;
5089 break;
5090 case NFA_CLASS_UPPER:
5091 if (MB_ISUPPER(c))
5092 return OK;
5093 break;
5094 case NFA_CLASS_XDIGIT:
5095 if (vim_isxdigit(c))
5096 return OK;
5097 break;
5098 case NFA_CLASS_TAB:
5099 if (c == '\t')
5100 return OK;
5101 break;
5102 case NFA_CLASS_RETURN:
5103 if (c == '\r')
5104 return OK;
5105 break;
5106 case NFA_CLASS_BACKSPACE:
5107 if (c == '\b')
5108 return OK;
5109 break;
5110 case NFA_CLASS_ESCAPE:
5111 if (c == '\033')
5112 return OK;
5113 break;
5114 case NFA_CLASS_IDENT:
5115 if (vim_isIDc(c))
5116 return OK;
5117 break;
5118 case NFA_CLASS_KEYWORD:
5119 if (reg_iswordc(c))
5120 return OK;
5121 break;
5122 case NFA_CLASS_FNAME:
5123 if (vim_isfilec(c))
5124 return OK;
5125 break;
5126
5127 default:
5128 // should not be here :P
5129 siemsg(_(e_ill_char_class), class);
5130 return FAIL;
5131 }
5132 return FAIL;
5133 }
5134
5135 /*
5136 * Check for a match with subexpression "subidx".
5137 * Return TRUE if it matches.
5138 */
5139 static int
match_backref(regsub_T * sub,int subidx,int * bytelen)5140 match_backref(
5141 regsub_T *sub, // pointers to subexpressions
5142 int subidx,
5143 int *bytelen) // out: length of match in bytes
5144 {
5145 int len;
5146
5147 if (sub->in_use <= subidx)
5148 {
5149 retempty:
5150 // backref was not set, match an empty string
5151 *bytelen = 0;
5152 return TRUE;
5153 }
5154
5155 if (REG_MULTI)
5156 {
5157 if (sub->list.multi[subidx].start_lnum < 0
5158 || sub->list.multi[subidx].end_lnum < 0)
5159 goto retempty;
5160 if (sub->list.multi[subidx].start_lnum == rex.lnum
5161 && sub->list.multi[subidx].end_lnum == rex.lnum)
5162 {
5163 len = sub->list.multi[subidx].end_col
5164 - sub->list.multi[subidx].start_col;
5165 if (cstrncmp(rex.line + sub->list.multi[subidx].start_col,
5166 rex.input, &len) == 0)
5167 {
5168 *bytelen = len;
5169 return TRUE;
5170 }
5171 }
5172 else
5173 {
5174 if (match_with_backref(
5175 sub->list.multi[subidx].start_lnum,
5176 sub->list.multi[subidx].start_col,
5177 sub->list.multi[subidx].end_lnum,
5178 sub->list.multi[subidx].end_col,
5179 bytelen) == RA_MATCH)
5180 return TRUE;
5181 }
5182 }
5183 else
5184 {
5185 if (sub->list.line[subidx].start == NULL
5186 || sub->list.line[subidx].end == NULL)
5187 goto retempty;
5188 len = (int)(sub->list.line[subidx].end - sub->list.line[subidx].start);
5189 if (cstrncmp(sub->list.line[subidx].start, rex.input, &len) == 0)
5190 {
5191 *bytelen = len;
5192 return TRUE;
5193 }
5194 }
5195 return FALSE;
5196 }
5197
5198 #ifdef FEAT_SYN_HL
5199
5200 /*
5201 * Check for a match with \z subexpression "subidx".
5202 * Return TRUE if it matches.
5203 */
5204 static int
match_zref(int subidx,int * bytelen)5205 match_zref(
5206 int subidx,
5207 int *bytelen) // out: length of match in bytes
5208 {
5209 int len;
5210
5211 cleanup_zsubexpr();
5212 if (re_extmatch_in == NULL || re_extmatch_in->matches[subidx] == NULL)
5213 {
5214 // backref was not set, match an empty string
5215 *bytelen = 0;
5216 return TRUE;
5217 }
5218
5219 len = (int)STRLEN(re_extmatch_in->matches[subidx]);
5220 if (cstrncmp(re_extmatch_in->matches[subidx], rex.input, &len) == 0)
5221 {
5222 *bytelen = len;
5223 return TRUE;
5224 }
5225 return FALSE;
5226 }
5227 #endif
5228
5229 /*
5230 * Save list IDs for all NFA states of "prog" into "list".
5231 * Also reset the IDs to zero.
5232 * Only used for the recursive value lastlist[1].
5233 */
5234 static void
nfa_save_listids(nfa_regprog_T * prog,int * list)5235 nfa_save_listids(nfa_regprog_T *prog, int *list)
5236 {
5237 int i;
5238 nfa_state_T *p;
5239
5240 // Order in the list is reverse, it's a bit faster that way.
5241 p = &prog->state[0];
5242 for (i = prog->nstate; --i >= 0; )
5243 {
5244 list[i] = p->lastlist[1];
5245 p->lastlist[1] = 0;
5246 ++p;
5247 }
5248 }
5249
5250 /*
5251 * Restore list IDs from "list" to all NFA states.
5252 */
5253 static void
nfa_restore_listids(nfa_regprog_T * prog,int * list)5254 nfa_restore_listids(nfa_regprog_T *prog, int *list)
5255 {
5256 int i;
5257 nfa_state_T *p;
5258
5259 p = &prog->state[0];
5260 for (i = prog->nstate; --i >= 0; )
5261 {
5262 p->lastlist[1] = list[i];
5263 ++p;
5264 }
5265 }
5266
5267 static int
nfa_re_num_cmp(long_u val,int op,long_u pos)5268 nfa_re_num_cmp(long_u val, int op, long_u pos)
5269 {
5270 if (op == 1) return pos > val;
5271 if (op == 2) return pos < val;
5272 return val == pos;
5273 }
5274
5275 static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, regsubs_T *submatch, regsubs_T *m);
5276
5277 /*
5278 * Recursively call nfa_regmatch()
5279 * "pim" is NULL or contains info about a Postponed Invisible Match (start
5280 * position).
5281 */
5282 static int
recursive_regmatch(nfa_state_T * state,nfa_pim_T * pim,nfa_regprog_T * prog,regsubs_T * submatch,regsubs_T * m,int ** listids,int * listids_len)5283 recursive_regmatch(
5284 nfa_state_T *state,
5285 nfa_pim_T *pim,
5286 nfa_regprog_T *prog,
5287 regsubs_T *submatch,
5288 regsubs_T *m,
5289 int **listids,
5290 int *listids_len)
5291 {
5292 int save_reginput_col = (int)(rex.input - rex.line);
5293 int save_reglnum = rex.lnum;
5294 int save_nfa_match = nfa_match;
5295 int save_nfa_listid = rex.nfa_listid;
5296 save_se_T *save_nfa_endp = nfa_endp;
5297 save_se_T endpos;
5298 save_se_T *endposp = NULL;
5299 int result;
5300 int need_restore = FALSE;
5301
5302 if (pim != NULL)
5303 {
5304 // start at the position where the postponed match was
5305 if (REG_MULTI)
5306 rex.input = rex.line + pim->end.pos.col;
5307 else
5308 rex.input = pim->end.ptr;
5309 }
5310
5311 if (state->c == NFA_START_INVISIBLE_BEFORE
5312 || state->c == NFA_START_INVISIBLE_BEFORE_FIRST
5313 || state->c == NFA_START_INVISIBLE_BEFORE_NEG
5314 || state->c == NFA_START_INVISIBLE_BEFORE_NEG_FIRST)
5315 {
5316 // The recursive match must end at the current position. When "pim" is
5317 // not NULL it specifies the current position.
5318 endposp = &endpos;
5319 if (REG_MULTI)
5320 {
5321 if (pim == NULL)
5322 {
5323 endpos.se_u.pos.col = (int)(rex.input - rex.line);
5324 endpos.se_u.pos.lnum = rex.lnum;
5325 }
5326 else
5327 endpos.se_u.pos = pim->end.pos;
5328 }
5329 else
5330 {
5331 if (pim == NULL)
5332 endpos.se_u.ptr = rex.input;
5333 else
5334 endpos.se_u.ptr = pim->end.ptr;
5335 }
5336
5337 // Go back the specified number of bytes, or as far as the
5338 // start of the previous line, to try matching "\@<=" or
5339 // not matching "\@<!". This is very inefficient, limit the number of
5340 // bytes if possible.
5341 if (state->val <= 0)
5342 {
5343 if (REG_MULTI)
5344 {
5345 rex.line = reg_getline(--rex.lnum);
5346 if (rex.line == NULL)
5347 // can't go before the first line
5348 rex.line = reg_getline(++rex.lnum);
5349 }
5350 rex.input = rex.line;
5351 }
5352 else
5353 {
5354 if (REG_MULTI && (int)(rex.input - rex.line) < state->val)
5355 {
5356 // Not enough bytes in this line, go to end of
5357 // previous line.
5358 rex.line = reg_getline(--rex.lnum);
5359 if (rex.line == NULL)
5360 {
5361 // can't go before the first line
5362 rex.line = reg_getline(++rex.lnum);
5363 rex.input = rex.line;
5364 }
5365 else
5366 rex.input = rex.line + STRLEN(rex.line);
5367 }
5368 if ((int)(rex.input - rex.line) >= state->val)
5369 {
5370 rex.input -= state->val;
5371 if (has_mbyte)
5372 rex.input -= mb_head_off(rex.line, rex.input);
5373 }
5374 else
5375 rex.input = rex.line;
5376 }
5377 }
5378
5379 #ifdef ENABLE_LOG
5380 if (log_fd != stderr)
5381 fclose(log_fd);
5382 log_fd = NULL;
5383 #endif
5384 // Have to clear the lastlist field of the NFA nodes, so that
5385 // nfa_regmatch() and addstate() can run properly after recursion.
5386 if (nfa_ll_index == 1)
5387 {
5388 // Already calling nfa_regmatch() recursively. Save the lastlist[1]
5389 // values and clear them.
5390 if (*listids == NULL || *listids_len < prog->nstate)
5391 {
5392 vim_free(*listids);
5393 *listids = ALLOC_MULT(int, prog->nstate);
5394 if (*listids == NULL)
5395 {
5396 emsg(_("E878: (NFA) Could not allocate memory for branch traversal!"));
5397 return 0;
5398 }
5399 *listids_len = prog->nstate;
5400 }
5401 nfa_save_listids(prog, *listids);
5402 need_restore = TRUE;
5403 // any value of rex.nfa_listid will do
5404 }
5405 else
5406 {
5407 // First recursive nfa_regmatch() call, switch to the second lastlist
5408 // entry. Make sure rex.nfa_listid is different from a previous
5409 // recursive call, because some states may still have this ID.
5410 ++nfa_ll_index;
5411 if (rex.nfa_listid <= rex.nfa_alt_listid)
5412 rex.nfa_listid = rex.nfa_alt_listid;
5413 }
5414
5415 // Call nfa_regmatch() to check if the current concat matches at this
5416 // position. The concat ends with the node NFA_END_INVISIBLE
5417 nfa_endp = endposp;
5418 result = nfa_regmatch(prog, state->out, submatch, m);
5419
5420 if (need_restore)
5421 nfa_restore_listids(prog, *listids);
5422 else
5423 {
5424 --nfa_ll_index;
5425 rex.nfa_alt_listid = rex.nfa_listid;
5426 }
5427
5428 // restore position in input text
5429 rex.lnum = save_reglnum;
5430 if (REG_MULTI)
5431 rex.line = reg_getline(rex.lnum);
5432 rex.input = rex.line + save_reginput_col;
5433 if (result != NFA_TOO_EXPENSIVE)
5434 {
5435 nfa_match = save_nfa_match;
5436 rex.nfa_listid = save_nfa_listid;
5437 }
5438 nfa_endp = save_nfa_endp;
5439
5440 #ifdef ENABLE_LOG
5441 log_fd = fopen(NFA_REGEXP_RUN_LOG, "a");
5442 if (log_fd != NULL)
5443 {
5444 fprintf(log_fd, "****************************\n");
5445 fprintf(log_fd, "FINISHED RUNNING nfa_regmatch() recursively\n");
5446 fprintf(log_fd, "MATCH = %s\n", result == TRUE ? "OK" : "FALSE");
5447 fprintf(log_fd, "****************************\n");
5448 }
5449 else
5450 {
5451 emsg(_(e_log_open_failed));
5452 log_fd = stderr;
5453 }
5454 #endif
5455
5456 return result;
5457 }
5458
5459 /*
5460 * Estimate the chance of a match with "state" failing.
5461 * empty match: 0
5462 * NFA_ANY: 1
5463 * specific character: 99
5464 */
5465 static int
failure_chance(nfa_state_T * state,int depth)5466 failure_chance(nfa_state_T *state, int depth)
5467 {
5468 int c = state->c;
5469 int l, r;
5470
5471 // detect looping
5472 if (depth > 4)
5473 return 1;
5474
5475 switch (c)
5476 {
5477 case NFA_SPLIT:
5478 if (state->out->c == NFA_SPLIT || state->out1->c == NFA_SPLIT)
5479 // avoid recursive stuff
5480 return 1;
5481 // two alternatives, use the lowest failure chance
5482 l = failure_chance(state->out, depth + 1);
5483 r = failure_chance(state->out1, depth + 1);
5484 return l < r ? l : r;
5485
5486 case NFA_ANY:
5487 // matches anything, unlikely to fail
5488 return 1;
5489
5490 case NFA_MATCH:
5491 case NFA_MCLOSE:
5492 case NFA_ANY_COMPOSING:
5493 // empty match works always
5494 return 0;
5495
5496 case NFA_START_INVISIBLE:
5497 case NFA_START_INVISIBLE_FIRST:
5498 case NFA_START_INVISIBLE_NEG:
5499 case NFA_START_INVISIBLE_NEG_FIRST:
5500 case NFA_START_INVISIBLE_BEFORE:
5501 case NFA_START_INVISIBLE_BEFORE_FIRST:
5502 case NFA_START_INVISIBLE_BEFORE_NEG:
5503 case NFA_START_INVISIBLE_BEFORE_NEG_FIRST:
5504 case NFA_START_PATTERN:
5505 // recursive regmatch is expensive, use low failure chance
5506 return 5;
5507
5508 case NFA_BOL:
5509 case NFA_EOL:
5510 case NFA_BOF:
5511 case NFA_EOF:
5512 case NFA_NEWL:
5513 return 99;
5514
5515 case NFA_BOW:
5516 case NFA_EOW:
5517 return 90;
5518
5519 case NFA_MOPEN:
5520 case NFA_MOPEN1:
5521 case NFA_MOPEN2:
5522 case NFA_MOPEN3:
5523 case NFA_MOPEN4:
5524 case NFA_MOPEN5:
5525 case NFA_MOPEN6:
5526 case NFA_MOPEN7:
5527 case NFA_MOPEN8:
5528 case NFA_MOPEN9:
5529 #ifdef FEAT_SYN_HL
5530 case NFA_ZOPEN:
5531 case NFA_ZOPEN1:
5532 case NFA_ZOPEN2:
5533 case NFA_ZOPEN3:
5534 case NFA_ZOPEN4:
5535 case NFA_ZOPEN5:
5536 case NFA_ZOPEN6:
5537 case NFA_ZOPEN7:
5538 case NFA_ZOPEN8:
5539 case NFA_ZOPEN9:
5540 case NFA_ZCLOSE:
5541 case NFA_ZCLOSE1:
5542 case NFA_ZCLOSE2:
5543 case NFA_ZCLOSE3:
5544 case NFA_ZCLOSE4:
5545 case NFA_ZCLOSE5:
5546 case NFA_ZCLOSE6:
5547 case NFA_ZCLOSE7:
5548 case NFA_ZCLOSE8:
5549 case NFA_ZCLOSE9:
5550 #endif
5551 case NFA_NOPEN:
5552 case NFA_MCLOSE1:
5553 case NFA_MCLOSE2:
5554 case NFA_MCLOSE3:
5555 case NFA_MCLOSE4:
5556 case NFA_MCLOSE5:
5557 case NFA_MCLOSE6:
5558 case NFA_MCLOSE7:
5559 case NFA_MCLOSE8:
5560 case NFA_MCLOSE9:
5561 case NFA_NCLOSE:
5562 return failure_chance(state->out, depth + 1);
5563
5564 case NFA_BACKREF1:
5565 case NFA_BACKREF2:
5566 case NFA_BACKREF3:
5567 case NFA_BACKREF4:
5568 case NFA_BACKREF5:
5569 case NFA_BACKREF6:
5570 case NFA_BACKREF7:
5571 case NFA_BACKREF8:
5572 case NFA_BACKREF9:
5573 #ifdef FEAT_SYN_HL
5574 case NFA_ZREF1:
5575 case NFA_ZREF2:
5576 case NFA_ZREF3:
5577 case NFA_ZREF4:
5578 case NFA_ZREF5:
5579 case NFA_ZREF6:
5580 case NFA_ZREF7:
5581 case NFA_ZREF8:
5582 case NFA_ZREF9:
5583 #endif
5584 // backreferences don't match in many places
5585 return 94;
5586
5587 case NFA_LNUM_GT:
5588 case NFA_LNUM_LT:
5589 case NFA_COL_GT:
5590 case NFA_COL_LT:
5591 case NFA_VCOL_GT:
5592 case NFA_VCOL_LT:
5593 case NFA_MARK_GT:
5594 case NFA_MARK_LT:
5595 case NFA_VISUAL:
5596 // before/after positions don't match very often
5597 return 85;
5598
5599 case NFA_LNUM:
5600 return 90;
5601
5602 case NFA_CURSOR:
5603 case NFA_COL:
5604 case NFA_VCOL:
5605 case NFA_MARK:
5606 // specific positions rarely match
5607 return 98;
5608
5609 case NFA_COMPOSING:
5610 return 95;
5611
5612 default:
5613 if (c > 0)
5614 // character match fails often
5615 return 95;
5616 }
5617
5618 // something else, includes character classes
5619 return 50;
5620 }
5621
5622 /*
5623 * Skip until the char "c" we know a match must start with.
5624 */
5625 static int
skip_to_start(int c,colnr_T * colp)5626 skip_to_start(int c, colnr_T *colp)
5627 {
5628 char_u *s;
5629
5630 // Used often, do some work to avoid call overhead.
5631 if (!rex.reg_ic && !has_mbyte)
5632 s = vim_strbyte(rex.line + *colp, c);
5633 else
5634 s = cstrchr(rex.line + *colp, c);
5635 if (s == NULL)
5636 return FAIL;
5637 *colp = (int)(s - rex.line);
5638 return OK;
5639 }
5640
5641 /*
5642 * Check for a match with match_text.
5643 * Called after skip_to_start() has found regstart.
5644 * Returns zero for no match, 1 for a match.
5645 */
5646 static long
find_match_text(colnr_T startcol,int regstart,char_u * match_text)5647 find_match_text(colnr_T startcol, int regstart, char_u *match_text)
5648 {
5649 colnr_T col = startcol;
5650 int c1, c2;
5651 int len1, len2;
5652 int match;
5653
5654 for (;;)
5655 {
5656 match = TRUE;
5657 len2 = MB_CHAR2LEN(regstart); // skip regstart
5658 for (len1 = 0; match_text[len1] != NUL; len1 += MB_CHAR2LEN(c1))
5659 {
5660 c1 = PTR2CHAR(match_text + len1);
5661 c2 = PTR2CHAR(rex.line + col + len2);
5662 if (c1 != c2 && (!rex.reg_ic || MB_CASEFOLD(c1) != MB_CASEFOLD(c2)))
5663 {
5664 match = FALSE;
5665 break;
5666 }
5667 len2 += enc_utf8 ? utf_ptr2len(rex.line + col + len2)
5668 : MB_CHAR2LEN(c2);
5669 }
5670 if (match
5671 // check that no composing char follows
5672 && !(enc_utf8
5673 && utf_iscomposing(PTR2CHAR(rex.line + col + len2))))
5674 {
5675 cleanup_subexpr();
5676 if (REG_MULTI)
5677 {
5678 rex.reg_startpos[0].lnum = rex.lnum;
5679 rex.reg_startpos[0].col = col;
5680 rex.reg_endpos[0].lnum = rex.lnum;
5681 rex.reg_endpos[0].col = col + len2;
5682 }
5683 else
5684 {
5685 rex.reg_startp[0] = rex.line + col;
5686 rex.reg_endp[0] = rex.line + col + len2;
5687 }
5688 return 1L;
5689 }
5690
5691 // Try finding regstart after the current match.
5692 col += MB_CHAR2LEN(regstart); // skip regstart
5693 if (skip_to_start(regstart, &col) == FAIL)
5694 break;
5695 }
5696 return 0L;
5697 }
5698
5699 #ifdef FEAT_RELTIME
5700 static int
nfa_did_time_out()5701 nfa_did_time_out()
5702 {
5703 if (nfa_time_limit != NULL && profile_passed_limit(nfa_time_limit))
5704 {
5705 if (nfa_timed_out != NULL)
5706 *nfa_timed_out = TRUE;
5707 return TRUE;
5708 }
5709 return FALSE;
5710 }
5711 #endif
5712
5713 /*
5714 * Main matching routine.
5715 *
5716 * Run NFA to determine whether it matches rex.input.
5717 *
5718 * When "nfa_endp" is not NULL it is a required end-of-match position.
5719 *
5720 * Return TRUE if there is a match, FALSE if there is no match,
5721 * NFA_TOO_EXPENSIVE if we end up with too many states.
5722 * When there is a match "submatch" contains the positions.
5723 *
5724 * Note: Caller must ensure that: start != NULL.
5725 */
5726 static int
nfa_regmatch(nfa_regprog_T * prog,nfa_state_T * start,regsubs_T * submatch,regsubs_T * m)5727 nfa_regmatch(
5728 nfa_regprog_T *prog,
5729 nfa_state_T *start,
5730 regsubs_T *submatch,
5731 regsubs_T *m)
5732 {
5733 int result = FALSE;
5734 size_t size = 0;
5735 int flag = 0;
5736 int go_to_nextline = FALSE;
5737 nfa_thread_T *t;
5738 nfa_list_T list[2];
5739 int listidx;
5740 nfa_list_T *thislist;
5741 nfa_list_T *nextlist;
5742 int *listids = NULL;
5743 int listids_len = 0;
5744 nfa_state_T *add_state;
5745 int add_here;
5746 int add_count;
5747 int add_off = 0;
5748 int toplevel = start->c == NFA_MOPEN;
5749 regsubs_T *r;
5750 #ifdef NFA_REGEXP_DEBUG_LOG
5751 FILE *debug;
5752 #endif
5753
5754 // Some patterns may take a long time to match, especially when using
5755 // recursive_regmatch(). Allow interrupting them with CTRL-C.
5756 fast_breakcheck();
5757 if (got_int)
5758 return FALSE;
5759 #ifdef FEAT_RELTIME
5760 if (nfa_did_time_out())
5761 return FALSE;
5762 #endif
5763
5764 #ifdef NFA_REGEXP_DEBUG_LOG
5765 debug = fopen(NFA_REGEXP_DEBUG_LOG, "a");
5766 if (debug == NULL)
5767 {
5768 semsg("(NFA) COULD NOT OPEN %s!", NFA_REGEXP_DEBUG_LOG);
5769 return FALSE;
5770 }
5771 #endif
5772 nfa_match = FALSE;
5773
5774 // Allocate memory for the lists of nodes.
5775 size = (prog->nstate + 1) * sizeof(nfa_thread_T);
5776
5777 list[0].t = alloc(size);
5778 list[0].len = prog->nstate + 1;
5779 list[1].t = alloc(size);
5780 list[1].len = prog->nstate + 1;
5781 if (list[0].t == NULL || list[1].t == NULL)
5782 goto theend;
5783
5784 #ifdef ENABLE_LOG
5785 log_fd = fopen(NFA_REGEXP_RUN_LOG, "a");
5786 if (log_fd != NULL)
5787 {
5788 fprintf(log_fd, "**********************************\n");
5789 nfa_set_code(start->c);
5790 fprintf(log_fd, " RUNNING nfa_regmatch() starting with state %d, code %s\n",
5791 abs(start->id), code);
5792 fprintf(log_fd, "**********************************\n");
5793 }
5794 else
5795 {
5796 emsg(_(e_log_open_failed));
5797 log_fd = stderr;
5798 }
5799 #endif
5800
5801 thislist = &list[0];
5802 thislist->n = 0;
5803 thislist->has_pim = FALSE;
5804 nextlist = &list[1];
5805 nextlist->n = 0;
5806 nextlist->has_pim = FALSE;
5807 #ifdef ENABLE_LOG
5808 fprintf(log_fd, "(---) STARTSTATE first\n");
5809 #endif
5810 thislist->id = rex.nfa_listid + 1;
5811
5812 // Inline optimized code for addstate(thislist, start, m, 0) if we know
5813 // it's the first MOPEN.
5814 if (toplevel)
5815 {
5816 if (REG_MULTI)
5817 {
5818 m->norm.list.multi[0].start_lnum = rex.lnum;
5819 m->norm.list.multi[0].start_col = (colnr_T)(rex.input - rex.line);
5820 }
5821 else
5822 m->norm.list.line[0].start = rex.input;
5823 m->norm.in_use = 1;
5824 r = addstate(thislist, start->out, m, NULL, 0);
5825 }
5826 else
5827 r = addstate(thislist, start, m, NULL, 0);
5828 if (r == NULL)
5829 {
5830 nfa_match = NFA_TOO_EXPENSIVE;
5831 goto theend;
5832 }
5833
5834 #define ADD_STATE_IF_MATCH(state) \
5835 if (result) { \
5836 add_state = state->out; \
5837 add_off = clen; \
5838 }
5839
5840 /*
5841 * Run for each character.
5842 */
5843 for (;;)
5844 {
5845 int curc;
5846 int clen;
5847
5848 if (has_mbyte)
5849 {
5850 curc = (*mb_ptr2char)(rex.input);
5851 clen = (*mb_ptr2len)(rex.input);
5852 }
5853 else
5854 {
5855 curc = *rex.input;
5856 clen = 1;
5857 }
5858 if (curc == NUL)
5859 {
5860 clen = 0;
5861 go_to_nextline = FALSE;
5862 }
5863
5864 // swap lists
5865 thislist = &list[flag];
5866 nextlist = &list[flag ^= 1];
5867 nextlist->n = 0; // clear nextlist
5868 nextlist->has_pim = FALSE;
5869 ++rex.nfa_listid;
5870 if (prog->re_engine == AUTOMATIC_ENGINE
5871 && (rex.nfa_listid >= NFA_MAX_STATES
5872 # ifdef FEAT_EVAL
5873 || nfa_fail_for_testing
5874 # endif
5875 ))
5876 {
5877 // too many states, retry with old engine
5878 nfa_match = NFA_TOO_EXPENSIVE;
5879 goto theend;
5880 }
5881
5882 thislist->id = rex.nfa_listid;
5883 nextlist->id = rex.nfa_listid + 1;
5884
5885 #ifdef ENABLE_LOG
5886 fprintf(log_fd, "------------------------------------------\n");
5887 fprintf(log_fd, ">>> Reginput is \"%s\"\n", rex.input);
5888 fprintf(log_fd, ">>> Advanced one character... Current char is %c (code %d) \n", curc, (int)curc);
5889 fprintf(log_fd, ">>> Thislist has %d states available: ", thislist->n);
5890 {
5891 int i;
5892
5893 for (i = 0; i < thislist->n; i++)
5894 fprintf(log_fd, "%d ", abs(thislist->t[i].state->id));
5895 }
5896 fprintf(log_fd, "\n");
5897 #endif
5898
5899 #ifdef NFA_REGEXP_DEBUG_LOG
5900 fprintf(debug, "\n-------------------\n");
5901 #endif
5902 /*
5903 * If the state lists are empty we can stop.
5904 */
5905 if (thislist->n == 0)
5906 break;
5907
5908 // compute nextlist
5909 for (listidx = 0; listidx < thislist->n; ++listidx)
5910 {
5911 // If the list gets very long there probably is something wrong.
5912 // At least allow interrupting with CTRL-C.
5913 fast_breakcheck();
5914 if (got_int)
5915 break;
5916 #ifdef FEAT_RELTIME
5917 if (nfa_time_limit != NULL && ++nfa_time_count == 20)
5918 {
5919 nfa_time_count = 0;
5920 if (nfa_did_time_out())
5921 break;
5922 }
5923 #endif
5924 t = &thislist->t[listidx];
5925
5926 #ifdef NFA_REGEXP_DEBUG_LOG
5927 nfa_set_code(t->state->c);
5928 fprintf(debug, "%s, ", code);
5929 #endif
5930 #ifdef ENABLE_LOG
5931 {
5932 int col;
5933
5934 if (t->subs.norm.in_use <= 0)
5935 col = -1;
5936 else if (REG_MULTI)
5937 col = t->subs.norm.list.multi[0].start_col;
5938 else
5939 col = (int)(t->subs.norm.list.line[0].start - rex.line);
5940 nfa_set_code(t->state->c);
5941 fprintf(log_fd, "(%d) char %d %s (start col %d)%s... \n",
5942 abs(t->state->id), (int)t->state->c, code, col,
5943 pim_info(&t->pim));
5944 }
5945 #endif
5946
5947 /*
5948 * Handle the possible codes of the current state.
5949 * The most important is NFA_MATCH.
5950 */
5951 add_state = NULL;
5952 add_here = FALSE;
5953 add_count = 0;
5954 switch (t->state->c)
5955 {
5956 case NFA_MATCH:
5957 {
5958 // If the match is not at the start of the line, ends before a
5959 // composing characters and rex.reg_icombine is not set, that
5960 // is not really a match.
5961 if (enc_utf8 && !rex.reg_icombine
5962 && rex.input != rex.line && utf_iscomposing(curc))
5963 break;
5964
5965 nfa_match = TRUE;
5966 copy_sub(&submatch->norm, &t->subs.norm);
5967 #ifdef FEAT_SYN_HL
5968 if (rex.nfa_has_zsubexpr)
5969 copy_sub(&submatch->synt, &t->subs.synt);
5970 #endif
5971 #ifdef ENABLE_LOG
5972 log_subsexpr(&t->subs);
5973 #endif
5974 // Found the left-most longest match, do not look at any other
5975 // states at this position. When the list of states is going
5976 // to be empty quit without advancing, so that "rex.input" is
5977 // correct.
5978 if (nextlist->n == 0)
5979 clen = 0;
5980 goto nextchar;
5981 }
5982
5983 case NFA_END_INVISIBLE:
5984 case NFA_END_INVISIBLE_NEG:
5985 case NFA_END_PATTERN:
5986 /*
5987 * This is only encountered after a NFA_START_INVISIBLE or
5988 * NFA_START_INVISIBLE_BEFORE node.
5989 * They surround a zero-width group, used with "\@=", "\&",
5990 * "\@!", "\@<=" and "\@<!".
5991 * If we got here, it means that the current "invisible" group
5992 * finished successfully, so return control to the parent
5993 * nfa_regmatch(). For a look-behind match only when it ends
5994 * in the position in "nfa_endp".
5995 * Submatches are stored in *m, and used in the parent call.
5996 */
5997 #ifdef ENABLE_LOG
5998 if (nfa_endp != NULL)
5999 {
6000 if (REG_MULTI)
6001 fprintf(log_fd, "Current lnum: %d, endp lnum: %d; current col: %d, endp col: %d\n",
6002 (int)rex.lnum,
6003 (int)nfa_endp->se_u.pos.lnum,
6004 (int)(rex.input - rex.line),
6005 nfa_endp->se_u.pos.col);
6006 else
6007 fprintf(log_fd, "Current col: %d, endp col: %d\n",
6008 (int)(rex.input - rex.line),
6009 (int)(nfa_endp->se_u.ptr - rex.input));
6010 }
6011 #endif
6012 // If "nfa_endp" is set it's only a match if it ends at
6013 // "nfa_endp"
6014 if (nfa_endp != NULL && (REG_MULTI
6015 ? (rex.lnum != nfa_endp->se_u.pos.lnum
6016 || (int)(rex.input - rex.line)
6017 != nfa_endp->se_u.pos.col)
6018 : rex.input != nfa_endp->se_u.ptr))
6019 break;
6020
6021 // do not set submatches for \@!
6022 if (t->state->c != NFA_END_INVISIBLE_NEG)
6023 {
6024 copy_sub(&m->norm, &t->subs.norm);
6025 #ifdef FEAT_SYN_HL
6026 if (rex.nfa_has_zsubexpr)
6027 copy_sub(&m->synt, &t->subs.synt);
6028 #endif
6029 }
6030 #ifdef ENABLE_LOG
6031 fprintf(log_fd, "Match found:\n");
6032 log_subsexpr(m);
6033 #endif
6034 nfa_match = TRUE;
6035 // See comment above at "goto nextchar".
6036 if (nextlist->n == 0)
6037 clen = 0;
6038 goto nextchar;
6039
6040 case NFA_START_INVISIBLE:
6041 case NFA_START_INVISIBLE_FIRST:
6042 case NFA_START_INVISIBLE_NEG:
6043 case NFA_START_INVISIBLE_NEG_FIRST:
6044 case NFA_START_INVISIBLE_BEFORE:
6045 case NFA_START_INVISIBLE_BEFORE_FIRST:
6046 case NFA_START_INVISIBLE_BEFORE_NEG:
6047 case NFA_START_INVISIBLE_BEFORE_NEG_FIRST:
6048 {
6049 #ifdef ENABLE_LOG
6050 fprintf(log_fd, "Failure chance invisible: %d, what follows: %d\n",
6051 failure_chance(t->state->out, 0),
6052 failure_chance(t->state->out1->out, 0));
6053 #endif
6054 // Do it directly if there already is a PIM or when
6055 // nfa_postprocess() detected it will work better.
6056 if (t->pim.result != NFA_PIM_UNUSED
6057 || t->state->c == NFA_START_INVISIBLE_FIRST
6058 || t->state->c == NFA_START_INVISIBLE_NEG_FIRST
6059 || t->state->c == NFA_START_INVISIBLE_BEFORE_FIRST
6060 || t->state->c == NFA_START_INVISIBLE_BEFORE_NEG_FIRST)
6061 {
6062 int in_use = m->norm.in_use;
6063
6064 // Copy submatch info for the recursive call, opposite
6065 // of what happens on success below.
6066 copy_sub_off(&m->norm, &t->subs.norm);
6067 #ifdef FEAT_SYN_HL
6068 if (rex.nfa_has_zsubexpr)
6069 copy_sub_off(&m->synt, &t->subs.synt);
6070 #endif
6071
6072 /*
6073 * First try matching the invisible match, then what
6074 * follows.
6075 */
6076 result = recursive_regmatch(t->state, NULL, prog,
6077 submatch, m, &listids, &listids_len);
6078 if (result == NFA_TOO_EXPENSIVE)
6079 {
6080 nfa_match = result;
6081 goto theend;
6082 }
6083
6084 // for \@! and \@<! it is a match when the result is
6085 // FALSE
6086 if (result != (t->state->c == NFA_START_INVISIBLE_NEG
6087 || t->state->c == NFA_START_INVISIBLE_NEG_FIRST
6088 || t->state->c
6089 == NFA_START_INVISIBLE_BEFORE_NEG
6090 || t->state->c
6091 == NFA_START_INVISIBLE_BEFORE_NEG_FIRST))
6092 {
6093 // Copy submatch info from the recursive call
6094 copy_sub_off(&t->subs.norm, &m->norm);
6095 #ifdef FEAT_SYN_HL
6096 if (rex.nfa_has_zsubexpr)
6097 copy_sub_off(&t->subs.synt, &m->synt);
6098 #endif
6099 // If the pattern has \ze and it matched in the
6100 // sub pattern, use it.
6101 copy_ze_off(&t->subs.norm, &m->norm);
6102
6103 // t->state->out1 is the corresponding
6104 // END_INVISIBLE node; Add its out to the current
6105 // list (zero-width match).
6106 add_here = TRUE;
6107 add_state = t->state->out1->out;
6108 }
6109 m->norm.in_use = in_use;
6110 }
6111 else
6112 {
6113 nfa_pim_T pim;
6114
6115 /*
6116 * First try matching what follows. Only if a match
6117 * is found verify the invisible match matches. Add a
6118 * nfa_pim_T to the following states, it contains info
6119 * about the invisible match.
6120 */
6121 pim.state = t->state;
6122 pim.result = NFA_PIM_TODO;
6123 pim.subs.norm.in_use = 0;
6124 #ifdef FEAT_SYN_HL
6125 pim.subs.synt.in_use = 0;
6126 #endif
6127 if (REG_MULTI)
6128 {
6129 pim.end.pos.col = (int)(rex.input - rex.line);
6130 pim.end.pos.lnum = rex.lnum;
6131 }
6132 else
6133 pim.end.ptr = rex.input;
6134
6135 // t->state->out1 is the corresponding END_INVISIBLE
6136 // node; Add its out to the current list (zero-width
6137 // match).
6138 if (addstate_here(thislist, t->state->out1->out,
6139 &t->subs, &pim, &listidx) == NULL)
6140 {
6141 nfa_match = NFA_TOO_EXPENSIVE;
6142 goto theend;
6143 }
6144 }
6145 }
6146 break;
6147
6148 case NFA_START_PATTERN:
6149 {
6150 nfa_state_T *skip = NULL;
6151 #ifdef ENABLE_LOG
6152 int skip_lid = 0;
6153 #endif
6154
6155 // There is no point in trying to match the pattern if the
6156 // output state is not going to be added to the list.
6157 if (state_in_list(nextlist, t->state->out1->out, &t->subs))
6158 {
6159 skip = t->state->out1->out;
6160 #ifdef ENABLE_LOG
6161 skip_lid = nextlist->id;
6162 #endif
6163 }
6164 else if (state_in_list(nextlist,
6165 t->state->out1->out->out, &t->subs))
6166 {
6167 skip = t->state->out1->out->out;
6168 #ifdef ENABLE_LOG
6169 skip_lid = nextlist->id;
6170 #endif
6171 }
6172 else if (state_in_list(thislist,
6173 t->state->out1->out->out, &t->subs))
6174 {
6175 skip = t->state->out1->out->out;
6176 #ifdef ENABLE_LOG
6177 skip_lid = thislist->id;
6178 #endif
6179 }
6180 if (skip != NULL)
6181 {
6182 #ifdef ENABLE_LOG
6183 nfa_set_code(skip->c);
6184 fprintf(log_fd, "> Not trying to match pattern, output state %d is already in list %d. char %d: %s\n",
6185 abs(skip->id), skip_lid, skip->c, code);
6186 #endif
6187 break;
6188 }
6189 // Copy submatch info to the recursive call, opposite of what
6190 // happens afterwards.
6191 copy_sub_off(&m->norm, &t->subs.norm);
6192 #ifdef FEAT_SYN_HL
6193 if (rex.nfa_has_zsubexpr)
6194 copy_sub_off(&m->synt, &t->subs.synt);
6195 #endif
6196
6197 // First try matching the pattern.
6198 result = recursive_regmatch(t->state, NULL, prog,
6199 submatch, m, &listids, &listids_len);
6200 if (result == NFA_TOO_EXPENSIVE)
6201 {
6202 nfa_match = result;
6203 goto theend;
6204 }
6205 if (result)
6206 {
6207 int bytelen;
6208
6209 #ifdef ENABLE_LOG
6210 fprintf(log_fd, "NFA_START_PATTERN matches:\n");
6211 log_subsexpr(m);
6212 #endif
6213 // Copy submatch info from the recursive call
6214 copy_sub_off(&t->subs.norm, &m->norm);
6215 #ifdef FEAT_SYN_HL
6216 if (rex.nfa_has_zsubexpr)
6217 copy_sub_off(&t->subs.synt, &m->synt);
6218 #endif
6219 // Now we need to skip over the matched text and then
6220 // continue with what follows.
6221 if (REG_MULTI)
6222 // TODO: multi-line match
6223 bytelen = m->norm.list.multi[0].end_col
6224 - (int)(rex.input - rex.line);
6225 else
6226 bytelen = (int)(m->norm.list.line[0].end - rex.input);
6227
6228 #ifdef ENABLE_LOG
6229 fprintf(log_fd, "NFA_START_PATTERN length: %d\n", bytelen);
6230 #endif
6231 if (bytelen == 0)
6232 {
6233 // empty match, output of corresponding
6234 // NFA_END_PATTERN/NFA_SKIP to be used at current
6235 // position
6236 add_here = TRUE;
6237 add_state = t->state->out1->out->out;
6238 }
6239 else if (bytelen <= clen)
6240 {
6241 // match current character, output of corresponding
6242 // NFA_END_PATTERN to be used at next position.
6243 add_state = t->state->out1->out->out;
6244 add_off = clen;
6245 }
6246 else
6247 {
6248 // skip over the matched characters, set character
6249 // count in NFA_SKIP
6250 add_state = t->state->out1->out;
6251 add_off = bytelen;
6252 add_count = bytelen - clen;
6253 }
6254 }
6255 break;
6256 }
6257
6258 case NFA_BOL:
6259 if (rex.input == rex.line)
6260 {
6261 add_here = TRUE;
6262 add_state = t->state->out;
6263 }
6264 break;
6265
6266 case NFA_EOL:
6267 if (curc == NUL)
6268 {
6269 add_here = TRUE;
6270 add_state = t->state->out;
6271 }
6272 break;
6273
6274 case NFA_BOW:
6275 result = TRUE;
6276
6277 if (curc == NUL)
6278 result = FALSE;
6279 else if (has_mbyte)
6280 {
6281 int this_class;
6282
6283 // Get class of current and previous char (if it exists).
6284 this_class = mb_get_class_buf(rex.input, rex.reg_buf);
6285 if (this_class <= 1)
6286 result = FALSE;
6287 else if (reg_prev_class() == this_class)
6288 result = FALSE;
6289 }
6290 else if (!vim_iswordc_buf(curc, rex.reg_buf)
6291 || (rex.input > rex.line
6292 && vim_iswordc_buf(rex.input[-1], rex.reg_buf)))
6293 result = FALSE;
6294 if (result)
6295 {
6296 add_here = TRUE;
6297 add_state = t->state->out;
6298 }
6299 break;
6300
6301 case NFA_EOW:
6302 result = TRUE;
6303 if (rex.input == rex.line)
6304 result = FALSE;
6305 else if (has_mbyte)
6306 {
6307 int this_class, prev_class;
6308
6309 // Get class of current and previous char (if it exists).
6310 this_class = mb_get_class_buf(rex.input, rex.reg_buf);
6311 prev_class = reg_prev_class();
6312 if (this_class == prev_class
6313 || prev_class == 0 || prev_class == 1)
6314 result = FALSE;
6315 }
6316 else if (!vim_iswordc_buf(rex.input[-1], rex.reg_buf)
6317 || (rex.input[0] != NUL
6318 && vim_iswordc_buf(curc, rex.reg_buf)))
6319 result = FALSE;
6320 if (result)
6321 {
6322 add_here = TRUE;
6323 add_state = t->state->out;
6324 }
6325 break;
6326
6327 case NFA_BOF:
6328 if (rex.lnum == 0 && rex.input == rex.line
6329 && (!REG_MULTI || rex.reg_firstlnum == 1))
6330 {
6331 add_here = TRUE;
6332 add_state = t->state->out;
6333 }
6334 break;
6335
6336 case NFA_EOF:
6337 if (rex.lnum == rex.reg_maxline && curc == NUL)
6338 {
6339 add_here = TRUE;
6340 add_state = t->state->out;
6341 }
6342 break;
6343
6344 case NFA_COMPOSING:
6345 {
6346 int mc = curc;
6347 int len = 0;
6348 nfa_state_T *end;
6349 nfa_state_T *sta;
6350 int cchars[MAX_MCO];
6351 int ccount = 0;
6352 int j;
6353
6354 sta = t->state->out;
6355 len = 0;
6356 if (utf_iscomposing(sta->c))
6357 {
6358 // Only match composing character(s), ignore base
6359 // character. Used for ".{composing}" and "{composing}"
6360 // (no preceding character).
6361 len += mb_char2len(mc);
6362 }
6363 if (rex.reg_icombine && len == 0)
6364 {
6365 // If \Z was present, then ignore composing characters.
6366 // When ignoring the base character this always matches.
6367 if (sta->c != curc)
6368 result = FAIL;
6369 else
6370 result = OK;
6371 while (sta->c != NFA_END_COMPOSING)
6372 sta = sta->out;
6373 }
6374
6375 // Check base character matches first, unless ignored.
6376 else if (len > 0 || mc == sta->c)
6377 {
6378 if (len == 0)
6379 {
6380 len += mb_char2len(mc);
6381 sta = sta->out;
6382 }
6383
6384 // We don't care about the order of composing characters.
6385 // Get them into cchars[] first.
6386 while (len < clen)
6387 {
6388 mc = mb_ptr2char(rex.input + len);
6389 cchars[ccount++] = mc;
6390 len += mb_char2len(mc);
6391 if (ccount == MAX_MCO)
6392 break;
6393 }
6394
6395 // Check that each composing char in the pattern matches a
6396 // composing char in the text. We do not check if all
6397 // composing chars are matched.
6398 result = OK;
6399 while (sta->c != NFA_END_COMPOSING)
6400 {
6401 for (j = 0; j < ccount; ++j)
6402 if (cchars[j] == sta->c)
6403 break;
6404 if (j == ccount)
6405 {
6406 result = FAIL;
6407 break;
6408 }
6409 sta = sta->out;
6410 }
6411 }
6412 else
6413 result = FAIL;
6414
6415 end = t->state->out1; // NFA_END_COMPOSING
6416 ADD_STATE_IF_MATCH(end);
6417 break;
6418 }
6419
6420 case NFA_NEWL:
6421 if (curc == NUL && !rex.reg_line_lbr && REG_MULTI
6422 && rex.lnum <= rex.reg_maxline)
6423 {
6424 go_to_nextline = TRUE;
6425 // Pass -1 for the offset, which means taking the position
6426 // at the start of the next line.
6427 add_state = t->state->out;
6428 add_off = -1;
6429 }
6430 else if (curc == '\n' && rex.reg_line_lbr)
6431 {
6432 // match \n as if it is an ordinary character
6433 add_state = t->state->out;
6434 add_off = 1;
6435 }
6436 break;
6437
6438 case NFA_START_COLL:
6439 case NFA_START_NEG_COLL:
6440 {
6441 // What follows is a list of characters, until NFA_END_COLL.
6442 // One of them must match or none of them must match.
6443 nfa_state_T *state;
6444 int result_if_matched;
6445 int c1, c2;
6446
6447 // Never match EOL. If it's part of the collection it is added
6448 // as a separate state with an OR.
6449 if (curc == NUL)
6450 break;
6451
6452 state = t->state->out;
6453 result_if_matched = (t->state->c == NFA_START_COLL);
6454 for (;;)
6455 {
6456 if (state->c == NFA_END_COLL)
6457 {
6458 result = !result_if_matched;
6459 break;
6460 }
6461 if (state->c == NFA_RANGE_MIN)
6462 {
6463 c1 = state->val;
6464 state = state->out; // advance to NFA_RANGE_MAX
6465 c2 = state->val;
6466 #ifdef ENABLE_LOG
6467 fprintf(log_fd, "NFA_RANGE_MIN curc=%d c1=%d c2=%d\n",
6468 curc, c1, c2);
6469 #endif
6470 if (curc >= c1 && curc <= c2)
6471 {
6472 result = result_if_matched;
6473 break;
6474 }
6475 if (rex.reg_ic)
6476 {
6477 int curc_low = MB_CASEFOLD(curc);
6478 int done = FALSE;
6479
6480 for ( ; c1 <= c2; ++c1)
6481 if (MB_CASEFOLD(c1) == curc_low)
6482 {
6483 result = result_if_matched;
6484 done = TRUE;
6485 break;
6486 }
6487 if (done)
6488 break;
6489 }
6490 }
6491 else if (state->c < 0 ? check_char_class(state->c, curc)
6492 : (curc == state->c
6493 || (rex.reg_ic && MB_CASEFOLD(curc)
6494 == MB_CASEFOLD(state->c))))
6495 {
6496 result = result_if_matched;
6497 break;
6498 }
6499 state = state->out;
6500 }
6501 if (result)
6502 {
6503 // next state is in out of the NFA_END_COLL, out1 of
6504 // START points to the END state
6505 add_state = t->state->out1->out;
6506 add_off = clen;
6507 }
6508 break;
6509 }
6510
6511 case NFA_ANY:
6512 // Any char except '\0', (end of input) does not match.
6513 if (curc > 0)
6514 {
6515 add_state = t->state->out;
6516 add_off = clen;
6517 }
6518 break;
6519
6520 case NFA_ANY_COMPOSING:
6521 // On a composing character skip over it. Otherwise do
6522 // nothing. Always matches.
6523 if (enc_utf8 && utf_iscomposing(curc))
6524 {
6525 add_off = clen;
6526 }
6527 else
6528 {
6529 add_here = TRUE;
6530 add_off = 0;
6531 }
6532 add_state = t->state->out;
6533 break;
6534
6535 /*
6536 * Character classes like \a for alpha, \d for digit etc.
6537 */
6538 case NFA_IDENT: // \i
6539 result = vim_isIDc(curc);
6540 ADD_STATE_IF_MATCH(t->state);
6541 break;
6542
6543 case NFA_SIDENT: // \I
6544 result = !VIM_ISDIGIT(curc) && vim_isIDc(curc);
6545 ADD_STATE_IF_MATCH(t->state);
6546 break;
6547
6548 case NFA_KWORD: // \k
6549 result = vim_iswordp_buf(rex.input, rex.reg_buf);
6550 ADD_STATE_IF_MATCH(t->state);
6551 break;
6552
6553 case NFA_SKWORD: // \K
6554 result = !VIM_ISDIGIT(curc)
6555 && vim_iswordp_buf(rex.input, rex.reg_buf);
6556 ADD_STATE_IF_MATCH(t->state);
6557 break;
6558
6559 case NFA_FNAME: // \f
6560 result = vim_isfilec(curc);
6561 ADD_STATE_IF_MATCH(t->state);
6562 break;
6563
6564 case NFA_SFNAME: // \F
6565 result = !VIM_ISDIGIT(curc) && vim_isfilec(curc);
6566 ADD_STATE_IF_MATCH(t->state);
6567 break;
6568
6569 case NFA_PRINT: // \p
6570 result = vim_isprintc(PTR2CHAR(rex.input));
6571 ADD_STATE_IF_MATCH(t->state);
6572 break;
6573
6574 case NFA_SPRINT: // \P
6575 result = !VIM_ISDIGIT(curc) && vim_isprintc(PTR2CHAR(rex.input));
6576 ADD_STATE_IF_MATCH(t->state);
6577 break;
6578
6579 case NFA_WHITE: // \s
6580 result = VIM_ISWHITE(curc);
6581 ADD_STATE_IF_MATCH(t->state);
6582 break;
6583
6584 case NFA_NWHITE: // \S
6585 result = curc != NUL && !VIM_ISWHITE(curc);
6586 ADD_STATE_IF_MATCH(t->state);
6587 break;
6588
6589 case NFA_DIGIT: // \d
6590 result = ri_digit(curc);
6591 ADD_STATE_IF_MATCH(t->state);
6592 break;
6593
6594 case NFA_NDIGIT: // \D
6595 result = curc != NUL && !ri_digit(curc);
6596 ADD_STATE_IF_MATCH(t->state);
6597 break;
6598
6599 case NFA_HEX: // \x
6600 result = ri_hex(curc);
6601 ADD_STATE_IF_MATCH(t->state);
6602 break;
6603
6604 case NFA_NHEX: // \X
6605 result = curc != NUL && !ri_hex(curc);
6606 ADD_STATE_IF_MATCH(t->state);
6607 break;
6608
6609 case NFA_OCTAL: // \o
6610 result = ri_octal(curc);
6611 ADD_STATE_IF_MATCH(t->state);
6612 break;
6613
6614 case NFA_NOCTAL: // \O
6615 result = curc != NUL && !ri_octal(curc);
6616 ADD_STATE_IF_MATCH(t->state);
6617 break;
6618
6619 case NFA_WORD: // \w
6620 result = ri_word(curc);
6621 ADD_STATE_IF_MATCH(t->state);
6622 break;
6623
6624 case NFA_NWORD: // \W
6625 result = curc != NUL && !ri_word(curc);
6626 ADD_STATE_IF_MATCH(t->state);
6627 break;
6628
6629 case NFA_HEAD: // \h
6630 result = ri_head(curc);
6631 ADD_STATE_IF_MATCH(t->state);
6632 break;
6633
6634 case NFA_NHEAD: // \H
6635 result = curc != NUL && !ri_head(curc);
6636 ADD_STATE_IF_MATCH(t->state);
6637 break;
6638
6639 case NFA_ALPHA: // \a
6640 result = ri_alpha(curc);
6641 ADD_STATE_IF_MATCH(t->state);
6642 break;
6643
6644 case NFA_NALPHA: // \A
6645 result = curc != NUL && !ri_alpha(curc);
6646 ADD_STATE_IF_MATCH(t->state);
6647 break;
6648
6649 case NFA_LOWER: // \l
6650 result = ri_lower(curc);
6651 ADD_STATE_IF_MATCH(t->state);
6652 break;
6653
6654 case NFA_NLOWER: // \L
6655 result = curc != NUL && !ri_lower(curc);
6656 ADD_STATE_IF_MATCH(t->state);
6657 break;
6658
6659 case NFA_UPPER: // \u
6660 result = ri_upper(curc);
6661 ADD_STATE_IF_MATCH(t->state);
6662 break;
6663
6664 case NFA_NUPPER: // \U
6665 result = curc != NUL && !ri_upper(curc);
6666 ADD_STATE_IF_MATCH(t->state);
6667 break;
6668
6669 case NFA_LOWER_IC: // [a-z]
6670 result = ri_lower(curc) || (rex.reg_ic && ri_upper(curc));
6671 ADD_STATE_IF_MATCH(t->state);
6672 break;
6673
6674 case NFA_NLOWER_IC: // [^a-z]
6675 result = curc != NUL
6676 && !(ri_lower(curc) || (rex.reg_ic && ri_upper(curc)));
6677 ADD_STATE_IF_MATCH(t->state);
6678 break;
6679
6680 case NFA_UPPER_IC: // [A-Z]
6681 result = ri_upper(curc) || (rex.reg_ic && ri_lower(curc));
6682 ADD_STATE_IF_MATCH(t->state);
6683 break;
6684
6685 case NFA_NUPPER_IC: // ^[A-Z]
6686 result = curc != NUL
6687 && !(ri_upper(curc) || (rex.reg_ic && ri_lower(curc)));
6688 ADD_STATE_IF_MATCH(t->state);
6689 break;
6690
6691 case NFA_BACKREF1:
6692 case NFA_BACKREF2:
6693 case NFA_BACKREF3:
6694 case NFA_BACKREF4:
6695 case NFA_BACKREF5:
6696 case NFA_BACKREF6:
6697 case NFA_BACKREF7:
6698 case NFA_BACKREF8:
6699 case NFA_BACKREF9:
6700 #ifdef FEAT_SYN_HL
6701 case NFA_ZREF1:
6702 case NFA_ZREF2:
6703 case NFA_ZREF3:
6704 case NFA_ZREF4:
6705 case NFA_ZREF5:
6706 case NFA_ZREF6:
6707 case NFA_ZREF7:
6708 case NFA_ZREF8:
6709 case NFA_ZREF9:
6710 #endif
6711 // \1 .. \9 \z1 .. \z9
6712 {
6713 int subidx;
6714 int bytelen;
6715
6716 if (t->state->c <= NFA_BACKREF9)
6717 {
6718 subidx = t->state->c - NFA_BACKREF1 + 1;
6719 result = match_backref(&t->subs.norm, subidx, &bytelen);
6720 }
6721 #ifdef FEAT_SYN_HL
6722 else
6723 {
6724 subidx = t->state->c - NFA_ZREF1 + 1;
6725 result = match_zref(subidx, &bytelen);
6726 }
6727 #endif
6728
6729 if (result)
6730 {
6731 if (bytelen == 0)
6732 {
6733 // empty match always works, output of NFA_SKIP to be
6734 // used next
6735 add_here = TRUE;
6736 add_state = t->state->out->out;
6737 }
6738 else if (bytelen <= clen)
6739 {
6740 // match current character, jump ahead to out of
6741 // NFA_SKIP
6742 add_state = t->state->out->out;
6743 add_off = clen;
6744 }
6745 else
6746 {
6747 // skip over the matched characters, set character
6748 // count in NFA_SKIP
6749 add_state = t->state->out;
6750 add_off = bytelen;
6751 add_count = bytelen - clen;
6752 }
6753 }
6754 break;
6755 }
6756 case NFA_SKIP:
6757 // character of previous matching \1 .. \9 or \@>
6758 if (t->count - clen <= 0)
6759 {
6760 // end of match, go to what follows
6761 add_state = t->state->out;
6762 add_off = clen;
6763 }
6764 else
6765 {
6766 // add state again with decremented count
6767 add_state = t->state;
6768 add_off = 0;
6769 add_count = t->count - clen;
6770 }
6771 break;
6772
6773 case NFA_LNUM:
6774 case NFA_LNUM_GT:
6775 case NFA_LNUM_LT:
6776 result = (REG_MULTI &&
6777 nfa_re_num_cmp(t->state->val, t->state->c - NFA_LNUM,
6778 (long_u)(rex.lnum + rex.reg_firstlnum)));
6779 if (result)
6780 {
6781 add_here = TRUE;
6782 add_state = t->state->out;
6783 }
6784 break;
6785
6786 case NFA_COL:
6787 case NFA_COL_GT:
6788 case NFA_COL_LT:
6789 result = nfa_re_num_cmp(t->state->val, t->state->c - NFA_COL,
6790 (long_u)(rex.input - rex.line) + 1);
6791 if (result)
6792 {
6793 add_here = TRUE;
6794 add_state = t->state->out;
6795 }
6796 break;
6797
6798 case NFA_VCOL:
6799 case NFA_VCOL_GT:
6800 case NFA_VCOL_LT:
6801 {
6802 int op = t->state->c - NFA_VCOL;
6803 colnr_T col = (colnr_T)(rex.input - rex.line);
6804 win_T *wp = rex.reg_win == NULL ? curwin : rex.reg_win;
6805
6806 // Bail out quickly when there can't be a match, avoid the
6807 // overhead of win_linetabsize() on long lines.
6808 if (op != 1 && col > t->state->val
6809 * (has_mbyte ? MB_MAXBYTES : 1))
6810 break;
6811 result = FALSE;
6812 if (op == 1 && col - 1 > t->state->val && col > 100)
6813 {
6814 int ts = wp->w_buffer->b_p_ts;
6815
6816 // Guess that a character won't use more columns than
6817 // 'tabstop', with a minimum of 4.
6818 if (ts < 4)
6819 ts = 4;
6820 result = col > t->state->val * ts;
6821 }
6822 if (!result)
6823 result = nfa_re_num_cmp(t->state->val, op,
6824 (long_u)win_linetabsize(wp, rex.line, col) + 1);
6825 if (result)
6826 {
6827 add_here = TRUE;
6828 add_state = t->state->out;
6829 }
6830 }
6831 break;
6832
6833 case NFA_MARK:
6834 case NFA_MARK_GT:
6835 case NFA_MARK_LT:
6836 {
6837 size_t col = rex.input - rex.line;
6838 pos_T *pos = getmark_buf(rex.reg_buf, t->state->val, FALSE);
6839
6840 // Line may have been freed, get it again.
6841 if (REG_MULTI)
6842 {
6843 rex.line = reg_getline(rex.lnum);
6844 rex.input = rex.line + col;
6845 }
6846
6847 // Compare the mark position to the match position, if the mark
6848 // exists and mark is set in reg_buf.
6849 if (pos != NULL && pos->lnum > 0)
6850 {
6851 colnr_T pos_col = pos->lnum == rex.lnum + rex.reg_firstlnum
6852 && pos->col == MAXCOL
6853 ? (colnr_T)STRLEN(reg_getline(
6854 pos->lnum - rex.reg_firstlnum))
6855 : pos->col;
6856
6857 result = (pos->lnum == rex.lnum + rex.reg_firstlnum
6858 ? (pos_col == (colnr_T)(rex.input - rex.line)
6859 ? t->state->c == NFA_MARK
6860 : (pos_col < (colnr_T)(rex.input - rex.line)
6861 ? t->state->c == NFA_MARK_GT
6862 : t->state->c == NFA_MARK_LT))
6863 : (pos->lnum < rex.lnum + rex.reg_firstlnum
6864 ? t->state->c == NFA_MARK_GT
6865 : t->state->c == NFA_MARK_LT));
6866 if (result)
6867 {
6868 add_here = TRUE;
6869 add_state = t->state->out;
6870 }
6871 }
6872 break;
6873 }
6874
6875 case NFA_CURSOR:
6876 result = (rex.reg_win != NULL
6877 && (rex.lnum + rex.reg_firstlnum
6878 == rex.reg_win->w_cursor.lnum)
6879 && ((colnr_T)(rex.input - rex.line)
6880 == rex.reg_win->w_cursor.col));
6881 if (result)
6882 {
6883 add_here = TRUE;
6884 add_state = t->state->out;
6885 }
6886 break;
6887
6888 case NFA_VISUAL:
6889 result = reg_match_visual();
6890 if (result)
6891 {
6892 add_here = TRUE;
6893 add_state = t->state->out;
6894 }
6895 break;
6896
6897 case NFA_MOPEN1:
6898 case NFA_MOPEN2:
6899 case NFA_MOPEN3:
6900 case NFA_MOPEN4:
6901 case NFA_MOPEN5:
6902 case NFA_MOPEN6:
6903 case NFA_MOPEN7:
6904 case NFA_MOPEN8:
6905 case NFA_MOPEN9:
6906 #ifdef FEAT_SYN_HL
6907 case NFA_ZOPEN:
6908 case NFA_ZOPEN1:
6909 case NFA_ZOPEN2:
6910 case NFA_ZOPEN3:
6911 case NFA_ZOPEN4:
6912 case NFA_ZOPEN5:
6913 case NFA_ZOPEN6:
6914 case NFA_ZOPEN7:
6915 case NFA_ZOPEN8:
6916 case NFA_ZOPEN9:
6917 #endif
6918 case NFA_NOPEN:
6919 case NFA_ZSTART:
6920 // These states are only added to be able to bail out when
6921 // they are added again, nothing is to be done.
6922 break;
6923
6924 default: // regular character
6925 {
6926 int c = t->state->c;
6927
6928 #ifdef DEBUG
6929 if (c < 0)
6930 siemsg("INTERNAL: Negative state char: %ld", c);
6931 #endif
6932 result = (c == curc);
6933
6934 if (!result && rex.reg_ic)
6935 result = MB_CASEFOLD(c) == MB_CASEFOLD(curc);
6936 // If rex.reg_icombine is not set only skip over the character
6937 // itself. When it is set skip over composing characters.
6938 if (result && enc_utf8 && !rex.reg_icombine)
6939 clen = utf_ptr2len(rex.input);
6940 ADD_STATE_IF_MATCH(t->state);
6941 break;
6942 }
6943
6944 } // switch (t->state->c)
6945
6946 if (add_state != NULL)
6947 {
6948 nfa_pim_T *pim;
6949 nfa_pim_T pim_copy;
6950
6951 if (t->pim.result == NFA_PIM_UNUSED)
6952 pim = NULL;
6953 else
6954 pim = &t->pim;
6955
6956 // Handle the postponed invisible match if the match might end
6957 // without advancing and before the end of the line.
6958 if (pim != NULL && (clen == 0 || match_follows(add_state, 0)))
6959 {
6960 if (pim->result == NFA_PIM_TODO)
6961 {
6962 #ifdef ENABLE_LOG
6963 fprintf(log_fd, "\n");
6964 fprintf(log_fd, "==================================\n");
6965 fprintf(log_fd, "Postponed recursive nfa_regmatch()\n");
6966 fprintf(log_fd, "\n");
6967 #endif
6968 result = recursive_regmatch(pim->state, pim,
6969 prog, submatch, m, &listids, &listids_len);
6970 pim->result = result ? NFA_PIM_MATCH : NFA_PIM_NOMATCH;
6971 // for \@! and \@<! it is a match when the result is
6972 // FALSE
6973 if (result != (pim->state->c == NFA_START_INVISIBLE_NEG
6974 || pim->state->c == NFA_START_INVISIBLE_NEG_FIRST
6975 || pim->state->c
6976 == NFA_START_INVISIBLE_BEFORE_NEG
6977 || pim->state->c
6978 == NFA_START_INVISIBLE_BEFORE_NEG_FIRST))
6979 {
6980 // Copy submatch info from the recursive call
6981 copy_sub_off(&pim->subs.norm, &m->norm);
6982 #ifdef FEAT_SYN_HL
6983 if (rex.nfa_has_zsubexpr)
6984 copy_sub_off(&pim->subs.synt, &m->synt);
6985 #endif
6986 }
6987 }
6988 else
6989 {
6990 result = (pim->result == NFA_PIM_MATCH);
6991 #ifdef ENABLE_LOG
6992 fprintf(log_fd, "\n");
6993 fprintf(log_fd, "Using previous recursive nfa_regmatch() result, result == %d\n", pim->result);
6994 fprintf(log_fd, "MATCH = %s\n", result == TRUE ? "OK" : "FALSE");
6995 fprintf(log_fd, "\n");
6996 #endif
6997 }
6998
6999 // for \@! and \@<! it is a match when result is FALSE
7000 if (result != (pim->state->c == NFA_START_INVISIBLE_NEG
7001 || pim->state->c == NFA_START_INVISIBLE_NEG_FIRST
7002 || pim->state->c
7003 == NFA_START_INVISIBLE_BEFORE_NEG
7004 || pim->state->c
7005 == NFA_START_INVISIBLE_BEFORE_NEG_FIRST))
7006 {
7007 // Copy submatch info from the recursive call
7008 copy_sub_off(&t->subs.norm, &pim->subs.norm);
7009 #ifdef FEAT_SYN_HL
7010 if (rex.nfa_has_zsubexpr)
7011 copy_sub_off(&t->subs.synt, &pim->subs.synt);
7012 #endif
7013 }
7014 else
7015 // look-behind match failed, don't add the state
7016 continue;
7017
7018 // Postponed invisible match was handled, don't add it to
7019 // following states.
7020 pim = NULL;
7021 }
7022
7023 // If "pim" points into l->t it will become invalid when
7024 // adding the state causes the list to be reallocated. Make a
7025 // local copy to avoid that.
7026 if (pim == &t->pim)
7027 {
7028 copy_pim(&pim_copy, pim);
7029 pim = &pim_copy;
7030 }
7031
7032 if (add_here)
7033 r = addstate_here(thislist, add_state, &t->subs,
7034 pim, &listidx);
7035 else
7036 {
7037 r = addstate(nextlist, add_state, &t->subs, pim, add_off);
7038 if (add_count > 0)
7039 nextlist->t[nextlist->n - 1].count = add_count;
7040 }
7041 if (r == NULL)
7042 {
7043 nfa_match = NFA_TOO_EXPENSIVE;
7044 goto theend;
7045 }
7046 }
7047
7048 } // for (thislist = thislist; thislist->state; thislist++)
7049
7050 // Look for the start of a match in the current position by adding the
7051 // start state to the list of states.
7052 // The first found match is the leftmost one, thus the order of states
7053 // matters!
7054 // Do not add the start state in recursive calls of nfa_regmatch(),
7055 // because recursive calls should only start in the first position.
7056 // Unless "nfa_endp" is not NULL, then we match the end position.
7057 // Also don't start a match past the first line.
7058 if (nfa_match == FALSE
7059 && ((toplevel
7060 && rex.lnum == 0
7061 && clen != 0
7062 && (rex.reg_maxcol == 0
7063 || (colnr_T)(rex.input - rex.line) < rex.reg_maxcol))
7064 || (nfa_endp != NULL
7065 && (REG_MULTI
7066 ? (rex.lnum < nfa_endp->se_u.pos.lnum
7067 || (rex.lnum == nfa_endp->se_u.pos.lnum
7068 && (int)(rex.input - rex.line)
7069 < nfa_endp->se_u.pos.col))
7070 : rex.input < nfa_endp->se_u.ptr))))
7071 {
7072 #ifdef ENABLE_LOG
7073 fprintf(log_fd, "(---) STARTSTATE\n");
7074 #endif
7075 // Inline optimized code for addstate() if we know the state is
7076 // the first MOPEN.
7077 if (toplevel)
7078 {
7079 int add = TRUE;
7080 int c;
7081
7082 if (prog->regstart != NUL && clen != 0)
7083 {
7084 if (nextlist->n == 0)
7085 {
7086 colnr_T col = (colnr_T)(rex.input - rex.line) + clen;
7087
7088 // Nextlist is empty, we can skip ahead to the
7089 // character that must appear at the start.
7090 if (skip_to_start(prog->regstart, &col) == FAIL)
7091 break;
7092 #ifdef ENABLE_LOG
7093 fprintf(log_fd, " Skipping ahead %d bytes to regstart\n",
7094 col - ((colnr_T)(rex.input - rex.line) + clen));
7095 #endif
7096 rex.input = rex.line + col - clen;
7097 }
7098 else
7099 {
7100 // Checking if the required start character matches is
7101 // cheaper than adding a state that won't match.
7102 c = PTR2CHAR(rex.input + clen);
7103 if (c != prog->regstart && (!rex.reg_ic
7104 || MB_CASEFOLD(c) != MB_CASEFOLD(prog->regstart)))
7105 {
7106 #ifdef ENABLE_LOG
7107 fprintf(log_fd, " Skipping start state, regstart does not match\n");
7108 #endif
7109 add = FALSE;
7110 }
7111 }
7112 }
7113
7114 if (add)
7115 {
7116 if (REG_MULTI)
7117 m->norm.list.multi[0].start_col =
7118 (colnr_T)(rex.input - rex.line) + clen;
7119 else
7120 m->norm.list.line[0].start = rex.input + clen;
7121 if (addstate(nextlist, start->out, m, NULL, clen) == NULL)
7122 {
7123 nfa_match = NFA_TOO_EXPENSIVE;
7124 goto theend;
7125 }
7126 }
7127 }
7128 else
7129 {
7130 if (addstate(nextlist, start, m, NULL, clen) == NULL)
7131 {
7132 nfa_match = NFA_TOO_EXPENSIVE;
7133 goto theend;
7134 }
7135 }
7136 }
7137
7138 #ifdef ENABLE_LOG
7139 fprintf(log_fd, ">>> Thislist had %d states available: ", thislist->n);
7140 {
7141 int i;
7142
7143 for (i = 0; i < thislist->n; i++)
7144 fprintf(log_fd, "%d ", abs(thislist->t[i].state->id));
7145 }
7146 fprintf(log_fd, "\n");
7147 #endif
7148
7149 nextchar:
7150 // Advance to the next character, or advance to the next line, or
7151 // finish.
7152 if (clen != 0)
7153 rex.input += clen;
7154 else if (go_to_nextline || (nfa_endp != NULL && REG_MULTI
7155 && rex.lnum < nfa_endp->se_u.pos.lnum))
7156 reg_nextline();
7157 else
7158 break;
7159
7160 // Allow interrupting with CTRL-C.
7161 line_breakcheck();
7162 if (got_int)
7163 break;
7164 #ifdef FEAT_RELTIME
7165 // Check for timeout once in a twenty times to avoid overhead.
7166 if (nfa_time_limit != NULL && ++nfa_time_count == 20)
7167 {
7168 nfa_time_count = 0;
7169 if (nfa_did_time_out())
7170 break;
7171 }
7172 #endif
7173 }
7174
7175 #ifdef ENABLE_LOG
7176 if (log_fd != stderr)
7177 fclose(log_fd);
7178 log_fd = NULL;
7179 #endif
7180
7181 theend:
7182 // Free memory
7183 vim_free(list[0].t);
7184 vim_free(list[1].t);
7185 vim_free(listids);
7186 #undef ADD_STATE_IF_MATCH
7187 #ifdef NFA_REGEXP_DEBUG_LOG
7188 fclose(debug);
7189 #endif
7190
7191 return nfa_match;
7192 }
7193
7194 /*
7195 * Try match of "prog" with at rex.line["col"].
7196 * Returns <= 0 for failure, number of lines contained in the match otherwise.
7197 */
7198 static long
nfa_regtry(nfa_regprog_T * prog,colnr_T col,proftime_T * tm UNUSED,int * timed_out UNUSED)7199 nfa_regtry(
7200 nfa_regprog_T *prog,
7201 colnr_T col,
7202 proftime_T *tm UNUSED, // timeout limit or NULL
7203 int *timed_out UNUSED) // flag set on timeout or NULL
7204 {
7205 int i;
7206 regsubs_T subs, m;
7207 nfa_state_T *start = prog->start;
7208 int result;
7209 #ifdef ENABLE_LOG
7210 FILE *f;
7211 #endif
7212
7213 rex.input = rex.line + col;
7214 #ifdef FEAT_RELTIME
7215 nfa_time_limit = tm;
7216 nfa_timed_out = timed_out;
7217 nfa_time_count = 0;
7218 #endif
7219
7220 #ifdef ENABLE_LOG
7221 f = fopen(NFA_REGEXP_RUN_LOG, "a");
7222 if (f != NULL)
7223 {
7224 fprintf(f, "\n\n\t=======================================================\n");
7225 #ifdef DEBUG
7226 fprintf(f, "\tRegexp is \"%s\"\n", nfa_regengine.expr);
7227 #endif
7228 fprintf(f, "\tInput text is \"%s\" \n", rex.input);
7229 fprintf(f, "\t=======================================================\n\n");
7230 nfa_print_state(f, start);
7231 fprintf(f, "\n\n");
7232 fclose(f);
7233 }
7234 else
7235 emsg("Could not open temporary log file for writing");
7236 #endif
7237
7238 clear_sub(&subs.norm);
7239 clear_sub(&m.norm);
7240 #ifdef FEAT_SYN_HL
7241 clear_sub(&subs.synt);
7242 clear_sub(&m.synt);
7243 #endif
7244
7245 result = nfa_regmatch(prog, start, &subs, &m);
7246 if (result == FALSE)
7247 return 0;
7248 else if (result == NFA_TOO_EXPENSIVE)
7249 return result;
7250
7251 cleanup_subexpr();
7252 if (REG_MULTI)
7253 {
7254 for (i = 0; i < subs.norm.in_use; i++)
7255 {
7256 rex.reg_startpos[i].lnum = subs.norm.list.multi[i].start_lnum;
7257 rex.reg_startpos[i].col = subs.norm.list.multi[i].start_col;
7258
7259 rex.reg_endpos[i].lnum = subs.norm.list.multi[i].end_lnum;
7260 rex.reg_endpos[i].col = subs.norm.list.multi[i].end_col;
7261 }
7262
7263 if (rex.reg_startpos[0].lnum < 0)
7264 {
7265 rex.reg_startpos[0].lnum = 0;
7266 rex.reg_startpos[0].col = col;
7267 }
7268 if (rex.reg_endpos[0].lnum < 0)
7269 {
7270 // pattern has a \ze but it didn't match, use current end
7271 rex.reg_endpos[0].lnum = rex.lnum;
7272 rex.reg_endpos[0].col = (int)(rex.input - rex.line);
7273 }
7274 else
7275 // Use line number of "\ze".
7276 rex.lnum = rex.reg_endpos[0].lnum;
7277 }
7278 else
7279 {
7280 for (i = 0; i < subs.norm.in_use; i++)
7281 {
7282 rex.reg_startp[i] = subs.norm.list.line[i].start;
7283 rex.reg_endp[i] = subs.norm.list.line[i].end;
7284 }
7285
7286 if (rex.reg_startp[0] == NULL)
7287 rex.reg_startp[0] = rex.line + col;
7288 if (rex.reg_endp[0] == NULL)
7289 rex.reg_endp[0] = rex.input;
7290 }
7291
7292 #ifdef FEAT_SYN_HL
7293 // Package any found \z(...\) matches for export. Default is none.
7294 unref_extmatch(re_extmatch_out);
7295 re_extmatch_out = NULL;
7296
7297 if (prog->reghasz == REX_SET)
7298 {
7299 cleanup_zsubexpr();
7300 re_extmatch_out = make_extmatch();
7301 if (re_extmatch_out == NULL)
7302 return 0;
7303 // Loop over \z1, \z2, etc. There is no \z0.
7304 for (i = 1; i < subs.synt.in_use; i++)
7305 {
7306 if (REG_MULTI)
7307 {
7308 struct multipos *mpos = &subs.synt.list.multi[i];
7309
7310 // Only accept single line matches that are valid.
7311 if (mpos->start_lnum >= 0
7312 && mpos->start_lnum == mpos->end_lnum
7313 && mpos->end_col >= mpos->start_col)
7314 re_extmatch_out->matches[i] =
7315 vim_strnsave(reg_getline(mpos->start_lnum)
7316 + mpos->start_col,
7317 mpos->end_col - mpos->start_col);
7318 }
7319 else
7320 {
7321 struct linepos *lpos = &subs.synt.list.line[i];
7322
7323 if (lpos->start != NULL && lpos->end != NULL)
7324 re_extmatch_out->matches[i] =
7325 vim_strnsave(lpos->start, lpos->end - lpos->start);
7326 }
7327 }
7328 }
7329 #endif
7330
7331 return 1 + rex.lnum;
7332 }
7333
7334 /*
7335 * Match a regexp against a string ("line" points to the string) or multiple
7336 * lines (if "line" is NULL, use reg_getline()).
7337 *
7338 * Returns <= 0 for failure, number of lines contained in the match otherwise.
7339 */
7340 static long
nfa_regexec_both(char_u * line,colnr_T startcol,proftime_T * tm,int * timed_out)7341 nfa_regexec_both(
7342 char_u *line,
7343 colnr_T startcol, // column to start looking for match
7344 proftime_T *tm, // timeout limit or NULL
7345 int *timed_out) // flag set on timeout or NULL
7346 {
7347 nfa_regprog_T *prog;
7348 long retval = 0L;
7349 int i;
7350 colnr_T col = startcol;
7351
7352 if (REG_MULTI)
7353 {
7354 prog = (nfa_regprog_T *)rex.reg_mmatch->regprog;
7355 line = reg_getline((linenr_T)0); // relative to the cursor
7356 rex.reg_startpos = rex.reg_mmatch->startpos;
7357 rex.reg_endpos = rex.reg_mmatch->endpos;
7358 }
7359 else
7360 {
7361 prog = (nfa_regprog_T *)rex.reg_match->regprog;
7362 rex.reg_startp = rex.reg_match->startp;
7363 rex.reg_endp = rex.reg_match->endp;
7364 }
7365
7366 // Be paranoid...
7367 if (prog == NULL || line == NULL)
7368 {
7369 iemsg(_(e_null_argument));
7370 goto theend;
7371 }
7372
7373 // If pattern contains "\c" or "\C": overrule value of rex.reg_ic
7374 if (prog->regflags & RF_ICASE)
7375 rex.reg_ic = TRUE;
7376 else if (prog->regflags & RF_NOICASE)
7377 rex.reg_ic = FALSE;
7378
7379 // If pattern contains "\Z" overrule value of rex.reg_icombine
7380 if (prog->regflags & RF_ICOMBINE)
7381 rex.reg_icombine = TRUE;
7382
7383 rex.line = line;
7384 rex.lnum = 0; // relative to line
7385
7386 rex.nfa_has_zend = prog->has_zend;
7387 rex.nfa_has_backref = prog->has_backref;
7388 rex.nfa_nsubexpr = prog->nsubexp;
7389 rex.nfa_listid = 1;
7390 rex.nfa_alt_listid = 2;
7391 #ifdef DEBUG
7392 nfa_regengine.expr = prog->pattern;
7393 #endif
7394
7395 if (prog->reganch && col > 0)
7396 return 0L;
7397
7398 rex.need_clear_subexpr = TRUE;
7399 #ifdef FEAT_SYN_HL
7400 // Clear the external match subpointers if necessary.
7401 if (prog->reghasz == REX_SET)
7402 {
7403 rex.nfa_has_zsubexpr = TRUE;
7404 rex.need_clear_zsubexpr = TRUE;
7405 }
7406 else
7407 {
7408 rex.nfa_has_zsubexpr = FALSE;
7409 rex.need_clear_zsubexpr = FALSE;
7410 }
7411 #endif
7412
7413 if (prog->regstart != NUL)
7414 {
7415 // Skip ahead until a character we know the match must start with.
7416 // When there is none there is no match.
7417 if (skip_to_start(prog->regstart, &col) == FAIL)
7418 return 0L;
7419
7420 // If match_text is set it contains the full text that must match.
7421 // Nothing else to try. Doesn't handle combining chars well.
7422 if (prog->match_text != NULL && !rex.reg_icombine)
7423 return find_match_text(col, prog->regstart, prog->match_text);
7424 }
7425
7426 // If the start column is past the maximum column: no need to try.
7427 if (rex.reg_maxcol > 0 && col >= rex.reg_maxcol)
7428 goto theend;
7429
7430 // Set the "nstate" used by nfa_regcomp() to zero to trigger an error when
7431 // it's accidentally used during execution.
7432 nstate = 0;
7433 for (i = 0; i < prog->nstate; ++i)
7434 {
7435 prog->state[i].id = i;
7436 prog->state[i].lastlist[0] = 0;
7437 prog->state[i].lastlist[1] = 0;
7438 }
7439
7440 retval = nfa_regtry(prog, col, tm, timed_out);
7441
7442 #ifdef DEBUG
7443 nfa_regengine.expr = NULL;
7444 #endif
7445
7446 theend:
7447 if (retval > 0)
7448 {
7449 // Make sure the end is never before the start. Can happen when \zs and
7450 // \ze are used.
7451 if (REG_MULTI)
7452 {
7453 lpos_T *start = &rex.reg_mmatch->startpos[0];
7454 lpos_T *end = &rex.reg_mmatch->endpos[0];
7455
7456 if (end->lnum < start->lnum
7457 || (end->lnum == start->lnum && end->col < start->col))
7458 rex.reg_mmatch->endpos[0] = rex.reg_mmatch->startpos[0];
7459 }
7460 else
7461 {
7462 if (rex.reg_match->endp[0] < rex.reg_match->startp[0])
7463 rex.reg_match->endp[0] = rex.reg_match->startp[0];
7464 }
7465 }
7466
7467 return retval;
7468 }
7469
7470 /*
7471 * Compile a regular expression into internal code for the NFA matcher.
7472 * Returns the program in allocated space. Returns NULL for an error.
7473 */
7474 static regprog_T *
nfa_regcomp(char_u * expr,int re_flags)7475 nfa_regcomp(char_u *expr, int re_flags)
7476 {
7477 nfa_regprog_T *prog = NULL;
7478 size_t prog_size;
7479 int *postfix;
7480
7481 if (expr == NULL)
7482 return NULL;
7483
7484 #ifdef DEBUG
7485 nfa_regengine.expr = expr;
7486 #endif
7487 nfa_re_flags = re_flags;
7488
7489 init_class_tab();
7490
7491 if (nfa_regcomp_start(expr, re_flags) == FAIL)
7492 return NULL;
7493
7494 // Build postfix form of the regexp. Needed to build the NFA
7495 // (and count its size).
7496 postfix = re2post();
7497 if (postfix == NULL)
7498 goto fail; // Cascaded (syntax?) error
7499
7500 /*
7501 * In order to build the NFA, we parse the input regexp twice:
7502 * 1. first pass to count size (so we can allocate space)
7503 * 2. second to emit code
7504 */
7505 #ifdef ENABLE_LOG
7506 {
7507 FILE *f = fopen(NFA_REGEXP_RUN_LOG, "a");
7508
7509 if (f != NULL)
7510 {
7511 fprintf(f, "\n*****************************\n\n\n\n\tCompiling regexp \"%s\"... hold on !\n", expr);
7512 fclose(f);
7513 }
7514 }
7515 #endif
7516
7517 /*
7518 * PASS 1
7519 * Count number of NFA states in "nstate". Do not build the NFA.
7520 */
7521 post2nfa(postfix, post_ptr, TRUE);
7522
7523 // allocate the regprog with space for the compiled regexp
7524 prog_size = sizeof(nfa_regprog_T) + sizeof(nfa_state_T) * (nstate - 1);
7525 prog = alloc(prog_size);
7526 if (prog == NULL)
7527 goto fail;
7528 state_ptr = prog->state;
7529 prog->re_in_use = FALSE;
7530
7531 /*
7532 * PASS 2
7533 * Build the NFA
7534 */
7535 prog->start = post2nfa(postfix, post_ptr, FALSE);
7536 if (prog->start == NULL)
7537 goto fail;
7538
7539 prog->regflags = regflags;
7540 prog->engine = &nfa_regengine;
7541 prog->nstate = nstate;
7542 prog->has_zend = rex.nfa_has_zend;
7543 prog->has_backref = rex.nfa_has_backref;
7544 prog->nsubexp = regnpar;
7545
7546 nfa_postprocess(prog);
7547
7548 prog->reganch = nfa_get_reganch(prog->start, 0);
7549 prog->regstart = nfa_get_regstart(prog->start, 0);
7550 prog->match_text = nfa_get_match_text(prog->start);
7551
7552 #ifdef ENABLE_LOG
7553 nfa_postfix_dump(expr, OK);
7554 nfa_dump(prog);
7555 #endif
7556 #ifdef FEAT_SYN_HL
7557 // Remember whether this pattern has any \z specials in it.
7558 prog->reghasz = re_has_z;
7559 #endif
7560 prog->pattern = vim_strsave(expr);
7561 #ifdef DEBUG
7562 nfa_regengine.expr = NULL;
7563 #endif
7564
7565 out:
7566 VIM_CLEAR(post_start);
7567 post_ptr = post_end = NULL;
7568 state_ptr = NULL;
7569 return (regprog_T *)prog;
7570
7571 fail:
7572 VIM_CLEAR(prog);
7573 #ifdef ENABLE_LOG
7574 nfa_postfix_dump(expr, FAIL);
7575 #endif
7576 #ifdef DEBUG
7577 nfa_regengine.expr = NULL;
7578 #endif
7579 goto out;
7580 }
7581
7582 /*
7583 * Free a compiled regexp program, returned by nfa_regcomp().
7584 */
7585 static void
nfa_regfree(regprog_T * prog)7586 nfa_regfree(regprog_T *prog)
7587 {
7588 if (prog != NULL)
7589 {
7590 vim_free(((nfa_regprog_T *)prog)->match_text);
7591 vim_free(((nfa_regprog_T *)prog)->pattern);
7592 vim_free(prog);
7593 }
7594 }
7595
7596 /*
7597 * Match a regexp against a string.
7598 * "rmp->regprog" is a compiled regexp as returned by nfa_regcomp().
7599 * Uses curbuf for line count and 'iskeyword'.
7600 * If "line_lbr" is TRUE consider a "\n" in "line" to be a line break.
7601 *
7602 * Returns <= 0 for failure, number of lines contained in the match otherwise.
7603 */
7604 static int
nfa_regexec_nl(regmatch_T * rmp,char_u * line,colnr_T col,int line_lbr)7605 nfa_regexec_nl(
7606 regmatch_T *rmp,
7607 char_u *line, // string to match against
7608 colnr_T col, // column to start looking for match
7609 int line_lbr)
7610 {
7611 rex.reg_match = rmp;
7612 rex.reg_mmatch = NULL;
7613 rex.reg_maxline = 0;
7614 rex.reg_line_lbr = line_lbr;
7615 rex.reg_buf = curbuf;
7616 rex.reg_win = NULL;
7617 rex.reg_ic = rmp->rm_ic;
7618 rex.reg_icombine = FALSE;
7619 rex.reg_maxcol = 0;
7620 return nfa_regexec_both(line, col, NULL, NULL);
7621 }
7622
7623
7624 /*
7625 * Match a regexp against multiple lines.
7626 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
7627 * Uses curbuf for line count and 'iskeyword'.
7628 *
7629 * Return <= 0 if there is no match. Return number of lines contained in the
7630 * match otherwise.
7631 *
7632 * Note: the body is the same as bt_regexec() except for nfa_regexec_both()
7633 *
7634 * ! Also NOTE : match may actually be in another line. e.g.:
7635 * when r.e. is \nc, cursor is at 'a' and the text buffer looks like
7636 *
7637 * +-------------------------+
7638 * |a |
7639 * |b |
7640 * |c |
7641 * | |
7642 * +-------------------------+
7643 *
7644 * then nfa_regexec_multi() returns 3. while the original
7645 * vim_regexec_multi() returns 0 and a second call at line 2 will return 2.
7646 *
7647 * FIXME if this behavior is not compatible.
7648 */
7649 static long
nfa_regexec_multi(regmmatch_T * rmp,win_T * win,buf_T * buf,linenr_T lnum,colnr_T col,proftime_T * tm,int * timed_out)7650 nfa_regexec_multi(
7651 regmmatch_T *rmp,
7652 win_T *win, // window in which to search or NULL
7653 buf_T *buf, // buffer in which to search
7654 linenr_T lnum, // nr of line to start looking for match
7655 colnr_T col, // column to start looking for match
7656 proftime_T *tm, // timeout limit or NULL
7657 int *timed_out) // flag set on timeout or NULL
7658 {
7659 init_regexec_multi(rmp, win, buf, lnum);
7660 return nfa_regexec_both(NULL, col, tm, timed_out);
7661 }
7662
7663 #ifdef DEBUG
7664 # undef ENABLE_LOG
7665 #endif
7666