1 /* vi:set ts=8 sts=4 sw=4: 2 * 3 * NFA regular expression implementation. 4 * 5 * This file is included in "regexp.c". 6 */ 7 8 /* 9 * Logging of NFA engine. 10 * 11 * The NFA engine can write four log files: 12 * - Error log: Contains NFA engine's fatal errors. 13 * - Dump log: Contains compiled NFA state machine's information. 14 * - Run log: Contains information of matching procedure. 15 * - Debug log: Contains detailed information of matching procedure. Can be 16 * disabled by undefining NFA_REGEXP_DEBUG_LOG. 17 * The first one can also be used without debug mode. 18 * The last three are enabled when compiled as debug mode and individually 19 * disabled by commenting them out. 20 * The log files can get quite big! 21 * Do disable all of this when compiling Vim for debugging, undefine DEBUG in 22 * regexp.c 23 */ 24 #ifdef DEBUG 25 # define NFA_REGEXP_ERROR_LOG "nfa_regexp_error.log" 26 # define ENABLE_LOG 27 # define NFA_REGEXP_DUMP_LOG "nfa_regexp_dump.log" 28 # define NFA_REGEXP_RUN_LOG "nfa_regexp_run.log" 29 # define NFA_REGEXP_DEBUG_LOG "nfa_regexp_debug.log" 30 #endif 31 32 /* Added to NFA_ANY - NFA_NUPPER_IC to include a NL. */ 33 #define NFA_ADD_NL 31 34 35 enum 36 { 37 NFA_SPLIT = -1024, 38 NFA_MATCH, 39 NFA_EMPTY, /* matches 0-length */ 40 41 NFA_START_COLL, /* [abc] start */ 42 NFA_END_COLL, /* [abc] end */ 43 NFA_START_NEG_COLL, /* [^abc] start */ 44 NFA_END_NEG_COLL, /* [^abc] end (postfix only) */ 45 NFA_RANGE, /* range of the two previous items 46 * (postfix only) */ 47 NFA_RANGE_MIN, /* low end of a range */ 48 NFA_RANGE_MAX, /* high end of a range */ 49 50 NFA_CONCAT, /* concatenate two previous items (postfix 51 * only) */ 52 NFA_OR, /* \| (postfix only) */ 53 NFA_STAR, /* greedy * (posfix only) */ 54 NFA_STAR_NONGREEDY, /* non-greedy * (postfix only) */ 55 NFA_QUEST, /* greedy \? (postfix only) */ 56 NFA_QUEST_NONGREEDY, /* non-greedy \? (postfix only) */ 57 58 NFA_BOL, /* ^ Begin line */ 59 NFA_EOL, /* $ End line */ 60 NFA_BOW, /* \< Begin word */ 61 NFA_EOW, /* \> End word */ 62 NFA_BOF, /* \%^ Begin file */ 63 NFA_EOF, /* \%$ End file */ 64 NFA_NEWL, 65 NFA_ZSTART, /* Used for \zs */ 66 NFA_ZEND, /* Used for \ze */ 67 NFA_NOPEN, /* Start of subexpression marked with \%( */ 68 NFA_NCLOSE, /* End of subexpr. marked with \%( ... \) */ 69 NFA_START_INVISIBLE, 70 NFA_START_INVISIBLE_FIRST, 71 NFA_START_INVISIBLE_NEG, 72 NFA_START_INVISIBLE_NEG_FIRST, 73 NFA_START_INVISIBLE_BEFORE, 74 NFA_START_INVISIBLE_BEFORE_FIRST, 75 NFA_START_INVISIBLE_BEFORE_NEG, 76 NFA_START_INVISIBLE_BEFORE_NEG_FIRST, 77 NFA_START_PATTERN, 78 NFA_END_INVISIBLE, 79 NFA_END_INVISIBLE_NEG, 80 NFA_END_PATTERN, 81 NFA_COMPOSING, /* Next nodes in NFA are part of the 82 composing multibyte char */ 83 NFA_END_COMPOSING, /* End of a composing char in the NFA */ 84 NFA_ANY_COMPOSING, /* \%C: Any composing characters. */ 85 NFA_OPT_CHARS, /* \%[abc] */ 86 87 /* The following are used only in the postfix form, not in the NFA */ 88 NFA_PREV_ATOM_NO_WIDTH, /* Used for \@= */ 89 NFA_PREV_ATOM_NO_WIDTH_NEG, /* Used for \@! */ 90 NFA_PREV_ATOM_JUST_BEFORE, /* Used for \@<= */ 91 NFA_PREV_ATOM_JUST_BEFORE_NEG, /* Used for \@<! */ 92 NFA_PREV_ATOM_LIKE_PATTERN, /* Used for \@> */ 93 94 NFA_BACKREF1, /* \1 */ 95 NFA_BACKREF2, /* \2 */ 96 NFA_BACKREF3, /* \3 */ 97 NFA_BACKREF4, /* \4 */ 98 NFA_BACKREF5, /* \5 */ 99 NFA_BACKREF6, /* \6 */ 100 NFA_BACKREF7, /* \7 */ 101 NFA_BACKREF8, /* \8 */ 102 NFA_BACKREF9, /* \9 */ 103 #ifdef FEAT_SYN_HL 104 NFA_ZREF1, /* \z1 */ 105 NFA_ZREF2, /* \z2 */ 106 NFA_ZREF3, /* \z3 */ 107 NFA_ZREF4, /* \z4 */ 108 NFA_ZREF5, /* \z5 */ 109 NFA_ZREF6, /* \z6 */ 110 NFA_ZREF7, /* \z7 */ 111 NFA_ZREF8, /* \z8 */ 112 NFA_ZREF9, /* \z9 */ 113 #endif 114 NFA_SKIP, /* Skip characters */ 115 116 NFA_MOPEN, 117 NFA_MOPEN1, 118 NFA_MOPEN2, 119 NFA_MOPEN3, 120 NFA_MOPEN4, 121 NFA_MOPEN5, 122 NFA_MOPEN6, 123 NFA_MOPEN7, 124 NFA_MOPEN8, 125 NFA_MOPEN9, 126 127 NFA_MCLOSE, 128 NFA_MCLOSE1, 129 NFA_MCLOSE2, 130 NFA_MCLOSE3, 131 NFA_MCLOSE4, 132 NFA_MCLOSE5, 133 NFA_MCLOSE6, 134 NFA_MCLOSE7, 135 NFA_MCLOSE8, 136 NFA_MCLOSE9, 137 138 #ifdef FEAT_SYN_HL 139 NFA_ZOPEN, 140 NFA_ZOPEN1, 141 NFA_ZOPEN2, 142 NFA_ZOPEN3, 143 NFA_ZOPEN4, 144 NFA_ZOPEN5, 145 NFA_ZOPEN6, 146 NFA_ZOPEN7, 147 NFA_ZOPEN8, 148 NFA_ZOPEN9, 149 150 NFA_ZCLOSE, 151 NFA_ZCLOSE1, 152 NFA_ZCLOSE2, 153 NFA_ZCLOSE3, 154 NFA_ZCLOSE4, 155 NFA_ZCLOSE5, 156 NFA_ZCLOSE6, 157 NFA_ZCLOSE7, 158 NFA_ZCLOSE8, 159 NFA_ZCLOSE9, 160 #endif 161 162 /* NFA_FIRST_NL */ 163 NFA_ANY, /* Match any one character. */ 164 NFA_IDENT, /* Match identifier char */ 165 NFA_SIDENT, /* Match identifier char but no digit */ 166 NFA_KWORD, /* Match keyword char */ 167 NFA_SKWORD, /* Match word char but no digit */ 168 NFA_FNAME, /* Match file name char */ 169 NFA_SFNAME, /* Match file name char but no digit */ 170 NFA_PRINT, /* Match printable char */ 171 NFA_SPRINT, /* Match printable char but no digit */ 172 NFA_WHITE, /* Match whitespace char */ 173 NFA_NWHITE, /* Match non-whitespace char */ 174 NFA_DIGIT, /* Match digit char */ 175 NFA_NDIGIT, /* Match non-digit char */ 176 NFA_HEX, /* Match hex char */ 177 NFA_NHEX, /* Match non-hex char */ 178 NFA_OCTAL, /* Match octal char */ 179 NFA_NOCTAL, /* Match non-octal char */ 180 NFA_WORD, /* Match word char */ 181 NFA_NWORD, /* Match non-word char */ 182 NFA_HEAD, /* Match head char */ 183 NFA_NHEAD, /* Match non-head char */ 184 NFA_ALPHA, /* Match alpha char */ 185 NFA_NALPHA, /* Match non-alpha char */ 186 NFA_LOWER, /* Match lowercase char */ 187 NFA_NLOWER, /* Match non-lowercase char */ 188 NFA_UPPER, /* Match uppercase char */ 189 NFA_NUPPER, /* Match non-uppercase char */ 190 NFA_LOWER_IC, /* Match [a-z] */ 191 NFA_NLOWER_IC, /* Match [^a-z] */ 192 NFA_UPPER_IC, /* Match [A-Z] */ 193 NFA_NUPPER_IC, /* Match [^A-Z] */ 194 195 NFA_FIRST_NL = NFA_ANY + NFA_ADD_NL, 196 NFA_LAST_NL = NFA_NUPPER_IC + NFA_ADD_NL, 197 198 NFA_CURSOR, /* Match cursor pos */ 199 NFA_LNUM, /* Match line number */ 200 NFA_LNUM_GT, /* Match > line number */ 201 NFA_LNUM_LT, /* Match < line number */ 202 NFA_COL, /* Match cursor column */ 203 NFA_COL_GT, /* Match > cursor column */ 204 NFA_COL_LT, /* Match < cursor column */ 205 NFA_VCOL, /* Match cursor virtual column */ 206 NFA_VCOL_GT, /* Match > cursor virtual column */ 207 NFA_VCOL_LT, /* Match < cursor virtual column */ 208 NFA_MARK, /* Match mark */ 209 NFA_MARK_GT, /* Match > mark */ 210 NFA_MARK_LT, /* Match < mark */ 211 NFA_VISUAL, /* Match Visual area */ 212 213 /* Character classes [:alnum:] etc */ 214 NFA_CLASS_ALNUM, 215 NFA_CLASS_ALPHA, 216 NFA_CLASS_BLANK, 217 NFA_CLASS_CNTRL, 218 NFA_CLASS_DIGIT, 219 NFA_CLASS_GRAPH, 220 NFA_CLASS_LOWER, 221 NFA_CLASS_PRINT, 222 NFA_CLASS_PUNCT, 223 NFA_CLASS_SPACE, 224 NFA_CLASS_UPPER, 225 NFA_CLASS_XDIGIT, 226 NFA_CLASS_TAB, 227 NFA_CLASS_RETURN, 228 NFA_CLASS_BACKSPACE, 229 NFA_CLASS_ESCAPE 230 }; 231 232 /* Keep in sync with classchars. */ 233 static int nfa_classcodes[] = { 234 NFA_ANY, NFA_IDENT, NFA_SIDENT, NFA_KWORD,NFA_SKWORD, 235 NFA_FNAME, NFA_SFNAME, NFA_PRINT, NFA_SPRINT, 236 NFA_WHITE, NFA_NWHITE, NFA_DIGIT, NFA_NDIGIT, 237 NFA_HEX, NFA_NHEX, NFA_OCTAL, NFA_NOCTAL, 238 NFA_WORD, NFA_NWORD, NFA_HEAD, NFA_NHEAD, 239 NFA_ALPHA, NFA_NALPHA, NFA_LOWER, NFA_NLOWER, 240 NFA_UPPER, NFA_NUPPER 241 }; 242 243 static char_u e_nul_found[] = N_("E865: (NFA) Regexp end encountered prematurely"); 244 static char_u e_misplaced[] = N_("E866: (NFA regexp) Misplaced %c"); 245 static char_u e_ill_char_class[] = N_("E877: (NFA regexp) Invalid character class: %ld"); 246 247 /* re_flags passed to nfa_regcomp() */ 248 static int nfa_re_flags; 249 250 /* NFA regexp \ze operator encountered. */ 251 static int nfa_has_zend; 252 253 /* NFA regexp \1 .. \9 encountered. */ 254 static int nfa_has_backref; 255 256 #ifdef FEAT_SYN_HL 257 /* NFA regexp has \z( ), set zsubexpr. */ 258 static int nfa_has_zsubexpr; 259 #endif 260 261 /* Number of sub expressions actually being used during execution. 1 if only 262 * the whole match (subexpr 0) is used. */ 263 static int nfa_nsubexpr; 264 265 static int *post_start; /* holds the postfix form of r.e. */ 266 static int *post_end; 267 static int *post_ptr; 268 269 static int nstate; /* Number of states in the NFA. Also used when 270 * executing. */ 271 static int istate; /* Index in the state vector, used in alloc_state() */ 272 273 /* If not NULL match must end at this position */ 274 static save_se_T *nfa_endp = NULL; 275 276 /* listid is global, so that it increases on recursive calls to 277 * nfa_regmatch(), which means we don't have to clear the lastlist field of 278 * all the states. */ 279 static int nfa_listid; 280 static int nfa_alt_listid; 281 282 /* 0 for first call to nfa_regmatch(), 1 for recursive call. */ 283 static int nfa_ll_index = 0; 284 285 static int nfa_regcomp_start __ARGS((char_u *expr, int re_flags)); 286 static int nfa_get_reganch __ARGS((nfa_state_T *start, int depth)); 287 static int nfa_get_regstart __ARGS((nfa_state_T *start, int depth)); 288 static char_u *nfa_get_match_text __ARGS((nfa_state_T *start)); 289 static int realloc_post_list __ARGS((void)); 290 static int nfa_recognize_char_class __ARGS((char_u *start, char_u *end, int extra_newl)); 291 static int nfa_emit_equi_class __ARGS((int c)); 292 static int nfa_regatom __ARGS((void)); 293 static int nfa_regpiece __ARGS((void)); 294 static int nfa_regconcat __ARGS((void)); 295 static int nfa_regbranch __ARGS((void)); 296 static int nfa_reg __ARGS((int paren)); 297 #ifdef DEBUG 298 static void nfa_set_code __ARGS((int c)); 299 static void nfa_postfix_dump __ARGS((char_u *expr, int retval)); 300 static void nfa_print_state __ARGS((FILE *debugf, nfa_state_T *state)); 301 static void nfa_print_state2 __ARGS((FILE *debugf, nfa_state_T *state, garray_T *indent)); 302 static void nfa_dump __ARGS((nfa_regprog_T *prog)); 303 #endif 304 static int *re2post __ARGS((void)); 305 static nfa_state_T *alloc_state __ARGS((int c, nfa_state_T *out, nfa_state_T *out1)); 306 static void st_error __ARGS((int *postfix, int *end, int *p)); 307 static int nfa_max_width __ARGS((nfa_state_T *startstate, int depth)); 308 static nfa_state_T *post2nfa __ARGS((int *postfix, int *end, int nfa_calc_size)); 309 static void nfa_postprocess __ARGS((nfa_regprog_T *prog)); 310 static int check_char_class __ARGS((int class, int c)); 311 static void nfa_save_listids __ARGS((nfa_regprog_T *prog, int *list)); 312 static void nfa_restore_listids __ARGS((nfa_regprog_T *prog, int *list)); 313 static int nfa_re_num_cmp __ARGS((long_u val, int op, long_u pos)); 314 static long nfa_regtry __ARGS((nfa_regprog_T *prog, colnr_T col, proftime_T *tm)); 315 static long nfa_regexec_both __ARGS((char_u *line, colnr_T col, proftime_T *tm)); 316 static regprog_T *nfa_regcomp __ARGS((char_u *expr, int re_flags)); 317 static void nfa_regfree __ARGS((regprog_T *prog)); 318 static int nfa_regexec_nl __ARGS((regmatch_T *rmp, char_u *line, colnr_T col, int line_lbr)); 319 static long nfa_regexec_multi __ARGS((regmmatch_T *rmp, win_T *win, buf_T *buf, linenr_T lnum, colnr_T col, proftime_T *tm)); 320 static int match_follows __ARGS((nfa_state_T *startstate, int depth)); 321 static int failure_chance __ARGS((nfa_state_T *state, int depth)); 322 323 /* helper functions used when doing re2post() ... regatom() parsing */ 324 #define EMIT(c) do { \ 325 if (post_ptr >= post_end && realloc_post_list() == FAIL) \ 326 return FAIL; \ 327 *post_ptr++ = c; \ 328 } while (0) 329 330 /* 331 * Initialize internal variables before NFA compilation. 332 * Return OK on success, FAIL otherwise. 333 */ 334 static int 335 nfa_regcomp_start(expr, re_flags) 336 char_u *expr; 337 int re_flags; /* see vim_regcomp() */ 338 { 339 size_t postfix_size; 340 int nstate_max; 341 342 nstate = 0; 343 istate = 0; 344 /* A reasonable estimation for maximum size */ 345 nstate_max = (int)(STRLEN(expr) + 1) * 25; 346 347 /* Some items blow up in size, such as [A-z]. Add more space for that. 348 * When it is still not enough realloc_post_list() will be used. */ 349 nstate_max += 1000; 350 351 /* Size for postfix representation of expr. */ 352 postfix_size = sizeof(int) * nstate_max; 353 354 post_start = (int *)lalloc(postfix_size, TRUE); 355 if (post_start == NULL) 356 return FAIL; 357 post_ptr = post_start; 358 post_end = post_start + nstate_max; 359 nfa_has_zend = FALSE; 360 nfa_has_backref = FALSE; 361 362 /* shared with BT engine */ 363 regcomp_start(expr, re_flags); 364 365 return OK; 366 } 367 368 /* 369 * Figure out if the NFA state list starts with an anchor, must match at start 370 * of the line. 371 */ 372 static int 373 nfa_get_reganch(start, depth) 374 nfa_state_T *start; 375 int depth; 376 { 377 nfa_state_T *p = start; 378 379 if (depth > 4) 380 return 0; 381 382 while (p != NULL) 383 { 384 switch (p->c) 385 { 386 case NFA_BOL: 387 case NFA_BOF: 388 return 1; /* yes! */ 389 390 case NFA_ZSTART: 391 case NFA_ZEND: 392 case NFA_CURSOR: 393 case NFA_VISUAL: 394 395 case NFA_MOPEN: 396 case NFA_MOPEN1: 397 case NFA_MOPEN2: 398 case NFA_MOPEN3: 399 case NFA_MOPEN4: 400 case NFA_MOPEN5: 401 case NFA_MOPEN6: 402 case NFA_MOPEN7: 403 case NFA_MOPEN8: 404 case NFA_MOPEN9: 405 case NFA_NOPEN: 406 #ifdef FEAT_SYN_HL 407 case NFA_ZOPEN: 408 case NFA_ZOPEN1: 409 case NFA_ZOPEN2: 410 case NFA_ZOPEN3: 411 case NFA_ZOPEN4: 412 case NFA_ZOPEN5: 413 case NFA_ZOPEN6: 414 case NFA_ZOPEN7: 415 case NFA_ZOPEN8: 416 case NFA_ZOPEN9: 417 #endif 418 p = p->out; 419 break; 420 421 case NFA_SPLIT: 422 return nfa_get_reganch(p->out, depth + 1) 423 && nfa_get_reganch(p->out1, depth + 1); 424 425 default: 426 return 0; /* noooo */ 427 } 428 } 429 return 0; 430 } 431 432 /* 433 * Figure out if the NFA state list starts with a character which must match 434 * at start of the match. 435 */ 436 static int 437 nfa_get_regstart(start, depth) 438 nfa_state_T *start; 439 int depth; 440 { 441 nfa_state_T *p = start; 442 443 if (depth > 4) 444 return 0; 445 446 while (p != NULL) 447 { 448 switch (p->c) 449 { 450 /* all kinds of zero-width matches */ 451 case NFA_BOL: 452 case NFA_BOF: 453 case NFA_BOW: 454 case NFA_EOW: 455 case NFA_ZSTART: 456 case NFA_ZEND: 457 case NFA_CURSOR: 458 case NFA_VISUAL: 459 case NFA_LNUM: 460 case NFA_LNUM_GT: 461 case NFA_LNUM_LT: 462 case NFA_COL: 463 case NFA_COL_GT: 464 case NFA_COL_LT: 465 case NFA_VCOL: 466 case NFA_VCOL_GT: 467 case NFA_VCOL_LT: 468 case NFA_MARK: 469 case NFA_MARK_GT: 470 case NFA_MARK_LT: 471 472 case NFA_MOPEN: 473 case NFA_MOPEN1: 474 case NFA_MOPEN2: 475 case NFA_MOPEN3: 476 case NFA_MOPEN4: 477 case NFA_MOPEN5: 478 case NFA_MOPEN6: 479 case NFA_MOPEN7: 480 case NFA_MOPEN8: 481 case NFA_MOPEN9: 482 case NFA_NOPEN: 483 #ifdef FEAT_SYN_HL 484 case NFA_ZOPEN: 485 case NFA_ZOPEN1: 486 case NFA_ZOPEN2: 487 case NFA_ZOPEN3: 488 case NFA_ZOPEN4: 489 case NFA_ZOPEN5: 490 case NFA_ZOPEN6: 491 case NFA_ZOPEN7: 492 case NFA_ZOPEN8: 493 case NFA_ZOPEN9: 494 #endif 495 p = p->out; 496 break; 497 498 case NFA_SPLIT: 499 { 500 int c1 = nfa_get_regstart(p->out, depth + 1); 501 int c2 = nfa_get_regstart(p->out1, depth + 1); 502 503 if (c1 == c2) 504 return c1; /* yes! */ 505 return 0; 506 } 507 508 default: 509 if (p->c > 0) 510 return p->c; /* yes! */ 511 return 0; 512 } 513 } 514 return 0; 515 } 516 517 /* 518 * Figure out if the NFA state list contains just literal text and nothing 519 * else. If so return a string in allocated memory with what must match after 520 * regstart. Otherwise return NULL. 521 */ 522 static char_u * 523 nfa_get_match_text(start) 524 nfa_state_T *start; 525 { 526 nfa_state_T *p = start; 527 int len = 0; 528 char_u *ret; 529 char_u *s; 530 531 if (p->c != NFA_MOPEN) 532 return NULL; /* just in case */ 533 p = p->out; 534 while (p->c > 0) 535 { 536 len += MB_CHAR2LEN(p->c); 537 p = p->out; 538 } 539 if (p->c != NFA_MCLOSE || p->out->c != NFA_MATCH) 540 return NULL; 541 542 ret = alloc(len); 543 if (ret != NULL) 544 { 545 p = start->out->out; /* skip first char, it goes into regstart */ 546 s = ret; 547 while (p->c > 0) 548 { 549 #ifdef FEAT_MBYTE 550 if (has_mbyte) 551 s += (*mb_char2bytes)(p->c, s); 552 else 553 #endif 554 *s++ = p->c; 555 p = p->out; 556 } 557 *s = NUL; 558 } 559 return ret; 560 } 561 562 /* 563 * Allocate more space for post_start. Called when 564 * running above the estimated number of states. 565 */ 566 static int 567 realloc_post_list() 568 { 569 int nstate_max = (int)(post_end - post_start); 570 int new_max = nstate_max + 1000; 571 int *new_start; 572 int *old_start; 573 574 new_start = (int *)lalloc(new_max * sizeof(int), TRUE); 575 if (new_start == NULL) 576 return FAIL; 577 mch_memmove(new_start, post_start, nstate_max * sizeof(int)); 578 old_start = post_start; 579 post_start = new_start; 580 post_ptr = new_start + (post_ptr - old_start); 581 post_end = post_start + new_max; 582 vim_free(old_start); 583 return OK; 584 } 585 586 /* 587 * Search between "start" and "end" and try to recognize a 588 * character class in expanded form. For example [0-9]. 589 * On success, return the id the character class to be emitted. 590 * On failure, return 0 (=FAIL) 591 * Start points to the first char of the range, while end should point 592 * to the closing brace. 593 * Keep in mind that 'ignorecase' applies at execution time, thus [a-z] may 594 * need to be interpreted as [a-zA-Z]. 595 */ 596 static int 597 nfa_recognize_char_class(start, end, extra_newl) 598 char_u *start; 599 char_u *end; 600 int extra_newl; 601 { 602 # define CLASS_not 0x80 603 # define CLASS_af 0x40 604 # define CLASS_AF 0x20 605 # define CLASS_az 0x10 606 # define CLASS_AZ 0x08 607 # define CLASS_o7 0x04 608 # define CLASS_o9 0x02 609 # define CLASS_underscore 0x01 610 611 int newl = FALSE; 612 char_u *p; 613 int config = 0; 614 615 if (extra_newl == TRUE) 616 newl = TRUE; 617 618 if (*end != ']') 619 return FAIL; 620 p = start; 621 if (*p == '^') 622 { 623 config |= CLASS_not; 624 p++; 625 } 626 627 while (p < end) 628 { 629 if (p + 2 < end && *(p + 1) == '-') 630 { 631 switch (*p) 632 { 633 case '0': 634 if (*(p + 2) == '9') 635 { 636 config |= CLASS_o9; 637 break; 638 } 639 else 640 if (*(p + 2) == '7') 641 { 642 config |= CLASS_o7; 643 break; 644 } 645 case 'a': 646 if (*(p + 2) == 'z') 647 { 648 config |= CLASS_az; 649 break; 650 } 651 else 652 if (*(p + 2) == 'f') 653 { 654 config |= CLASS_af; 655 break; 656 } 657 case 'A': 658 if (*(p + 2) == 'Z') 659 { 660 config |= CLASS_AZ; 661 break; 662 } 663 else 664 if (*(p + 2) == 'F') 665 { 666 config |= CLASS_AF; 667 break; 668 } 669 /* FALLTHROUGH */ 670 default: 671 return FAIL; 672 } 673 p += 3; 674 } 675 else if (p + 1 < end && *p == '\\' && *(p + 1) == 'n') 676 { 677 newl = TRUE; 678 p += 2; 679 } 680 else if (*p == '_') 681 { 682 config |= CLASS_underscore; 683 p ++; 684 } 685 else if (*p == '\n') 686 { 687 newl = TRUE; 688 p ++; 689 } 690 else 691 return FAIL; 692 } /* while (p < end) */ 693 694 if (p != end) 695 return FAIL; 696 697 if (newl == TRUE) 698 extra_newl = NFA_ADD_NL; 699 700 switch (config) 701 { 702 case CLASS_o9: 703 return extra_newl + NFA_DIGIT; 704 case CLASS_not | CLASS_o9: 705 return extra_newl + NFA_NDIGIT; 706 case CLASS_af | CLASS_AF | CLASS_o9: 707 return extra_newl + NFA_HEX; 708 case CLASS_not | CLASS_af | CLASS_AF | CLASS_o9: 709 return extra_newl + NFA_NHEX; 710 case CLASS_o7: 711 return extra_newl + NFA_OCTAL; 712 case CLASS_not | CLASS_o7: 713 return extra_newl + NFA_NOCTAL; 714 case CLASS_az | CLASS_AZ | CLASS_o9 | CLASS_underscore: 715 return extra_newl + NFA_WORD; 716 case CLASS_not | CLASS_az | CLASS_AZ | CLASS_o9 | CLASS_underscore: 717 return extra_newl + NFA_NWORD; 718 case CLASS_az | CLASS_AZ | CLASS_underscore: 719 return extra_newl + NFA_HEAD; 720 case CLASS_not | CLASS_az | CLASS_AZ | CLASS_underscore: 721 return extra_newl + NFA_NHEAD; 722 case CLASS_az | CLASS_AZ: 723 return extra_newl + NFA_ALPHA; 724 case CLASS_not | CLASS_az | CLASS_AZ: 725 return extra_newl + NFA_NALPHA; 726 case CLASS_az: 727 return extra_newl + NFA_LOWER_IC; 728 case CLASS_not | CLASS_az: 729 return extra_newl + NFA_NLOWER_IC; 730 case CLASS_AZ: 731 return extra_newl + NFA_UPPER_IC; 732 case CLASS_not | CLASS_AZ: 733 return extra_newl + NFA_NUPPER_IC; 734 } 735 return FAIL; 736 } 737 738 /* 739 * Produce the bytes for equivalence class "c". 740 * Currently only handles latin1, latin9 and utf-8. 741 * Emits bytes in postfix notation: 'a,b,NFA_OR,c,NFA_OR' is 742 * equivalent to 'a OR b OR c' 743 * 744 * NOTE! When changing this function, also update reg_equi_class() 745 */ 746 static int 747 nfa_emit_equi_class(c) 748 int c; 749 { 750 #define EMIT2(c) EMIT(c); EMIT(NFA_CONCAT); 751 #ifdef FEAT_MBYTE 752 # define EMITMBC(c) EMIT(c); EMIT(NFA_CONCAT); 753 #else 754 # define EMITMBC(c) 755 #endif 756 757 #ifdef FEAT_MBYTE 758 if (enc_utf8 || STRCMP(p_enc, "latin1") == 0 759 || STRCMP(p_enc, "iso-8859-15") == 0) 760 #endif 761 { 762 switch (c) 763 { 764 case 'A': case 0300: case 0301: case 0302: 765 case 0303: case 0304: case 0305: 766 CASEMBC(0x100) CASEMBC(0x102) CASEMBC(0x104) CASEMBC(0x1cd) 767 CASEMBC(0x1de) CASEMBC(0x1e0) CASEMBC(0x1ea2) 768 EMIT2('A'); EMIT2(0300); EMIT2(0301); EMIT2(0302); 769 EMIT2(0303); EMIT2(0304); EMIT2(0305); 770 EMITMBC(0x100) EMITMBC(0x102) EMITMBC(0x104) 771 EMITMBC(0x1cd) EMITMBC(0x1de) EMITMBC(0x1e0) 772 EMITMBC(0x1ea2) 773 return OK; 774 775 case 'B': CASEMBC(0x1e02) CASEMBC(0x1e06) 776 EMIT2('B'); EMITMBC(0x1e02) EMITMBC(0x1e06) 777 return OK; 778 779 case 'C': case 0307: 780 CASEMBC(0x106) CASEMBC(0x108) CASEMBC(0x10a) CASEMBC(0x10c) 781 EMIT2('C'); EMIT2(0307); EMITMBC(0x106) EMITMBC(0x108) 782 EMITMBC(0x10a) EMITMBC(0x10c) 783 return OK; 784 785 case 'D': CASEMBC(0x10e) CASEMBC(0x110) CASEMBC(0x1e0a) 786 CASEMBC(0x1e0e) CASEMBC(0x1e10) 787 EMIT2('D'); EMITMBC(0x10e) EMITMBC(0x110) EMITMBC(0x1e0a) 788 EMITMBC(0x1e0e) EMITMBC(0x1e10) 789 return OK; 790 791 case 'E': case 0310: case 0311: case 0312: case 0313: 792 CASEMBC(0x112) CASEMBC(0x114) CASEMBC(0x116) CASEMBC(0x118) 793 CASEMBC(0x11a) CASEMBC(0x1eba) CASEMBC(0x1ebc) 794 EMIT2('E'); EMIT2(0310); EMIT2(0311); EMIT2(0312); 795 EMIT2(0313); 796 EMITMBC(0x112) EMITMBC(0x114) EMITMBC(0x116) 797 EMITMBC(0x118) EMITMBC(0x11a) EMITMBC(0x1eba) 798 EMITMBC(0x1ebc) 799 return OK; 800 801 case 'F': CASEMBC(0x1e1e) 802 EMIT2('F'); EMITMBC(0x1e1e) 803 return OK; 804 805 case 'G': CASEMBC(0x11c) CASEMBC(0x11e) CASEMBC(0x120) 806 CASEMBC(0x122) CASEMBC(0x1e4) CASEMBC(0x1e6) CASEMBC(0x1f4) 807 CASEMBC(0x1e20) 808 EMIT2('G'); EMITMBC(0x11c) EMITMBC(0x11e) EMITMBC(0x120) 809 EMITMBC(0x122) EMITMBC(0x1e4) EMITMBC(0x1e6) 810 EMITMBC(0x1f4) EMITMBC(0x1e20) 811 return OK; 812 813 case 'H': CASEMBC(0x124) CASEMBC(0x126) CASEMBC(0x1e22) 814 CASEMBC(0x1e26) CASEMBC(0x1e28) 815 EMIT2('H'); EMITMBC(0x124) EMITMBC(0x126) EMITMBC(0x1e22) 816 EMITMBC(0x1e26) EMITMBC(0x1e28) 817 return OK; 818 819 case 'I': case 0314: case 0315: case 0316: case 0317: 820 CASEMBC(0x128) CASEMBC(0x12a) CASEMBC(0x12c) CASEMBC(0x12e) 821 CASEMBC(0x130) CASEMBC(0x1cf) CASEMBC(0x1ec8) 822 EMIT2('I'); EMIT2(0314); EMIT2(0315); EMIT2(0316); 823 EMIT2(0317); EMITMBC(0x128) EMITMBC(0x12a) 824 EMITMBC(0x12c) EMITMBC(0x12e) EMITMBC(0x130) 825 EMITMBC(0x1cf) EMITMBC(0x1ec8) 826 return OK; 827 828 case 'J': CASEMBC(0x134) 829 EMIT2('J'); EMITMBC(0x134) 830 return OK; 831 832 case 'K': CASEMBC(0x136) CASEMBC(0x1e8) CASEMBC(0x1e30) 833 CASEMBC(0x1e34) 834 EMIT2('K'); EMITMBC(0x136) EMITMBC(0x1e8) EMITMBC(0x1e30) 835 EMITMBC(0x1e34) 836 return OK; 837 838 case 'L': CASEMBC(0x139) CASEMBC(0x13b) CASEMBC(0x13d) 839 CASEMBC(0x13f) CASEMBC(0x141) CASEMBC(0x1e3a) 840 EMIT2('L'); EMITMBC(0x139) EMITMBC(0x13b) EMITMBC(0x13d) 841 EMITMBC(0x13f) EMITMBC(0x141) EMITMBC(0x1e3a) 842 return OK; 843 844 case 'M': CASEMBC(0x1e3e) CASEMBC(0x1e40) 845 EMIT2('M'); EMITMBC(0x1e3e) EMITMBC(0x1e40) 846 return OK; 847 848 case 'N': case 0321: 849 CASEMBC(0x143) CASEMBC(0x145) CASEMBC(0x147) CASEMBC(0x1e44) 850 CASEMBC(0x1e48) 851 EMIT2('N'); EMIT2(0321); EMITMBC(0x143) EMITMBC(0x145) 852 EMITMBC(0x147) EMITMBC(0x1e44) EMITMBC(0x1e48) 853 return OK; 854 855 case 'O': case 0322: case 0323: case 0324: case 0325: 856 case 0326: case 0330: 857 CASEMBC(0x14c) CASEMBC(0x14e) CASEMBC(0x150) CASEMBC(0x1a0) 858 CASEMBC(0x1d1) CASEMBC(0x1ea) CASEMBC(0x1ec) CASEMBC(0x1ece) 859 EMIT2('O'); EMIT2(0322); EMIT2(0323); EMIT2(0324); 860 EMIT2(0325); EMIT2(0326); EMIT2(0330); 861 EMITMBC(0x14c) EMITMBC(0x14e) EMITMBC(0x150) 862 EMITMBC(0x1a0) EMITMBC(0x1d1) EMITMBC(0x1ea) 863 EMITMBC(0x1ec) EMITMBC(0x1ece) 864 return OK; 865 866 case 'P': case 0x1e54: case 0x1e56: 867 EMIT2('P'); EMITMBC(0x1e54) EMITMBC(0x1e56) 868 return OK; 869 870 case 'R': CASEMBC(0x154) CASEMBC(0x156) CASEMBC(0x158) 871 CASEMBC(0x1e58) CASEMBC(0x1e5e) 872 EMIT2('R'); EMITMBC(0x154) EMITMBC(0x156) EMITMBC(0x158) 873 EMITMBC(0x1e58) EMITMBC(0x1e5e) 874 return OK; 875 876 case 'S': CASEMBC(0x15a) CASEMBC(0x15c) CASEMBC(0x15e) 877 CASEMBC(0x160) CASEMBC(0x1e60) 878 EMIT2('S'); EMITMBC(0x15a) EMITMBC(0x15c) EMITMBC(0x15e) 879 EMITMBC(0x160) EMITMBC(0x1e60) 880 return OK; 881 882 case 'T': CASEMBC(0x162) CASEMBC(0x164) CASEMBC(0x166) 883 CASEMBC(0x1e6a) CASEMBC(0x1e6e) 884 EMIT2('T'); EMITMBC(0x162) EMITMBC(0x164) EMITMBC(0x166) 885 EMITMBC(0x1e6a) EMITMBC(0x1e6e) 886 return OK; 887 888 case 'U': case 0331: case 0332: case 0333: case 0334: 889 CASEMBC(0x168) CASEMBC(0x16a) CASEMBC(0x16c) CASEMBC(0x16e) 890 CASEMBC(0x170) CASEMBC(0x172) CASEMBC(0x1af) CASEMBC(0x1d3) 891 CASEMBC(0x1ee6) 892 EMIT2('U'); EMIT2(0331); EMIT2(0332); EMIT2(0333); 893 EMIT2(0334); EMITMBC(0x168) EMITMBC(0x16a) 894 EMITMBC(0x16c) EMITMBC(0x16e) EMITMBC(0x170) 895 EMITMBC(0x172) EMITMBC(0x1af) EMITMBC(0x1d3) 896 EMITMBC(0x1ee6) 897 return OK; 898 899 case 'V': CASEMBC(0x1e7c) 900 EMIT2('V'); EMITMBC(0x1e7c) 901 return OK; 902 903 case 'W': CASEMBC(0x174) CASEMBC(0x1e80) CASEMBC(0x1e82) 904 CASEMBC(0x1e84) CASEMBC(0x1e86) 905 EMIT2('W'); EMITMBC(0x174) EMITMBC(0x1e80) EMITMBC(0x1e82) 906 EMITMBC(0x1e84) EMITMBC(0x1e86) 907 return OK; 908 909 case 'X': CASEMBC(0x1e8a) CASEMBC(0x1e8c) 910 EMIT2('X'); EMITMBC(0x1e8a) EMITMBC(0x1e8c) 911 return OK; 912 913 case 'Y': case 0335: 914 CASEMBC(0x176) CASEMBC(0x178) CASEMBC(0x1e8e) CASEMBC(0x1ef2) 915 CASEMBC(0x1ef6) CASEMBC(0x1ef8) 916 EMIT2('Y'); EMIT2(0335); EMITMBC(0x176) EMITMBC(0x178) 917 EMITMBC(0x1e8e) EMITMBC(0x1ef2) EMITMBC(0x1ef6) 918 EMITMBC(0x1ef8) 919 return OK; 920 921 case 'Z': CASEMBC(0x179) CASEMBC(0x17b) CASEMBC(0x17d) 922 CASEMBC(0x1b5) CASEMBC(0x1e90) CASEMBC(0x1e94) 923 EMIT2('Z'); EMITMBC(0x179) EMITMBC(0x17b) EMITMBC(0x17d) 924 EMITMBC(0x1b5) EMITMBC(0x1e90) EMITMBC(0x1e94) 925 return OK; 926 927 case 'a': case 0340: case 0341: case 0342: 928 case 0343: case 0344: case 0345: 929 CASEMBC(0x101) CASEMBC(0x103) CASEMBC(0x105) CASEMBC(0x1ce) 930 CASEMBC(0x1df) CASEMBC(0x1e1) CASEMBC(0x1ea3) 931 EMIT2('a'); EMIT2(0340); EMIT2(0341); EMIT2(0342); 932 EMIT2(0343); EMIT2(0344); EMIT2(0345); 933 EMITMBC(0x101) EMITMBC(0x103) EMITMBC(0x105) 934 EMITMBC(0x1ce) EMITMBC(0x1df) EMITMBC(0x1e1) 935 EMITMBC(0x1ea3) 936 return OK; 937 938 case 'b': CASEMBC(0x1e03) CASEMBC(0x1e07) 939 EMIT2('b'); EMITMBC(0x1e03) EMITMBC(0x1e07) 940 return OK; 941 942 case 'c': case 0347: 943 CASEMBC(0x107) CASEMBC(0x109) CASEMBC(0x10b) CASEMBC(0x10d) 944 EMIT2('c'); EMIT2(0347); EMITMBC(0x107) EMITMBC(0x109) 945 EMITMBC(0x10b) EMITMBC(0x10d) 946 return OK; 947 948 case 'd': CASEMBC(0x10f) CASEMBC(0x111) CASEMBC(0x1e0b) 949 CASEMBC(0x1e0f) CASEMBC(0x1e11) 950 EMIT2('d'); EMITMBC(0x10f) EMITMBC(0x111) 951 EMITMBC(0x1e0b) EMITMBC(0x1e0f) EMITMBC(0x1e11) 952 return OK; 953 954 case 'e': case 0350: case 0351: case 0352: case 0353: 955 CASEMBC(0x113) CASEMBC(0x115) CASEMBC(0x117) CASEMBC(0x119) 956 CASEMBC(0x11b) CASEMBC(0x1ebb) CASEMBC(0x1ebd) 957 EMIT2('e'); EMIT2(0350); EMIT2(0351); EMIT2(0352); 958 EMIT2(0353); EMITMBC(0x113) EMITMBC(0x115) 959 EMITMBC(0x117) EMITMBC(0x119) EMITMBC(0x11b) 960 EMITMBC(0x1ebb) EMITMBC(0x1ebd) 961 return OK; 962 963 case 'f': CASEMBC(0x1e1f) 964 EMIT2('f'); EMITMBC(0x1e1f) 965 return OK; 966 967 case 'g': CASEMBC(0x11d) CASEMBC(0x11f) CASEMBC(0x121) 968 CASEMBC(0x123) CASEMBC(0x1e5) CASEMBC(0x1e7) CASEMBC(0x1f5) 969 CASEMBC(0x1e21) 970 EMIT2('g'); EMITMBC(0x11d) EMITMBC(0x11f) EMITMBC(0x121) 971 EMITMBC(0x123) EMITMBC(0x1e5) EMITMBC(0x1e7) 972 EMITMBC(0x1f5) EMITMBC(0x1e21) 973 return OK; 974 975 case 'h': CASEMBC(0x125) CASEMBC(0x127) CASEMBC(0x1e23) 976 CASEMBC(0x1e27) CASEMBC(0x1e29) CASEMBC(0x1e96) 977 EMIT2('h'); EMITMBC(0x125) EMITMBC(0x127) EMITMBC(0x1e23) 978 EMITMBC(0x1e27) EMITMBC(0x1e29) EMITMBC(0x1e96) 979 return OK; 980 981 case 'i': case 0354: case 0355: case 0356: case 0357: 982 CASEMBC(0x129) CASEMBC(0x12b) CASEMBC(0x12d) CASEMBC(0x12f) 983 CASEMBC(0x1d0) CASEMBC(0x1ec9) 984 EMIT2('i'); EMIT2(0354); EMIT2(0355); EMIT2(0356); 985 EMIT2(0357); EMITMBC(0x129) EMITMBC(0x12b) 986 EMITMBC(0x12d) EMITMBC(0x12f) EMITMBC(0x1d0) 987 EMITMBC(0x1ec9) 988 return OK; 989 990 case 'j': CASEMBC(0x135) CASEMBC(0x1f0) 991 EMIT2('j'); EMITMBC(0x135) EMITMBC(0x1f0) 992 return OK; 993 994 case 'k': CASEMBC(0x137) CASEMBC(0x1e9) CASEMBC(0x1e31) 995 CASEMBC(0x1e35) 996 EMIT2('k'); EMITMBC(0x137) EMITMBC(0x1e9) EMITMBC(0x1e31) 997 EMITMBC(0x1e35) 998 return OK; 999 1000 case 'l': CASEMBC(0x13a) CASEMBC(0x13c) CASEMBC(0x13e) 1001 CASEMBC(0x140) CASEMBC(0x142) CASEMBC(0x1e3b) 1002 EMIT2('l'); EMITMBC(0x13a) EMITMBC(0x13c) EMITMBC(0x13e) 1003 EMITMBC(0x140) EMITMBC(0x142) EMITMBC(0x1e3b) 1004 return OK; 1005 1006 case 'm': CASEMBC(0x1e3f) CASEMBC(0x1e41) 1007 EMIT2('m'); EMITMBC(0x1e3f) EMITMBC(0x1e41) 1008 return OK; 1009 1010 case 'n': case 0361: 1011 CASEMBC(0x144) CASEMBC(0x146) CASEMBC(0x148) CASEMBC(0x149) 1012 CASEMBC(0x1e45) CASEMBC(0x1e49) 1013 EMIT2('n'); EMIT2(0361); EMITMBC(0x144) EMITMBC(0x146) 1014 EMITMBC(0x148) EMITMBC(0x149) EMITMBC(0x1e45) 1015 EMITMBC(0x1e49) 1016 return OK; 1017 1018 case 'o': case 0362: case 0363: case 0364: case 0365: 1019 case 0366: case 0370: 1020 CASEMBC(0x14d) CASEMBC(0x14f) CASEMBC(0x151) CASEMBC(0x1a1) 1021 CASEMBC(0x1d2) CASEMBC(0x1eb) CASEMBC(0x1ed) CASEMBC(0x1ecf) 1022 EMIT2('o'); EMIT2(0362); EMIT2(0363); EMIT2(0364); 1023 EMIT2(0365); EMIT2(0366); EMIT2(0370); 1024 EMITMBC(0x14d) EMITMBC(0x14f) EMITMBC(0x151) 1025 EMITMBC(0x1a1) EMITMBC(0x1d2) EMITMBC(0x1eb) 1026 EMITMBC(0x1ed) EMITMBC(0x1ecf) 1027 return OK; 1028 1029 case 'p': CASEMBC(0x1e55) CASEMBC(0x1e57) 1030 EMIT2('p'); EMITMBC(0x1e55) EMITMBC(0x1e57) 1031 return OK; 1032 1033 case 'r': CASEMBC(0x155) CASEMBC(0x157) CASEMBC(0x159) 1034 CASEMBC(0x1e59) CASEMBC(0x1e5f) 1035 EMIT2('r'); EMITMBC(0x155) EMITMBC(0x157) EMITMBC(0x159) 1036 EMITMBC(0x1e59) EMITMBC(0x1e5f) 1037 return OK; 1038 1039 case 's': CASEMBC(0x15b) CASEMBC(0x15d) CASEMBC(0x15f) 1040 CASEMBC(0x161) CASEMBC(0x1e61) 1041 EMIT2('s'); EMITMBC(0x15b) EMITMBC(0x15d) EMITMBC(0x15f) 1042 EMITMBC(0x161) EMITMBC(0x1e61) 1043 return OK; 1044 1045 case 't': CASEMBC(0x163) CASEMBC(0x165) CASEMBC(0x167) 1046 CASEMBC(0x1e6b) CASEMBC(0x1e6f) CASEMBC(0x1e97) 1047 EMIT2('t'); EMITMBC(0x163) EMITMBC(0x165) EMITMBC(0x167) 1048 EMITMBC(0x1e6b) EMITMBC(0x1e6f) EMITMBC(0x1e97) 1049 return OK; 1050 1051 case 'u': case 0371: case 0372: case 0373: case 0374: 1052 CASEMBC(0x169) CASEMBC(0x16b) CASEMBC(0x16d) CASEMBC(0x16f) 1053 CASEMBC(0x171) CASEMBC(0x173) CASEMBC(0x1b0) CASEMBC(0x1d4) 1054 CASEMBC(0x1ee7) 1055 EMIT2('u'); EMIT2(0371); EMIT2(0372); EMIT2(0373); 1056 EMIT2(0374); EMITMBC(0x169) EMITMBC(0x16b) 1057 EMITMBC(0x16d) EMITMBC(0x16f) EMITMBC(0x171) 1058 EMITMBC(0x173) EMITMBC(0x1b0) EMITMBC(0x1d4) 1059 EMITMBC(0x1ee7) 1060 return OK; 1061 1062 case 'v': CASEMBC(0x1e7d) 1063 EMIT2('v'); EMITMBC(0x1e7d) 1064 return OK; 1065 1066 case 'w': CASEMBC(0x175) CASEMBC(0x1e81) CASEMBC(0x1e83) 1067 CASEMBC(0x1e85) CASEMBC(0x1e87) CASEMBC(0x1e98) 1068 EMIT2('w'); EMITMBC(0x175) EMITMBC(0x1e81) EMITMBC(0x1e83) 1069 EMITMBC(0x1e85) EMITMBC(0x1e87) EMITMBC(0x1e98) 1070 return OK; 1071 1072 case 'x': CASEMBC(0x1e8b) CASEMBC(0x1e8d) 1073 EMIT2('x'); EMITMBC(0x1e8b) EMITMBC(0x1e8d) 1074 return OK; 1075 1076 case 'y': case 0375: case 0377: 1077 CASEMBC(0x177) CASEMBC(0x1e8f) CASEMBC(0x1e99) 1078 CASEMBC(0x1ef3) CASEMBC(0x1ef7) CASEMBC(0x1ef9) 1079 EMIT2('y'); EMIT2(0375); EMIT2(0377); EMITMBC(0x177) 1080 EMITMBC(0x1e8f) EMITMBC(0x1e99) EMITMBC(0x1ef3) 1081 EMITMBC(0x1ef7) EMITMBC(0x1ef9) 1082 return OK; 1083 1084 case 'z': CASEMBC(0x17a) CASEMBC(0x17c) CASEMBC(0x17e) 1085 CASEMBC(0x1b6) CASEMBC(0x1e91) CASEMBC(0x1e95) 1086 EMIT2('z'); EMITMBC(0x17a) EMITMBC(0x17c) EMITMBC(0x17e) 1087 EMITMBC(0x1b6) EMITMBC(0x1e91) EMITMBC(0x1e95) 1088 return OK; 1089 1090 /* default: character itself */ 1091 } 1092 } 1093 1094 EMIT2(c); 1095 return OK; 1096 #undef EMIT2 1097 #undef EMITMBC 1098 } 1099 1100 /* 1101 * Code to parse regular expression. 1102 * 1103 * We try to reuse parsing functions in regexp.c to 1104 * minimize surprise and keep the syntax consistent. 1105 */ 1106 1107 /* 1108 * Parse the lowest level. 1109 * 1110 * An atom can be one of a long list of items. Many atoms match one character 1111 * in the text. It is often an ordinary character or a character class. 1112 * Braces can be used to make a pattern into an atom. The "\z(\)" construct 1113 * is only for syntax highlighting. 1114 * 1115 * atom ::= ordinary-atom 1116 * or \( pattern \) 1117 * or \%( pattern \) 1118 * or \z( pattern \) 1119 */ 1120 static int 1121 nfa_regatom() 1122 { 1123 int c; 1124 int charclass; 1125 int equiclass; 1126 int collclass; 1127 int got_coll_char; 1128 char_u *p; 1129 char_u *endp; 1130 #ifdef FEAT_MBYTE 1131 char_u *old_regparse = regparse; 1132 #endif 1133 int extra = 0; 1134 int emit_range; 1135 int negated; 1136 int result; 1137 int startc = -1; 1138 int endc = -1; 1139 int oldstartc = -1; 1140 1141 c = getchr(); 1142 switch (c) 1143 { 1144 case NUL: 1145 EMSG_RET_FAIL(_(e_nul_found)); 1146 1147 case Magic('^'): 1148 EMIT(NFA_BOL); 1149 break; 1150 1151 case Magic('$'): 1152 EMIT(NFA_EOL); 1153 #if defined(FEAT_SYN_HL) || defined(PROTO) 1154 had_eol = TRUE; 1155 #endif 1156 break; 1157 1158 case Magic('<'): 1159 EMIT(NFA_BOW); 1160 break; 1161 1162 case Magic('>'): 1163 EMIT(NFA_EOW); 1164 break; 1165 1166 case Magic('_'): 1167 c = no_Magic(getchr()); 1168 if (c == NUL) 1169 EMSG_RET_FAIL(_(e_nul_found)); 1170 1171 if (c == '^') /* "\_^" is start-of-line */ 1172 { 1173 EMIT(NFA_BOL); 1174 break; 1175 } 1176 if (c == '$') /* "\_$" is end-of-line */ 1177 { 1178 EMIT(NFA_EOL); 1179 #if defined(FEAT_SYN_HL) || defined(PROTO) 1180 had_eol = TRUE; 1181 #endif 1182 break; 1183 } 1184 1185 extra = NFA_ADD_NL; 1186 1187 /* "\_[" is collection plus newline */ 1188 if (c == '[') 1189 goto collection; 1190 1191 /* "\_x" is character class plus newline */ 1192 /*FALLTHROUGH*/ 1193 1194 /* 1195 * Character classes. 1196 */ 1197 case Magic('.'): 1198 case Magic('i'): 1199 case Magic('I'): 1200 case Magic('k'): 1201 case Magic('K'): 1202 case Magic('f'): 1203 case Magic('F'): 1204 case Magic('p'): 1205 case Magic('P'): 1206 case Magic('s'): 1207 case Magic('S'): 1208 case Magic('d'): 1209 case Magic('D'): 1210 case Magic('x'): 1211 case Magic('X'): 1212 case Magic('o'): 1213 case Magic('O'): 1214 case Magic('w'): 1215 case Magic('W'): 1216 case Magic('h'): 1217 case Magic('H'): 1218 case Magic('a'): 1219 case Magic('A'): 1220 case Magic('l'): 1221 case Magic('L'): 1222 case Magic('u'): 1223 case Magic('U'): 1224 p = vim_strchr(classchars, no_Magic(c)); 1225 if (p == NULL) 1226 { 1227 if (extra == NFA_ADD_NL) 1228 { 1229 EMSGN(_(e_ill_char_class), c); 1230 rc_did_emsg = TRUE; 1231 return FAIL; 1232 } 1233 EMSGN("INTERNAL: Unknown character class char: %ld", c); 1234 return FAIL; 1235 } 1236 #ifdef FEAT_MBYTE 1237 /* When '.' is followed by a composing char ignore the dot, so that 1238 * the composing char is matched here. */ 1239 if (enc_utf8 && c == Magic('.') && utf_iscomposing(peekchr())) 1240 { 1241 old_regparse = regparse; 1242 c = getchr(); 1243 goto nfa_do_multibyte; 1244 } 1245 #endif 1246 EMIT(nfa_classcodes[p - classchars]); 1247 if (extra == NFA_ADD_NL) 1248 { 1249 EMIT(NFA_NEWL); 1250 EMIT(NFA_OR); 1251 regflags |= RF_HASNL; 1252 } 1253 break; 1254 1255 case Magic('n'): 1256 if (reg_string) 1257 /* In a string "\n" matches a newline character. */ 1258 EMIT(NL); 1259 else 1260 { 1261 /* In buffer text "\n" matches the end of a line. */ 1262 EMIT(NFA_NEWL); 1263 regflags |= RF_HASNL; 1264 } 1265 break; 1266 1267 case Magic('('): 1268 if (nfa_reg(REG_PAREN) == FAIL) 1269 return FAIL; /* cascaded error */ 1270 break; 1271 1272 case Magic('|'): 1273 case Magic('&'): 1274 case Magic(')'): 1275 EMSGN(_(e_misplaced), no_Magic(c)); 1276 return FAIL; 1277 1278 case Magic('='): 1279 case Magic('?'): 1280 case Magic('+'): 1281 case Magic('@'): 1282 case Magic('*'): 1283 case Magic('{'): 1284 /* these should follow an atom, not form an atom */ 1285 EMSGN(_(e_misplaced), no_Magic(c)); 1286 return FAIL; 1287 1288 case Magic('~'): 1289 { 1290 char_u *lp; 1291 1292 /* Previous substitute pattern. 1293 * Generated as "\%(pattern\)". */ 1294 if (reg_prev_sub == NULL) 1295 { 1296 EMSG(_(e_nopresub)); 1297 return FAIL; 1298 } 1299 for (lp = reg_prev_sub; *lp != NUL; mb_cptr_adv(lp)) 1300 { 1301 EMIT(PTR2CHAR(lp)); 1302 if (lp != reg_prev_sub) 1303 EMIT(NFA_CONCAT); 1304 } 1305 EMIT(NFA_NOPEN); 1306 break; 1307 } 1308 1309 case Magic('1'): 1310 case Magic('2'): 1311 case Magic('3'): 1312 case Magic('4'): 1313 case Magic('5'): 1314 case Magic('6'): 1315 case Magic('7'): 1316 case Magic('8'): 1317 case Magic('9'): 1318 EMIT(NFA_BACKREF1 + (no_Magic(c) - '1')); 1319 nfa_has_backref = TRUE; 1320 break; 1321 1322 case Magic('z'): 1323 c = no_Magic(getchr()); 1324 switch (c) 1325 { 1326 case 's': 1327 EMIT(NFA_ZSTART); 1328 if (re_mult_next("\\zs") == FAIL) 1329 return FAIL; 1330 break; 1331 case 'e': 1332 EMIT(NFA_ZEND); 1333 nfa_has_zend = TRUE; 1334 if (re_mult_next("\\ze") == FAIL) 1335 return FAIL; 1336 break; 1337 #ifdef FEAT_SYN_HL 1338 case '1': 1339 case '2': 1340 case '3': 1341 case '4': 1342 case '5': 1343 case '6': 1344 case '7': 1345 case '8': 1346 case '9': 1347 /* \z1...\z9 */ 1348 if (reg_do_extmatch != REX_USE) 1349 EMSG_RET_FAIL(_(e_z1_not_allowed)); 1350 EMIT(NFA_ZREF1 + (no_Magic(c) - '1')); 1351 /* No need to set nfa_has_backref, the sub-matches don't 1352 * change when \z1 .. \z9 matches or not. */ 1353 re_has_z = REX_USE; 1354 break; 1355 case '(': 1356 /* \z( */ 1357 if (reg_do_extmatch != REX_SET) 1358 EMSG_RET_FAIL(_(e_z_not_allowed)); 1359 if (nfa_reg(REG_ZPAREN) == FAIL) 1360 return FAIL; /* cascaded error */ 1361 re_has_z = REX_SET; 1362 break; 1363 #endif 1364 default: 1365 EMSGN(_("E867: (NFA) Unknown operator '\\z%c'"), 1366 no_Magic(c)); 1367 return FAIL; 1368 } 1369 break; 1370 1371 case Magic('%'): 1372 c = no_Magic(getchr()); 1373 switch (c) 1374 { 1375 /* () without a back reference */ 1376 case '(': 1377 if (nfa_reg(REG_NPAREN) == FAIL) 1378 return FAIL; 1379 EMIT(NFA_NOPEN); 1380 break; 1381 1382 case 'd': /* %d123 decimal */ 1383 case 'o': /* %o123 octal */ 1384 case 'x': /* %xab hex 2 */ 1385 case 'u': /* %uabcd hex 4 */ 1386 case 'U': /* %U1234abcd hex 8 */ 1387 { 1388 int nr; 1389 1390 switch (c) 1391 { 1392 case 'd': nr = getdecchrs(); break; 1393 case 'o': nr = getoctchrs(); break; 1394 case 'x': nr = gethexchrs(2); break; 1395 case 'u': nr = gethexchrs(4); break; 1396 case 'U': nr = gethexchrs(8); break; 1397 default: nr = -1; break; 1398 } 1399 1400 if (nr < 0) 1401 EMSG2_RET_FAIL( 1402 _("E678: Invalid character after %s%%[dxouU]"), 1403 reg_magic == MAGIC_ALL); 1404 /* A NUL is stored in the text as NL */ 1405 /* TODO: what if a composing character follows? */ 1406 EMIT(nr == 0 ? 0x0a : nr); 1407 } 1408 break; 1409 1410 /* Catch \%^ and \%$ regardless of where they appear in the 1411 * pattern -- regardless of whether or not it makes sense. */ 1412 case '^': 1413 EMIT(NFA_BOF); 1414 break; 1415 1416 case '$': 1417 EMIT(NFA_EOF); 1418 break; 1419 1420 case '#': 1421 EMIT(NFA_CURSOR); 1422 break; 1423 1424 case 'V': 1425 EMIT(NFA_VISUAL); 1426 break; 1427 1428 case 'C': 1429 EMIT(NFA_ANY_COMPOSING); 1430 break; 1431 1432 case '[': 1433 { 1434 int n; 1435 1436 /* \%[abc] */ 1437 for (n = 0; (c = peekchr()) != ']'; ++n) 1438 { 1439 if (c == NUL) 1440 EMSG2_RET_FAIL(_(e_missing_sb), 1441 reg_magic == MAGIC_ALL); 1442 /* recursive call! */ 1443 if (nfa_regatom() == FAIL) 1444 return FAIL; 1445 } 1446 getchr(); /* get the ] */ 1447 if (n == 0) 1448 EMSG2_RET_FAIL(_(e_empty_sb), 1449 reg_magic == MAGIC_ALL); 1450 EMIT(NFA_OPT_CHARS); 1451 EMIT(n); 1452 1453 /* Emit as "\%(\%[abc]\)" to be able to handle 1454 * "\%[abc]*" which would cause the empty string to be 1455 * matched an unlimited number of times. NFA_NOPEN is 1456 * added only once at a position, while NFA_SPLIT is 1457 * added multiple times. This is more efficient than 1458 * not allowing NFA_SPLIT multiple times, it is used 1459 * a lot. */ 1460 EMIT(NFA_NOPEN); 1461 break; 1462 } 1463 1464 default: 1465 { 1466 int n = 0; 1467 int cmp = c; 1468 1469 if (c == '<' || c == '>') 1470 c = getchr(); 1471 while (VIM_ISDIGIT(c)) 1472 { 1473 n = n * 10 + (c - '0'); 1474 c = getchr(); 1475 } 1476 if (c == 'l' || c == 'c' || c == 'v') 1477 { 1478 if (c == 'l') 1479 /* \%{n}l \%{n}<l \%{n}>l */ 1480 EMIT(cmp == '<' ? NFA_LNUM_LT : 1481 cmp == '>' ? NFA_LNUM_GT : NFA_LNUM); 1482 else if (c == 'c') 1483 /* \%{n}c \%{n}<c \%{n}>c */ 1484 EMIT(cmp == '<' ? NFA_COL_LT : 1485 cmp == '>' ? NFA_COL_GT : NFA_COL); 1486 else 1487 /* \%{n}v \%{n}<v \%{n}>v */ 1488 EMIT(cmp == '<' ? NFA_VCOL_LT : 1489 cmp == '>' ? NFA_VCOL_GT : NFA_VCOL); 1490 EMIT(n); 1491 break; 1492 } 1493 else if (c == '\'' && n == 0) 1494 { 1495 /* \%'m \%<'m \%>'m */ 1496 EMIT(cmp == '<' ? NFA_MARK_LT : 1497 cmp == '>' ? NFA_MARK_GT : NFA_MARK); 1498 EMIT(getchr()); 1499 break; 1500 } 1501 } 1502 EMSGN(_("E867: (NFA) Unknown operator '\\%%%c'"), 1503 no_Magic(c)); 1504 return FAIL; 1505 } 1506 break; 1507 1508 case Magic('['): 1509 collection: 1510 /* 1511 * [abc] uses NFA_START_COLL - NFA_END_COLL 1512 * [^abc] uses NFA_START_NEG_COLL - NFA_END_NEG_COLL 1513 * Each character is produced as a regular state, using 1514 * NFA_CONCAT to bind them together. 1515 * Besides normal characters there can be: 1516 * - character classes NFA_CLASS_* 1517 * - ranges, two characters followed by NFA_RANGE. 1518 */ 1519 1520 p = regparse; 1521 endp = skip_anyof(p); 1522 if (*endp == ']') 1523 { 1524 /* 1525 * Try to reverse engineer character classes. For example, 1526 * recognize that [0-9] stands for \d and [A-Za-z_] for \h, 1527 * and perform the necessary substitutions in the NFA. 1528 */ 1529 result = nfa_recognize_char_class(regparse, endp, 1530 extra == NFA_ADD_NL); 1531 if (result != FAIL) 1532 { 1533 if (result >= NFA_FIRST_NL && result <= NFA_LAST_NL) 1534 { 1535 EMIT(result - NFA_ADD_NL); 1536 EMIT(NFA_NEWL); 1537 EMIT(NFA_OR); 1538 } 1539 else 1540 EMIT(result); 1541 regparse = endp; 1542 mb_ptr_adv(regparse); 1543 return OK; 1544 } 1545 /* 1546 * Failed to recognize a character class. Use the simple 1547 * version that turns [abc] into 'a' OR 'b' OR 'c' 1548 */ 1549 startc = endc = oldstartc = -1; 1550 negated = FALSE; 1551 if (*regparse == '^') /* negated range */ 1552 { 1553 negated = TRUE; 1554 mb_ptr_adv(regparse); 1555 EMIT(NFA_START_NEG_COLL); 1556 } 1557 else 1558 EMIT(NFA_START_COLL); 1559 if (*regparse == '-') 1560 { 1561 startc = '-'; 1562 EMIT(startc); 1563 EMIT(NFA_CONCAT); 1564 mb_ptr_adv(regparse); 1565 } 1566 /* Emit the OR branches for each character in the [] */ 1567 emit_range = FALSE; 1568 while (regparse < endp) 1569 { 1570 oldstartc = startc; 1571 startc = -1; 1572 got_coll_char = FALSE; 1573 if (*regparse == '[') 1574 { 1575 /* Check for [: :], [= =], [. .] */ 1576 equiclass = collclass = 0; 1577 charclass = get_char_class(®parse); 1578 if (charclass == CLASS_NONE) 1579 { 1580 equiclass = get_equi_class(®parse); 1581 if (equiclass == 0) 1582 collclass = get_coll_element(®parse); 1583 } 1584 1585 /* Character class like [:alpha:] */ 1586 if (charclass != CLASS_NONE) 1587 { 1588 switch (charclass) 1589 { 1590 case CLASS_ALNUM: 1591 EMIT(NFA_CLASS_ALNUM); 1592 break; 1593 case CLASS_ALPHA: 1594 EMIT(NFA_CLASS_ALPHA); 1595 break; 1596 case CLASS_BLANK: 1597 EMIT(NFA_CLASS_BLANK); 1598 break; 1599 case CLASS_CNTRL: 1600 EMIT(NFA_CLASS_CNTRL); 1601 break; 1602 case CLASS_DIGIT: 1603 EMIT(NFA_CLASS_DIGIT); 1604 break; 1605 case CLASS_GRAPH: 1606 EMIT(NFA_CLASS_GRAPH); 1607 break; 1608 case CLASS_LOWER: 1609 EMIT(NFA_CLASS_LOWER); 1610 break; 1611 case CLASS_PRINT: 1612 EMIT(NFA_CLASS_PRINT); 1613 break; 1614 case CLASS_PUNCT: 1615 EMIT(NFA_CLASS_PUNCT); 1616 break; 1617 case CLASS_SPACE: 1618 EMIT(NFA_CLASS_SPACE); 1619 break; 1620 case CLASS_UPPER: 1621 EMIT(NFA_CLASS_UPPER); 1622 break; 1623 case CLASS_XDIGIT: 1624 EMIT(NFA_CLASS_XDIGIT); 1625 break; 1626 case CLASS_TAB: 1627 EMIT(NFA_CLASS_TAB); 1628 break; 1629 case CLASS_RETURN: 1630 EMIT(NFA_CLASS_RETURN); 1631 break; 1632 case CLASS_BACKSPACE: 1633 EMIT(NFA_CLASS_BACKSPACE); 1634 break; 1635 case CLASS_ESCAPE: 1636 EMIT(NFA_CLASS_ESCAPE); 1637 break; 1638 } 1639 EMIT(NFA_CONCAT); 1640 continue; 1641 } 1642 /* Try equivalence class [=a=] and the like */ 1643 if (equiclass != 0) 1644 { 1645 result = nfa_emit_equi_class(equiclass); 1646 if (result == FAIL) 1647 { 1648 /* should never happen */ 1649 EMSG_RET_FAIL(_("E868: Error building NFA with equivalence class!")); 1650 } 1651 continue; 1652 } 1653 /* Try collating class like [. .] */ 1654 if (collclass != 0) 1655 { 1656 startc = collclass; /* allow [.a.]-x as a range */ 1657 /* Will emit the proper atom at the end of the 1658 * while loop. */ 1659 } 1660 } 1661 /* Try a range like 'a-x' or '\t-z'. Also allows '-' as a 1662 * start character. */ 1663 if (*regparse == '-' && oldstartc != -1) 1664 { 1665 emit_range = TRUE; 1666 startc = oldstartc; 1667 mb_ptr_adv(regparse); 1668 continue; /* reading the end of the range */ 1669 } 1670 1671 /* Now handle simple and escaped characters. 1672 * Only "\]", "\^", "\]" and "\\" are special in Vi. Vim 1673 * accepts "\t", "\e", etc., but only when the 'l' flag in 1674 * 'cpoptions' is not included. 1675 * Posix doesn't recognize backslash at all. 1676 */ 1677 if (*regparse == '\\' 1678 && !reg_cpo_bsl 1679 && regparse + 1 <= endp 1680 && (vim_strchr(REGEXP_INRANGE, regparse[1]) != NULL 1681 || (!reg_cpo_lit 1682 && vim_strchr(REGEXP_ABBR, regparse[1]) 1683 != NULL) 1684 ) 1685 ) 1686 { 1687 mb_ptr_adv(regparse); 1688 1689 if (*regparse == 'n') 1690 startc = reg_string ? NL : NFA_NEWL; 1691 else 1692 if (*regparse == 'd' 1693 || *regparse == 'o' 1694 || *regparse == 'x' 1695 || *regparse == 'u' 1696 || *regparse == 'U' 1697 ) 1698 { 1699 /* TODO(RE) This needs more testing */ 1700 startc = coll_get_char(); 1701 got_coll_char = TRUE; 1702 mb_ptr_back(old_regparse, regparse); 1703 } 1704 else 1705 { 1706 /* \r,\t,\e,\b */ 1707 startc = backslash_trans(*regparse); 1708 } 1709 } 1710 1711 /* Normal printable char */ 1712 if (startc == -1) 1713 startc = PTR2CHAR(regparse); 1714 1715 /* Previous char was '-', so this char is end of range. */ 1716 if (emit_range) 1717 { 1718 endc = startc; 1719 startc = oldstartc; 1720 if (startc > endc) 1721 EMSG_RET_FAIL(_(e_invrange)); 1722 1723 if (endc > startc + 2) 1724 { 1725 /* Emit a range instead of the sequence of 1726 * individual characters. */ 1727 if (startc == 0) 1728 /* \x00 is translated to \x0a, start at \x01. */ 1729 EMIT(1); 1730 else 1731 --post_ptr; /* remove NFA_CONCAT */ 1732 EMIT(endc); 1733 EMIT(NFA_RANGE); 1734 EMIT(NFA_CONCAT); 1735 } 1736 else 1737 #ifdef FEAT_MBYTE 1738 if (has_mbyte && ((*mb_char2len)(startc) > 1 1739 || (*mb_char2len)(endc) > 1)) 1740 { 1741 /* Emit the characters in the range. 1742 * "startc" was already emitted, so skip it. 1743 * */ 1744 for (c = startc + 1; c <= endc; c++) 1745 { 1746 EMIT(c); 1747 EMIT(NFA_CONCAT); 1748 } 1749 } 1750 else 1751 #endif 1752 { 1753 #ifdef EBCDIC 1754 int alpha_only = FALSE; 1755 1756 /* for alphabetical range skip the gaps 1757 * 'i'-'j', 'r'-'s', 'I'-'J' and 'R'-'S'. */ 1758 if (isalpha(startc) && isalpha(endc)) 1759 alpha_only = TRUE; 1760 #endif 1761 /* Emit the range. "startc" was already emitted, so 1762 * skip it. */ 1763 for (c = startc + 1; c <= endc; c++) 1764 #ifdef EBCDIC 1765 if (!alpha_only || isalpha(startc)) 1766 #endif 1767 { 1768 EMIT(c); 1769 EMIT(NFA_CONCAT); 1770 } 1771 } 1772 emit_range = FALSE; 1773 startc = -1; 1774 } 1775 else 1776 { 1777 /* This char (startc) is not part of a range. Just 1778 * emit it. 1779 * Normally, simply emit startc. But if we get char 1780 * code=0 from a collating char, then replace it with 1781 * 0x0a. 1782 * This is needed to completely mimic the behaviour of 1783 * the backtracking engine. */ 1784 if (startc == NFA_NEWL) 1785 { 1786 /* Line break can't be matched as part of the 1787 * collection, add an OR below. But not for negated 1788 * range. */ 1789 if (!negated) 1790 extra = NFA_ADD_NL; 1791 } 1792 else 1793 { 1794 if (got_coll_char == TRUE && startc == 0) 1795 EMIT(0x0a); 1796 else 1797 EMIT(startc); 1798 EMIT(NFA_CONCAT); 1799 } 1800 } 1801 1802 mb_ptr_adv(regparse); 1803 } /* while (p < endp) */ 1804 1805 mb_ptr_back(old_regparse, regparse); 1806 if (*regparse == '-') /* if last, '-' is just a char */ 1807 { 1808 EMIT('-'); 1809 EMIT(NFA_CONCAT); 1810 } 1811 1812 /* skip the trailing ] */ 1813 regparse = endp; 1814 mb_ptr_adv(regparse); 1815 1816 /* Mark end of the collection. */ 1817 if (negated == TRUE) 1818 EMIT(NFA_END_NEG_COLL); 1819 else 1820 EMIT(NFA_END_COLL); 1821 1822 /* \_[] also matches \n but it's not negated */ 1823 if (extra == NFA_ADD_NL) 1824 { 1825 EMIT(reg_string ? NL : NFA_NEWL); 1826 EMIT(NFA_OR); 1827 } 1828 1829 return OK; 1830 } /* if exists closing ] */ 1831 1832 if (reg_strict) 1833 EMSG_RET_FAIL(_(e_missingbracket)); 1834 /* FALLTHROUGH */ 1835 1836 default: 1837 { 1838 #ifdef FEAT_MBYTE 1839 int plen; 1840 1841 nfa_do_multibyte: 1842 /* plen is length of current char with composing chars */ 1843 if (enc_utf8 && ((*mb_char2len)(c) 1844 != (plen = (*mb_ptr2len)(old_regparse)) 1845 || utf_iscomposing(c))) 1846 { 1847 int i = 0; 1848 1849 /* A base character plus composing characters, or just one 1850 * or more composing characters. 1851 * This requires creating a separate atom as if enclosing 1852 * the characters in (), where NFA_COMPOSING is the ( and 1853 * NFA_END_COMPOSING is the ). Note that right now we are 1854 * building the postfix form, not the NFA itself; 1855 * a composing char could be: a, b, c, NFA_COMPOSING 1856 * where 'b' and 'c' are chars with codes > 256. */ 1857 for (;;) 1858 { 1859 EMIT(c); 1860 if (i > 0) 1861 EMIT(NFA_CONCAT); 1862 if ((i += utf_char2len(c)) >= plen) 1863 break; 1864 c = utf_ptr2char(old_regparse + i); 1865 } 1866 EMIT(NFA_COMPOSING); 1867 regparse = old_regparse + plen; 1868 } 1869 else 1870 #endif 1871 { 1872 c = no_Magic(c); 1873 EMIT(c); 1874 } 1875 return OK; 1876 } 1877 } 1878 1879 return OK; 1880 } 1881 1882 /* 1883 * Parse something followed by possible [*+=]. 1884 * 1885 * A piece is an atom, possibly followed by a multi, an indication of how many 1886 * times the atom can be matched. Example: "a*" matches any sequence of "a" 1887 * characters: "", "a", "aa", etc. 1888 * 1889 * piece ::= atom 1890 * or atom multi 1891 */ 1892 static int 1893 nfa_regpiece() 1894 { 1895 int i; 1896 int op; 1897 int ret; 1898 long minval, maxval; 1899 int greedy = TRUE; /* Braces are prefixed with '-' ? */ 1900 parse_state_T old_state; 1901 parse_state_T new_state; 1902 int c2; 1903 int old_post_pos; 1904 int my_post_start; 1905 int quest; 1906 1907 /* Save the current parse state, so that we can use it if <atom>{m,n} is 1908 * next. */ 1909 save_parse_state(&old_state); 1910 1911 /* store current pos in the postfix form, for \{m,n} involving 0s */ 1912 my_post_start = (int)(post_ptr - post_start); 1913 1914 ret = nfa_regatom(); 1915 if (ret == FAIL) 1916 return FAIL; /* cascaded error */ 1917 1918 op = peekchr(); 1919 if (re_multi_type(op) == NOT_MULTI) 1920 return OK; 1921 1922 skipchr(); 1923 switch (op) 1924 { 1925 case Magic('*'): 1926 EMIT(NFA_STAR); 1927 break; 1928 1929 case Magic('+'): 1930 /* 1931 * Trick: Normally, (a*)\+ would match the whole input "aaa". The 1932 * first and only submatch would be "aaa". But the backtracking 1933 * engine interprets the plus as "try matching one more time", and 1934 * a* matches a second time at the end of the input, the empty 1935 * string. 1936 * The submatch will be the empty string. 1937 * 1938 * In order to be consistent with the old engine, we replace 1939 * <atom>+ with <atom><atom>* 1940 */ 1941 restore_parse_state(&old_state); 1942 curchr = -1; 1943 if (nfa_regatom() == FAIL) 1944 return FAIL; 1945 EMIT(NFA_STAR); 1946 EMIT(NFA_CONCAT); 1947 skipchr(); /* skip the \+ */ 1948 break; 1949 1950 case Magic('@'): 1951 c2 = getdecchrs(); 1952 op = no_Magic(getchr()); 1953 i = 0; 1954 switch(op) 1955 { 1956 case '=': 1957 /* \@= */ 1958 i = NFA_PREV_ATOM_NO_WIDTH; 1959 break; 1960 case '!': 1961 /* \@! */ 1962 i = NFA_PREV_ATOM_NO_WIDTH_NEG; 1963 break; 1964 case '<': 1965 op = no_Magic(getchr()); 1966 if (op == '=') 1967 /* \@<= */ 1968 i = NFA_PREV_ATOM_JUST_BEFORE; 1969 else if (op == '!') 1970 /* \@<! */ 1971 i = NFA_PREV_ATOM_JUST_BEFORE_NEG; 1972 break; 1973 case '>': 1974 /* \@> */ 1975 i = NFA_PREV_ATOM_LIKE_PATTERN; 1976 break; 1977 } 1978 if (i == 0) 1979 { 1980 EMSGN(_("E869: (NFA) Unknown operator '\\@%c'"), op); 1981 return FAIL; 1982 } 1983 EMIT(i); 1984 if (i == NFA_PREV_ATOM_JUST_BEFORE 1985 || i == NFA_PREV_ATOM_JUST_BEFORE_NEG) 1986 EMIT(c2); 1987 break; 1988 1989 case Magic('?'): 1990 case Magic('='): 1991 EMIT(NFA_QUEST); 1992 break; 1993 1994 case Magic('{'): 1995 /* a{2,5} will expand to 'aaa?a?a?' 1996 * a{-1,3} will expand to 'aa??a??', where ?? is the nongreedy 1997 * version of '?' 1998 * \v(ab){2,3} will expand to '(ab)(ab)(ab)?', where all the 1999 * parenthesis have the same id 2000 */ 2001 2002 greedy = TRUE; 2003 c2 = peekchr(); 2004 if (c2 == '-' || c2 == Magic('-')) 2005 { 2006 skipchr(); 2007 greedy = FALSE; 2008 } 2009 if (!read_limits(&minval, &maxval)) 2010 EMSG_RET_FAIL(_("E870: (NFA regexp) Error reading repetition limits")); 2011 2012 /* <atom>{0,inf}, <atom>{0,} and <atom>{} are equivalent to 2013 * <atom>* */ 2014 if (minval == 0 && maxval == MAX_LIMIT) 2015 { 2016 if (greedy) /* { { (match the braces) */ 2017 /* \{}, \{0,} */ 2018 EMIT(NFA_STAR); 2019 else /* { { (match the braces) */ 2020 /* \{-}, \{-0,} */ 2021 EMIT(NFA_STAR_NONGREEDY); 2022 break; 2023 } 2024 2025 /* Special case: x{0} or x{-0} */ 2026 if (maxval == 0) 2027 { 2028 /* Ignore result of previous call to nfa_regatom() */ 2029 post_ptr = post_start + my_post_start; 2030 /* NFA_EMPTY is 0-length and works everywhere */ 2031 EMIT(NFA_EMPTY); 2032 return OK; 2033 } 2034 2035 /* The engine is very inefficient (uses too many states) when the 2036 * maximum is much larger than the minimum and when the maximum is 2037 * large. Bail out if we can use the other engine. */ 2038 if ((nfa_re_flags & RE_AUTO) 2039 && (maxval > minval + 200 || maxval > 500)) 2040 return FAIL; 2041 2042 /* Ignore previous call to nfa_regatom() */ 2043 post_ptr = post_start + my_post_start; 2044 /* Save parse state after the repeated atom and the \{} */ 2045 save_parse_state(&new_state); 2046 2047 quest = (greedy == TRUE? NFA_QUEST : NFA_QUEST_NONGREEDY); 2048 for (i = 0; i < maxval; i++) 2049 { 2050 /* Goto beginning of the repeated atom */ 2051 restore_parse_state(&old_state); 2052 old_post_pos = (int)(post_ptr - post_start); 2053 if (nfa_regatom() == FAIL) 2054 return FAIL; 2055 /* after "minval" times, atoms are optional */ 2056 if (i + 1 > minval) 2057 { 2058 if (maxval == MAX_LIMIT) 2059 { 2060 if (greedy) 2061 EMIT(NFA_STAR); 2062 else 2063 EMIT(NFA_STAR_NONGREEDY); 2064 } 2065 else 2066 EMIT(quest); 2067 } 2068 if (old_post_pos != my_post_start) 2069 EMIT(NFA_CONCAT); 2070 if (i + 1 > minval && maxval == MAX_LIMIT) 2071 break; 2072 } 2073 2074 /* Go to just after the repeated atom and the \{} */ 2075 restore_parse_state(&new_state); 2076 curchr = -1; 2077 2078 break; 2079 2080 2081 default: 2082 break; 2083 } /* end switch */ 2084 2085 if (re_multi_type(peekchr()) != NOT_MULTI) 2086 /* Can't have a multi follow a multi. */ 2087 EMSG_RET_FAIL(_("E871: (NFA regexp) Can't have a multi follow a multi !")); 2088 2089 return OK; 2090 } 2091 2092 /* 2093 * Parse one or more pieces, concatenated. It matches a match for the 2094 * first piece, followed by a match for the second piece, etc. Example: 2095 * "f[0-9]b", first matches "f", then a digit and then "b". 2096 * 2097 * concat ::= piece 2098 * or piece piece 2099 * or piece piece piece 2100 * etc. 2101 */ 2102 static int 2103 nfa_regconcat() 2104 { 2105 int cont = TRUE; 2106 int first = TRUE; 2107 2108 while (cont) 2109 { 2110 switch (peekchr()) 2111 { 2112 case NUL: 2113 case Magic('|'): 2114 case Magic('&'): 2115 case Magic(')'): 2116 cont = FALSE; 2117 break; 2118 2119 case Magic('Z'): 2120 #ifdef FEAT_MBYTE 2121 regflags |= RF_ICOMBINE; 2122 #endif 2123 skipchr_keepstart(); 2124 break; 2125 case Magic('c'): 2126 regflags |= RF_ICASE; 2127 skipchr_keepstart(); 2128 break; 2129 case Magic('C'): 2130 regflags |= RF_NOICASE; 2131 skipchr_keepstart(); 2132 break; 2133 case Magic('v'): 2134 reg_magic = MAGIC_ALL; 2135 skipchr_keepstart(); 2136 curchr = -1; 2137 break; 2138 case Magic('m'): 2139 reg_magic = MAGIC_ON; 2140 skipchr_keepstart(); 2141 curchr = -1; 2142 break; 2143 case Magic('M'): 2144 reg_magic = MAGIC_OFF; 2145 skipchr_keepstart(); 2146 curchr = -1; 2147 break; 2148 case Magic('V'): 2149 reg_magic = MAGIC_NONE; 2150 skipchr_keepstart(); 2151 curchr = -1; 2152 break; 2153 2154 default: 2155 if (nfa_regpiece() == FAIL) 2156 return FAIL; 2157 if (first == FALSE) 2158 EMIT(NFA_CONCAT); 2159 else 2160 first = FALSE; 2161 break; 2162 } 2163 } 2164 2165 return OK; 2166 } 2167 2168 /* 2169 * Parse a branch, one or more concats, separated by "\&". It matches the 2170 * last concat, but only if all the preceding concats also match at the same 2171 * position. Examples: 2172 * "foobeep\&..." matches "foo" in "foobeep". 2173 * ".*Peter\&.*Bob" matches in a line containing both "Peter" and "Bob" 2174 * 2175 * branch ::= concat 2176 * or concat \& concat 2177 * or concat \& concat \& concat 2178 * etc. 2179 */ 2180 static int 2181 nfa_regbranch() 2182 { 2183 int ch; 2184 int old_post_pos; 2185 2186 old_post_pos = (int)(post_ptr - post_start); 2187 2188 /* First branch, possibly the only one */ 2189 if (nfa_regconcat() == FAIL) 2190 return FAIL; 2191 2192 ch = peekchr(); 2193 /* Try next concats */ 2194 while (ch == Magic('&')) 2195 { 2196 skipchr(); 2197 EMIT(NFA_NOPEN); 2198 EMIT(NFA_PREV_ATOM_NO_WIDTH); 2199 old_post_pos = (int)(post_ptr - post_start); 2200 if (nfa_regconcat() == FAIL) 2201 return FAIL; 2202 /* if concat is empty do emit a node */ 2203 if (old_post_pos == (int)(post_ptr - post_start)) 2204 EMIT(NFA_EMPTY); 2205 EMIT(NFA_CONCAT); 2206 ch = peekchr(); 2207 } 2208 2209 /* if a branch is empty, emit one node for it */ 2210 if (old_post_pos == (int)(post_ptr - post_start)) 2211 EMIT(NFA_EMPTY); 2212 2213 return OK; 2214 } 2215 2216 /* 2217 * Parse a pattern, one or more branches, separated by "\|". It matches 2218 * anything that matches one of the branches. Example: "foo\|beep" matches 2219 * "foo" and matches "beep". If more than one branch matches, the first one 2220 * is used. 2221 * 2222 * pattern ::= branch 2223 * or branch \| branch 2224 * or branch \| branch \| branch 2225 * etc. 2226 */ 2227 static int 2228 nfa_reg(paren) 2229 int paren; /* REG_NOPAREN, REG_PAREN, REG_NPAREN or REG_ZPAREN */ 2230 { 2231 int parno = 0; 2232 2233 if (paren == REG_PAREN) 2234 { 2235 if (regnpar >= NSUBEXP) /* Too many `(' */ 2236 EMSG_RET_FAIL(_("E872: (NFA regexp) Too many '('")); 2237 parno = regnpar++; 2238 } 2239 #ifdef FEAT_SYN_HL 2240 else if (paren == REG_ZPAREN) 2241 { 2242 /* Make a ZOPEN node. */ 2243 if (regnzpar >= NSUBEXP) 2244 EMSG_RET_FAIL(_("E879: (NFA regexp) Too many \\z(")); 2245 parno = regnzpar++; 2246 } 2247 #endif 2248 2249 if (nfa_regbranch() == FAIL) 2250 return FAIL; /* cascaded error */ 2251 2252 while (peekchr() == Magic('|')) 2253 { 2254 skipchr(); 2255 if (nfa_regbranch() == FAIL) 2256 return FAIL; /* cascaded error */ 2257 EMIT(NFA_OR); 2258 } 2259 2260 /* Check for proper termination. */ 2261 if (paren != REG_NOPAREN && getchr() != Magic(')')) 2262 { 2263 if (paren == REG_NPAREN) 2264 EMSG2_RET_FAIL(_(e_unmatchedpp), reg_magic == MAGIC_ALL); 2265 else 2266 EMSG2_RET_FAIL(_(e_unmatchedp), reg_magic == MAGIC_ALL); 2267 } 2268 else if (paren == REG_NOPAREN && peekchr() != NUL) 2269 { 2270 if (peekchr() == Magic(')')) 2271 EMSG2_RET_FAIL(_(e_unmatchedpar), reg_magic == MAGIC_ALL); 2272 else 2273 EMSG_RET_FAIL(_("E873: (NFA regexp) proper termination error")); 2274 } 2275 /* 2276 * Here we set the flag allowing back references to this set of 2277 * parentheses. 2278 */ 2279 if (paren == REG_PAREN) 2280 { 2281 had_endbrace[parno] = TRUE; /* have seen the close paren */ 2282 EMIT(NFA_MOPEN + parno); 2283 } 2284 #ifdef FEAT_SYN_HL 2285 else if (paren == REG_ZPAREN) 2286 EMIT(NFA_ZOPEN + parno); 2287 #endif 2288 2289 return OK; 2290 } 2291 2292 #ifdef DEBUG 2293 static char_u code[50]; 2294 2295 static void 2296 nfa_set_code(c) 2297 int c; 2298 { 2299 int addnl = FALSE; 2300 2301 if (c >= NFA_FIRST_NL && c <= NFA_LAST_NL) 2302 { 2303 addnl = TRUE; 2304 c -= NFA_ADD_NL; 2305 } 2306 2307 STRCPY(code, ""); 2308 switch (c) 2309 { 2310 case NFA_MATCH: STRCPY(code, "NFA_MATCH "); break; 2311 case NFA_SPLIT: STRCPY(code, "NFA_SPLIT "); break; 2312 case NFA_CONCAT: STRCPY(code, "NFA_CONCAT "); break; 2313 case NFA_NEWL: STRCPY(code, "NFA_NEWL "); break; 2314 case NFA_ZSTART: STRCPY(code, "NFA_ZSTART"); break; 2315 case NFA_ZEND: STRCPY(code, "NFA_ZEND"); break; 2316 2317 case NFA_BACKREF1: STRCPY(code, "NFA_BACKREF1"); break; 2318 case NFA_BACKREF2: STRCPY(code, "NFA_BACKREF2"); break; 2319 case NFA_BACKREF3: STRCPY(code, "NFA_BACKREF3"); break; 2320 case NFA_BACKREF4: STRCPY(code, "NFA_BACKREF4"); break; 2321 case NFA_BACKREF5: STRCPY(code, "NFA_BACKREF5"); break; 2322 case NFA_BACKREF6: STRCPY(code, "NFA_BACKREF6"); break; 2323 case NFA_BACKREF7: STRCPY(code, "NFA_BACKREF7"); break; 2324 case NFA_BACKREF8: STRCPY(code, "NFA_BACKREF8"); break; 2325 case NFA_BACKREF9: STRCPY(code, "NFA_BACKREF9"); break; 2326 #ifdef FEAT_SYN_HL 2327 case NFA_ZREF1: STRCPY(code, "NFA_ZREF1"); break; 2328 case NFA_ZREF2: STRCPY(code, "NFA_ZREF2"); break; 2329 case NFA_ZREF3: STRCPY(code, "NFA_ZREF3"); break; 2330 case NFA_ZREF4: STRCPY(code, "NFA_ZREF4"); break; 2331 case NFA_ZREF5: STRCPY(code, "NFA_ZREF5"); break; 2332 case NFA_ZREF6: STRCPY(code, "NFA_ZREF6"); break; 2333 case NFA_ZREF7: STRCPY(code, "NFA_ZREF7"); break; 2334 case NFA_ZREF8: STRCPY(code, "NFA_ZREF8"); break; 2335 case NFA_ZREF9: STRCPY(code, "NFA_ZREF9"); break; 2336 #endif 2337 case NFA_SKIP: STRCPY(code, "NFA_SKIP"); break; 2338 2339 case NFA_PREV_ATOM_NO_WIDTH: 2340 STRCPY(code, "NFA_PREV_ATOM_NO_WIDTH"); break; 2341 case NFA_PREV_ATOM_NO_WIDTH_NEG: 2342 STRCPY(code, "NFA_PREV_ATOM_NO_WIDTH_NEG"); break; 2343 case NFA_PREV_ATOM_JUST_BEFORE: 2344 STRCPY(code, "NFA_PREV_ATOM_JUST_BEFORE"); break; 2345 case NFA_PREV_ATOM_JUST_BEFORE_NEG: 2346 STRCPY(code, "NFA_PREV_ATOM_JUST_BEFORE_NEG"); break; 2347 case NFA_PREV_ATOM_LIKE_PATTERN: 2348 STRCPY(code, "NFA_PREV_ATOM_LIKE_PATTERN"); break; 2349 2350 case NFA_NOPEN: STRCPY(code, "NFA_NOPEN"); break; 2351 case NFA_NCLOSE: STRCPY(code, "NFA_NCLOSE"); break; 2352 case NFA_START_INVISIBLE: STRCPY(code, "NFA_START_INVISIBLE"); break; 2353 case NFA_START_INVISIBLE_FIRST: 2354 STRCPY(code, "NFA_START_INVISIBLE_FIRST"); break; 2355 case NFA_START_INVISIBLE_NEG: 2356 STRCPY(code, "NFA_START_INVISIBLE_NEG"); break; 2357 case NFA_START_INVISIBLE_NEG_FIRST: 2358 STRCPY(code, "NFA_START_INVISIBLE_NEG_FIRST"); break; 2359 case NFA_START_INVISIBLE_BEFORE: 2360 STRCPY(code, "NFA_START_INVISIBLE_BEFORE"); break; 2361 case NFA_START_INVISIBLE_BEFORE_FIRST: 2362 STRCPY(code, "NFA_START_INVISIBLE_BEFORE_FIRST"); break; 2363 case NFA_START_INVISIBLE_BEFORE_NEG: 2364 STRCPY(code, "NFA_START_INVISIBLE_BEFORE_NEG"); break; 2365 case NFA_START_INVISIBLE_BEFORE_NEG_FIRST: 2366 STRCPY(code, "NFA_START_INVISIBLE_BEFORE_NEG_FIRST"); break; 2367 case NFA_START_PATTERN: STRCPY(code, "NFA_START_PATTERN"); break; 2368 case NFA_END_INVISIBLE: STRCPY(code, "NFA_END_INVISIBLE"); break; 2369 case NFA_END_INVISIBLE_NEG: STRCPY(code, "NFA_END_INVISIBLE_NEG"); break; 2370 case NFA_END_PATTERN: STRCPY(code, "NFA_END_PATTERN"); break; 2371 2372 case NFA_COMPOSING: STRCPY(code, "NFA_COMPOSING"); break; 2373 case NFA_END_COMPOSING: STRCPY(code, "NFA_END_COMPOSING"); break; 2374 case NFA_OPT_CHARS: STRCPY(code, "NFA_OPT_CHARS"); break; 2375 2376 case NFA_MOPEN: 2377 case NFA_MOPEN1: 2378 case NFA_MOPEN2: 2379 case NFA_MOPEN3: 2380 case NFA_MOPEN4: 2381 case NFA_MOPEN5: 2382 case NFA_MOPEN6: 2383 case NFA_MOPEN7: 2384 case NFA_MOPEN8: 2385 case NFA_MOPEN9: 2386 STRCPY(code, "NFA_MOPEN(x)"); 2387 code[10] = c - NFA_MOPEN + '0'; 2388 break; 2389 case NFA_MCLOSE: 2390 case NFA_MCLOSE1: 2391 case NFA_MCLOSE2: 2392 case NFA_MCLOSE3: 2393 case NFA_MCLOSE4: 2394 case NFA_MCLOSE5: 2395 case NFA_MCLOSE6: 2396 case NFA_MCLOSE7: 2397 case NFA_MCLOSE8: 2398 case NFA_MCLOSE9: 2399 STRCPY(code, "NFA_MCLOSE(x)"); 2400 code[11] = c - NFA_MCLOSE + '0'; 2401 break; 2402 #ifdef FEAT_SYN_HL 2403 case NFA_ZOPEN: 2404 case NFA_ZOPEN1: 2405 case NFA_ZOPEN2: 2406 case NFA_ZOPEN3: 2407 case NFA_ZOPEN4: 2408 case NFA_ZOPEN5: 2409 case NFA_ZOPEN6: 2410 case NFA_ZOPEN7: 2411 case NFA_ZOPEN8: 2412 case NFA_ZOPEN9: 2413 STRCPY(code, "NFA_ZOPEN(x)"); 2414 code[10] = c - NFA_ZOPEN + '0'; 2415 break; 2416 case NFA_ZCLOSE: 2417 case NFA_ZCLOSE1: 2418 case NFA_ZCLOSE2: 2419 case NFA_ZCLOSE3: 2420 case NFA_ZCLOSE4: 2421 case NFA_ZCLOSE5: 2422 case NFA_ZCLOSE6: 2423 case NFA_ZCLOSE7: 2424 case NFA_ZCLOSE8: 2425 case NFA_ZCLOSE9: 2426 STRCPY(code, "NFA_ZCLOSE(x)"); 2427 code[11] = c - NFA_ZCLOSE + '0'; 2428 break; 2429 #endif 2430 case NFA_EOL: STRCPY(code, "NFA_EOL "); break; 2431 case NFA_BOL: STRCPY(code, "NFA_BOL "); break; 2432 case NFA_EOW: STRCPY(code, "NFA_EOW "); break; 2433 case NFA_BOW: STRCPY(code, "NFA_BOW "); break; 2434 case NFA_EOF: STRCPY(code, "NFA_EOF "); break; 2435 case NFA_BOF: STRCPY(code, "NFA_BOF "); break; 2436 case NFA_LNUM: STRCPY(code, "NFA_LNUM "); break; 2437 case NFA_LNUM_GT: STRCPY(code, "NFA_LNUM_GT "); break; 2438 case NFA_LNUM_LT: STRCPY(code, "NFA_LNUM_LT "); break; 2439 case NFA_COL: STRCPY(code, "NFA_COL "); break; 2440 case NFA_COL_GT: STRCPY(code, "NFA_COL_GT "); break; 2441 case NFA_COL_LT: STRCPY(code, "NFA_COL_LT "); break; 2442 case NFA_VCOL: STRCPY(code, "NFA_VCOL "); break; 2443 case NFA_VCOL_GT: STRCPY(code, "NFA_VCOL_GT "); break; 2444 case NFA_VCOL_LT: STRCPY(code, "NFA_VCOL_LT "); break; 2445 case NFA_MARK: STRCPY(code, "NFA_MARK "); break; 2446 case NFA_MARK_GT: STRCPY(code, "NFA_MARK_GT "); break; 2447 case NFA_MARK_LT: STRCPY(code, "NFA_MARK_LT "); break; 2448 case NFA_CURSOR: STRCPY(code, "NFA_CURSOR "); break; 2449 case NFA_VISUAL: STRCPY(code, "NFA_VISUAL "); break; 2450 case NFA_ANY_COMPOSING: STRCPY(code, "NFA_ANY_COMPOSING "); break; 2451 2452 case NFA_STAR: STRCPY(code, "NFA_STAR "); break; 2453 case NFA_STAR_NONGREEDY: STRCPY(code, "NFA_STAR_NONGREEDY "); break; 2454 case NFA_QUEST: STRCPY(code, "NFA_QUEST"); break; 2455 case NFA_QUEST_NONGREEDY: STRCPY(code, "NFA_QUEST_NON_GREEDY"); break; 2456 case NFA_EMPTY: STRCPY(code, "NFA_EMPTY"); break; 2457 case NFA_OR: STRCPY(code, "NFA_OR"); break; 2458 2459 case NFA_START_COLL: STRCPY(code, "NFA_START_COLL"); break; 2460 case NFA_END_COLL: STRCPY(code, "NFA_END_COLL"); break; 2461 case NFA_START_NEG_COLL: STRCPY(code, "NFA_START_NEG_COLL"); break; 2462 case NFA_END_NEG_COLL: STRCPY(code, "NFA_END_NEG_COLL"); break; 2463 case NFA_RANGE: STRCPY(code, "NFA_RANGE"); break; 2464 case NFA_RANGE_MIN: STRCPY(code, "NFA_RANGE_MIN"); break; 2465 case NFA_RANGE_MAX: STRCPY(code, "NFA_RANGE_MAX"); break; 2466 2467 case NFA_CLASS_ALNUM: STRCPY(code, "NFA_CLASS_ALNUM"); break; 2468 case NFA_CLASS_ALPHA: STRCPY(code, "NFA_CLASS_ALPHA"); break; 2469 case NFA_CLASS_BLANK: STRCPY(code, "NFA_CLASS_BLANK"); break; 2470 case NFA_CLASS_CNTRL: STRCPY(code, "NFA_CLASS_CNTRL"); break; 2471 case NFA_CLASS_DIGIT: STRCPY(code, "NFA_CLASS_DIGIT"); break; 2472 case NFA_CLASS_GRAPH: STRCPY(code, "NFA_CLASS_GRAPH"); break; 2473 case NFA_CLASS_LOWER: STRCPY(code, "NFA_CLASS_LOWER"); break; 2474 case NFA_CLASS_PRINT: STRCPY(code, "NFA_CLASS_PRINT"); break; 2475 case NFA_CLASS_PUNCT: STRCPY(code, "NFA_CLASS_PUNCT"); break; 2476 case NFA_CLASS_SPACE: STRCPY(code, "NFA_CLASS_SPACE"); break; 2477 case NFA_CLASS_UPPER: STRCPY(code, "NFA_CLASS_UPPER"); break; 2478 case NFA_CLASS_XDIGIT: STRCPY(code, "NFA_CLASS_XDIGIT"); break; 2479 case NFA_CLASS_TAB: STRCPY(code, "NFA_CLASS_TAB"); break; 2480 case NFA_CLASS_RETURN: STRCPY(code, "NFA_CLASS_RETURN"); break; 2481 case NFA_CLASS_BACKSPACE: STRCPY(code, "NFA_CLASS_BACKSPACE"); break; 2482 case NFA_CLASS_ESCAPE: STRCPY(code, "NFA_CLASS_ESCAPE"); break; 2483 2484 case NFA_ANY: STRCPY(code, "NFA_ANY"); break; 2485 case NFA_IDENT: STRCPY(code, "NFA_IDENT"); break; 2486 case NFA_SIDENT:STRCPY(code, "NFA_SIDENT"); break; 2487 case NFA_KWORD: STRCPY(code, "NFA_KWORD"); break; 2488 case NFA_SKWORD:STRCPY(code, "NFA_SKWORD"); break; 2489 case NFA_FNAME: STRCPY(code, "NFA_FNAME"); break; 2490 case NFA_SFNAME:STRCPY(code, "NFA_SFNAME"); break; 2491 case NFA_PRINT: STRCPY(code, "NFA_PRINT"); break; 2492 case NFA_SPRINT:STRCPY(code, "NFA_SPRINT"); break; 2493 case NFA_WHITE: STRCPY(code, "NFA_WHITE"); break; 2494 case NFA_NWHITE:STRCPY(code, "NFA_NWHITE"); break; 2495 case NFA_DIGIT: STRCPY(code, "NFA_DIGIT"); break; 2496 case NFA_NDIGIT:STRCPY(code, "NFA_NDIGIT"); break; 2497 case NFA_HEX: STRCPY(code, "NFA_HEX"); break; 2498 case NFA_NHEX: STRCPY(code, "NFA_NHEX"); break; 2499 case NFA_OCTAL: STRCPY(code, "NFA_OCTAL"); break; 2500 case NFA_NOCTAL:STRCPY(code, "NFA_NOCTAL"); break; 2501 case NFA_WORD: STRCPY(code, "NFA_WORD"); break; 2502 case NFA_NWORD: STRCPY(code, "NFA_NWORD"); break; 2503 case NFA_HEAD: STRCPY(code, "NFA_HEAD"); break; 2504 case NFA_NHEAD: STRCPY(code, "NFA_NHEAD"); break; 2505 case NFA_ALPHA: STRCPY(code, "NFA_ALPHA"); break; 2506 case NFA_NALPHA:STRCPY(code, "NFA_NALPHA"); break; 2507 case NFA_LOWER: STRCPY(code, "NFA_LOWER"); break; 2508 case NFA_NLOWER:STRCPY(code, "NFA_NLOWER"); break; 2509 case NFA_UPPER: STRCPY(code, "NFA_UPPER"); break; 2510 case NFA_NUPPER:STRCPY(code, "NFA_NUPPER"); break; 2511 case NFA_LOWER_IC: STRCPY(code, "NFA_LOWER_IC"); break; 2512 case NFA_NLOWER_IC: STRCPY(code, "NFA_NLOWER_IC"); break; 2513 case NFA_UPPER_IC: STRCPY(code, "NFA_UPPER_IC"); break; 2514 case NFA_NUPPER_IC: STRCPY(code, "NFA_NUPPER_IC"); break; 2515 2516 default: 2517 STRCPY(code, "CHAR(x)"); 2518 code[5] = c; 2519 } 2520 2521 if (addnl == TRUE) 2522 STRCAT(code, " + NEWLINE "); 2523 2524 } 2525 2526 #ifdef ENABLE_LOG 2527 static FILE *log_fd; 2528 2529 /* 2530 * Print the postfix notation of the current regexp. 2531 */ 2532 static void 2533 nfa_postfix_dump(expr, retval) 2534 char_u *expr; 2535 int retval; 2536 { 2537 int *p; 2538 FILE *f; 2539 2540 f = fopen(NFA_REGEXP_DUMP_LOG, "a"); 2541 if (f != NULL) 2542 { 2543 fprintf(f, "\n-------------------------\n"); 2544 if (retval == FAIL) 2545 fprintf(f, ">>> NFA engine failed ... \n"); 2546 else if (retval == OK) 2547 fprintf(f, ">>> NFA engine succeeded !\n"); 2548 fprintf(f, "Regexp: \"%s\"\nPostfix notation (char): \"", expr); 2549 for (p = post_start; *p && p < post_ptr; p++) 2550 { 2551 nfa_set_code(*p); 2552 fprintf(f, "%s, ", code); 2553 } 2554 fprintf(f, "\"\nPostfix notation (int): "); 2555 for (p = post_start; *p && p < post_ptr; p++) 2556 fprintf(f, "%d ", *p); 2557 fprintf(f, "\n\n"); 2558 fclose(f); 2559 } 2560 } 2561 2562 /* 2563 * Print the NFA starting with a root node "state". 2564 */ 2565 static void 2566 nfa_print_state(debugf, state) 2567 FILE *debugf; 2568 nfa_state_T *state; 2569 { 2570 garray_T indent; 2571 2572 ga_init2(&indent, 1, 64); 2573 ga_append(&indent, '\0'); 2574 nfa_print_state2(debugf, state, &indent); 2575 ga_clear(&indent); 2576 } 2577 2578 static void 2579 nfa_print_state2(debugf, state, indent) 2580 FILE *debugf; 2581 nfa_state_T *state; 2582 garray_T *indent; 2583 { 2584 char_u *p; 2585 2586 if (state == NULL) 2587 return; 2588 2589 fprintf(debugf, "(%2d)", abs(state->id)); 2590 2591 /* Output indent */ 2592 p = (char_u *)indent->ga_data; 2593 if (indent->ga_len >= 3) 2594 { 2595 int last = indent->ga_len - 3; 2596 char_u save[2]; 2597 2598 STRNCPY(save, &p[last], 2); 2599 STRNCPY(&p[last], "+-", 2); 2600 fprintf(debugf, " %s", p); 2601 STRNCPY(&p[last], save, 2); 2602 } 2603 else 2604 fprintf(debugf, " %s", p); 2605 2606 nfa_set_code(state->c); 2607 fprintf(debugf, "%s (%d) (id=%d) val=%d\n", 2608 code, 2609 state->c, 2610 abs(state->id), 2611 state->val); 2612 if (state->id < 0) 2613 return; 2614 2615 state->id = abs(state->id) * -1; 2616 2617 /* grow indent for state->out */ 2618 indent->ga_len -= 1; 2619 if (state->out1) 2620 ga_concat(indent, (char_u *)"| "); 2621 else 2622 ga_concat(indent, (char_u *)" "); 2623 ga_append(indent, '\0'); 2624 2625 nfa_print_state2(debugf, state->out, indent); 2626 2627 /* replace last part of indent for state->out1 */ 2628 indent->ga_len -= 3; 2629 ga_concat(indent, (char_u *)" "); 2630 ga_append(indent, '\0'); 2631 2632 nfa_print_state2(debugf, state->out1, indent); 2633 2634 /* shrink indent */ 2635 indent->ga_len -= 3; 2636 ga_append(indent, '\0'); 2637 } 2638 2639 /* 2640 * Print the NFA state machine. 2641 */ 2642 static void 2643 nfa_dump(prog) 2644 nfa_regprog_T *prog; 2645 { 2646 FILE *debugf = fopen(NFA_REGEXP_DUMP_LOG, "a"); 2647 2648 if (debugf != NULL) 2649 { 2650 nfa_print_state(debugf, prog->start); 2651 2652 if (prog->reganch) 2653 fprintf(debugf, "reganch: %d\n", prog->reganch); 2654 if (prog->regstart != NUL) 2655 fprintf(debugf, "regstart: %c (decimal: %d)\n", 2656 prog->regstart, prog->regstart); 2657 if (prog->match_text != NULL) 2658 fprintf(debugf, "match_text: \"%s\"\n", prog->match_text); 2659 2660 fclose(debugf); 2661 } 2662 } 2663 #endif /* ENABLE_LOG */ 2664 #endif /* DEBUG */ 2665 2666 /* 2667 * Parse r.e. @expr and convert it into postfix form. 2668 * Return the postfix string on success, NULL otherwise. 2669 */ 2670 static int * 2671 re2post() 2672 { 2673 if (nfa_reg(REG_NOPAREN) == FAIL) 2674 return NULL; 2675 EMIT(NFA_MOPEN); 2676 return post_start; 2677 } 2678 2679 /* NB. Some of the code below is inspired by Russ's. */ 2680 2681 /* 2682 * Represents an NFA state plus zero or one or two arrows exiting. 2683 * if c == MATCH, no arrows out; matching state. 2684 * If c == SPLIT, unlabeled arrows to out and out1 (if != NULL). 2685 * If c < 256, labeled arrow with character c to out. 2686 */ 2687 2688 static nfa_state_T *state_ptr; /* points to nfa_prog->state */ 2689 2690 /* 2691 * Allocate and initialize nfa_state_T. 2692 */ 2693 static nfa_state_T * 2694 alloc_state(c, out, out1) 2695 int c; 2696 nfa_state_T *out; 2697 nfa_state_T *out1; 2698 { 2699 nfa_state_T *s; 2700 2701 if (istate >= nstate) 2702 return NULL; 2703 2704 s = &state_ptr[istate++]; 2705 2706 s->c = c; 2707 s->out = out; 2708 s->out1 = out1; 2709 s->val = 0; 2710 2711 s->id = istate; 2712 s->lastlist[0] = 0; 2713 s->lastlist[1] = 0; 2714 2715 return s; 2716 } 2717 2718 /* 2719 * A partially built NFA without the matching state filled in. 2720 * Frag_T.start points at the start state. 2721 * Frag_T.out is a list of places that need to be set to the 2722 * next state for this fragment. 2723 */ 2724 2725 /* Since the out pointers in the list are always 2726 * uninitialized, we use the pointers themselves 2727 * as storage for the Ptrlists. */ 2728 typedef union Ptrlist Ptrlist; 2729 union Ptrlist 2730 { 2731 Ptrlist *next; 2732 nfa_state_T *s; 2733 }; 2734 2735 struct Frag 2736 { 2737 nfa_state_T *start; 2738 Ptrlist *out; 2739 }; 2740 typedef struct Frag Frag_T; 2741 2742 static Frag_T frag __ARGS((nfa_state_T *start, Ptrlist *out)); 2743 static Ptrlist *list1 __ARGS((nfa_state_T **outp)); 2744 static void patch __ARGS((Ptrlist *l, nfa_state_T *s)); 2745 static Ptrlist *append __ARGS((Ptrlist *l1, Ptrlist *l2)); 2746 static void st_push __ARGS((Frag_T s, Frag_T **p, Frag_T *stack_end)); 2747 static Frag_T st_pop __ARGS((Frag_T **p, Frag_T *stack)); 2748 2749 /* 2750 * Initialize a Frag_T struct and return it. 2751 */ 2752 static Frag_T 2753 frag(start, out) 2754 nfa_state_T *start; 2755 Ptrlist *out; 2756 { 2757 Frag_T n; 2758 2759 n.start = start; 2760 n.out = out; 2761 return n; 2762 } 2763 2764 /* 2765 * Create singleton list containing just outp. 2766 */ 2767 static Ptrlist * 2768 list1(outp) 2769 nfa_state_T **outp; 2770 { 2771 Ptrlist *l; 2772 2773 l = (Ptrlist *)outp; 2774 l->next = NULL; 2775 return l; 2776 } 2777 2778 /* 2779 * Patch the list of states at out to point to start. 2780 */ 2781 static void 2782 patch(l, s) 2783 Ptrlist *l; 2784 nfa_state_T *s; 2785 { 2786 Ptrlist *next; 2787 2788 for (; l; l = next) 2789 { 2790 next = l->next; 2791 l->s = s; 2792 } 2793 } 2794 2795 2796 /* 2797 * Join the two lists l1 and l2, returning the combination. 2798 */ 2799 static Ptrlist * 2800 append(l1, l2) 2801 Ptrlist *l1; 2802 Ptrlist *l2; 2803 { 2804 Ptrlist *oldl1; 2805 2806 oldl1 = l1; 2807 while (l1->next) 2808 l1 = l1->next; 2809 l1->next = l2; 2810 return oldl1; 2811 } 2812 2813 /* 2814 * Stack used for transforming postfix form into NFA. 2815 */ 2816 static Frag_T empty; 2817 2818 static void 2819 st_error(postfix, end, p) 2820 int *postfix UNUSED; 2821 int *end UNUSED; 2822 int *p UNUSED; 2823 { 2824 #ifdef NFA_REGEXP_ERROR_LOG 2825 FILE *df; 2826 int *p2; 2827 2828 df = fopen(NFA_REGEXP_ERROR_LOG, "a"); 2829 if (df) 2830 { 2831 fprintf(df, "Error popping the stack!\n"); 2832 #ifdef DEBUG 2833 fprintf(df, "Current regexp is \"%s\"\n", nfa_regengine.expr); 2834 #endif 2835 fprintf(df, "Postfix form is: "); 2836 #ifdef DEBUG 2837 for (p2 = postfix; p2 < end; p2++) 2838 { 2839 nfa_set_code(*p2); 2840 fprintf(df, "%s, ", code); 2841 } 2842 nfa_set_code(*p); 2843 fprintf(df, "\nCurrent position is: "); 2844 for (p2 = postfix; p2 <= p; p2 ++) 2845 { 2846 nfa_set_code(*p2); 2847 fprintf(df, "%s, ", code); 2848 } 2849 #else 2850 for (p2 = postfix; p2 < end; p2++) 2851 { 2852 fprintf(df, "%d, ", *p2); 2853 } 2854 fprintf(df, "\nCurrent position is: "); 2855 for (p2 = postfix; p2 <= p; p2 ++) 2856 { 2857 fprintf(df, "%d, ", *p2); 2858 } 2859 #endif 2860 fprintf(df, "\n--------------------------\n"); 2861 fclose(df); 2862 } 2863 #endif 2864 EMSG(_("E874: (NFA) Could not pop the stack !")); 2865 } 2866 2867 /* 2868 * Push an item onto the stack. 2869 */ 2870 static void 2871 st_push(s, p, stack_end) 2872 Frag_T s; 2873 Frag_T **p; 2874 Frag_T *stack_end; 2875 { 2876 Frag_T *stackp = *p; 2877 2878 if (stackp >= stack_end) 2879 return; 2880 *stackp = s; 2881 *p = *p + 1; 2882 } 2883 2884 /* 2885 * Pop an item from the stack. 2886 */ 2887 static Frag_T 2888 st_pop(p, stack) 2889 Frag_T **p; 2890 Frag_T *stack; 2891 { 2892 Frag_T *stackp; 2893 2894 *p = *p - 1; 2895 stackp = *p; 2896 if (stackp < stack) 2897 return empty; 2898 return **p; 2899 } 2900 2901 /* 2902 * Estimate the maximum byte length of anything matching "state". 2903 * When unknown or unlimited return -1. 2904 */ 2905 static int 2906 nfa_max_width(startstate, depth) 2907 nfa_state_T *startstate; 2908 int depth; 2909 { 2910 int l, r; 2911 nfa_state_T *state = startstate; 2912 int len = 0; 2913 2914 /* detect looping in a NFA_SPLIT */ 2915 if (depth > 4) 2916 return -1; 2917 2918 while (state != NULL) 2919 { 2920 switch (state->c) 2921 { 2922 case NFA_END_INVISIBLE: 2923 case NFA_END_INVISIBLE_NEG: 2924 /* the end, return what we have */ 2925 return len; 2926 2927 case NFA_SPLIT: 2928 /* two alternatives, use the maximum */ 2929 l = nfa_max_width(state->out, depth + 1); 2930 r = nfa_max_width(state->out1, depth + 1); 2931 if (l < 0 || r < 0) 2932 return -1; 2933 return len + (l > r ? l : r); 2934 2935 case NFA_ANY: 2936 case NFA_START_COLL: 2937 case NFA_START_NEG_COLL: 2938 /* matches some character, including composing chars */ 2939 #ifdef FEAT_MBYTE 2940 if (enc_utf8) 2941 len += MB_MAXBYTES; 2942 else if (has_mbyte) 2943 len += 2; 2944 else 2945 #endif 2946 ++len; 2947 if (state->c != NFA_ANY) 2948 { 2949 /* skip over the characters */ 2950 state = state->out1->out; 2951 continue; 2952 } 2953 break; 2954 2955 case NFA_DIGIT: 2956 case NFA_WHITE: 2957 case NFA_HEX: 2958 case NFA_OCTAL: 2959 /* ascii */ 2960 ++len; 2961 break; 2962 2963 case NFA_IDENT: 2964 case NFA_SIDENT: 2965 case NFA_KWORD: 2966 case NFA_SKWORD: 2967 case NFA_FNAME: 2968 case NFA_SFNAME: 2969 case NFA_PRINT: 2970 case NFA_SPRINT: 2971 case NFA_NWHITE: 2972 case NFA_NDIGIT: 2973 case NFA_NHEX: 2974 case NFA_NOCTAL: 2975 case NFA_WORD: 2976 case NFA_NWORD: 2977 case NFA_HEAD: 2978 case NFA_NHEAD: 2979 case NFA_ALPHA: 2980 case NFA_NALPHA: 2981 case NFA_LOWER: 2982 case NFA_NLOWER: 2983 case NFA_UPPER: 2984 case NFA_NUPPER: 2985 case NFA_LOWER_IC: 2986 case NFA_NLOWER_IC: 2987 case NFA_UPPER_IC: 2988 case NFA_NUPPER_IC: 2989 case NFA_ANY_COMPOSING: 2990 /* possibly non-ascii */ 2991 #ifdef FEAT_MBYTE 2992 if (has_mbyte) 2993 len += 3; 2994 else 2995 #endif 2996 ++len; 2997 break; 2998 2999 case NFA_START_INVISIBLE: 3000 case NFA_START_INVISIBLE_NEG: 3001 case NFA_START_INVISIBLE_BEFORE: 3002 case NFA_START_INVISIBLE_BEFORE_NEG: 3003 /* zero-width, out1 points to the END state */ 3004 state = state->out1->out; 3005 continue; 3006 3007 case NFA_BACKREF1: 3008 case NFA_BACKREF2: 3009 case NFA_BACKREF3: 3010 case NFA_BACKREF4: 3011 case NFA_BACKREF5: 3012 case NFA_BACKREF6: 3013 case NFA_BACKREF7: 3014 case NFA_BACKREF8: 3015 case NFA_BACKREF9: 3016 #ifdef FEAT_SYN_HL 3017 case NFA_ZREF1: 3018 case NFA_ZREF2: 3019 case NFA_ZREF3: 3020 case NFA_ZREF4: 3021 case NFA_ZREF5: 3022 case NFA_ZREF6: 3023 case NFA_ZREF7: 3024 case NFA_ZREF8: 3025 case NFA_ZREF9: 3026 #endif 3027 case NFA_NEWL: 3028 case NFA_SKIP: 3029 /* unknown width */ 3030 return -1; 3031 3032 case NFA_BOL: 3033 case NFA_EOL: 3034 case NFA_BOF: 3035 case NFA_EOF: 3036 case NFA_BOW: 3037 case NFA_EOW: 3038 case NFA_MOPEN: 3039 case NFA_MOPEN1: 3040 case NFA_MOPEN2: 3041 case NFA_MOPEN3: 3042 case NFA_MOPEN4: 3043 case NFA_MOPEN5: 3044 case NFA_MOPEN6: 3045 case NFA_MOPEN7: 3046 case NFA_MOPEN8: 3047 case NFA_MOPEN9: 3048 #ifdef FEAT_SYN_HL 3049 case NFA_ZOPEN: 3050 case NFA_ZOPEN1: 3051 case NFA_ZOPEN2: 3052 case NFA_ZOPEN3: 3053 case NFA_ZOPEN4: 3054 case NFA_ZOPEN5: 3055 case NFA_ZOPEN6: 3056 case NFA_ZOPEN7: 3057 case NFA_ZOPEN8: 3058 case NFA_ZOPEN9: 3059 case NFA_ZCLOSE: 3060 case NFA_ZCLOSE1: 3061 case NFA_ZCLOSE2: 3062 case NFA_ZCLOSE3: 3063 case NFA_ZCLOSE4: 3064 case NFA_ZCLOSE5: 3065 case NFA_ZCLOSE6: 3066 case NFA_ZCLOSE7: 3067 case NFA_ZCLOSE8: 3068 case NFA_ZCLOSE9: 3069 #endif 3070 case NFA_MCLOSE: 3071 case NFA_MCLOSE1: 3072 case NFA_MCLOSE2: 3073 case NFA_MCLOSE3: 3074 case NFA_MCLOSE4: 3075 case NFA_MCLOSE5: 3076 case NFA_MCLOSE6: 3077 case NFA_MCLOSE7: 3078 case NFA_MCLOSE8: 3079 case NFA_MCLOSE9: 3080 case NFA_NOPEN: 3081 case NFA_NCLOSE: 3082 3083 case NFA_LNUM_GT: 3084 case NFA_LNUM_LT: 3085 case NFA_COL_GT: 3086 case NFA_COL_LT: 3087 case NFA_VCOL_GT: 3088 case NFA_VCOL_LT: 3089 case NFA_MARK_GT: 3090 case NFA_MARK_LT: 3091 case NFA_VISUAL: 3092 case NFA_LNUM: 3093 case NFA_CURSOR: 3094 case NFA_COL: 3095 case NFA_VCOL: 3096 case NFA_MARK: 3097 3098 case NFA_ZSTART: 3099 case NFA_ZEND: 3100 case NFA_OPT_CHARS: 3101 case NFA_EMPTY: 3102 case NFA_START_PATTERN: 3103 case NFA_END_PATTERN: 3104 case NFA_COMPOSING: 3105 case NFA_END_COMPOSING: 3106 /* zero-width */ 3107 break; 3108 3109 default: 3110 if (state->c < 0) 3111 /* don't know what this is */ 3112 return -1; 3113 /* normal character */ 3114 len += MB_CHAR2LEN(state->c); 3115 break; 3116 } 3117 3118 /* normal way to continue */ 3119 state = state->out; 3120 } 3121 3122 /* unrecognized, "cannot happen" */ 3123 return -1; 3124 } 3125 3126 /* 3127 * Convert a postfix form into its equivalent NFA. 3128 * Return the NFA start state on success, NULL otherwise. 3129 */ 3130 static nfa_state_T * 3131 post2nfa(postfix, end, nfa_calc_size) 3132 int *postfix; 3133 int *end; 3134 int nfa_calc_size; 3135 { 3136 int *p; 3137 int mopen; 3138 int mclose; 3139 Frag_T *stack = NULL; 3140 Frag_T *stackp = NULL; 3141 Frag_T *stack_end = NULL; 3142 Frag_T e1; 3143 Frag_T e2; 3144 Frag_T e; 3145 nfa_state_T *s; 3146 nfa_state_T *s1; 3147 nfa_state_T *matchstate; 3148 nfa_state_T *ret = NULL; 3149 3150 if (postfix == NULL) 3151 return NULL; 3152 3153 #define PUSH(s) st_push((s), &stackp, stack_end) 3154 #define POP() st_pop(&stackp, stack); \ 3155 if (stackp < stack) \ 3156 { \ 3157 st_error(postfix, end, p); \ 3158 vim_free(stack); \ 3159 return NULL; \ 3160 } 3161 3162 if (nfa_calc_size == FALSE) 3163 { 3164 /* Allocate space for the stack. Max states on the stack : nstate */ 3165 stack = (Frag_T *)lalloc((nstate + 1) * sizeof(Frag_T), TRUE); 3166 stackp = stack; 3167 stack_end = stack + (nstate + 1); 3168 } 3169 3170 for (p = postfix; p < end; ++p) 3171 { 3172 switch (*p) 3173 { 3174 case NFA_CONCAT: 3175 /* Concatenation. 3176 * Pay attention: this operator does not exist in the r.e. itself 3177 * (it is implicit, really). It is added when r.e. is translated 3178 * to postfix form in re2post(). */ 3179 if (nfa_calc_size == TRUE) 3180 { 3181 /* nstate += 0; */ 3182 break; 3183 } 3184 e2 = POP(); 3185 e1 = POP(); 3186 patch(e1.out, e2.start); 3187 PUSH(frag(e1.start, e2.out)); 3188 break; 3189 3190 case NFA_OR: 3191 /* Alternation */ 3192 if (nfa_calc_size == TRUE) 3193 { 3194 nstate++; 3195 break; 3196 } 3197 e2 = POP(); 3198 e1 = POP(); 3199 s = alloc_state(NFA_SPLIT, e1.start, e2.start); 3200 if (s == NULL) 3201 goto theend; 3202 PUSH(frag(s, append(e1.out, e2.out))); 3203 break; 3204 3205 case NFA_STAR: 3206 /* Zero or more, prefer more */ 3207 if (nfa_calc_size == TRUE) 3208 { 3209 nstate++; 3210 break; 3211 } 3212 e = POP(); 3213 s = alloc_state(NFA_SPLIT, e.start, NULL); 3214 if (s == NULL) 3215 goto theend; 3216 patch(e.out, s); 3217 PUSH(frag(s, list1(&s->out1))); 3218 break; 3219 3220 case NFA_STAR_NONGREEDY: 3221 /* Zero or more, prefer zero */ 3222 if (nfa_calc_size == TRUE) 3223 { 3224 nstate++; 3225 break; 3226 } 3227 e = POP(); 3228 s = alloc_state(NFA_SPLIT, NULL, e.start); 3229 if (s == NULL) 3230 goto theend; 3231 patch(e.out, s); 3232 PUSH(frag(s, list1(&s->out))); 3233 break; 3234 3235 case NFA_QUEST: 3236 /* one or zero atoms=> greedy match */ 3237 if (nfa_calc_size == TRUE) 3238 { 3239 nstate++; 3240 break; 3241 } 3242 e = POP(); 3243 s = alloc_state(NFA_SPLIT, e.start, NULL); 3244 if (s == NULL) 3245 goto theend; 3246 PUSH(frag(s, append(e.out, list1(&s->out1)))); 3247 break; 3248 3249 case NFA_QUEST_NONGREEDY: 3250 /* zero or one atoms => non-greedy match */ 3251 if (nfa_calc_size == TRUE) 3252 { 3253 nstate++; 3254 break; 3255 } 3256 e = POP(); 3257 s = alloc_state(NFA_SPLIT, NULL, e.start); 3258 if (s == NULL) 3259 goto theend; 3260 PUSH(frag(s, append(e.out, list1(&s->out)))); 3261 break; 3262 3263 case NFA_END_COLL: 3264 case NFA_END_NEG_COLL: 3265 /* On the stack is the sequence starting with NFA_START_COLL or 3266 * NFA_START_NEG_COLL and all possible characters. Patch it to 3267 * add the output to the start. */ 3268 if (nfa_calc_size == TRUE) 3269 { 3270 nstate++; 3271 break; 3272 } 3273 e = POP(); 3274 s = alloc_state(NFA_END_COLL, NULL, NULL); 3275 if (s == NULL) 3276 goto theend; 3277 patch(e.out, s); 3278 e.start->out1 = s; 3279 PUSH(frag(e.start, list1(&s->out))); 3280 break; 3281 3282 case NFA_RANGE: 3283 /* Before this are two characters, the low and high end of a 3284 * range. Turn them into two states with MIN and MAX. */ 3285 if (nfa_calc_size == TRUE) 3286 { 3287 /* nstate += 0; */ 3288 break; 3289 } 3290 e2 = POP(); 3291 e1 = POP(); 3292 e2.start->val = e2.start->c; 3293 e2.start->c = NFA_RANGE_MAX; 3294 e1.start->val = e1.start->c; 3295 e1.start->c = NFA_RANGE_MIN; 3296 patch(e1.out, e2.start); 3297 PUSH(frag(e1.start, e2.out)); 3298 break; 3299 3300 case NFA_EMPTY: 3301 /* 0-length, used in a repetition with max/min count of 0 */ 3302 if (nfa_calc_size == TRUE) 3303 { 3304 nstate++; 3305 break; 3306 } 3307 s = alloc_state(NFA_EMPTY, NULL, NULL); 3308 if (s == NULL) 3309 goto theend; 3310 PUSH(frag(s, list1(&s->out))); 3311 break; 3312 3313 case NFA_OPT_CHARS: 3314 { 3315 int n; 3316 3317 /* \%[abc] implemented as: 3318 * NFA_SPLIT 3319 * +-CHAR(a) 3320 * | +-NFA_SPLIT 3321 * | +-CHAR(b) 3322 * | | +-NFA_SPLIT 3323 * | | +-CHAR(c) 3324 * | | | +-next 3325 * | | +- next 3326 * | +- next 3327 * +- next 3328 */ 3329 n = *++p; /* get number of characters */ 3330 if (nfa_calc_size == TRUE) 3331 { 3332 nstate += n; 3333 break; 3334 } 3335 s = NULL; /* avoid compiler warning */ 3336 e1.out = NULL; /* stores list with out1's */ 3337 s1 = NULL; /* previous NFA_SPLIT to connect to */ 3338 while (n-- > 0) 3339 { 3340 e = POP(); /* get character */ 3341 s = alloc_state(NFA_SPLIT, e.start, NULL); 3342 if (s == NULL) 3343 goto theend; 3344 if (e1.out == NULL) 3345 e1 = e; 3346 patch(e.out, s1); 3347 append(e1.out, list1(&s->out1)); 3348 s1 = s; 3349 } 3350 PUSH(frag(s, e1.out)); 3351 break; 3352 } 3353 3354 case NFA_PREV_ATOM_NO_WIDTH: 3355 case NFA_PREV_ATOM_NO_WIDTH_NEG: 3356 case NFA_PREV_ATOM_JUST_BEFORE: 3357 case NFA_PREV_ATOM_JUST_BEFORE_NEG: 3358 case NFA_PREV_ATOM_LIKE_PATTERN: 3359 { 3360 int before = (*p == NFA_PREV_ATOM_JUST_BEFORE 3361 || *p == NFA_PREV_ATOM_JUST_BEFORE_NEG); 3362 int pattern = (*p == NFA_PREV_ATOM_LIKE_PATTERN); 3363 int start_state; 3364 int end_state; 3365 int n = 0; 3366 nfa_state_T *zend; 3367 nfa_state_T *skip; 3368 3369 switch (*p) 3370 { 3371 case NFA_PREV_ATOM_NO_WIDTH: 3372 start_state = NFA_START_INVISIBLE; 3373 end_state = NFA_END_INVISIBLE; 3374 break; 3375 case NFA_PREV_ATOM_NO_WIDTH_NEG: 3376 start_state = NFA_START_INVISIBLE_NEG; 3377 end_state = NFA_END_INVISIBLE_NEG; 3378 break; 3379 case NFA_PREV_ATOM_JUST_BEFORE: 3380 start_state = NFA_START_INVISIBLE_BEFORE; 3381 end_state = NFA_END_INVISIBLE; 3382 break; 3383 case NFA_PREV_ATOM_JUST_BEFORE_NEG: 3384 start_state = NFA_START_INVISIBLE_BEFORE_NEG; 3385 end_state = NFA_END_INVISIBLE_NEG; 3386 break; 3387 default: /* NFA_PREV_ATOM_LIKE_PATTERN: */ 3388 start_state = NFA_START_PATTERN; 3389 end_state = NFA_END_PATTERN; 3390 break; 3391 } 3392 3393 if (before) 3394 n = *++p; /* get the count */ 3395 3396 /* The \@= operator: match the preceding atom with zero width. 3397 * The \@! operator: no match for the preceding atom. 3398 * The \@<= operator: match for the preceding atom. 3399 * The \@<! operator: no match for the preceding atom. 3400 * Surrounds the preceding atom with START_INVISIBLE and 3401 * END_INVISIBLE, similarly to MOPEN. */ 3402 3403 if (nfa_calc_size == TRUE) 3404 { 3405 nstate += pattern ? 4 : 2; 3406 break; 3407 } 3408 e = POP(); 3409 s1 = alloc_state(end_state, NULL, NULL); 3410 if (s1 == NULL) 3411 goto theend; 3412 3413 s = alloc_state(start_state, e.start, s1); 3414 if (s == NULL) 3415 goto theend; 3416 if (pattern) 3417 { 3418 /* NFA_ZEND -> NFA_END_PATTERN -> NFA_SKIP -> what follows. */ 3419 skip = alloc_state(NFA_SKIP, NULL, NULL); 3420 zend = alloc_state(NFA_ZEND, s1, NULL); 3421 s1->out= skip; 3422 patch(e.out, zend); 3423 PUSH(frag(s, list1(&skip->out))); 3424 } 3425 else 3426 { 3427 patch(e.out, s1); 3428 PUSH(frag(s, list1(&s1->out))); 3429 if (before) 3430 { 3431 if (n <= 0) 3432 /* See if we can guess the maximum width, it avoids a 3433 * lot of pointless tries. */ 3434 n = nfa_max_width(e.start, 0); 3435 s->val = n; /* store the count */ 3436 } 3437 } 3438 break; 3439 } 3440 3441 #ifdef FEAT_MBYTE 3442 case NFA_COMPOSING: /* char with composing char */ 3443 #if 0 3444 /* TODO */ 3445 if (regflags & RF_ICOMBINE) 3446 { 3447 /* use the base character only */ 3448 } 3449 #endif 3450 /* FALLTHROUGH */ 3451 #endif 3452 3453 case NFA_MOPEN: /* \( \) Submatch */ 3454 case NFA_MOPEN1: 3455 case NFA_MOPEN2: 3456 case NFA_MOPEN3: 3457 case NFA_MOPEN4: 3458 case NFA_MOPEN5: 3459 case NFA_MOPEN6: 3460 case NFA_MOPEN7: 3461 case NFA_MOPEN8: 3462 case NFA_MOPEN9: 3463 #ifdef FEAT_SYN_HL 3464 case NFA_ZOPEN: /* \z( \) Submatch */ 3465 case NFA_ZOPEN1: 3466 case NFA_ZOPEN2: 3467 case NFA_ZOPEN3: 3468 case NFA_ZOPEN4: 3469 case NFA_ZOPEN5: 3470 case NFA_ZOPEN6: 3471 case NFA_ZOPEN7: 3472 case NFA_ZOPEN8: 3473 case NFA_ZOPEN9: 3474 #endif 3475 case NFA_NOPEN: /* \%( \) "Invisible Submatch" */ 3476 if (nfa_calc_size == TRUE) 3477 { 3478 nstate += 2; 3479 break; 3480 } 3481 3482 mopen = *p; 3483 switch (*p) 3484 { 3485 case NFA_NOPEN: mclose = NFA_NCLOSE; break; 3486 #ifdef FEAT_SYN_HL 3487 case NFA_ZOPEN: mclose = NFA_ZCLOSE; break; 3488 case NFA_ZOPEN1: mclose = NFA_ZCLOSE1; break; 3489 case NFA_ZOPEN2: mclose = NFA_ZCLOSE2; break; 3490 case NFA_ZOPEN3: mclose = NFA_ZCLOSE3; break; 3491 case NFA_ZOPEN4: mclose = NFA_ZCLOSE4; break; 3492 case NFA_ZOPEN5: mclose = NFA_ZCLOSE5; break; 3493 case NFA_ZOPEN6: mclose = NFA_ZCLOSE6; break; 3494 case NFA_ZOPEN7: mclose = NFA_ZCLOSE7; break; 3495 case NFA_ZOPEN8: mclose = NFA_ZCLOSE8; break; 3496 case NFA_ZOPEN9: mclose = NFA_ZCLOSE9; break; 3497 #endif 3498 #ifdef FEAT_MBYTE 3499 case NFA_COMPOSING: mclose = NFA_END_COMPOSING; break; 3500 #endif 3501 default: 3502 /* NFA_MOPEN, NFA_MOPEN1 .. NFA_MOPEN9 */ 3503 mclose = *p + NSUBEXP; 3504 break; 3505 } 3506 3507 /* Allow "NFA_MOPEN" as a valid postfix representation for 3508 * the empty regexp "". In this case, the NFA will be 3509 * NFA_MOPEN -> NFA_MCLOSE. Note that this also allows 3510 * empty groups of parenthesis, and empty mbyte chars */ 3511 if (stackp == stack) 3512 { 3513 s = alloc_state(mopen, NULL, NULL); 3514 if (s == NULL) 3515 goto theend; 3516 s1 = alloc_state(mclose, NULL, NULL); 3517 if (s1 == NULL) 3518 goto theend; 3519 patch(list1(&s->out), s1); 3520 PUSH(frag(s, list1(&s1->out))); 3521 break; 3522 } 3523 3524 /* At least one node was emitted before NFA_MOPEN, so 3525 * at least one node will be between NFA_MOPEN and NFA_MCLOSE */ 3526 e = POP(); 3527 s = alloc_state(mopen, e.start, NULL); /* `(' */ 3528 if (s == NULL) 3529 goto theend; 3530 3531 s1 = alloc_state(mclose, NULL, NULL); /* `)' */ 3532 if (s1 == NULL) 3533 goto theend; 3534 patch(e.out, s1); 3535 3536 #ifdef FEAT_MBYTE 3537 if (mopen == NFA_COMPOSING) 3538 /* COMPOSING->out1 = END_COMPOSING */ 3539 patch(list1(&s->out1), s1); 3540 #endif 3541 3542 PUSH(frag(s, list1(&s1->out))); 3543 break; 3544 3545 case NFA_BACKREF1: 3546 case NFA_BACKREF2: 3547 case NFA_BACKREF3: 3548 case NFA_BACKREF4: 3549 case NFA_BACKREF5: 3550 case NFA_BACKREF6: 3551 case NFA_BACKREF7: 3552 case NFA_BACKREF8: 3553 case NFA_BACKREF9: 3554 #ifdef FEAT_SYN_HL 3555 case NFA_ZREF1: 3556 case NFA_ZREF2: 3557 case NFA_ZREF3: 3558 case NFA_ZREF4: 3559 case NFA_ZREF5: 3560 case NFA_ZREF6: 3561 case NFA_ZREF7: 3562 case NFA_ZREF8: 3563 case NFA_ZREF9: 3564 #endif 3565 if (nfa_calc_size == TRUE) 3566 { 3567 nstate += 2; 3568 break; 3569 } 3570 s = alloc_state(*p, NULL, NULL); 3571 if (s == NULL) 3572 goto theend; 3573 s1 = alloc_state(NFA_SKIP, NULL, NULL); 3574 if (s1 == NULL) 3575 goto theend; 3576 patch(list1(&s->out), s1); 3577 PUSH(frag(s, list1(&s1->out))); 3578 break; 3579 3580 case NFA_LNUM: 3581 case NFA_LNUM_GT: 3582 case NFA_LNUM_LT: 3583 case NFA_VCOL: 3584 case NFA_VCOL_GT: 3585 case NFA_VCOL_LT: 3586 case NFA_COL: 3587 case NFA_COL_GT: 3588 case NFA_COL_LT: 3589 case NFA_MARK: 3590 case NFA_MARK_GT: 3591 case NFA_MARK_LT: 3592 { 3593 int n = *++p; /* lnum, col or mark name */ 3594 3595 if (nfa_calc_size == TRUE) 3596 { 3597 nstate += 1; 3598 break; 3599 } 3600 s = alloc_state(p[-1], NULL, NULL); 3601 if (s == NULL) 3602 goto theend; 3603 s->val = n; 3604 PUSH(frag(s, list1(&s->out))); 3605 break; 3606 } 3607 3608 case NFA_ZSTART: 3609 case NFA_ZEND: 3610 default: 3611 /* Operands */ 3612 if (nfa_calc_size == TRUE) 3613 { 3614 nstate++; 3615 break; 3616 } 3617 s = alloc_state(*p, NULL, NULL); 3618 if (s == NULL) 3619 goto theend; 3620 PUSH(frag(s, list1(&s->out))); 3621 break; 3622 3623 } /* switch(*p) */ 3624 3625 } /* for(p = postfix; *p; ++p) */ 3626 3627 if (nfa_calc_size == TRUE) 3628 { 3629 nstate++; 3630 goto theend; /* Return value when counting size is ignored anyway */ 3631 } 3632 3633 e = POP(); 3634 if (stackp != stack) 3635 { 3636 vim_free(stack); 3637 EMSG_RET_NULL(_("E875: (NFA regexp) (While converting from postfix to NFA), too many states left on stack")); 3638 } 3639 3640 if (istate >= nstate) 3641 { 3642 vim_free(stack); 3643 EMSG_RET_NULL(_("E876: (NFA regexp) Not enough space to store the whole NFA ")); 3644 } 3645 3646 matchstate = &state_ptr[istate++]; /* the match state */ 3647 matchstate->c = NFA_MATCH; 3648 matchstate->out = matchstate->out1 = NULL; 3649 matchstate->id = 0; 3650 3651 patch(e.out, matchstate); 3652 ret = e.start; 3653 3654 theend: 3655 vim_free(stack); 3656 return ret; 3657 3658 #undef POP1 3659 #undef PUSH1 3660 #undef POP2 3661 #undef PUSH2 3662 #undef POP 3663 #undef PUSH 3664 } 3665 3666 /* 3667 * After building the NFA program, inspect it to add optimization hints. 3668 */ 3669 static void 3670 nfa_postprocess(prog) 3671 nfa_regprog_T *prog; 3672 { 3673 int i; 3674 int c; 3675 3676 for (i = 0; i < prog->nstate; ++i) 3677 { 3678 c = prog->state[i].c; 3679 if (c == NFA_START_INVISIBLE 3680 || c == NFA_START_INVISIBLE_NEG 3681 || c == NFA_START_INVISIBLE_BEFORE 3682 || c == NFA_START_INVISIBLE_BEFORE_NEG) 3683 { 3684 int directly; 3685 3686 /* Do it directly when what follows is possibly the end of the 3687 * match. */ 3688 if (match_follows(prog->state[i].out1->out, 0)) 3689 directly = TRUE; 3690 else 3691 { 3692 int ch_invisible = failure_chance(prog->state[i].out, 0); 3693 int ch_follows = failure_chance(prog->state[i].out1->out, 0); 3694 3695 /* Postpone when the invisible match is expensive or has a 3696 * lower chance of failing. */ 3697 if (c == NFA_START_INVISIBLE_BEFORE 3698 || c == NFA_START_INVISIBLE_BEFORE_NEG) 3699 { 3700 /* "before" matches are very expensive when 3701 * unbounded, always prefer what follows then, 3702 * unless what follows will always match. 3703 * Otherwise strongly prefer what follows. */ 3704 if (prog->state[i].val <= 0 && ch_follows > 0) 3705 directly = FALSE; 3706 else 3707 directly = ch_follows * 10 < ch_invisible; 3708 } 3709 else 3710 { 3711 /* normal invisible, first do the one with the 3712 * highest failure chance */ 3713 directly = ch_follows < ch_invisible; 3714 } 3715 } 3716 if (directly) 3717 /* switch to the _FIRST state */ 3718 ++prog->state[i].c; 3719 } 3720 } 3721 } 3722 3723 /**************************************************************** 3724 * NFA execution code. 3725 ****************************************************************/ 3726 3727 typedef struct 3728 { 3729 int in_use; /* number of subexpr with useful info */ 3730 3731 /* When REG_MULTI is TRUE list.multi is used, otherwise list.line. */ 3732 union 3733 { 3734 struct multipos 3735 { 3736 linenr_T start_lnum; 3737 linenr_T end_lnum; 3738 colnr_T start_col; 3739 colnr_T end_col; 3740 } multi[NSUBEXP]; 3741 struct linepos 3742 { 3743 char_u *start; 3744 char_u *end; 3745 } line[NSUBEXP]; 3746 } list; 3747 } regsub_T; 3748 3749 typedef struct 3750 { 3751 regsub_T norm; /* \( .. \) matches */ 3752 #ifdef FEAT_SYN_HL 3753 regsub_T synt; /* \z( .. \) matches */ 3754 #endif 3755 } regsubs_T; 3756 3757 /* nfa_pim_T stores a Postponed Invisible Match. */ 3758 typedef struct nfa_pim_S nfa_pim_T; 3759 struct nfa_pim_S 3760 { 3761 int result; /* NFA_PIM_*, see below */ 3762 nfa_state_T *state; /* the invisible match start state */ 3763 regsubs_T subs; /* submatch info, only party used */ 3764 union 3765 { 3766 lpos_T pos; 3767 char_u *ptr; 3768 } end; /* where the match must end */ 3769 }; 3770 3771 /* Values for done in nfa_pim_T. */ 3772 #define NFA_PIM_UNUSED 0 /* pim not used */ 3773 #define NFA_PIM_TODO 1 /* pim not done yet */ 3774 #define NFA_PIM_MATCH 2 /* pim executed, matches */ 3775 #define NFA_PIM_NOMATCH 3 /* pim executed, no match */ 3776 3777 3778 /* nfa_thread_T contains execution information of a NFA state */ 3779 typedef struct 3780 { 3781 nfa_state_T *state; 3782 int count; 3783 nfa_pim_T pim; /* if pim.result != NFA_PIM_UNUSED: postponed 3784 * invisible match */ 3785 regsubs_T subs; /* submatch info, only party used */ 3786 } nfa_thread_T; 3787 3788 /* nfa_list_T contains the alternative NFA execution states. */ 3789 typedef struct 3790 { 3791 nfa_thread_T *t; /* allocated array of states */ 3792 int n; /* nr of states currently in "t" */ 3793 int len; /* max nr of states in "t" */ 3794 int id; /* ID of the list */ 3795 int has_pim; /* TRUE when any state has a PIM */ 3796 } nfa_list_T; 3797 3798 #ifdef ENABLE_LOG 3799 static void log_subsexpr __ARGS((regsubs_T *subs)); 3800 static void log_subexpr __ARGS((regsub_T *sub)); 3801 static char *pim_info __ARGS((nfa_pim_T *pim)); 3802 3803 static void 3804 log_subsexpr(subs) 3805 regsubs_T *subs; 3806 { 3807 log_subexpr(&subs->norm); 3808 # ifdef FEAT_SYN_HL 3809 if (nfa_has_zsubexpr) 3810 log_subexpr(&subs->synt); 3811 # endif 3812 } 3813 3814 static void 3815 log_subexpr(sub) 3816 regsub_T *sub; 3817 { 3818 int j; 3819 3820 for (j = 0; j < sub->in_use; j++) 3821 if (REG_MULTI) 3822 fprintf(log_fd, "*** group %d, start: c=%d, l=%d, end: c=%d, l=%d\n", 3823 j, 3824 sub->list.multi[j].start_col, 3825 (int)sub->list.multi[j].start_lnum, 3826 sub->list.multi[j].end_col, 3827 (int)sub->list.multi[j].end_lnum); 3828 else 3829 { 3830 char *s = (char *)sub->list.line[j].start; 3831 char *e = (char *)sub->list.line[j].end; 3832 3833 fprintf(log_fd, "*** group %d, start: \"%s\", end: \"%s\"\n", 3834 j, 3835 s == NULL ? "NULL" : s, 3836 e == NULL ? "NULL" : e); 3837 } 3838 } 3839 3840 static char * 3841 pim_info(pim) 3842 nfa_pim_T *pim; 3843 { 3844 static char buf[30]; 3845 3846 if (pim == NULL || pim->result == NFA_PIM_UNUSED) 3847 buf[0] = NUL; 3848 else 3849 { 3850 sprintf(buf, " PIM col %d", REG_MULTI ? (int)pim->end.pos.col 3851 : (int)(pim->end.ptr - reginput)); 3852 } 3853 return buf; 3854 } 3855 3856 #endif 3857 3858 /* Used during execution: whether a match has been found. */ 3859 static int nfa_match; 3860 #ifdef FEAT_RELTIME 3861 static proftime_T *nfa_time_limit; 3862 static int nfa_time_count; 3863 #endif 3864 3865 static void copy_pim __ARGS((nfa_pim_T *to, nfa_pim_T *from)); 3866 static void clear_sub __ARGS((regsub_T *sub)); 3867 static void copy_sub __ARGS((regsub_T *to, regsub_T *from)); 3868 static void copy_sub_off __ARGS((regsub_T *to, regsub_T *from)); 3869 static void copy_ze_off __ARGS((regsub_T *to, regsub_T *from)); 3870 static int sub_equal __ARGS((regsub_T *sub1, regsub_T *sub2)); 3871 static int match_backref __ARGS((regsub_T *sub, int subidx, int *bytelen)); 3872 static int has_state_with_pos __ARGS((nfa_list_T *l, nfa_state_T *state, regsubs_T *subs, nfa_pim_T *pim)); 3873 static int pim_equal __ARGS((nfa_pim_T *one, nfa_pim_T *two)); 3874 static int state_in_list __ARGS((nfa_list_T *l, nfa_state_T *state, regsubs_T *subs)); 3875 static regsubs_T *addstate __ARGS((nfa_list_T *l, nfa_state_T *state, regsubs_T *subs_arg, nfa_pim_T *pim, int off)); 3876 static void addstate_here __ARGS((nfa_list_T *l, nfa_state_T *state, regsubs_T *subs, nfa_pim_T *pim, int *ip)); 3877 3878 /* 3879 * Copy postponed invisible match info from "from" to "to". 3880 */ 3881 static void 3882 copy_pim(to, from) 3883 nfa_pim_T *to; 3884 nfa_pim_T *from; 3885 { 3886 to->result = from->result; 3887 to->state = from->state; 3888 copy_sub(&to->subs.norm, &from->subs.norm); 3889 #ifdef FEAT_SYN_HL 3890 if (nfa_has_zsubexpr) 3891 copy_sub(&to->subs.synt, &from->subs.synt); 3892 #endif 3893 to->end = from->end; 3894 } 3895 3896 static void 3897 clear_sub(sub) 3898 regsub_T *sub; 3899 { 3900 if (REG_MULTI) 3901 /* Use 0xff to set lnum to -1 */ 3902 vim_memset(sub->list.multi, 0xff, 3903 sizeof(struct multipos) * nfa_nsubexpr); 3904 else 3905 vim_memset(sub->list.line, 0, sizeof(struct linepos) * nfa_nsubexpr); 3906 sub->in_use = 0; 3907 } 3908 3909 /* 3910 * Copy the submatches from "from" to "to". 3911 */ 3912 static void 3913 copy_sub(to, from) 3914 regsub_T *to; 3915 regsub_T *from; 3916 { 3917 to->in_use = from->in_use; 3918 if (from->in_use > 0) 3919 { 3920 /* Copy the match start and end positions. */ 3921 if (REG_MULTI) 3922 mch_memmove(&to->list.multi[0], 3923 &from->list.multi[0], 3924 sizeof(struct multipos) * from->in_use); 3925 else 3926 mch_memmove(&to->list.line[0], 3927 &from->list.line[0], 3928 sizeof(struct linepos) * from->in_use); 3929 } 3930 } 3931 3932 /* 3933 * Like copy_sub() but exclude the main match. 3934 */ 3935 static void 3936 copy_sub_off(to, from) 3937 regsub_T *to; 3938 regsub_T *from; 3939 { 3940 if (to->in_use < from->in_use) 3941 to->in_use = from->in_use; 3942 if (from->in_use > 1) 3943 { 3944 /* Copy the match start and end positions. */ 3945 if (REG_MULTI) 3946 mch_memmove(&to->list.multi[1], 3947 &from->list.multi[1], 3948 sizeof(struct multipos) * (from->in_use - 1)); 3949 else 3950 mch_memmove(&to->list.line[1], 3951 &from->list.line[1], 3952 sizeof(struct linepos) * (from->in_use - 1)); 3953 } 3954 } 3955 3956 /* 3957 * Like copy_sub() but only do the end of the main match if \ze is present. 3958 */ 3959 static void 3960 copy_ze_off(to, from) 3961 regsub_T *to; 3962 regsub_T *from; 3963 { 3964 if (nfa_has_zend) 3965 { 3966 if (REG_MULTI) 3967 { 3968 if (from->list.multi[0].end_lnum >= 0) 3969 { 3970 to->list.multi[0].end_lnum = from->list.multi[0].end_lnum; 3971 to->list.multi[0].end_col = from->list.multi[0].end_col; 3972 } 3973 } 3974 else 3975 { 3976 if (from->list.line[0].end != NULL) 3977 to->list.line[0].end = from->list.line[0].end; 3978 } 3979 } 3980 } 3981 3982 /* 3983 * Return TRUE if "sub1" and "sub2" have the same start positions. 3984 * When using back-references also check the end position. 3985 */ 3986 static int 3987 sub_equal(sub1, sub2) 3988 regsub_T *sub1; 3989 regsub_T *sub2; 3990 { 3991 int i; 3992 int todo; 3993 linenr_T s1; 3994 linenr_T s2; 3995 char_u *sp1; 3996 char_u *sp2; 3997 3998 todo = sub1->in_use > sub2->in_use ? sub1->in_use : sub2->in_use; 3999 if (REG_MULTI) 4000 { 4001 for (i = 0; i < todo; ++i) 4002 { 4003 if (i < sub1->in_use) 4004 s1 = sub1->list.multi[i].start_lnum; 4005 else 4006 s1 = -1; 4007 if (i < sub2->in_use) 4008 s2 = sub2->list.multi[i].start_lnum; 4009 else 4010 s2 = -1; 4011 if (s1 != s2) 4012 return FALSE; 4013 if (s1 != -1 && sub1->list.multi[i].start_col 4014 != sub2->list.multi[i].start_col) 4015 return FALSE; 4016 4017 if (nfa_has_backref) 4018 { 4019 if (i < sub1->in_use) 4020 s1 = sub1->list.multi[i].end_lnum; 4021 else 4022 s1 = -1; 4023 if (i < sub2->in_use) 4024 s2 = sub2->list.multi[i].end_lnum; 4025 else 4026 s2 = -1; 4027 if (s1 != s2) 4028 return FALSE; 4029 if (s1 != -1 && sub1->list.multi[i].end_col 4030 != sub2->list.multi[i].end_col) 4031 return FALSE; 4032 } 4033 } 4034 } 4035 else 4036 { 4037 for (i = 0; i < todo; ++i) 4038 { 4039 if (i < sub1->in_use) 4040 sp1 = sub1->list.line[i].start; 4041 else 4042 sp1 = NULL; 4043 if (i < sub2->in_use) 4044 sp2 = sub2->list.line[i].start; 4045 else 4046 sp2 = NULL; 4047 if (sp1 != sp2) 4048 return FALSE; 4049 if (nfa_has_backref) 4050 { 4051 if (i < sub1->in_use) 4052 sp1 = sub1->list.line[i].end; 4053 else 4054 sp1 = NULL; 4055 if (i < sub2->in_use) 4056 sp2 = sub2->list.line[i].end; 4057 else 4058 sp2 = NULL; 4059 if (sp1 != sp2) 4060 return FALSE; 4061 } 4062 } 4063 } 4064 4065 return TRUE; 4066 } 4067 4068 #ifdef ENABLE_LOG 4069 static void 4070 report_state(char *action, 4071 regsub_T *sub, 4072 nfa_state_T *state, 4073 int lid, 4074 nfa_pim_T *pim) 4075 { 4076 int col; 4077 4078 if (sub->in_use <= 0) 4079 col = -1; 4080 else if (REG_MULTI) 4081 col = sub->list.multi[0].start_col; 4082 else 4083 col = (int)(sub->list.line[0].start - regline); 4084 nfa_set_code(state->c); 4085 fprintf(log_fd, "> %s state %d to list %d. char %d: %s (start col %d)%s\n", 4086 action, abs(state->id), lid, state->c, code, col, 4087 pim_info(pim)); 4088 } 4089 #endif 4090 4091 /* 4092 * Return TRUE if the same state is already in list "l" with the same 4093 * positions as "subs". 4094 */ 4095 static int 4096 has_state_with_pos(l, state, subs, pim) 4097 nfa_list_T *l; /* runtime state list */ 4098 nfa_state_T *state; /* state to update */ 4099 regsubs_T *subs; /* pointers to subexpressions */ 4100 nfa_pim_T *pim; /* postponed match or NULL */ 4101 { 4102 nfa_thread_T *thread; 4103 int i; 4104 4105 for (i = 0; i < l->n; ++i) 4106 { 4107 thread = &l->t[i]; 4108 if (thread->state->id == state->id 4109 && sub_equal(&thread->subs.norm, &subs->norm) 4110 #ifdef FEAT_SYN_HL 4111 && (!nfa_has_zsubexpr 4112 || sub_equal(&thread->subs.synt, &subs->synt)) 4113 #endif 4114 && pim_equal(&thread->pim, pim)) 4115 return TRUE; 4116 } 4117 return FALSE; 4118 } 4119 4120 /* 4121 * Return TRUE if "one" and "two" are equal. That includes when both are not 4122 * set. 4123 */ 4124 static int 4125 pim_equal(one, two) 4126 nfa_pim_T *one; 4127 nfa_pim_T *two; 4128 { 4129 int one_unused = (one == NULL || one->result == NFA_PIM_UNUSED); 4130 int two_unused = (two == NULL || two->result == NFA_PIM_UNUSED); 4131 4132 if (one_unused) 4133 /* one is unused: equal when two is also unused */ 4134 return two_unused; 4135 if (two_unused) 4136 /* one is used and two is not: not equal */ 4137 return FALSE; 4138 /* compare the state id */ 4139 if (one->state->id != two->state->id) 4140 return FALSE; 4141 /* compare the position */ 4142 if (REG_MULTI) 4143 return one->end.pos.lnum == two->end.pos.lnum 4144 && one->end.pos.col == two->end.pos.col; 4145 return one->end.ptr == two->end.ptr; 4146 } 4147 4148 /* 4149 * Return TRUE if "state" leads to a NFA_MATCH without advancing the input. 4150 */ 4151 static int 4152 match_follows(startstate, depth) 4153 nfa_state_T *startstate; 4154 int depth; 4155 { 4156 nfa_state_T *state = startstate; 4157 4158 /* avoid too much recursion */ 4159 if (depth > 10) 4160 return FALSE; 4161 4162 while (state != NULL) 4163 { 4164 switch (state->c) 4165 { 4166 case NFA_MATCH: 4167 case NFA_MCLOSE: 4168 case NFA_END_INVISIBLE: 4169 case NFA_END_INVISIBLE_NEG: 4170 case NFA_END_PATTERN: 4171 return TRUE; 4172 4173 case NFA_SPLIT: 4174 return match_follows(state->out, depth + 1) 4175 || match_follows(state->out1, depth + 1); 4176 4177 case NFA_START_INVISIBLE: 4178 case NFA_START_INVISIBLE_FIRST: 4179 case NFA_START_INVISIBLE_BEFORE: 4180 case NFA_START_INVISIBLE_BEFORE_FIRST: 4181 case NFA_START_INVISIBLE_NEG: 4182 case NFA_START_INVISIBLE_NEG_FIRST: 4183 case NFA_START_INVISIBLE_BEFORE_NEG: 4184 case NFA_START_INVISIBLE_BEFORE_NEG_FIRST: 4185 case NFA_COMPOSING: 4186 /* skip ahead to next state */ 4187 state = state->out1->out; 4188 continue; 4189 4190 case NFA_ANY: 4191 case NFA_ANY_COMPOSING: 4192 case NFA_IDENT: 4193 case NFA_SIDENT: 4194 case NFA_KWORD: 4195 case NFA_SKWORD: 4196 case NFA_FNAME: 4197 case NFA_SFNAME: 4198 case NFA_PRINT: 4199 case NFA_SPRINT: 4200 case NFA_WHITE: 4201 case NFA_NWHITE: 4202 case NFA_DIGIT: 4203 case NFA_NDIGIT: 4204 case NFA_HEX: 4205 case NFA_NHEX: 4206 case NFA_OCTAL: 4207 case NFA_NOCTAL: 4208 case NFA_WORD: 4209 case NFA_NWORD: 4210 case NFA_HEAD: 4211 case NFA_NHEAD: 4212 case NFA_ALPHA: 4213 case NFA_NALPHA: 4214 case NFA_LOWER: 4215 case NFA_NLOWER: 4216 case NFA_UPPER: 4217 case NFA_NUPPER: 4218 case NFA_LOWER_IC: 4219 case NFA_NLOWER_IC: 4220 case NFA_UPPER_IC: 4221 case NFA_NUPPER_IC: 4222 case NFA_START_COLL: 4223 case NFA_START_NEG_COLL: 4224 case NFA_NEWL: 4225 /* state will advance input */ 4226 return FALSE; 4227 4228 default: 4229 if (state->c > 0) 4230 /* state will advance input */ 4231 return FALSE; 4232 4233 /* Others: zero-width or possibly zero-width, might still find 4234 * a match at the same position, keep looking. */ 4235 break; 4236 } 4237 state = state->out; 4238 } 4239 return FALSE; 4240 } 4241 4242 4243 /* 4244 * Return TRUE if "state" is already in list "l". 4245 */ 4246 static int 4247 state_in_list(l, state, subs) 4248 nfa_list_T *l; /* runtime state list */ 4249 nfa_state_T *state; /* state to update */ 4250 regsubs_T *subs; /* pointers to subexpressions */ 4251 { 4252 if (state->lastlist[nfa_ll_index] == l->id) 4253 { 4254 if (!nfa_has_backref || has_state_with_pos(l, state, subs, NULL)) 4255 return TRUE; 4256 } 4257 return FALSE; 4258 } 4259 4260 /* 4261 * Add "state" and possibly what follows to state list ".". 4262 * Returns "subs_arg", possibly copied into temp_subs. 4263 */ 4264 static regsubs_T * 4265 addstate(l, state, subs_arg, pim, off) 4266 nfa_list_T *l; /* runtime state list */ 4267 nfa_state_T *state; /* state to update */ 4268 regsubs_T *subs_arg; /* pointers to subexpressions */ 4269 nfa_pim_T *pim; /* postponed look-behind match */ 4270 int off; /* byte offset, when -1 go to next line */ 4271 { 4272 int subidx; 4273 nfa_thread_T *thread; 4274 lpos_T save_lpos; 4275 int save_in_use; 4276 char_u *save_ptr; 4277 int i; 4278 regsub_T *sub; 4279 regsubs_T *subs = subs_arg; 4280 static regsubs_T temp_subs; 4281 #ifdef ENABLE_LOG 4282 int did_print = FALSE; 4283 #endif 4284 4285 switch (state->c) 4286 { 4287 case NFA_NCLOSE: 4288 case NFA_MCLOSE: 4289 case NFA_MCLOSE1: 4290 case NFA_MCLOSE2: 4291 case NFA_MCLOSE3: 4292 case NFA_MCLOSE4: 4293 case NFA_MCLOSE5: 4294 case NFA_MCLOSE6: 4295 case NFA_MCLOSE7: 4296 case NFA_MCLOSE8: 4297 case NFA_MCLOSE9: 4298 #ifdef FEAT_SYN_HL 4299 case NFA_ZCLOSE: 4300 case NFA_ZCLOSE1: 4301 case NFA_ZCLOSE2: 4302 case NFA_ZCLOSE3: 4303 case NFA_ZCLOSE4: 4304 case NFA_ZCLOSE5: 4305 case NFA_ZCLOSE6: 4306 case NFA_ZCLOSE7: 4307 case NFA_ZCLOSE8: 4308 case NFA_ZCLOSE9: 4309 #endif 4310 case NFA_MOPEN: 4311 case NFA_ZEND: 4312 case NFA_SPLIT: 4313 case NFA_EMPTY: 4314 /* These nodes are not added themselves but their "out" and/or 4315 * "out1" may be added below. */ 4316 break; 4317 4318 case NFA_BOL: 4319 case NFA_BOF: 4320 /* "^" won't match past end-of-line, don't bother trying. 4321 * Except when at the end of the line, or when we are going to the 4322 * next line for a look-behind match. */ 4323 if (reginput > regline 4324 && *reginput != NUL 4325 && (nfa_endp == NULL 4326 || !REG_MULTI 4327 || reglnum == nfa_endp->se_u.pos.lnum)) 4328 goto skip_add; 4329 /* FALLTHROUGH */ 4330 4331 case NFA_MOPEN1: 4332 case NFA_MOPEN2: 4333 case NFA_MOPEN3: 4334 case NFA_MOPEN4: 4335 case NFA_MOPEN5: 4336 case NFA_MOPEN6: 4337 case NFA_MOPEN7: 4338 case NFA_MOPEN8: 4339 case NFA_MOPEN9: 4340 #ifdef FEAT_SYN_HL 4341 case NFA_ZOPEN: 4342 case NFA_ZOPEN1: 4343 case NFA_ZOPEN2: 4344 case NFA_ZOPEN3: 4345 case NFA_ZOPEN4: 4346 case NFA_ZOPEN5: 4347 case NFA_ZOPEN6: 4348 case NFA_ZOPEN7: 4349 case NFA_ZOPEN8: 4350 case NFA_ZOPEN9: 4351 #endif 4352 case NFA_NOPEN: 4353 case NFA_ZSTART: 4354 /* These nodes need to be added so that we can bail out when it 4355 * was added to this list before at the same position to avoid an 4356 * endless loop for "\(\)*" */ 4357 4358 default: 4359 if (state->lastlist[nfa_ll_index] == l->id && state->c != NFA_SKIP) 4360 { 4361 /* This state is already in the list, don't add it again, 4362 * unless it is an MOPEN that is used for a backreference or 4363 * when there is a PIM. For NFA_MATCH check the position, 4364 * lower position is preferred. */ 4365 if (!nfa_has_backref && pim == NULL && !l->has_pim 4366 && state->c != NFA_MATCH) 4367 { 4368 skip_add: 4369 #ifdef ENABLE_LOG 4370 nfa_set_code(state->c); 4371 fprintf(log_fd, "> Not adding state %d to list %d. char %d: %s\n", 4372 abs(state->id), l->id, state->c, code); 4373 #endif 4374 return subs; 4375 } 4376 4377 /* Do not add the state again when it exists with the same 4378 * positions. */ 4379 if (has_state_with_pos(l, state, subs, pim)) 4380 goto skip_add; 4381 } 4382 4383 /* When there are backreferences or PIMs the number of states may 4384 * be (a lot) bigger than anticipated. */ 4385 if (l->n == l->len) 4386 { 4387 int newlen = l->len * 3 / 2 + 50; 4388 4389 if (subs != &temp_subs) 4390 { 4391 /* "subs" may point into the current array, need to make a 4392 * copy before it becomes invalid. */ 4393 copy_sub(&temp_subs.norm, &subs->norm); 4394 #ifdef FEAT_SYN_HL 4395 if (nfa_has_zsubexpr) 4396 copy_sub(&temp_subs.synt, &subs->synt); 4397 #endif 4398 subs = &temp_subs; 4399 } 4400 4401 /* TODO: check for vim_realloc() returning NULL. */ 4402 l->t = vim_realloc(l->t, newlen * sizeof(nfa_thread_T)); 4403 l->len = newlen; 4404 } 4405 4406 /* add the state to the list */ 4407 state->lastlist[nfa_ll_index] = l->id; 4408 thread = &l->t[l->n++]; 4409 thread->state = state; 4410 if (pim == NULL) 4411 thread->pim.result = NFA_PIM_UNUSED; 4412 else 4413 { 4414 copy_pim(&thread->pim, pim); 4415 l->has_pim = TRUE; 4416 } 4417 copy_sub(&thread->subs.norm, &subs->norm); 4418 #ifdef FEAT_SYN_HL 4419 if (nfa_has_zsubexpr) 4420 copy_sub(&thread->subs.synt, &subs->synt); 4421 #endif 4422 #ifdef ENABLE_LOG 4423 report_state("Adding", &thread->subs.norm, state, l->id, pim); 4424 did_print = TRUE; 4425 #endif 4426 } 4427 4428 #ifdef ENABLE_LOG 4429 if (!did_print) 4430 report_state("Processing", &subs->norm, state, l->id, pim); 4431 #endif 4432 switch (state->c) 4433 { 4434 case NFA_MATCH: 4435 break; 4436 4437 case NFA_SPLIT: 4438 /* order matters here */ 4439 subs = addstate(l, state->out, subs, pim, off); 4440 subs = addstate(l, state->out1, subs, pim, off); 4441 break; 4442 4443 case NFA_EMPTY: 4444 case NFA_NOPEN: 4445 case NFA_NCLOSE: 4446 subs = addstate(l, state->out, subs, pim, off); 4447 break; 4448 4449 case NFA_MOPEN: 4450 case NFA_MOPEN1: 4451 case NFA_MOPEN2: 4452 case NFA_MOPEN3: 4453 case NFA_MOPEN4: 4454 case NFA_MOPEN5: 4455 case NFA_MOPEN6: 4456 case NFA_MOPEN7: 4457 case NFA_MOPEN8: 4458 case NFA_MOPEN9: 4459 #ifdef FEAT_SYN_HL 4460 case NFA_ZOPEN: 4461 case NFA_ZOPEN1: 4462 case NFA_ZOPEN2: 4463 case NFA_ZOPEN3: 4464 case NFA_ZOPEN4: 4465 case NFA_ZOPEN5: 4466 case NFA_ZOPEN6: 4467 case NFA_ZOPEN7: 4468 case NFA_ZOPEN8: 4469 case NFA_ZOPEN9: 4470 #endif 4471 case NFA_ZSTART: 4472 if (state->c == NFA_ZSTART) 4473 { 4474 subidx = 0; 4475 sub = &subs->norm; 4476 } 4477 #ifdef FEAT_SYN_HL 4478 else if (state->c >= NFA_ZOPEN && state->c <= NFA_ZOPEN9) 4479 { 4480 subidx = state->c - NFA_ZOPEN; 4481 sub = &subs->synt; 4482 } 4483 #endif 4484 else 4485 { 4486 subidx = state->c - NFA_MOPEN; 4487 sub = &subs->norm; 4488 } 4489 4490 /* avoid compiler warnings */ 4491 save_ptr = NULL; 4492 save_lpos.lnum = 0; 4493 save_lpos.col = 0; 4494 4495 /* Set the position (with "off" added) in the subexpression. Save 4496 * and restore it when it was in use. Otherwise fill any gap. */ 4497 if (REG_MULTI) 4498 { 4499 if (subidx < sub->in_use) 4500 { 4501 save_lpos.lnum = sub->list.multi[subidx].start_lnum; 4502 save_lpos.col = sub->list.multi[subidx].start_col; 4503 save_in_use = -1; 4504 } 4505 else 4506 { 4507 save_in_use = sub->in_use; 4508 for (i = sub->in_use; i < subidx; ++i) 4509 { 4510 sub->list.multi[i].start_lnum = -1; 4511 sub->list.multi[i].end_lnum = -1; 4512 } 4513 sub->in_use = subidx + 1; 4514 } 4515 if (off == -1) 4516 { 4517 sub->list.multi[subidx].start_lnum = reglnum + 1; 4518 sub->list.multi[subidx].start_col = 0; 4519 } 4520 else 4521 { 4522 sub->list.multi[subidx].start_lnum = reglnum; 4523 sub->list.multi[subidx].start_col = 4524 (colnr_T)(reginput - regline + off); 4525 } 4526 sub->list.multi[subidx].end_lnum = -1; 4527 } 4528 else 4529 { 4530 if (subidx < sub->in_use) 4531 { 4532 save_ptr = sub->list.line[subidx].start; 4533 save_in_use = -1; 4534 } 4535 else 4536 { 4537 save_in_use = sub->in_use; 4538 for (i = sub->in_use; i < subidx; ++i) 4539 { 4540 sub->list.line[i].start = NULL; 4541 sub->list.line[i].end = NULL; 4542 } 4543 sub->in_use = subidx + 1; 4544 } 4545 sub->list.line[subidx].start = reginput + off; 4546 } 4547 4548 subs = addstate(l, state->out, subs, pim, off); 4549 /* "subs" may have changed, need to set "sub" again */ 4550 #ifdef FEAT_SYN_HL 4551 if (state->c >= NFA_ZOPEN && state->c <= NFA_ZOPEN9) 4552 sub = &subs->synt; 4553 else 4554 #endif 4555 sub = &subs->norm; 4556 4557 if (save_in_use == -1) 4558 { 4559 if (REG_MULTI) 4560 { 4561 sub->list.multi[subidx].start_lnum = save_lpos.lnum; 4562 sub->list.multi[subidx].start_col = save_lpos.col; 4563 } 4564 else 4565 sub->list.line[subidx].start = save_ptr; 4566 } 4567 else 4568 sub->in_use = save_in_use; 4569 break; 4570 4571 case NFA_MCLOSE: 4572 if (nfa_has_zend && (REG_MULTI 4573 ? subs->norm.list.multi[0].end_lnum >= 0 4574 : subs->norm.list.line[0].end != NULL)) 4575 { 4576 /* Do not overwrite the position set by \ze. */ 4577 subs = addstate(l, state->out, subs, pim, off); 4578 break; 4579 } 4580 case NFA_MCLOSE1: 4581 case NFA_MCLOSE2: 4582 case NFA_MCLOSE3: 4583 case NFA_MCLOSE4: 4584 case NFA_MCLOSE5: 4585 case NFA_MCLOSE6: 4586 case NFA_MCLOSE7: 4587 case NFA_MCLOSE8: 4588 case NFA_MCLOSE9: 4589 #ifdef FEAT_SYN_HL 4590 case NFA_ZCLOSE: 4591 case NFA_ZCLOSE1: 4592 case NFA_ZCLOSE2: 4593 case NFA_ZCLOSE3: 4594 case NFA_ZCLOSE4: 4595 case NFA_ZCLOSE5: 4596 case NFA_ZCLOSE6: 4597 case NFA_ZCLOSE7: 4598 case NFA_ZCLOSE8: 4599 case NFA_ZCLOSE9: 4600 #endif 4601 case NFA_ZEND: 4602 if (state->c == NFA_ZEND) 4603 { 4604 subidx = 0; 4605 sub = &subs->norm; 4606 } 4607 #ifdef FEAT_SYN_HL 4608 else if (state->c >= NFA_ZCLOSE && state->c <= NFA_ZCLOSE9) 4609 { 4610 subidx = state->c - NFA_ZCLOSE; 4611 sub = &subs->synt; 4612 } 4613 #endif 4614 else 4615 { 4616 subidx = state->c - NFA_MCLOSE; 4617 sub = &subs->norm; 4618 } 4619 4620 /* We don't fill in gaps here, there must have been an MOPEN that 4621 * has done that. */ 4622 save_in_use = sub->in_use; 4623 if (sub->in_use <= subidx) 4624 sub->in_use = subidx + 1; 4625 if (REG_MULTI) 4626 { 4627 save_lpos.lnum = sub->list.multi[subidx].end_lnum; 4628 save_lpos.col = sub->list.multi[subidx].end_col; 4629 if (off == -1) 4630 { 4631 sub->list.multi[subidx].end_lnum = reglnum + 1; 4632 sub->list.multi[subidx].end_col = 0; 4633 } 4634 else 4635 { 4636 sub->list.multi[subidx].end_lnum = reglnum; 4637 sub->list.multi[subidx].end_col = 4638 (colnr_T)(reginput - regline + off); 4639 } 4640 /* avoid compiler warnings */ 4641 save_ptr = NULL; 4642 } 4643 else 4644 { 4645 save_ptr = sub->list.line[subidx].end; 4646 sub->list.line[subidx].end = reginput + off; 4647 /* avoid compiler warnings */ 4648 save_lpos.lnum = 0; 4649 save_lpos.col = 0; 4650 } 4651 4652 subs = addstate(l, state->out, subs, pim, off); 4653 /* "subs" may have changed, need to set "sub" again */ 4654 #ifdef FEAT_SYN_HL 4655 if (state->c >= NFA_ZCLOSE && state->c <= NFA_ZCLOSE9) 4656 sub = &subs->synt; 4657 else 4658 #endif 4659 sub = &subs->norm; 4660 4661 if (REG_MULTI) 4662 { 4663 sub->list.multi[subidx].end_lnum = save_lpos.lnum; 4664 sub->list.multi[subidx].end_col = save_lpos.col; 4665 } 4666 else 4667 sub->list.line[subidx].end = save_ptr; 4668 sub->in_use = save_in_use; 4669 break; 4670 } 4671 return subs; 4672 } 4673 4674 /* 4675 * Like addstate(), but the new state(s) are put at position "*ip". 4676 * Used for zero-width matches, next state to use is the added one. 4677 * This makes sure the order of states to be tried does not change, which 4678 * matters for alternatives. 4679 */ 4680 static void 4681 addstate_here(l, state, subs, pim, ip) 4682 nfa_list_T *l; /* runtime state list */ 4683 nfa_state_T *state; /* state to update */ 4684 regsubs_T *subs; /* pointers to subexpressions */ 4685 nfa_pim_T *pim; /* postponed look-behind match */ 4686 int *ip; 4687 { 4688 int tlen = l->n; 4689 int count; 4690 int listidx = *ip; 4691 4692 /* first add the state(s) at the end, so that we know how many there are */ 4693 addstate(l, state, subs, pim, 0); 4694 4695 /* when "*ip" was at the end of the list, nothing to do */ 4696 if (listidx + 1 == tlen) 4697 return; 4698 4699 /* re-order to put the new state at the current position */ 4700 count = l->n - tlen; 4701 if (count == 0) 4702 return; /* no state got added */ 4703 if (count == 1) 4704 { 4705 /* overwrite the current state */ 4706 l->t[listidx] = l->t[l->n - 1]; 4707 } 4708 else if (count > 1) 4709 { 4710 if (l->n + count - 1 >= l->len) 4711 { 4712 /* not enough space to move the new states, reallocate the list 4713 * and move the states to the right position */ 4714 nfa_thread_T *newl; 4715 4716 l->len = l->len * 3 / 2 + 50; 4717 newl = (nfa_thread_T *)alloc(l->len * sizeof(nfa_thread_T)); 4718 if (newl == NULL) 4719 return; 4720 mch_memmove(&(newl[0]), 4721 &(l->t[0]), 4722 sizeof(nfa_thread_T) * listidx); 4723 mch_memmove(&(newl[listidx]), 4724 &(l->t[l->n - count]), 4725 sizeof(nfa_thread_T) * count); 4726 mch_memmove(&(newl[listidx + count]), 4727 &(l->t[listidx + 1]), 4728 sizeof(nfa_thread_T) * (l->n - count - listidx - 1)); 4729 vim_free(l->t); 4730 l->t = newl; 4731 } 4732 else 4733 { 4734 /* make space for new states, then move them from the 4735 * end to the current position */ 4736 mch_memmove(&(l->t[listidx + count]), 4737 &(l->t[listidx + 1]), 4738 sizeof(nfa_thread_T) * (l->n - listidx - 1)); 4739 mch_memmove(&(l->t[listidx]), 4740 &(l->t[l->n - 1]), 4741 sizeof(nfa_thread_T) * count); 4742 } 4743 } 4744 --l->n; 4745 *ip = listidx - 1; 4746 } 4747 4748 /* 4749 * Check character class "class" against current character c. 4750 */ 4751 static int 4752 check_char_class(class, c) 4753 int class; 4754 int c; 4755 { 4756 switch (class) 4757 { 4758 case NFA_CLASS_ALNUM: 4759 if (c >= 1 && c <= 255 && isalnum(c)) 4760 return OK; 4761 break; 4762 case NFA_CLASS_ALPHA: 4763 if (c >= 1 && c <= 255 && isalpha(c)) 4764 return OK; 4765 break; 4766 case NFA_CLASS_BLANK: 4767 if (c == ' ' || c == '\t') 4768 return OK; 4769 break; 4770 case NFA_CLASS_CNTRL: 4771 if (c >= 1 && c <= 255 && iscntrl(c)) 4772 return OK; 4773 break; 4774 case NFA_CLASS_DIGIT: 4775 if (VIM_ISDIGIT(c)) 4776 return OK; 4777 break; 4778 case NFA_CLASS_GRAPH: 4779 if (c >= 1 && c <= 255 && isgraph(c)) 4780 return OK; 4781 break; 4782 case NFA_CLASS_LOWER: 4783 if (MB_ISLOWER(c)) 4784 return OK; 4785 break; 4786 case NFA_CLASS_PRINT: 4787 if (vim_isprintc(c)) 4788 return OK; 4789 break; 4790 case NFA_CLASS_PUNCT: 4791 if (c >= 1 && c <= 255 && ispunct(c)) 4792 return OK; 4793 break; 4794 case NFA_CLASS_SPACE: 4795 if ((c >= 9 && c <= 13) || (c == ' ')) 4796 return OK; 4797 break; 4798 case NFA_CLASS_UPPER: 4799 if (MB_ISUPPER(c)) 4800 return OK; 4801 break; 4802 case NFA_CLASS_XDIGIT: 4803 if (vim_isxdigit(c)) 4804 return OK; 4805 break; 4806 case NFA_CLASS_TAB: 4807 if (c == '\t') 4808 return OK; 4809 break; 4810 case NFA_CLASS_RETURN: 4811 if (c == '\r') 4812 return OK; 4813 break; 4814 case NFA_CLASS_BACKSPACE: 4815 if (c == '\b') 4816 return OK; 4817 break; 4818 case NFA_CLASS_ESCAPE: 4819 if (c == '\033') 4820 return OK; 4821 break; 4822 4823 default: 4824 /* should not be here :P */ 4825 EMSGN(_(e_ill_char_class), class); 4826 return FAIL; 4827 } 4828 return FAIL; 4829 } 4830 4831 /* 4832 * Check for a match with subexpression "subidx". 4833 * Return TRUE if it matches. 4834 */ 4835 static int 4836 match_backref(sub, subidx, bytelen) 4837 regsub_T *sub; /* pointers to subexpressions */ 4838 int subidx; 4839 int *bytelen; /* out: length of match in bytes */ 4840 { 4841 int len; 4842 4843 if (sub->in_use <= subidx) 4844 { 4845 retempty: 4846 /* backref was not set, match an empty string */ 4847 *bytelen = 0; 4848 return TRUE; 4849 } 4850 4851 if (REG_MULTI) 4852 { 4853 if (sub->list.multi[subidx].start_lnum < 0 4854 || sub->list.multi[subidx].end_lnum < 0) 4855 goto retempty; 4856 if (sub->list.multi[subidx].start_lnum == reglnum 4857 && sub->list.multi[subidx].end_lnum == reglnum) 4858 { 4859 len = sub->list.multi[subidx].end_col 4860 - sub->list.multi[subidx].start_col; 4861 if (cstrncmp(regline + sub->list.multi[subidx].start_col, 4862 reginput, &len) == 0) 4863 { 4864 *bytelen = len; 4865 return TRUE; 4866 } 4867 } 4868 else 4869 { 4870 if (match_with_backref( 4871 sub->list.multi[subidx].start_lnum, 4872 sub->list.multi[subidx].start_col, 4873 sub->list.multi[subidx].end_lnum, 4874 sub->list.multi[subidx].end_col, 4875 bytelen) == RA_MATCH) 4876 return TRUE; 4877 } 4878 } 4879 else 4880 { 4881 if (sub->list.line[subidx].start == NULL 4882 || sub->list.line[subidx].end == NULL) 4883 goto retempty; 4884 len = (int)(sub->list.line[subidx].end - sub->list.line[subidx].start); 4885 if (cstrncmp(sub->list.line[subidx].start, reginput, &len) == 0) 4886 { 4887 *bytelen = len; 4888 return TRUE; 4889 } 4890 } 4891 return FALSE; 4892 } 4893 4894 #ifdef FEAT_SYN_HL 4895 4896 static int match_zref __ARGS((int subidx, int *bytelen)); 4897 4898 /* 4899 * Check for a match with \z subexpression "subidx". 4900 * Return TRUE if it matches. 4901 */ 4902 static int 4903 match_zref(subidx, bytelen) 4904 int subidx; 4905 int *bytelen; /* out: length of match in bytes */ 4906 { 4907 int len; 4908 4909 cleanup_zsubexpr(); 4910 if (re_extmatch_in == NULL || re_extmatch_in->matches[subidx] == NULL) 4911 { 4912 /* backref was not set, match an empty string */ 4913 *bytelen = 0; 4914 return TRUE; 4915 } 4916 4917 len = (int)STRLEN(re_extmatch_in->matches[subidx]); 4918 if (cstrncmp(re_extmatch_in->matches[subidx], reginput, &len) == 0) 4919 { 4920 *bytelen = len; 4921 return TRUE; 4922 } 4923 return FALSE; 4924 } 4925 #endif 4926 4927 /* 4928 * Save list IDs for all NFA states of "prog" into "list". 4929 * Also reset the IDs to zero. 4930 * Only used for the recursive value lastlist[1]. 4931 */ 4932 static void 4933 nfa_save_listids(prog, list) 4934 nfa_regprog_T *prog; 4935 int *list; 4936 { 4937 int i; 4938 nfa_state_T *p; 4939 4940 /* Order in the list is reverse, it's a bit faster that way. */ 4941 p = &prog->state[0]; 4942 for (i = prog->nstate; --i >= 0; ) 4943 { 4944 list[i] = p->lastlist[1]; 4945 p->lastlist[1] = 0; 4946 ++p; 4947 } 4948 } 4949 4950 /* 4951 * Restore list IDs from "list" to all NFA states. 4952 */ 4953 static void 4954 nfa_restore_listids(prog, list) 4955 nfa_regprog_T *prog; 4956 int *list; 4957 { 4958 int i; 4959 nfa_state_T *p; 4960 4961 p = &prog->state[0]; 4962 for (i = prog->nstate; --i >= 0; ) 4963 { 4964 p->lastlist[1] = list[i]; 4965 ++p; 4966 } 4967 } 4968 4969 static int 4970 nfa_re_num_cmp(val, op, pos) 4971 long_u val; 4972 int op; 4973 long_u pos; 4974 { 4975 if (op == 1) return pos > val; 4976 if (op == 2) return pos < val; 4977 return val == pos; 4978 } 4979 4980 static int recursive_regmatch __ARGS((nfa_state_T *state, nfa_pim_T *pim, nfa_regprog_T *prog, regsubs_T *submatch, regsubs_T *m, int **listids)); 4981 static int nfa_regmatch __ARGS((nfa_regprog_T *prog, nfa_state_T *start, regsubs_T *submatch, regsubs_T *m)); 4982 4983 /* 4984 * Recursively call nfa_regmatch() 4985 * "pim" is NULL or contains info about a Postponed Invisible Match (start 4986 * position). 4987 */ 4988 static int 4989 recursive_regmatch(state, pim, prog, submatch, m, listids) 4990 nfa_state_T *state; 4991 nfa_pim_T *pim; 4992 nfa_regprog_T *prog; 4993 regsubs_T *submatch; 4994 regsubs_T *m; 4995 int **listids; 4996 { 4997 int save_reginput_col = (int)(reginput - regline); 4998 int save_reglnum = reglnum; 4999 int save_nfa_match = nfa_match; 5000 int save_nfa_listid = nfa_listid; 5001 save_se_T *save_nfa_endp = nfa_endp; 5002 save_se_T endpos; 5003 save_se_T *endposp = NULL; 5004 int result; 5005 int need_restore = FALSE; 5006 5007 if (pim != NULL) 5008 { 5009 /* start at the position where the postponed match was */ 5010 if (REG_MULTI) 5011 reginput = regline + pim->end.pos.col; 5012 else 5013 reginput = pim->end.ptr; 5014 } 5015 5016 if (state->c == NFA_START_INVISIBLE_BEFORE 5017 || state->c == NFA_START_INVISIBLE_BEFORE_FIRST 5018 || state->c == NFA_START_INVISIBLE_BEFORE_NEG 5019 || state->c == NFA_START_INVISIBLE_BEFORE_NEG_FIRST) 5020 { 5021 /* The recursive match must end at the current position. When "pim" is 5022 * not NULL it specifies the current position. */ 5023 endposp = &endpos; 5024 if (REG_MULTI) 5025 { 5026 if (pim == NULL) 5027 { 5028 endpos.se_u.pos.col = (int)(reginput - regline); 5029 endpos.se_u.pos.lnum = reglnum; 5030 } 5031 else 5032 endpos.se_u.pos = pim->end.pos; 5033 } 5034 else 5035 { 5036 if (pim == NULL) 5037 endpos.se_u.ptr = reginput; 5038 else 5039 endpos.se_u.ptr = pim->end.ptr; 5040 } 5041 5042 /* Go back the specified number of bytes, or as far as the 5043 * start of the previous line, to try matching "\@<=" or 5044 * not matching "\@<!". This is very inefficient, limit the number of 5045 * bytes if possible. */ 5046 if (state->val <= 0) 5047 { 5048 if (REG_MULTI) 5049 { 5050 regline = reg_getline(--reglnum); 5051 if (regline == NULL) 5052 /* can't go before the first line */ 5053 regline = reg_getline(++reglnum); 5054 } 5055 reginput = regline; 5056 } 5057 else 5058 { 5059 if (REG_MULTI && (int)(reginput - regline) < state->val) 5060 { 5061 /* Not enough bytes in this line, go to end of 5062 * previous line. */ 5063 regline = reg_getline(--reglnum); 5064 if (regline == NULL) 5065 { 5066 /* can't go before the first line */ 5067 regline = reg_getline(++reglnum); 5068 reginput = regline; 5069 } 5070 else 5071 reginput = regline + STRLEN(regline); 5072 } 5073 if ((int)(reginput - regline) >= state->val) 5074 { 5075 reginput -= state->val; 5076 #ifdef FEAT_MBYTE 5077 if (has_mbyte) 5078 reginput -= mb_head_off(regline, reginput); 5079 #endif 5080 } 5081 else 5082 reginput = regline; 5083 } 5084 } 5085 5086 #ifdef ENABLE_LOG 5087 if (log_fd != stderr) 5088 fclose(log_fd); 5089 log_fd = NULL; 5090 #endif 5091 /* Have to clear the lastlist field of the NFA nodes, so that 5092 * nfa_regmatch() and addstate() can run properly after recursion. */ 5093 if (nfa_ll_index == 1) 5094 { 5095 /* Already calling nfa_regmatch() recursively. Save the lastlist[1] 5096 * values and clear them. */ 5097 if (*listids == NULL) 5098 { 5099 *listids = (int *)lalloc(sizeof(int) * nstate, TRUE); 5100 if (*listids == NULL) 5101 { 5102 EMSG(_("E878: (NFA) Could not allocate memory for branch traversal!")); 5103 return 0; 5104 } 5105 } 5106 nfa_save_listids(prog, *listids); 5107 need_restore = TRUE; 5108 /* any value of nfa_listid will do */ 5109 } 5110 else 5111 { 5112 /* First recursive nfa_regmatch() call, switch to the second lastlist 5113 * entry. Make sure nfa_listid is different from a previous recursive 5114 * call, because some states may still have this ID. */ 5115 ++nfa_ll_index; 5116 if (nfa_listid <= nfa_alt_listid) 5117 nfa_listid = nfa_alt_listid; 5118 } 5119 5120 /* Call nfa_regmatch() to check if the current concat matches at this 5121 * position. The concat ends with the node NFA_END_INVISIBLE */ 5122 nfa_endp = endposp; 5123 result = nfa_regmatch(prog, state->out, submatch, m); 5124 5125 if (need_restore) 5126 nfa_restore_listids(prog, *listids); 5127 else 5128 { 5129 --nfa_ll_index; 5130 nfa_alt_listid = nfa_listid; 5131 } 5132 5133 /* restore position in input text */ 5134 reglnum = save_reglnum; 5135 if (REG_MULTI) 5136 regline = reg_getline(reglnum); 5137 reginput = regline + save_reginput_col; 5138 nfa_match = save_nfa_match; 5139 nfa_endp = save_nfa_endp; 5140 nfa_listid = save_nfa_listid; 5141 5142 #ifdef ENABLE_LOG 5143 log_fd = fopen(NFA_REGEXP_RUN_LOG, "a"); 5144 if (log_fd != NULL) 5145 { 5146 fprintf(log_fd, "****************************\n"); 5147 fprintf(log_fd, "FINISHED RUNNING nfa_regmatch() recursively\n"); 5148 fprintf(log_fd, "MATCH = %s\n", result == TRUE ? "OK" : "FALSE"); 5149 fprintf(log_fd, "****************************\n"); 5150 } 5151 else 5152 { 5153 EMSG(_("Could not open temporary log file for writing, displaying on stderr ... ")); 5154 log_fd = stderr; 5155 } 5156 #endif 5157 5158 return result; 5159 } 5160 5161 static int skip_to_start __ARGS((int c, colnr_T *colp)); 5162 static long find_match_text __ARGS((colnr_T startcol, int regstart, char_u *match_text)); 5163 5164 /* 5165 * Estimate the chance of a match with "state" failing. 5166 * empty match: 0 5167 * NFA_ANY: 1 5168 * specific character: 99 5169 */ 5170 static int 5171 failure_chance(state, depth) 5172 nfa_state_T *state; 5173 int depth; 5174 { 5175 int c = state->c; 5176 int l, r; 5177 5178 /* detect looping */ 5179 if (depth > 4) 5180 return 1; 5181 5182 switch (c) 5183 { 5184 case NFA_SPLIT: 5185 if (state->out->c == NFA_SPLIT || state->out1->c == NFA_SPLIT) 5186 /* avoid recursive stuff */ 5187 return 1; 5188 /* two alternatives, use the lowest failure chance */ 5189 l = failure_chance(state->out, depth + 1); 5190 r = failure_chance(state->out1, depth + 1); 5191 return l < r ? l : r; 5192 5193 case NFA_ANY: 5194 /* matches anything, unlikely to fail */ 5195 return 1; 5196 5197 case NFA_MATCH: 5198 case NFA_MCLOSE: 5199 case NFA_ANY_COMPOSING: 5200 /* empty match works always */ 5201 return 0; 5202 5203 case NFA_START_INVISIBLE: 5204 case NFA_START_INVISIBLE_FIRST: 5205 case NFA_START_INVISIBLE_NEG: 5206 case NFA_START_INVISIBLE_NEG_FIRST: 5207 case NFA_START_INVISIBLE_BEFORE: 5208 case NFA_START_INVISIBLE_BEFORE_FIRST: 5209 case NFA_START_INVISIBLE_BEFORE_NEG: 5210 case NFA_START_INVISIBLE_BEFORE_NEG_FIRST: 5211 case NFA_START_PATTERN: 5212 /* recursive regmatch is expensive, use low failure chance */ 5213 return 5; 5214 5215 case NFA_BOL: 5216 case NFA_EOL: 5217 case NFA_BOF: 5218 case NFA_EOF: 5219 case NFA_NEWL: 5220 return 99; 5221 5222 case NFA_BOW: 5223 case NFA_EOW: 5224 return 90; 5225 5226 case NFA_MOPEN: 5227 case NFA_MOPEN1: 5228 case NFA_MOPEN2: 5229 case NFA_MOPEN3: 5230 case NFA_MOPEN4: 5231 case NFA_MOPEN5: 5232 case NFA_MOPEN6: 5233 case NFA_MOPEN7: 5234 case NFA_MOPEN8: 5235 case NFA_MOPEN9: 5236 #ifdef FEAT_SYN_HL 5237 case NFA_ZOPEN: 5238 case NFA_ZOPEN1: 5239 case NFA_ZOPEN2: 5240 case NFA_ZOPEN3: 5241 case NFA_ZOPEN4: 5242 case NFA_ZOPEN5: 5243 case NFA_ZOPEN6: 5244 case NFA_ZOPEN7: 5245 case NFA_ZOPEN8: 5246 case NFA_ZOPEN9: 5247 case NFA_ZCLOSE: 5248 case NFA_ZCLOSE1: 5249 case NFA_ZCLOSE2: 5250 case NFA_ZCLOSE3: 5251 case NFA_ZCLOSE4: 5252 case NFA_ZCLOSE5: 5253 case NFA_ZCLOSE6: 5254 case NFA_ZCLOSE7: 5255 case NFA_ZCLOSE8: 5256 case NFA_ZCLOSE9: 5257 #endif 5258 case NFA_NOPEN: 5259 case NFA_MCLOSE1: 5260 case NFA_MCLOSE2: 5261 case NFA_MCLOSE3: 5262 case NFA_MCLOSE4: 5263 case NFA_MCLOSE5: 5264 case NFA_MCLOSE6: 5265 case NFA_MCLOSE7: 5266 case NFA_MCLOSE8: 5267 case NFA_MCLOSE9: 5268 case NFA_NCLOSE: 5269 return failure_chance(state->out, depth + 1); 5270 5271 case NFA_BACKREF1: 5272 case NFA_BACKREF2: 5273 case NFA_BACKREF3: 5274 case NFA_BACKREF4: 5275 case NFA_BACKREF5: 5276 case NFA_BACKREF6: 5277 case NFA_BACKREF7: 5278 case NFA_BACKREF8: 5279 case NFA_BACKREF9: 5280 #ifdef FEAT_SYN_HL 5281 case NFA_ZREF1: 5282 case NFA_ZREF2: 5283 case NFA_ZREF3: 5284 case NFA_ZREF4: 5285 case NFA_ZREF5: 5286 case NFA_ZREF6: 5287 case NFA_ZREF7: 5288 case NFA_ZREF8: 5289 case NFA_ZREF9: 5290 #endif 5291 /* backreferences don't match in many places */ 5292 return 94; 5293 5294 case NFA_LNUM_GT: 5295 case NFA_LNUM_LT: 5296 case NFA_COL_GT: 5297 case NFA_COL_LT: 5298 case NFA_VCOL_GT: 5299 case NFA_VCOL_LT: 5300 case NFA_MARK_GT: 5301 case NFA_MARK_LT: 5302 case NFA_VISUAL: 5303 /* before/after positions don't match very often */ 5304 return 85; 5305 5306 case NFA_LNUM: 5307 return 90; 5308 5309 case NFA_CURSOR: 5310 case NFA_COL: 5311 case NFA_VCOL: 5312 case NFA_MARK: 5313 /* specific positions rarely match */ 5314 return 98; 5315 5316 case NFA_COMPOSING: 5317 return 95; 5318 5319 default: 5320 if (c > 0) 5321 /* character match fails often */ 5322 return 95; 5323 } 5324 5325 /* something else, includes character classes */ 5326 return 50; 5327 } 5328 5329 /* 5330 * Skip until the char "c" we know a match must start with. 5331 */ 5332 static int 5333 skip_to_start(c, colp) 5334 int c; 5335 colnr_T *colp; 5336 { 5337 char_u *s; 5338 5339 /* Used often, do some work to avoid call overhead. */ 5340 if (!ireg_ic 5341 #ifdef FEAT_MBYTE 5342 && !has_mbyte 5343 #endif 5344 ) 5345 s = vim_strbyte(regline + *colp, c); 5346 else 5347 s = cstrchr(regline + *colp, c); 5348 if (s == NULL) 5349 return FAIL; 5350 *colp = (int)(s - regline); 5351 return OK; 5352 } 5353 5354 /* 5355 * Check for a match with match_text. 5356 * Called after skip_to_start() has found regstart. 5357 * Returns zero for no match, 1 for a match. 5358 */ 5359 static long 5360 find_match_text(startcol, regstart, match_text) 5361 colnr_T startcol; 5362 int regstart; 5363 char_u *match_text; 5364 { 5365 colnr_T col = startcol; 5366 int c1, c2; 5367 int len1, len2; 5368 int match; 5369 5370 for (;;) 5371 { 5372 match = TRUE; 5373 len2 = MB_CHAR2LEN(regstart); /* skip regstart */ 5374 for (len1 = 0; match_text[len1] != NUL; len1 += MB_CHAR2LEN(c1)) 5375 { 5376 c1 = PTR2CHAR(match_text + len1); 5377 c2 = PTR2CHAR(regline + col + len2); 5378 if (c1 != c2 && (!ireg_ic || MB_TOLOWER(c1) != MB_TOLOWER(c2))) 5379 { 5380 match = FALSE; 5381 break; 5382 } 5383 len2 += MB_CHAR2LEN(c2); 5384 } 5385 if (match 5386 #ifdef FEAT_MBYTE 5387 /* check that no composing char follows */ 5388 && !(enc_utf8 5389 && utf_iscomposing(PTR2CHAR(regline + col + len2))) 5390 #endif 5391 ) 5392 { 5393 cleanup_subexpr(); 5394 if (REG_MULTI) 5395 { 5396 reg_startpos[0].lnum = reglnum; 5397 reg_startpos[0].col = col; 5398 reg_endpos[0].lnum = reglnum; 5399 reg_endpos[0].col = col + len2; 5400 } 5401 else 5402 { 5403 reg_startp[0] = regline + col; 5404 reg_endp[0] = regline + col + len2; 5405 } 5406 return 1L; 5407 } 5408 5409 /* Try finding regstart after the current match. */ 5410 col += MB_CHAR2LEN(regstart); /* skip regstart */ 5411 if (skip_to_start(regstart, &col) == FAIL) 5412 break; 5413 } 5414 return 0L; 5415 } 5416 5417 /* 5418 * Main matching routine. 5419 * 5420 * Run NFA to determine whether it matches reginput. 5421 * 5422 * When "nfa_endp" is not NULL it is a required end-of-match position. 5423 * 5424 * Return TRUE if there is a match, FALSE otherwise. 5425 * When there is a match "submatch" contains the positions. 5426 * Note: Caller must ensure that: start != NULL. 5427 */ 5428 static int 5429 nfa_regmatch(prog, start, submatch, m) 5430 nfa_regprog_T *prog; 5431 nfa_state_T *start; 5432 regsubs_T *submatch; 5433 regsubs_T *m; 5434 { 5435 int result; 5436 size_t size = 0; 5437 int flag = 0; 5438 int go_to_nextline = FALSE; 5439 nfa_thread_T *t; 5440 nfa_list_T list[2]; 5441 int listidx; 5442 nfa_list_T *thislist; 5443 nfa_list_T *nextlist; 5444 int *listids = NULL; 5445 nfa_state_T *add_state; 5446 int add_here; 5447 int add_count; 5448 int add_off = 0; 5449 int toplevel = start->c == NFA_MOPEN; 5450 #ifdef NFA_REGEXP_DEBUG_LOG 5451 FILE *debug = fopen(NFA_REGEXP_DEBUG_LOG, "a"); 5452 5453 if (debug == NULL) 5454 { 5455 EMSG2(_("(NFA) COULD NOT OPEN %s !"), NFA_REGEXP_DEBUG_LOG); 5456 return FALSE; 5457 } 5458 #endif 5459 /* Some patterns may take a long time to match, especially when using 5460 * recursive_regmatch(). Allow interrupting them with CTRL-C. */ 5461 fast_breakcheck(); 5462 if (got_int) 5463 return FALSE; 5464 #ifdef FEAT_RELTIME 5465 if (nfa_time_limit != NULL && profile_passed_limit(nfa_time_limit)) 5466 return FALSE; 5467 #endif 5468 5469 nfa_match = FALSE; 5470 5471 /* Allocate memory for the lists of nodes. */ 5472 size = (nstate + 1) * sizeof(nfa_thread_T); 5473 5474 list[0].t = (nfa_thread_T *)lalloc(size, TRUE); 5475 list[0].len = nstate + 1; 5476 list[1].t = (nfa_thread_T *)lalloc(size, TRUE); 5477 list[1].len = nstate + 1; 5478 if (list[0].t == NULL || list[1].t == NULL) 5479 goto theend; 5480 5481 #ifdef ENABLE_LOG 5482 log_fd = fopen(NFA_REGEXP_RUN_LOG, "a"); 5483 if (log_fd != NULL) 5484 { 5485 fprintf(log_fd, "**********************************\n"); 5486 nfa_set_code(start->c); 5487 fprintf(log_fd, " RUNNING nfa_regmatch() starting with state %d, code %s\n", 5488 abs(start->id), code); 5489 fprintf(log_fd, "**********************************\n"); 5490 } 5491 else 5492 { 5493 EMSG(_("Could not open temporary log file for writing, displaying on stderr ... ")); 5494 log_fd = stderr; 5495 } 5496 #endif 5497 5498 thislist = &list[0]; 5499 thislist->n = 0; 5500 thislist->has_pim = FALSE; 5501 nextlist = &list[1]; 5502 nextlist->n = 0; 5503 nextlist->has_pim = FALSE; 5504 #ifdef ENABLE_LOG 5505 fprintf(log_fd, "(---) STARTSTATE first\n"); 5506 #endif 5507 thislist->id = nfa_listid + 1; 5508 5509 /* Inline optimized code for addstate(thislist, start, m, 0) if we know 5510 * it's the first MOPEN. */ 5511 if (toplevel) 5512 { 5513 if (REG_MULTI) 5514 { 5515 m->norm.list.multi[0].start_lnum = reglnum; 5516 m->norm.list.multi[0].start_col = (colnr_T)(reginput - regline); 5517 } 5518 else 5519 m->norm.list.line[0].start = reginput; 5520 m->norm.in_use = 1; 5521 addstate(thislist, start->out, m, NULL, 0); 5522 } 5523 else 5524 addstate(thislist, start, m, NULL, 0); 5525 5526 #define ADD_STATE_IF_MATCH(state) \ 5527 if (result) { \ 5528 add_state = state->out; \ 5529 add_off = clen; \ 5530 } 5531 5532 /* 5533 * Run for each character. 5534 */ 5535 for (;;) 5536 { 5537 int curc; 5538 int clen; 5539 5540 #ifdef FEAT_MBYTE 5541 if (has_mbyte) 5542 { 5543 curc = (*mb_ptr2char)(reginput); 5544 clen = (*mb_ptr2len)(reginput); 5545 } 5546 else 5547 #endif 5548 { 5549 curc = *reginput; 5550 clen = 1; 5551 } 5552 if (curc == NUL) 5553 { 5554 clen = 0; 5555 go_to_nextline = FALSE; 5556 } 5557 5558 /* swap lists */ 5559 thislist = &list[flag]; 5560 nextlist = &list[flag ^= 1]; 5561 nextlist->n = 0; /* clear nextlist */ 5562 nextlist->has_pim = FALSE; 5563 ++nfa_listid; 5564 if (prog->re_engine == AUTOMATIC_ENGINE && nfa_listid >= NFA_MAX_STATES) 5565 { 5566 /* too many states, retry with old engine */ 5567 nfa_match = NFA_TOO_EXPENSIVE; 5568 goto theend; 5569 } 5570 5571 thislist->id = nfa_listid; 5572 nextlist->id = nfa_listid + 1; 5573 5574 #ifdef ENABLE_LOG 5575 fprintf(log_fd, "------------------------------------------\n"); 5576 fprintf(log_fd, ">>> Reginput is \"%s\"\n", reginput); 5577 fprintf(log_fd, ">>> Advanced one character ... Current char is %c (code %d) \n", curc, (int)curc); 5578 fprintf(log_fd, ">>> Thislist has %d states available: ", thislist->n); 5579 { 5580 int i; 5581 5582 for (i = 0; i < thislist->n; i++) 5583 fprintf(log_fd, "%d ", abs(thislist->t[i].state->id)); 5584 } 5585 fprintf(log_fd, "\n"); 5586 #endif 5587 5588 #ifdef NFA_REGEXP_DEBUG_LOG 5589 fprintf(debug, "\n-------------------\n"); 5590 #endif 5591 /* 5592 * If the state lists are empty we can stop. 5593 */ 5594 if (thislist->n == 0) 5595 break; 5596 5597 /* compute nextlist */ 5598 for (listidx = 0; listidx < thislist->n; ++listidx) 5599 { 5600 t = &thislist->t[listidx]; 5601 5602 #ifdef NFA_REGEXP_DEBUG_LOG 5603 nfa_set_code(t->state->c); 5604 fprintf(debug, "%s, ", code); 5605 #endif 5606 #ifdef ENABLE_LOG 5607 { 5608 int col; 5609 5610 if (t->subs.norm.in_use <= 0) 5611 col = -1; 5612 else if (REG_MULTI) 5613 col = t->subs.norm.list.multi[0].start_col; 5614 else 5615 col = (int)(t->subs.norm.list.line[0].start - regline); 5616 nfa_set_code(t->state->c); 5617 fprintf(log_fd, "(%d) char %d %s (start col %d)%s ... \n", 5618 abs(t->state->id), (int)t->state->c, code, col, 5619 pim_info(&t->pim)); 5620 } 5621 #endif 5622 5623 /* 5624 * Handle the possible codes of the current state. 5625 * The most important is NFA_MATCH. 5626 */ 5627 add_state = NULL; 5628 add_here = FALSE; 5629 add_count = 0; 5630 switch (t->state->c) 5631 { 5632 case NFA_MATCH: 5633 { 5634 #ifdef FEAT_MBYTE 5635 /* If the match ends before a composing characters and 5636 * ireg_icombine is not set, that is not really a match. */ 5637 if (enc_utf8 && !ireg_icombine && utf_iscomposing(curc)) 5638 break; 5639 #endif 5640 nfa_match = TRUE; 5641 copy_sub(&submatch->norm, &t->subs.norm); 5642 #ifdef FEAT_SYN_HL 5643 if (nfa_has_zsubexpr) 5644 copy_sub(&submatch->synt, &t->subs.synt); 5645 #endif 5646 #ifdef ENABLE_LOG 5647 log_subsexpr(&t->subs); 5648 #endif 5649 /* Found the left-most longest match, do not look at any other 5650 * states at this position. When the list of states is going 5651 * to be empty quit without advancing, so that "reginput" is 5652 * correct. */ 5653 if (nextlist->n == 0) 5654 clen = 0; 5655 goto nextchar; 5656 } 5657 5658 case NFA_END_INVISIBLE: 5659 case NFA_END_INVISIBLE_NEG: 5660 case NFA_END_PATTERN: 5661 /* 5662 * This is only encountered after a NFA_START_INVISIBLE or 5663 * NFA_START_INVISIBLE_BEFORE node. 5664 * They surround a zero-width group, used with "\@=", "\&", 5665 * "\@!", "\@<=" and "\@<!". 5666 * If we got here, it means that the current "invisible" group 5667 * finished successfully, so return control to the parent 5668 * nfa_regmatch(). For a look-behind match only when it ends 5669 * in the position in "nfa_endp". 5670 * Submatches are stored in *m, and used in the parent call. 5671 */ 5672 #ifdef ENABLE_LOG 5673 if (nfa_endp != NULL) 5674 { 5675 if (REG_MULTI) 5676 fprintf(log_fd, "Current lnum: %d, endp lnum: %d; current col: %d, endp col: %d\n", 5677 (int)reglnum, 5678 (int)nfa_endp->se_u.pos.lnum, 5679 (int)(reginput - regline), 5680 nfa_endp->se_u.pos.col); 5681 else 5682 fprintf(log_fd, "Current col: %d, endp col: %d\n", 5683 (int)(reginput - regline), 5684 (int)(nfa_endp->se_u.ptr - reginput)); 5685 } 5686 #endif 5687 /* If "nfa_endp" is set it's only a match if it ends at 5688 * "nfa_endp" */ 5689 if (nfa_endp != NULL && (REG_MULTI 5690 ? (reglnum != nfa_endp->se_u.pos.lnum 5691 || (int)(reginput - regline) 5692 != nfa_endp->se_u.pos.col) 5693 : reginput != nfa_endp->se_u.ptr)) 5694 break; 5695 5696 /* do not set submatches for \@! */ 5697 if (t->state->c != NFA_END_INVISIBLE_NEG) 5698 { 5699 copy_sub(&m->norm, &t->subs.norm); 5700 #ifdef FEAT_SYN_HL 5701 if (nfa_has_zsubexpr) 5702 copy_sub(&m->synt, &t->subs.synt); 5703 #endif 5704 } 5705 #ifdef ENABLE_LOG 5706 fprintf(log_fd, "Match found:\n"); 5707 log_subsexpr(m); 5708 #endif 5709 nfa_match = TRUE; 5710 /* See comment above at "goto nextchar". */ 5711 if (nextlist->n == 0) 5712 clen = 0; 5713 goto nextchar; 5714 5715 case NFA_START_INVISIBLE: 5716 case NFA_START_INVISIBLE_FIRST: 5717 case NFA_START_INVISIBLE_NEG: 5718 case NFA_START_INVISIBLE_NEG_FIRST: 5719 case NFA_START_INVISIBLE_BEFORE: 5720 case NFA_START_INVISIBLE_BEFORE_FIRST: 5721 case NFA_START_INVISIBLE_BEFORE_NEG: 5722 case NFA_START_INVISIBLE_BEFORE_NEG_FIRST: 5723 { 5724 #ifdef ENABLE_LOG 5725 fprintf(log_fd, "Failure chance invisible: %d, what follows: %d\n", 5726 failure_chance(t->state->out, 0), 5727 failure_chance(t->state->out1->out, 0)); 5728 #endif 5729 /* Do it directly if there already is a PIM or when 5730 * nfa_postprocess() detected it will work better. */ 5731 if (t->pim.result != NFA_PIM_UNUSED 5732 || t->state->c == NFA_START_INVISIBLE_FIRST 5733 || t->state->c == NFA_START_INVISIBLE_NEG_FIRST 5734 || t->state->c == NFA_START_INVISIBLE_BEFORE_FIRST 5735 || t->state->c == NFA_START_INVISIBLE_BEFORE_NEG_FIRST) 5736 { 5737 int in_use = m->norm.in_use; 5738 5739 /* Copy submatch info for the recursive call, opposite 5740 * of what happens on success below. */ 5741 copy_sub_off(&m->norm, &t->subs.norm); 5742 #ifdef FEAT_SYN_HL 5743 if (nfa_has_zsubexpr) 5744 copy_sub_off(&m->synt, &t->subs.synt); 5745 #endif 5746 5747 /* 5748 * First try matching the invisible match, then what 5749 * follows. 5750 */ 5751 result = recursive_regmatch(t->state, NULL, prog, 5752 submatch, m, &listids); 5753 if (result == NFA_TOO_EXPENSIVE) 5754 { 5755 nfa_match = result; 5756 goto theend; 5757 } 5758 5759 /* for \@! and \@<! it is a match when the result is 5760 * FALSE */ 5761 if (result != (t->state->c == NFA_START_INVISIBLE_NEG 5762 || t->state->c == NFA_START_INVISIBLE_NEG_FIRST 5763 || t->state->c 5764 == NFA_START_INVISIBLE_BEFORE_NEG 5765 || t->state->c 5766 == NFA_START_INVISIBLE_BEFORE_NEG_FIRST)) 5767 { 5768 /* Copy submatch info from the recursive call */ 5769 copy_sub_off(&t->subs.norm, &m->norm); 5770 #ifdef FEAT_SYN_HL 5771 if (nfa_has_zsubexpr) 5772 copy_sub_off(&t->subs.synt, &m->synt); 5773 #endif 5774 /* If the pattern has \ze and it matched in the 5775 * sub pattern, use it. */ 5776 copy_ze_off(&t->subs.norm, &m->norm); 5777 5778 /* t->state->out1 is the corresponding 5779 * END_INVISIBLE node; Add its out to the current 5780 * list (zero-width match). */ 5781 add_here = TRUE; 5782 add_state = t->state->out1->out; 5783 } 5784 m->norm.in_use = in_use; 5785 } 5786 else 5787 { 5788 nfa_pim_T pim; 5789 5790 /* 5791 * First try matching what follows. Only if a match 5792 * is found verify the invisible match matches. Add a 5793 * nfa_pim_T to the following states, it contains info 5794 * about the invisible match. 5795 */ 5796 pim.state = t->state; 5797 pim.result = NFA_PIM_TODO; 5798 pim.subs.norm.in_use = 0; 5799 #ifdef FEAT_SYN_HL 5800 pim.subs.synt.in_use = 0; 5801 #endif 5802 if (REG_MULTI) 5803 { 5804 pim.end.pos.col = (int)(reginput - regline); 5805 pim.end.pos.lnum = reglnum; 5806 } 5807 else 5808 pim.end.ptr = reginput; 5809 5810 /* t->state->out1 is the corresponding END_INVISIBLE 5811 * node; Add its out to the current list (zero-width 5812 * match). */ 5813 addstate_here(thislist, t->state->out1->out, &t->subs, 5814 &pim, &listidx); 5815 } 5816 } 5817 break; 5818 5819 case NFA_START_PATTERN: 5820 { 5821 nfa_state_T *skip = NULL; 5822 #ifdef ENABLE_LOG 5823 int skip_lid = 0; 5824 #endif 5825 5826 /* There is no point in trying to match the pattern if the 5827 * output state is not going to be added to the list. */ 5828 if (state_in_list(nextlist, t->state->out1->out, &t->subs)) 5829 { 5830 skip = t->state->out1->out; 5831 #ifdef ENABLE_LOG 5832 skip_lid = nextlist->id; 5833 #endif 5834 } 5835 else if (state_in_list(nextlist, 5836 t->state->out1->out->out, &t->subs)) 5837 { 5838 skip = t->state->out1->out->out; 5839 #ifdef ENABLE_LOG 5840 skip_lid = nextlist->id; 5841 #endif 5842 } 5843 else if (state_in_list(thislist, 5844 t->state->out1->out->out, &t->subs)) 5845 { 5846 skip = t->state->out1->out->out; 5847 #ifdef ENABLE_LOG 5848 skip_lid = thislist->id; 5849 #endif 5850 } 5851 if (skip != NULL) 5852 { 5853 #ifdef ENABLE_LOG 5854 nfa_set_code(skip->c); 5855 fprintf(log_fd, "> Not trying to match pattern, output state %d is already in list %d. char %d: %s\n", 5856 abs(skip->id), skip_lid, skip->c, code); 5857 #endif 5858 break; 5859 } 5860 /* Copy submatch info to the recursive call, opposite of what 5861 * happens afterwards. */ 5862 copy_sub_off(&m->norm, &t->subs.norm); 5863 #ifdef FEAT_SYN_HL 5864 if (nfa_has_zsubexpr) 5865 copy_sub_off(&m->synt, &t->subs.synt); 5866 #endif 5867 5868 /* First try matching the pattern. */ 5869 result = recursive_regmatch(t->state, NULL, prog, 5870 submatch, m, &listids); 5871 if (result == NFA_TOO_EXPENSIVE) 5872 { 5873 nfa_match = result; 5874 goto theend; 5875 } 5876 if (result) 5877 { 5878 int bytelen; 5879 5880 #ifdef ENABLE_LOG 5881 fprintf(log_fd, "NFA_START_PATTERN matches:\n"); 5882 log_subsexpr(m); 5883 #endif 5884 /* Copy submatch info from the recursive call */ 5885 copy_sub_off(&t->subs.norm, &m->norm); 5886 #ifdef FEAT_SYN_HL 5887 if (nfa_has_zsubexpr) 5888 copy_sub_off(&t->subs.synt, &m->synt); 5889 #endif 5890 /* Now we need to skip over the matched text and then 5891 * continue with what follows. */ 5892 if (REG_MULTI) 5893 /* TODO: multi-line match */ 5894 bytelen = m->norm.list.multi[0].end_col 5895 - (int)(reginput - regline); 5896 else 5897 bytelen = (int)(m->norm.list.line[0].end - reginput); 5898 5899 #ifdef ENABLE_LOG 5900 fprintf(log_fd, "NFA_START_PATTERN length: %d\n", bytelen); 5901 #endif 5902 if (bytelen == 0) 5903 { 5904 /* empty match, output of corresponding 5905 * NFA_END_PATTERN/NFA_SKIP to be used at current 5906 * position */ 5907 add_here = TRUE; 5908 add_state = t->state->out1->out->out; 5909 } 5910 else if (bytelen <= clen) 5911 { 5912 /* match current character, output of corresponding 5913 * NFA_END_PATTERN to be used at next position. */ 5914 add_state = t->state->out1->out->out; 5915 add_off = clen; 5916 } 5917 else 5918 { 5919 /* skip over the matched characters, set character 5920 * count in NFA_SKIP */ 5921 add_state = t->state->out1->out; 5922 add_off = bytelen; 5923 add_count = bytelen - clen; 5924 } 5925 } 5926 break; 5927 } 5928 5929 case NFA_BOL: 5930 if (reginput == regline) 5931 { 5932 add_here = TRUE; 5933 add_state = t->state->out; 5934 } 5935 break; 5936 5937 case NFA_EOL: 5938 if (curc == NUL) 5939 { 5940 add_here = TRUE; 5941 add_state = t->state->out; 5942 } 5943 break; 5944 5945 case NFA_BOW: 5946 result = TRUE; 5947 5948 if (curc == NUL) 5949 result = FALSE; 5950 #ifdef FEAT_MBYTE 5951 else if (has_mbyte) 5952 { 5953 int this_class; 5954 5955 /* Get class of current and previous char (if it exists). */ 5956 this_class = mb_get_class_buf(reginput, reg_buf); 5957 if (this_class <= 1) 5958 result = FALSE; 5959 else if (reg_prev_class() == this_class) 5960 result = FALSE; 5961 } 5962 #endif 5963 else if (!vim_iswordc_buf(curc, reg_buf) 5964 || (reginput > regline 5965 && vim_iswordc_buf(reginput[-1], reg_buf))) 5966 result = FALSE; 5967 if (result) 5968 { 5969 add_here = TRUE; 5970 add_state = t->state->out; 5971 } 5972 break; 5973 5974 case NFA_EOW: 5975 result = TRUE; 5976 if (reginput == regline) 5977 result = FALSE; 5978 #ifdef FEAT_MBYTE 5979 else if (has_mbyte) 5980 { 5981 int this_class, prev_class; 5982 5983 /* Get class of current and previous char (if it exists). */ 5984 this_class = mb_get_class_buf(reginput, reg_buf); 5985 prev_class = reg_prev_class(); 5986 if (this_class == prev_class 5987 || prev_class == 0 || prev_class == 1) 5988 result = FALSE; 5989 } 5990 #endif 5991 else if (!vim_iswordc_buf(reginput[-1], reg_buf) 5992 || (reginput[0] != NUL 5993 && vim_iswordc_buf(curc, reg_buf))) 5994 result = FALSE; 5995 if (result) 5996 { 5997 add_here = TRUE; 5998 add_state = t->state->out; 5999 } 6000 break; 6001 6002 case NFA_BOF: 6003 if (reglnum == 0 && reginput == regline 6004 && (!REG_MULTI || reg_firstlnum == 1)) 6005 { 6006 add_here = TRUE; 6007 add_state = t->state->out; 6008 } 6009 break; 6010 6011 case NFA_EOF: 6012 if (reglnum == reg_maxline && curc == NUL) 6013 { 6014 add_here = TRUE; 6015 add_state = t->state->out; 6016 } 6017 break; 6018 6019 #ifdef FEAT_MBYTE 6020 case NFA_COMPOSING: 6021 { 6022 int mc = curc; 6023 int len = 0; 6024 nfa_state_T *end; 6025 nfa_state_T *sta; 6026 int cchars[MAX_MCO]; 6027 int ccount = 0; 6028 int j; 6029 6030 sta = t->state->out; 6031 len = 0; 6032 if (utf_iscomposing(sta->c)) 6033 { 6034 /* Only match composing character(s), ignore base 6035 * character. Used for ".{composing}" and "{composing}" 6036 * (no preceding character). */ 6037 len += mb_char2len(mc); 6038 } 6039 if (ireg_icombine && len == 0) 6040 { 6041 /* If \Z was present, then ignore composing characters. 6042 * When ignoring the base character this always matches. */ 6043 if (len == 0 && sta->c != curc) 6044 result = FAIL; 6045 else 6046 result = OK; 6047 while (sta->c != NFA_END_COMPOSING) 6048 sta = sta->out; 6049 } 6050 6051 /* Check base character matches first, unless ignored. */ 6052 else if (len > 0 || mc == sta->c) 6053 { 6054 if (len == 0) 6055 { 6056 len += mb_char2len(mc); 6057 sta = sta->out; 6058 } 6059 6060 /* We don't care about the order of composing characters. 6061 * Get them into cchars[] first. */ 6062 while (len < clen) 6063 { 6064 mc = mb_ptr2char(reginput + len); 6065 cchars[ccount++] = mc; 6066 len += mb_char2len(mc); 6067 if (ccount == MAX_MCO) 6068 break; 6069 } 6070 6071 /* Check that each composing char in the pattern matches a 6072 * composing char in the text. We do not check if all 6073 * composing chars are matched. */ 6074 result = OK; 6075 while (sta->c != NFA_END_COMPOSING) 6076 { 6077 for (j = 0; j < ccount; ++j) 6078 if (cchars[j] == sta->c) 6079 break; 6080 if (j == ccount) 6081 { 6082 result = FAIL; 6083 break; 6084 } 6085 sta = sta->out; 6086 } 6087 } 6088 else 6089 result = FAIL; 6090 6091 end = t->state->out1; /* NFA_END_COMPOSING */ 6092 ADD_STATE_IF_MATCH(end); 6093 break; 6094 } 6095 #endif 6096 6097 case NFA_NEWL: 6098 if (curc == NUL && !reg_line_lbr && REG_MULTI 6099 && reglnum <= reg_maxline) 6100 { 6101 go_to_nextline = TRUE; 6102 /* Pass -1 for the offset, which means taking the position 6103 * at the start of the next line. */ 6104 add_state = t->state->out; 6105 add_off = -1; 6106 } 6107 else if (curc == '\n' && reg_line_lbr) 6108 { 6109 /* match \n as if it is an ordinary character */ 6110 add_state = t->state->out; 6111 add_off = 1; 6112 } 6113 break; 6114 6115 case NFA_START_COLL: 6116 case NFA_START_NEG_COLL: 6117 { 6118 /* What follows is a list of characters, until NFA_END_COLL. 6119 * One of them must match or none of them must match. */ 6120 nfa_state_T *state; 6121 int result_if_matched; 6122 int c1, c2; 6123 6124 /* Never match EOL. If it's part of the collection it is added 6125 * as a separate state with an OR. */ 6126 if (curc == NUL) 6127 break; 6128 6129 state = t->state->out; 6130 result_if_matched = (t->state->c == NFA_START_COLL); 6131 for (;;) 6132 { 6133 if (state->c == NFA_END_COLL) 6134 { 6135 result = !result_if_matched; 6136 break; 6137 } 6138 if (state->c == NFA_RANGE_MIN) 6139 { 6140 c1 = state->val; 6141 state = state->out; /* advance to NFA_RANGE_MAX */ 6142 c2 = state->val; 6143 #ifdef ENABLE_LOG 6144 fprintf(log_fd, "NFA_RANGE_MIN curc=%d c1=%d c2=%d\n", 6145 curc, c1, c2); 6146 #endif 6147 if (curc >= c1 && curc <= c2) 6148 { 6149 result = result_if_matched; 6150 break; 6151 } 6152 if (ireg_ic) 6153 { 6154 int curc_low = MB_TOLOWER(curc); 6155 int done = FALSE; 6156 6157 for ( ; c1 <= c2; ++c1) 6158 if (MB_TOLOWER(c1) == curc_low) 6159 { 6160 result = result_if_matched; 6161 done = TRUE; 6162 break; 6163 } 6164 if (done) 6165 break; 6166 } 6167 } 6168 else if (state->c < 0 ? check_char_class(state->c, curc) 6169 : (curc == state->c 6170 || (ireg_ic && MB_TOLOWER(curc) 6171 == MB_TOLOWER(state->c)))) 6172 { 6173 result = result_if_matched; 6174 break; 6175 } 6176 state = state->out; 6177 } 6178 if (result) 6179 { 6180 /* next state is in out of the NFA_END_COLL, out1 of 6181 * START points to the END state */ 6182 add_state = t->state->out1->out; 6183 add_off = clen; 6184 } 6185 break; 6186 } 6187 6188 case NFA_ANY: 6189 /* Any char except '\0', (end of input) does not match. */ 6190 if (curc > 0) 6191 { 6192 add_state = t->state->out; 6193 add_off = clen; 6194 } 6195 break; 6196 6197 case NFA_ANY_COMPOSING: 6198 /* On a composing character skip over it. Otherwise do 6199 * nothing. Always matches. */ 6200 #ifdef FEAT_MBYTE 6201 if (enc_utf8 && utf_iscomposing(curc)) 6202 { 6203 add_off = clen; 6204 } 6205 else 6206 #endif 6207 { 6208 add_here = TRUE; 6209 add_off = 0; 6210 } 6211 add_state = t->state->out; 6212 break; 6213 6214 /* 6215 * Character classes like \a for alpha, \d for digit etc. 6216 */ 6217 case NFA_IDENT: /* \i */ 6218 result = vim_isIDc(curc); 6219 ADD_STATE_IF_MATCH(t->state); 6220 break; 6221 6222 case NFA_SIDENT: /* \I */ 6223 result = !VIM_ISDIGIT(curc) && vim_isIDc(curc); 6224 ADD_STATE_IF_MATCH(t->state); 6225 break; 6226 6227 case NFA_KWORD: /* \k */ 6228 result = vim_iswordp_buf(reginput, reg_buf); 6229 ADD_STATE_IF_MATCH(t->state); 6230 break; 6231 6232 case NFA_SKWORD: /* \K */ 6233 result = !VIM_ISDIGIT(curc) 6234 && vim_iswordp_buf(reginput, reg_buf); 6235 ADD_STATE_IF_MATCH(t->state); 6236 break; 6237 6238 case NFA_FNAME: /* \f */ 6239 result = vim_isfilec(curc); 6240 ADD_STATE_IF_MATCH(t->state); 6241 break; 6242 6243 case NFA_SFNAME: /* \F */ 6244 result = !VIM_ISDIGIT(curc) && vim_isfilec(curc); 6245 ADD_STATE_IF_MATCH(t->state); 6246 break; 6247 6248 case NFA_PRINT: /* \p */ 6249 result = vim_isprintc(PTR2CHAR(reginput)); 6250 ADD_STATE_IF_MATCH(t->state); 6251 break; 6252 6253 case NFA_SPRINT: /* \P */ 6254 result = !VIM_ISDIGIT(curc) && vim_isprintc(PTR2CHAR(reginput)); 6255 ADD_STATE_IF_MATCH(t->state); 6256 break; 6257 6258 case NFA_WHITE: /* \s */ 6259 result = vim_iswhite(curc); 6260 ADD_STATE_IF_MATCH(t->state); 6261 break; 6262 6263 case NFA_NWHITE: /* \S */ 6264 result = curc != NUL && !vim_iswhite(curc); 6265 ADD_STATE_IF_MATCH(t->state); 6266 break; 6267 6268 case NFA_DIGIT: /* \d */ 6269 result = ri_digit(curc); 6270 ADD_STATE_IF_MATCH(t->state); 6271 break; 6272 6273 case NFA_NDIGIT: /* \D */ 6274 result = curc != NUL && !ri_digit(curc); 6275 ADD_STATE_IF_MATCH(t->state); 6276 break; 6277 6278 case NFA_HEX: /* \x */ 6279 result = ri_hex(curc); 6280 ADD_STATE_IF_MATCH(t->state); 6281 break; 6282 6283 case NFA_NHEX: /* \X */ 6284 result = curc != NUL && !ri_hex(curc); 6285 ADD_STATE_IF_MATCH(t->state); 6286 break; 6287 6288 case NFA_OCTAL: /* \o */ 6289 result = ri_octal(curc); 6290 ADD_STATE_IF_MATCH(t->state); 6291 break; 6292 6293 case NFA_NOCTAL: /* \O */ 6294 result = curc != NUL && !ri_octal(curc); 6295 ADD_STATE_IF_MATCH(t->state); 6296 break; 6297 6298 case NFA_WORD: /* \w */ 6299 result = ri_word(curc); 6300 ADD_STATE_IF_MATCH(t->state); 6301 break; 6302 6303 case NFA_NWORD: /* \W */ 6304 result = curc != NUL && !ri_word(curc); 6305 ADD_STATE_IF_MATCH(t->state); 6306 break; 6307 6308 case NFA_HEAD: /* \h */ 6309 result = ri_head(curc); 6310 ADD_STATE_IF_MATCH(t->state); 6311 break; 6312 6313 case NFA_NHEAD: /* \H */ 6314 result = curc != NUL && !ri_head(curc); 6315 ADD_STATE_IF_MATCH(t->state); 6316 break; 6317 6318 case NFA_ALPHA: /* \a */ 6319 result = ri_alpha(curc); 6320 ADD_STATE_IF_MATCH(t->state); 6321 break; 6322 6323 case NFA_NALPHA: /* \A */ 6324 result = curc != NUL && !ri_alpha(curc); 6325 ADD_STATE_IF_MATCH(t->state); 6326 break; 6327 6328 case NFA_LOWER: /* \l */ 6329 result = ri_lower(curc); 6330 ADD_STATE_IF_MATCH(t->state); 6331 break; 6332 6333 case NFA_NLOWER: /* \L */ 6334 result = curc != NUL && !ri_lower(curc); 6335 ADD_STATE_IF_MATCH(t->state); 6336 break; 6337 6338 case NFA_UPPER: /* \u */ 6339 result = ri_upper(curc); 6340 ADD_STATE_IF_MATCH(t->state); 6341 break; 6342 6343 case NFA_NUPPER: /* \U */ 6344 result = curc != NUL && !ri_upper(curc); 6345 ADD_STATE_IF_MATCH(t->state); 6346 break; 6347 6348 case NFA_LOWER_IC: /* [a-z] */ 6349 result = ri_lower(curc) || (ireg_ic && ri_upper(curc)); 6350 ADD_STATE_IF_MATCH(t->state); 6351 break; 6352 6353 case NFA_NLOWER_IC: /* [^a-z] */ 6354 result = curc != NUL 6355 && !(ri_lower(curc) || (ireg_ic && ri_upper(curc))); 6356 ADD_STATE_IF_MATCH(t->state); 6357 break; 6358 6359 case NFA_UPPER_IC: /* [A-Z] */ 6360 result = ri_upper(curc) || (ireg_ic && ri_lower(curc)); 6361 ADD_STATE_IF_MATCH(t->state); 6362 break; 6363 6364 case NFA_NUPPER_IC: /* ^[A-Z] */ 6365 result = curc != NUL 6366 && !(ri_upper(curc) || (ireg_ic && ri_lower(curc))); 6367 ADD_STATE_IF_MATCH(t->state); 6368 break; 6369 6370 case NFA_BACKREF1: 6371 case NFA_BACKREF2: 6372 case NFA_BACKREF3: 6373 case NFA_BACKREF4: 6374 case NFA_BACKREF5: 6375 case NFA_BACKREF6: 6376 case NFA_BACKREF7: 6377 case NFA_BACKREF8: 6378 case NFA_BACKREF9: 6379 #ifdef FEAT_SYN_HL 6380 case NFA_ZREF1: 6381 case NFA_ZREF2: 6382 case NFA_ZREF3: 6383 case NFA_ZREF4: 6384 case NFA_ZREF5: 6385 case NFA_ZREF6: 6386 case NFA_ZREF7: 6387 case NFA_ZREF8: 6388 case NFA_ZREF9: 6389 #endif 6390 /* \1 .. \9 \z1 .. \z9 */ 6391 { 6392 int subidx; 6393 int bytelen; 6394 6395 if (t->state->c <= NFA_BACKREF9) 6396 { 6397 subidx = t->state->c - NFA_BACKREF1 + 1; 6398 result = match_backref(&t->subs.norm, subidx, &bytelen); 6399 } 6400 #ifdef FEAT_SYN_HL 6401 else 6402 { 6403 subidx = t->state->c - NFA_ZREF1 + 1; 6404 result = match_zref(subidx, &bytelen); 6405 } 6406 #endif 6407 6408 if (result) 6409 { 6410 if (bytelen == 0) 6411 { 6412 /* empty match always works, output of NFA_SKIP to be 6413 * used next */ 6414 add_here = TRUE; 6415 add_state = t->state->out->out; 6416 } 6417 else if (bytelen <= clen) 6418 { 6419 /* match current character, jump ahead to out of 6420 * NFA_SKIP */ 6421 add_state = t->state->out->out; 6422 add_off = clen; 6423 } 6424 else 6425 { 6426 /* skip over the matched characters, set character 6427 * count in NFA_SKIP */ 6428 add_state = t->state->out; 6429 add_off = bytelen; 6430 add_count = bytelen - clen; 6431 } 6432 } 6433 break; 6434 } 6435 case NFA_SKIP: 6436 /* character of previous matching \1 .. \9 or \@> */ 6437 if (t->count - clen <= 0) 6438 { 6439 /* end of match, go to what follows */ 6440 add_state = t->state->out; 6441 add_off = clen; 6442 } 6443 else 6444 { 6445 /* add state again with decremented count */ 6446 add_state = t->state; 6447 add_off = 0; 6448 add_count = t->count - clen; 6449 } 6450 break; 6451 6452 case NFA_LNUM: 6453 case NFA_LNUM_GT: 6454 case NFA_LNUM_LT: 6455 result = (REG_MULTI && 6456 nfa_re_num_cmp(t->state->val, t->state->c - NFA_LNUM, 6457 (long_u)(reglnum + reg_firstlnum))); 6458 if (result) 6459 { 6460 add_here = TRUE; 6461 add_state = t->state->out; 6462 } 6463 break; 6464 6465 case NFA_COL: 6466 case NFA_COL_GT: 6467 case NFA_COL_LT: 6468 result = nfa_re_num_cmp(t->state->val, t->state->c - NFA_COL, 6469 (long_u)(reginput - regline) + 1); 6470 if (result) 6471 { 6472 add_here = TRUE; 6473 add_state = t->state->out; 6474 } 6475 break; 6476 6477 case NFA_VCOL: 6478 case NFA_VCOL_GT: 6479 case NFA_VCOL_LT: 6480 { 6481 int op = t->state->c - NFA_VCOL; 6482 colnr_T col = (colnr_T)(reginput - regline); 6483 win_T *wp = reg_win == NULL ? curwin : reg_win; 6484 6485 /* Bail out quickly when there can't be a match, avoid the 6486 * overhead of win_linetabsize() on long lines. */ 6487 if (op != 1 && col > t->state->val 6488 #ifdef FEAT_MBYTE 6489 * (has_mbyte ? MB_MAXBYTES : 1) 6490 #endif 6491 ) 6492 break; 6493 result = FALSE; 6494 if (op == 1 && col - 1 > t->state->val && col > 100) 6495 { 6496 int ts = wp->w_buffer->b_p_ts; 6497 6498 /* Guess that a character won't use more columns than 6499 * 'tabstop', with a minimum of 4. */ 6500 if (ts < 4) 6501 ts = 4; 6502 result = col > t->state->val * ts; 6503 } 6504 if (!result) 6505 result = nfa_re_num_cmp(t->state->val, op, 6506 (long_u)win_linetabsize(wp, regline, col) + 1); 6507 if (result) 6508 { 6509 add_here = TRUE; 6510 add_state = t->state->out; 6511 } 6512 } 6513 break; 6514 6515 case NFA_MARK: 6516 case NFA_MARK_GT: 6517 case NFA_MARK_LT: 6518 { 6519 pos_T *pos = getmark_buf(reg_buf, t->state->val, FALSE); 6520 6521 /* Compare the mark position to the match position. */ 6522 result = (pos != NULL /* mark doesn't exist */ 6523 && pos->lnum > 0 /* mark isn't set in reg_buf */ 6524 && (pos->lnum == reglnum + reg_firstlnum 6525 ? (pos->col == (colnr_T)(reginput - regline) 6526 ? t->state->c == NFA_MARK 6527 : (pos->col < (colnr_T)(reginput - regline) 6528 ? t->state->c == NFA_MARK_GT 6529 : t->state->c == NFA_MARK_LT)) 6530 : (pos->lnum < reglnum + reg_firstlnum 6531 ? t->state->c == NFA_MARK_GT 6532 : t->state->c == NFA_MARK_LT))); 6533 if (result) 6534 { 6535 add_here = TRUE; 6536 add_state = t->state->out; 6537 } 6538 break; 6539 } 6540 6541 case NFA_CURSOR: 6542 result = (reg_win != NULL 6543 && (reglnum + reg_firstlnum == reg_win->w_cursor.lnum) 6544 && ((colnr_T)(reginput - regline) 6545 == reg_win->w_cursor.col)); 6546 if (result) 6547 { 6548 add_here = TRUE; 6549 add_state = t->state->out; 6550 } 6551 break; 6552 6553 case NFA_VISUAL: 6554 result = reg_match_visual(); 6555 if (result) 6556 { 6557 add_here = TRUE; 6558 add_state = t->state->out; 6559 } 6560 break; 6561 6562 case NFA_MOPEN1: 6563 case NFA_MOPEN2: 6564 case NFA_MOPEN3: 6565 case NFA_MOPEN4: 6566 case NFA_MOPEN5: 6567 case NFA_MOPEN6: 6568 case NFA_MOPEN7: 6569 case NFA_MOPEN8: 6570 case NFA_MOPEN9: 6571 #ifdef FEAT_SYN_HL 6572 case NFA_ZOPEN: 6573 case NFA_ZOPEN1: 6574 case NFA_ZOPEN2: 6575 case NFA_ZOPEN3: 6576 case NFA_ZOPEN4: 6577 case NFA_ZOPEN5: 6578 case NFA_ZOPEN6: 6579 case NFA_ZOPEN7: 6580 case NFA_ZOPEN8: 6581 case NFA_ZOPEN9: 6582 #endif 6583 case NFA_NOPEN: 6584 case NFA_ZSTART: 6585 /* These states are only added to be able to bail out when 6586 * they are added again, nothing is to be done. */ 6587 break; 6588 6589 default: /* regular character */ 6590 { 6591 int c = t->state->c; 6592 6593 #ifdef DEBUG 6594 if (c < 0) 6595 EMSGN("INTERNAL: Negative state char: %ld", c); 6596 #endif 6597 result = (c == curc); 6598 6599 if (!result && ireg_ic) 6600 result = MB_TOLOWER(c) == MB_TOLOWER(curc); 6601 #ifdef FEAT_MBYTE 6602 /* If ireg_icombine is not set only skip over the character 6603 * itself. When it is set skip over composing characters. */ 6604 if (result && enc_utf8 && !ireg_icombine) 6605 clen = utf_ptr2len(reginput); 6606 #endif 6607 ADD_STATE_IF_MATCH(t->state); 6608 break; 6609 } 6610 6611 } /* switch (t->state->c) */ 6612 6613 if (add_state != NULL) 6614 { 6615 nfa_pim_T *pim; 6616 nfa_pim_T pim_copy; 6617 6618 if (t->pim.result == NFA_PIM_UNUSED) 6619 pim = NULL; 6620 else 6621 pim = &t->pim; 6622 6623 /* Handle the postponed invisible match if the match might end 6624 * without advancing and before the end of the line. */ 6625 if (pim != NULL && (clen == 0 || match_follows(add_state, 0))) 6626 { 6627 if (pim->result == NFA_PIM_TODO) 6628 { 6629 #ifdef ENABLE_LOG 6630 fprintf(log_fd, "\n"); 6631 fprintf(log_fd, "==================================\n"); 6632 fprintf(log_fd, "Postponed recursive nfa_regmatch()\n"); 6633 fprintf(log_fd, "\n"); 6634 #endif 6635 result = recursive_regmatch(pim->state, pim, 6636 prog, submatch, m, &listids); 6637 pim->result = result ? NFA_PIM_MATCH : NFA_PIM_NOMATCH; 6638 /* for \@! and \@<! it is a match when the result is 6639 * FALSE */ 6640 if (result != (pim->state->c == NFA_START_INVISIBLE_NEG 6641 || pim->state->c == NFA_START_INVISIBLE_NEG_FIRST 6642 || pim->state->c 6643 == NFA_START_INVISIBLE_BEFORE_NEG 6644 || pim->state->c 6645 == NFA_START_INVISIBLE_BEFORE_NEG_FIRST)) 6646 { 6647 /* Copy submatch info from the recursive call */ 6648 copy_sub_off(&pim->subs.norm, &m->norm); 6649 #ifdef FEAT_SYN_HL 6650 if (nfa_has_zsubexpr) 6651 copy_sub_off(&pim->subs.synt, &m->synt); 6652 #endif 6653 } 6654 } 6655 else 6656 { 6657 result = (pim->result == NFA_PIM_MATCH); 6658 #ifdef ENABLE_LOG 6659 fprintf(log_fd, "\n"); 6660 fprintf(log_fd, "Using previous recursive nfa_regmatch() result, result == %d\n", pim->result); 6661 fprintf(log_fd, "MATCH = %s\n", result == TRUE ? "OK" : "FALSE"); 6662 fprintf(log_fd, "\n"); 6663 #endif 6664 } 6665 6666 /* for \@! and \@<! it is a match when result is FALSE */ 6667 if (result != (pim->state->c == NFA_START_INVISIBLE_NEG 6668 || pim->state->c == NFA_START_INVISIBLE_NEG_FIRST 6669 || pim->state->c 6670 == NFA_START_INVISIBLE_BEFORE_NEG 6671 || pim->state->c 6672 == NFA_START_INVISIBLE_BEFORE_NEG_FIRST)) 6673 { 6674 /* Copy submatch info from the recursive call */ 6675 copy_sub_off(&t->subs.norm, &pim->subs.norm); 6676 #ifdef FEAT_SYN_HL 6677 if (nfa_has_zsubexpr) 6678 copy_sub_off(&t->subs.synt, &pim->subs.synt); 6679 #endif 6680 } 6681 else 6682 /* look-behind match failed, don't add the state */ 6683 continue; 6684 6685 /* Postponed invisible match was handled, don't add it to 6686 * following states. */ 6687 pim = NULL; 6688 } 6689 6690 /* If "pim" points into l->t it will become invalid when 6691 * adding the state causes the list to be reallocated. Make a 6692 * local copy to avoid that. */ 6693 if (pim == &t->pim) 6694 { 6695 copy_pim(&pim_copy, pim); 6696 pim = &pim_copy; 6697 } 6698 6699 if (add_here) 6700 addstate_here(thislist, add_state, &t->subs, pim, &listidx); 6701 else 6702 { 6703 addstate(nextlist, add_state, &t->subs, pim, add_off); 6704 if (add_count > 0) 6705 nextlist->t[nextlist->n - 1].count = add_count; 6706 } 6707 } 6708 6709 } /* for (thislist = thislist; thislist->state; thislist++) */ 6710 6711 /* Look for the start of a match in the current position by adding the 6712 * start state to the list of states. 6713 * The first found match is the leftmost one, thus the order of states 6714 * matters! 6715 * Do not add the start state in recursive calls of nfa_regmatch(), 6716 * because recursive calls should only start in the first position. 6717 * Unless "nfa_endp" is not NULL, then we match the end position. 6718 * Also don't start a match past the first line. */ 6719 if (nfa_match == FALSE 6720 && ((toplevel 6721 && reglnum == 0 6722 && clen != 0 6723 && (ireg_maxcol == 0 6724 || (colnr_T)(reginput - regline) < ireg_maxcol)) 6725 || (nfa_endp != NULL 6726 && (REG_MULTI 6727 ? (reglnum < nfa_endp->se_u.pos.lnum 6728 || (reglnum == nfa_endp->se_u.pos.lnum 6729 && (int)(reginput - regline) 6730 < nfa_endp->se_u.pos.col)) 6731 : reginput < nfa_endp->se_u.ptr)))) 6732 { 6733 #ifdef ENABLE_LOG 6734 fprintf(log_fd, "(---) STARTSTATE\n"); 6735 #endif 6736 /* Inline optimized code for addstate() if we know the state is 6737 * the first MOPEN. */ 6738 if (toplevel) 6739 { 6740 int add = TRUE; 6741 int c; 6742 6743 if (prog->regstart != NUL && clen != 0) 6744 { 6745 if (nextlist->n == 0) 6746 { 6747 colnr_T col = (colnr_T)(reginput - regline) + clen; 6748 6749 /* Nextlist is empty, we can skip ahead to the 6750 * character that must appear at the start. */ 6751 if (skip_to_start(prog->regstart, &col) == FAIL) 6752 break; 6753 #ifdef ENABLE_LOG 6754 fprintf(log_fd, " Skipping ahead %d bytes to regstart\n", 6755 col - ((colnr_T)(reginput - regline) + clen)); 6756 #endif 6757 reginput = regline + col - clen; 6758 } 6759 else 6760 { 6761 /* Checking if the required start character matches is 6762 * cheaper than adding a state that won't match. */ 6763 c = PTR2CHAR(reginput + clen); 6764 if (c != prog->regstart && (!ireg_ic || MB_TOLOWER(c) 6765 != MB_TOLOWER(prog->regstart))) 6766 { 6767 #ifdef ENABLE_LOG 6768 fprintf(log_fd, " Skipping start state, regstart does not match\n"); 6769 #endif 6770 add = FALSE; 6771 } 6772 } 6773 } 6774 6775 if (add) 6776 { 6777 if (REG_MULTI) 6778 m->norm.list.multi[0].start_col = 6779 (colnr_T)(reginput - regline) + clen; 6780 else 6781 m->norm.list.line[0].start = reginput + clen; 6782 addstate(nextlist, start->out, m, NULL, clen); 6783 } 6784 } 6785 else 6786 addstate(nextlist, start, m, NULL, clen); 6787 } 6788 6789 #ifdef ENABLE_LOG 6790 fprintf(log_fd, ">>> Thislist had %d states available: ", thislist->n); 6791 { 6792 int i; 6793 6794 for (i = 0; i < thislist->n; i++) 6795 fprintf(log_fd, "%d ", abs(thislist->t[i].state->id)); 6796 } 6797 fprintf(log_fd, "\n"); 6798 #endif 6799 6800 nextchar: 6801 /* Advance to the next character, or advance to the next line, or 6802 * finish. */ 6803 if (clen != 0) 6804 reginput += clen; 6805 else if (go_to_nextline || (nfa_endp != NULL && REG_MULTI 6806 && reglnum < nfa_endp->se_u.pos.lnum)) 6807 reg_nextline(); 6808 else 6809 break; 6810 6811 /* Allow interrupting with CTRL-C. */ 6812 line_breakcheck(); 6813 if (got_int) 6814 break; 6815 #ifdef FEAT_RELTIME 6816 /* Check for timeout once in a twenty times to avoid overhead. */ 6817 if (nfa_time_limit != NULL && ++nfa_time_count == 20) 6818 { 6819 nfa_time_count = 0; 6820 if (profile_passed_limit(nfa_time_limit)) 6821 break; 6822 } 6823 #endif 6824 } 6825 6826 #ifdef ENABLE_LOG 6827 if (log_fd != stderr) 6828 fclose(log_fd); 6829 log_fd = NULL; 6830 #endif 6831 6832 theend: 6833 /* Free memory */ 6834 vim_free(list[0].t); 6835 vim_free(list[1].t); 6836 vim_free(listids); 6837 #undef ADD_STATE_IF_MATCH 6838 #ifdef NFA_REGEXP_DEBUG_LOG 6839 fclose(debug); 6840 #endif 6841 6842 return nfa_match; 6843 } 6844 6845 /* 6846 * Try match of "prog" with at regline["col"]. 6847 * Returns <= 0 for failure, number of lines contained in the match otherwise. 6848 */ 6849 static long 6850 nfa_regtry(prog, col, tm) 6851 nfa_regprog_T *prog; 6852 colnr_T col; 6853 proftime_T *tm UNUSED; /* timeout limit or NULL */ 6854 { 6855 int i; 6856 regsubs_T subs, m; 6857 nfa_state_T *start = prog->start; 6858 int result; 6859 #ifdef ENABLE_LOG 6860 FILE *f; 6861 #endif 6862 6863 reginput = regline + col; 6864 #ifdef FEAT_RELTIME 6865 nfa_time_limit = tm; 6866 nfa_time_count = 0; 6867 #endif 6868 6869 #ifdef ENABLE_LOG 6870 f = fopen(NFA_REGEXP_RUN_LOG, "a"); 6871 if (f != NULL) 6872 { 6873 fprintf(f, "\n\n\t=======================================================\n"); 6874 #ifdef DEBUG 6875 fprintf(f, "\tRegexp is \"%s\"\n", nfa_regengine.expr); 6876 #endif 6877 fprintf(f, "\tInput text is \"%s\" \n", reginput); 6878 fprintf(f, "\t=======================================================\n\n"); 6879 nfa_print_state(f, start); 6880 fprintf(f, "\n\n"); 6881 fclose(f); 6882 } 6883 else 6884 EMSG(_("Could not open temporary log file for writing ")); 6885 #endif 6886 6887 clear_sub(&subs.norm); 6888 clear_sub(&m.norm); 6889 #ifdef FEAT_SYN_HL 6890 clear_sub(&subs.synt); 6891 clear_sub(&m.synt); 6892 #endif 6893 6894 result = nfa_regmatch(prog, start, &subs, &m); 6895 if (result == FALSE) 6896 return 0; 6897 else if (result == NFA_TOO_EXPENSIVE) 6898 return result; 6899 6900 cleanup_subexpr(); 6901 if (REG_MULTI) 6902 { 6903 for (i = 0; i < subs.norm.in_use; i++) 6904 { 6905 reg_startpos[i].lnum = subs.norm.list.multi[i].start_lnum; 6906 reg_startpos[i].col = subs.norm.list.multi[i].start_col; 6907 6908 reg_endpos[i].lnum = subs.norm.list.multi[i].end_lnum; 6909 reg_endpos[i].col = subs.norm.list.multi[i].end_col; 6910 } 6911 6912 if (reg_startpos[0].lnum < 0) 6913 { 6914 reg_startpos[0].lnum = 0; 6915 reg_startpos[0].col = col; 6916 } 6917 if (reg_endpos[0].lnum < 0) 6918 { 6919 /* pattern has a \ze but it didn't match, use current end */ 6920 reg_endpos[0].lnum = reglnum; 6921 reg_endpos[0].col = (int)(reginput - regline); 6922 } 6923 else 6924 /* Use line number of "\ze". */ 6925 reglnum = reg_endpos[0].lnum; 6926 } 6927 else 6928 { 6929 for (i = 0; i < subs.norm.in_use; i++) 6930 { 6931 reg_startp[i] = subs.norm.list.line[i].start; 6932 reg_endp[i] = subs.norm.list.line[i].end; 6933 } 6934 6935 if (reg_startp[0] == NULL) 6936 reg_startp[0] = regline + col; 6937 if (reg_endp[0] == NULL) 6938 reg_endp[0] = reginput; 6939 } 6940 6941 #ifdef FEAT_SYN_HL 6942 /* Package any found \z(...\) matches for export. Default is none. */ 6943 unref_extmatch(re_extmatch_out); 6944 re_extmatch_out = NULL; 6945 6946 if (prog->reghasz == REX_SET) 6947 { 6948 cleanup_zsubexpr(); 6949 re_extmatch_out = make_extmatch(); 6950 /* Loop over \z1, \z2, etc. There is no \z0. */ 6951 for (i = 1; i < subs.synt.in_use; i++) 6952 { 6953 if (REG_MULTI) 6954 { 6955 struct multipos *mpos = &subs.synt.list.multi[i]; 6956 6957 /* Only accept single line matches that are valid. */ 6958 if (mpos->start_lnum >= 0 6959 && mpos->start_lnum == mpos->end_lnum 6960 && mpos->end_col >= mpos->start_col) 6961 re_extmatch_out->matches[i] = 6962 vim_strnsave(reg_getline(mpos->start_lnum) 6963 + mpos->start_col, 6964 mpos->end_col - mpos->start_col); 6965 } 6966 else 6967 { 6968 struct linepos *lpos = &subs.synt.list.line[i]; 6969 6970 if (lpos->start != NULL && lpos->end != NULL) 6971 re_extmatch_out->matches[i] = 6972 vim_strnsave(lpos->start, 6973 (int)(lpos->end - lpos->start)); 6974 } 6975 } 6976 } 6977 #endif 6978 6979 return 1 + reglnum; 6980 } 6981 6982 /* 6983 * Match a regexp against a string ("line" points to the string) or multiple 6984 * lines ("line" is NULL, use reg_getline()). 6985 * 6986 * Returns <= 0 for failure, number of lines contained in the match otherwise. 6987 */ 6988 static long 6989 nfa_regexec_both(line, startcol, tm) 6990 char_u *line; 6991 colnr_T startcol; /* column to start looking for match */ 6992 proftime_T *tm; /* timeout limit or NULL */ 6993 { 6994 nfa_regprog_T *prog; 6995 long retval = 0L; 6996 int i; 6997 colnr_T col = startcol; 6998 6999 if (REG_MULTI) 7000 { 7001 prog = (nfa_regprog_T *)reg_mmatch->regprog; 7002 line = reg_getline((linenr_T)0); /* relative to the cursor */ 7003 reg_startpos = reg_mmatch->startpos; 7004 reg_endpos = reg_mmatch->endpos; 7005 } 7006 else 7007 { 7008 prog = (nfa_regprog_T *)reg_match->regprog; 7009 reg_startp = reg_match->startp; 7010 reg_endp = reg_match->endp; 7011 } 7012 7013 /* Be paranoid... */ 7014 if (prog == NULL || line == NULL) 7015 { 7016 EMSG(_(e_null)); 7017 goto theend; 7018 } 7019 7020 /* If pattern contains "\c" or "\C": overrule value of ireg_ic */ 7021 if (prog->regflags & RF_ICASE) 7022 ireg_ic = TRUE; 7023 else if (prog->regflags & RF_NOICASE) 7024 ireg_ic = FALSE; 7025 7026 #ifdef FEAT_MBYTE 7027 /* If pattern contains "\Z" overrule value of ireg_icombine */ 7028 if (prog->regflags & RF_ICOMBINE) 7029 ireg_icombine = TRUE; 7030 #endif 7031 7032 regline = line; 7033 reglnum = 0; /* relative to line */ 7034 7035 nfa_has_zend = prog->has_zend; 7036 nfa_has_backref = prog->has_backref; 7037 nfa_nsubexpr = prog->nsubexp; 7038 nfa_listid = 1; 7039 nfa_alt_listid = 2; 7040 nfa_regengine.expr = prog->pattern; 7041 7042 if (prog->reganch && col > 0) 7043 return 0L; 7044 7045 need_clear_subexpr = TRUE; 7046 #ifdef FEAT_SYN_HL 7047 /* Clear the external match subpointers if necessary. */ 7048 if (prog->reghasz == REX_SET) 7049 { 7050 nfa_has_zsubexpr = TRUE; 7051 need_clear_zsubexpr = TRUE; 7052 } 7053 else 7054 nfa_has_zsubexpr = FALSE; 7055 #endif 7056 7057 if (prog->regstart != NUL) 7058 { 7059 /* Skip ahead until a character we know the match must start with. 7060 * When there is none there is no match. */ 7061 if (skip_to_start(prog->regstart, &col) == FAIL) 7062 return 0L; 7063 7064 /* If match_text is set it contains the full text that must match. 7065 * Nothing else to try. Doesn't handle combining chars well. */ 7066 if (prog->match_text != NULL 7067 #ifdef FEAT_MBYTE 7068 && !ireg_icombine 7069 #endif 7070 ) 7071 return find_match_text(col, prog->regstart, prog->match_text); 7072 } 7073 7074 /* If the start column is past the maximum column: no need to try. */ 7075 if (ireg_maxcol > 0 && col >= ireg_maxcol) 7076 goto theend; 7077 7078 nstate = prog->nstate; 7079 for (i = 0; i < nstate; ++i) 7080 { 7081 prog->state[i].id = i; 7082 prog->state[i].lastlist[0] = 0; 7083 prog->state[i].lastlist[1] = 0; 7084 } 7085 7086 retval = nfa_regtry(prog, col, tm); 7087 7088 nfa_regengine.expr = NULL; 7089 7090 theend: 7091 return retval; 7092 } 7093 7094 /* 7095 * Compile a regular expression into internal code for the NFA matcher. 7096 * Returns the program in allocated space. Returns NULL for an error. 7097 */ 7098 static regprog_T * 7099 nfa_regcomp(expr, re_flags) 7100 char_u *expr; 7101 int re_flags; 7102 { 7103 nfa_regprog_T *prog = NULL; 7104 size_t prog_size; 7105 int *postfix; 7106 7107 if (expr == NULL) 7108 return NULL; 7109 7110 nfa_regengine.expr = expr; 7111 nfa_re_flags = re_flags; 7112 7113 init_class_tab(); 7114 7115 if (nfa_regcomp_start(expr, re_flags) == FAIL) 7116 return NULL; 7117 7118 /* Build postfix form of the regexp. Needed to build the NFA 7119 * (and count its size). */ 7120 postfix = re2post(); 7121 if (postfix == NULL) 7122 { 7123 /* TODO: only give this error for debugging? */ 7124 if (post_ptr >= post_end) 7125 EMSGN("Internal error: estimated max number of states insufficient: %ld", post_end - post_start); 7126 goto fail; /* Cascaded (syntax?) error */ 7127 } 7128 7129 /* 7130 * In order to build the NFA, we parse the input regexp twice: 7131 * 1. first pass to count size (so we can allocate space) 7132 * 2. second to emit code 7133 */ 7134 #ifdef ENABLE_LOG 7135 { 7136 FILE *f = fopen(NFA_REGEXP_RUN_LOG, "a"); 7137 7138 if (f != NULL) 7139 { 7140 fprintf(f, "\n*****************************\n\n\n\n\tCompiling regexp \"%s\" ... hold on !\n", expr); 7141 fclose(f); 7142 } 7143 } 7144 #endif 7145 7146 /* 7147 * PASS 1 7148 * Count number of NFA states in "nstate". Do not build the NFA. 7149 */ 7150 post2nfa(postfix, post_ptr, TRUE); 7151 7152 /* allocate the regprog with space for the compiled regexp */ 7153 prog_size = sizeof(nfa_regprog_T) + sizeof(nfa_state_T) * (nstate - 1); 7154 prog = (nfa_regprog_T *)lalloc(prog_size, TRUE); 7155 if (prog == NULL) 7156 goto fail; 7157 state_ptr = prog->state; 7158 7159 /* 7160 * PASS 2 7161 * Build the NFA 7162 */ 7163 prog->start = post2nfa(postfix, post_ptr, FALSE); 7164 if (prog->start == NULL) 7165 goto fail; 7166 7167 prog->regflags = regflags; 7168 prog->engine = &nfa_regengine; 7169 prog->nstate = nstate; 7170 prog->has_zend = nfa_has_zend; 7171 prog->has_backref = nfa_has_backref; 7172 prog->nsubexp = regnpar; 7173 7174 nfa_postprocess(prog); 7175 7176 prog->reganch = nfa_get_reganch(prog->start, 0); 7177 prog->regstart = nfa_get_regstart(prog->start, 0); 7178 prog->match_text = nfa_get_match_text(prog->start); 7179 7180 #ifdef ENABLE_LOG 7181 nfa_postfix_dump(expr, OK); 7182 nfa_dump(prog); 7183 #endif 7184 #ifdef FEAT_SYN_HL 7185 /* Remember whether this pattern has any \z specials in it. */ 7186 prog->reghasz = re_has_z; 7187 #endif 7188 prog->pattern = vim_strsave(expr); 7189 nfa_regengine.expr = NULL; 7190 7191 out: 7192 vim_free(post_start); 7193 post_start = post_ptr = post_end = NULL; 7194 state_ptr = NULL; 7195 return (regprog_T *)prog; 7196 7197 fail: 7198 vim_free(prog); 7199 prog = NULL; 7200 #ifdef ENABLE_LOG 7201 nfa_postfix_dump(expr, FAIL); 7202 #endif 7203 nfa_regengine.expr = NULL; 7204 goto out; 7205 } 7206 7207 /* 7208 * Free a compiled regexp program, returned by nfa_regcomp(). 7209 */ 7210 static void 7211 nfa_regfree(prog) 7212 regprog_T *prog; 7213 { 7214 if (prog != NULL) 7215 { 7216 vim_free(((nfa_regprog_T *)prog)->match_text); 7217 vim_free(((nfa_regprog_T *)prog)->pattern); 7218 vim_free(prog); 7219 } 7220 } 7221 7222 /* 7223 * Match a regexp against a string. 7224 * "rmp->regprog" is a compiled regexp as returned by nfa_regcomp(). 7225 * Uses curbuf for line count and 'iskeyword'. 7226 * If "line_lbr" is TRUE consider a "\n" in "line" to be a line break. 7227 * 7228 * Returns <= 0 for failure, number of lines contained in the match otherwise. 7229 */ 7230 static int 7231 nfa_regexec_nl(rmp, line, col, line_lbr) 7232 regmatch_T *rmp; 7233 char_u *line; /* string to match against */ 7234 colnr_T col; /* column to start looking for match */ 7235 int line_lbr; 7236 { 7237 reg_match = rmp; 7238 reg_mmatch = NULL; 7239 reg_maxline = 0; 7240 reg_line_lbr = line_lbr; 7241 reg_buf = curbuf; 7242 reg_win = NULL; 7243 ireg_ic = rmp->rm_ic; 7244 #ifdef FEAT_MBYTE 7245 ireg_icombine = FALSE; 7246 #endif 7247 ireg_maxcol = 0; 7248 return nfa_regexec_both(line, col, NULL); 7249 } 7250 7251 7252 /* 7253 * Match a regexp against multiple lines. 7254 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp(). 7255 * Uses curbuf for line count and 'iskeyword'. 7256 * 7257 * Return <= 0 if there is no match. Return number of lines contained in the 7258 * match otherwise. 7259 * 7260 * Note: the body is the same as bt_regexec() except for nfa_regexec_both() 7261 * 7262 * ! Also NOTE : match may actually be in another line. e.g.: 7263 * when r.e. is \nc, cursor is at 'a' and the text buffer looks like 7264 * 7265 * +-------------------------+ 7266 * |a | 7267 * |b | 7268 * |c | 7269 * | | 7270 * +-------------------------+ 7271 * 7272 * then nfa_regexec_multi() returns 3. while the original 7273 * vim_regexec_multi() returns 0 and a second call at line 2 will return 2. 7274 * 7275 * FIXME if this behavior is not compatible. 7276 */ 7277 static long 7278 nfa_regexec_multi(rmp, win, buf, lnum, col, tm) 7279 regmmatch_T *rmp; 7280 win_T *win; /* window in which to search or NULL */ 7281 buf_T *buf; /* buffer in which to search */ 7282 linenr_T lnum; /* nr of line to start looking for match */ 7283 colnr_T col; /* column to start looking for match */ 7284 proftime_T *tm; /* timeout limit or NULL */ 7285 { 7286 reg_match = NULL; 7287 reg_mmatch = rmp; 7288 reg_buf = buf; 7289 reg_win = win; 7290 reg_firstlnum = lnum; 7291 reg_maxline = reg_buf->b_ml.ml_line_count - lnum; 7292 reg_line_lbr = FALSE; 7293 ireg_ic = rmp->rmm_ic; 7294 #ifdef FEAT_MBYTE 7295 ireg_icombine = FALSE; 7296 #endif 7297 ireg_maxcol = rmp->rmm_maxcol; 7298 7299 return nfa_regexec_both(NULL, col, tm); 7300 } 7301 7302 #ifdef DEBUG 7303 # undef ENABLE_LOG 7304 #endif 7305