1 /*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1992, 1993, 1994 Henry Spencer.
5 * Copyright (c) 1992, 1993, 1994
6 * The Regents of the University of California. All rights reserved.
7 *
8 * This code is derived from software contributed to Berkeley by
9 * Henry Spencer.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 * 3. Neither the name of the University nor the names of its contributors
20 * may be used to endorse or promote products derived from this software
21 * without specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
34 *
35 * @(#)engine.c 8.5 (Berkeley) 3/20/94
36 */
37
38 #include <stdbool.h>
39
40 /*
41 * The matching engine and friends. This file is #included by regexec.c
42 * after suitable #defines of a variety of macros used herein, so that
43 * different state representations can be used without duplicating masses
44 * of code.
45 */
46
47 #ifdef SNAMES
48 #define stepback sstepback
49 #define matcher smatcher
50 #define walk swalk
51 #define dissect sdissect
52 #define backref sbackref
53 #define step sstep
54 #define print sprint
55 #define at sat
56 #define match smat
57 #endif
58 #ifdef LNAMES
59 #define stepback lstepback
60 #define matcher lmatcher
61 #define walk lwalk
62 #define dissect ldissect
63 #define backref lbackref
64 #define step lstep
65 #define print lprint
66 #define at lat
67 #define match lmat
68 #endif
69 #ifdef MNAMES
70 #define stepback mstepback
71 #define matcher mmatcher
72 #define walk mwalk
73 #define dissect mdissect
74 #define backref mbackref
75 #define step mstep
76 #define print mprint
77 #define at mat
78 #define match mmat
79 #endif
80
81 /* another structure passed up and down to avoid zillions of parameters */
82 struct match {
83 struct re_guts *g;
84 int eflags;
85 regmatch_t *pmatch; /* [nsub+1] (0 element unused) */
86 const char *offp; /* offsets work from here */
87 const char *beginp; /* start of string -- virtual NUL precedes */
88 const char *endp; /* end of string -- virtual NUL here */
89 const char *coldp; /* can be no match starting before here */
90 const char **lastpos; /* [nplus+1] */
91 STATEVARS;
92 states st; /* current states */
93 states fresh; /* states for a fresh start */
94 states tmp; /* temporary */
95 states empty; /* empty set of states */
96 mbstate_t mbs; /* multibyte conversion state */
97 };
98
99 /* ========= begin header generated by ./mkh ========= */
100 #ifdef __cplusplus
101 extern "C" {
102 #endif
103
104 /* === engine.c === */
105 static int matcher(struct re_guts *g, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags);
106 static const char *dissect(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst);
107 static const char *backref(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst, sopno lev, int);
108 static const char *walk(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst, bool fast);
109 static states step(struct re_guts *g, sopno start, sopno stop, states bef, wint_t ch, states aft, int sflags);
110 #define MAX_RECURSION 100
111 #define BOL (OUT-1)
112 #define EOL (BOL-1)
113 #define BOLEOL (BOL-2)
114 #define NOTHING (BOL-3)
115 #define BOW (BOL-4)
116 #define EOW (BOL-5)
117 #define BADCHAR (BOL-6)
118 #define NWBND (BOL-7)
119 #define NONCHAR(c) ((c) <= OUT)
120 /* sflags */
121 #define SBOS 0x0001
122 #define SEOS 0x0002
123
124 #ifdef REDEBUG
125 static void print(struct match *m, const char *caption, states st, int ch, FILE *d);
126 #endif
127 #ifdef REDEBUG
128 static void at(struct match *m, const char *title, const char *start, const char *stop, sopno startst, sopno stopst);
129 #endif
130 #ifdef REDEBUG
131 static const char *pchar(int ch);
132 #endif
133
134 #ifdef __cplusplus
135 }
136 #endif
137 /* ========= end header generated by ./mkh ========= */
138
139 #ifdef REDEBUG
140 #define SP(t, s, c) print(m, t, s, c, stdout)
141 #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2)
142 #define NOTE(str) { if (m->eflags®_TRACE) printf("=%s\n", (str)); }
143 #else
144 #define SP(t, s, c) /* nothing */
145 #define AT(t, p1, p2, s1, s2) /* nothing */
146 #define NOTE(s) /* nothing */
147 #endif
148
149 /*
150 * Given a multibyte string pointed to by start, step back nchar characters
151 * from current position pointed to by cur.
152 */
153 static const char *
stepback(const char * start,const char * cur,int nchar)154 stepback(const char *start, const char *cur, int nchar)
155 {
156 const char *ret;
157 int wc, mbc;
158 mbstate_t mbs;
159 size_t clen;
160
161 if (MB_CUR_MAX == 1)
162 return ((cur - nchar) > start ? cur - nchar : NULL);
163
164 ret = cur;
165 for (wc = nchar; wc > 0; wc--) {
166 for (mbc = 1; mbc <= MB_CUR_MAX; mbc++) {
167 if ((ret - mbc) < start)
168 return (NULL);
169 memset(&mbs, 0, sizeof(mbs));
170 clen = mbrtowc(NULL, ret - mbc, mbc, &mbs);
171 if (clen != (size_t)-1 && clen != (size_t)-2)
172 break;
173 }
174 if (mbc > MB_CUR_MAX)
175 return (NULL);
176 ret -= mbc;
177 }
178
179 return (ret);
180 }
181
182 /*
183 - matcher - the actual matching engine
184 == static int matcher(struct re_guts *g, const char *string, \
185 == size_t nmatch, regmatch_t pmatch[], int eflags);
186 */
187 static int /* 0 success, REG_NOMATCH failure */
matcher(struct re_guts * g,const char * string,size_t nmatch,regmatch_t pmatch[],int eflags)188 matcher(struct re_guts *g,
189 const char *string,
190 size_t nmatch,
191 regmatch_t pmatch[],
192 int eflags)
193 {
194 const char *endp;
195 size_t i;
196 struct match mv;
197 struct match *m = &mv;
198 const char *dp = NULL;
199 const sopno gf = g->firststate+1; /* +1 for OEND */
200 const sopno gl = g->laststate;
201 const char *start;
202 const char *stop;
203 /* Boyer-Moore algorithms variables */
204 const char *pp;
205 int cj, mj;
206 const char *mustfirst;
207 const char *mustlast;
208 int *matchjump;
209 int *charjump;
210
211 /* simplify the situation where possible */
212 if (g->cflags®_NOSUB)
213 nmatch = 0;
214 if (eflags®_STARTEND) {
215 start = string + pmatch[0].rm_so;
216 stop = string + pmatch[0].rm_eo;
217 } else {
218 start = string;
219 stop = start + strlen(start);
220 }
221 if (stop < start)
222 return(REG_INVARG);
223
224 /* prescreening; this does wonders for this rather slow code */
225 if (g->must != NULL) {
226 if (g->charjump != NULL && g->matchjump != NULL) {
227 mustfirst = g->must;
228 mustlast = g->must + g->mlen - 1;
229 charjump = g->charjump;
230 matchjump = g->matchjump;
231 pp = mustlast;
232 for (dp = start+g->mlen-1; dp < stop;) {
233 /* Fast skip non-matches */
234 while (dp < stop && charjump[(int)*dp])
235 dp += charjump[(int)*dp];
236
237 if (dp >= stop)
238 break;
239
240 /* Greedy matcher */
241 /* We depend on not being used for
242 * for strings of length 1
243 */
244 while (*--dp == *--pp && pp != mustfirst);
245
246 if (*dp == *pp)
247 break;
248
249 /* Jump to next possible match */
250 mj = matchjump[pp - mustfirst];
251 cj = charjump[(int)*dp];
252 dp += (cj < mj ? mj : cj);
253 pp = mustlast;
254 }
255 if (pp != mustfirst)
256 return(REG_NOMATCH);
257 } else {
258 for (dp = start; dp < stop; dp++)
259 if (*dp == g->must[0] &&
260 stop - dp >= g->mlen &&
261 memcmp(dp, g->must, (size_t)g->mlen) == 0)
262 break;
263 if (dp == stop) /* we didn't find g->must */
264 return(REG_NOMATCH);
265 }
266 }
267
268 /* match struct setup */
269 m->g = g;
270 m->eflags = eflags;
271 m->pmatch = NULL;
272 m->lastpos = NULL;
273 m->offp = string;
274 m->beginp = start;
275 m->endp = stop;
276 STATESETUP(m, 4);
277 SETUP(m->st);
278 SETUP(m->fresh);
279 SETUP(m->tmp);
280 SETUP(m->empty);
281 CLEAR(m->empty);
282 ZAPSTATE(&m->mbs);
283
284 /* Adjust start according to moffset, to speed things up */
285 if (dp != NULL && g->moffset > -1) {
286 const char *nstart;
287
288 nstart = stepback(start, dp, g->moffset);
289 if (nstart != NULL)
290 start = nstart;
291 }
292
293 SP("mloop", m->st, *start);
294
295 /* this loop does only one repetition except for backrefs */
296 for (;;) {
297 endp = walk(m, start, stop, gf, gl, true);
298 if (endp == NULL) { /* a miss */
299 if (m->pmatch != NULL)
300 free((char *)m->pmatch);
301 if (m->lastpos != NULL)
302 free((char *)m->lastpos);
303 STATETEARDOWN(m);
304 return(REG_NOMATCH);
305 }
306 if (nmatch == 0 && !g->backrefs)
307 break; /* no further info needed */
308
309 /* where? */
310 assert(m->coldp != NULL);
311 for (;;) {
312 NOTE("finding start");
313 endp = walk(m, m->coldp, stop, gf, gl, false);
314 if (endp != NULL)
315 break;
316 assert(m->coldp < m->endp);
317 m->coldp += XMBRTOWC(NULL, m->coldp,
318 m->endp - m->coldp, &m->mbs, 0);
319 }
320 if (nmatch == 1 && !g->backrefs)
321 break; /* no further info needed */
322
323 /* oh my, he wants the subexpressions... */
324 if (m->pmatch == NULL)
325 m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) *
326 sizeof(regmatch_t));
327 if (m->pmatch == NULL) {
328 STATETEARDOWN(m);
329 return(REG_ESPACE);
330 }
331 for (i = 1; i <= m->g->nsub; i++)
332 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1;
333 if (!g->backrefs && !(m->eflags®_BACKR)) {
334 NOTE("dissecting");
335 dp = dissect(m, m->coldp, endp, gf, gl);
336 } else {
337 if (g->nplus > 0 && m->lastpos == NULL)
338 m->lastpos = malloc((g->nplus+1) *
339 sizeof(const char *));
340 if (g->nplus > 0 && m->lastpos == NULL) {
341 free(m->pmatch);
342 STATETEARDOWN(m);
343 return(REG_ESPACE);
344 }
345 NOTE("backref dissect");
346 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0);
347 }
348 if (dp != NULL)
349 break;
350
351 /* uh-oh... we couldn't find a subexpression-level match */
352 assert(g->backrefs); /* must be back references doing it */
353 assert(g->nplus == 0 || m->lastpos != NULL);
354 for (;;) {
355 if (dp != NULL || endp <= m->coldp)
356 break; /* defeat */
357 NOTE("backoff");
358 endp = walk(m, m->coldp, endp-1, gf, gl, false);
359 if (endp == NULL)
360 break; /* defeat */
361 /* try it on a shorter possibility */
362 #ifndef NDEBUG
363 for (i = 1; i <= m->g->nsub; i++) {
364 assert(m->pmatch[i].rm_so == -1);
365 assert(m->pmatch[i].rm_eo == -1);
366 }
367 #endif
368 NOTE("backoff dissect");
369 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0);
370 }
371 assert(dp == NULL || dp == endp);
372 if (dp != NULL) /* found a shorter one */
373 break;
374
375 /* despite initial appearances, there is no match here */
376 NOTE("false alarm");
377 /* recycle starting later */
378 start = m->coldp + XMBRTOWC(NULL, m->coldp,
379 stop - m->coldp, &m->mbs, 0);
380 assert(start <= stop);
381 }
382
383 /* fill in the details if requested */
384 if (nmatch > 0) {
385 pmatch[0].rm_so = m->coldp - m->offp;
386 pmatch[0].rm_eo = endp - m->offp;
387 }
388 if (nmatch > 1) {
389 assert(m->pmatch != NULL);
390 for (i = 1; i < nmatch; i++)
391 if (i <= m->g->nsub)
392 pmatch[i] = m->pmatch[i];
393 else {
394 pmatch[i].rm_so = -1;
395 pmatch[i].rm_eo = -1;
396 }
397 }
398
399 if (m->pmatch != NULL)
400 free((char *)m->pmatch);
401 if (m->lastpos != NULL)
402 free((char *)m->lastpos);
403 STATETEARDOWN(m);
404 return(0);
405 }
406
407 /*
408 - dissect - figure out what matched what, no back references
409 == static const char *dissect(struct match *m, const char *start, \
410 == const char *stop, sopno startst, sopno stopst);
411 */
412 static const char * /* == stop (success) always */
dissect(struct match * m,const char * start,const char * stop,sopno startst,sopno stopst)413 dissect(struct match *m,
414 const char *start,
415 const char *stop,
416 sopno startst,
417 sopno stopst)
418 {
419 int i;
420 sopno ss; /* start sop of current subRE */
421 sopno es; /* end sop of current subRE */
422 const char *sp; /* start of string matched by it */
423 const char *stp; /* string matched by it cannot pass here */
424 const char *rest; /* start of rest of string */
425 const char *tail; /* string unmatched by rest of RE */
426 sopno ssub; /* start sop of subsubRE */
427 sopno esub; /* end sop of subsubRE */
428 const char *ssp; /* start of string matched by subsubRE */
429 const char *sep; /* end of string matched by subsubRE */
430 const char *oldssp; /* previous ssp */
431 const char *dp __unused;
432
433 AT("diss", start, stop, startst, stopst);
434 sp = start;
435 for (ss = startst; ss < stopst; ss = es) {
436 /* identify end of subRE */
437 es = ss;
438 switch (OP(m->g->strip[es])) {
439 case OPLUS_:
440 case OQUEST_:
441 es += OPND(m->g->strip[es]);
442 break;
443 case OCH_:
444 while (OP(m->g->strip[es]) != (sop)O_CH)
445 es += OPND(m->g->strip[es]);
446 break;
447 }
448 es++;
449
450 /* figure out what it matched */
451 switch (OP(m->g->strip[ss])) {
452 case OEND:
453 assert(nope);
454 break;
455 case OCHAR:
456 sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0);
457 break;
458 case OBOL:
459 case OEOL:
460 case OBOW:
461 case OEOW:
462 case OBOS:
463 case OEOS:
464 case OWBND:
465 case ONWBND:
466 break;
467 case OANY:
468 case OANYOF:
469 sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0);
470 break;
471 case OBACK_:
472 case O_BACK:
473 assert(nope);
474 break;
475 /* cases where length of match is hard to find */
476 case OQUEST_:
477 stp = stop;
478 for (;;) {
479 /* how long could this one be? */
480 rest = walk(m, sp, stp, ss, es, false);
481 assert(rest != NULL); /* it did match */
482 /* could the rest match the rest? */
483 tail = walk(m, rest, stop, es, stopst, false);
484 if (tail == stop)
485 break; /* yes! */
486 /* no -- try a shorter match for this one */
487 stp = rest - 1;
488 assert(stp >= sp); /* it did work */
489 }
490 ssub = ss + 1;
491 esub = es - 1;
492 /* did innards match? */
493 if (walk(m, sp, rest, ssub, esub, false) != NULL) {
494 dp = dissect(m, sp, rest, ssub, esub);
495 assert(dp == rest);
496 } else /* no */
497 assert(sp == rest);
498 sp = rest;
499 break;
500 case OPLUS_:
501 stp = stop;
502 for (;;) {
503 /* how long could this one be? */
504 rest = walk(m, sp, stp, ss, es, false);
505 assert(rest != NULL); /* it did match */
506 /* could the rest match the rest? */
507 tail = walk(m, rest, stop, es, stopst, false);
508 if (tail == stop)
509 break; /* yes! */
510 /* no -- try a shorter match for this one */
511 stp = rest - 1;
512 assert(stp >= sp); /* it did work */
513 }
514 ssub = ss + 1;
515 esub = es - 1;
516 ssp = sp;
517 oldssp = ssp;
518 for (;;) { /* find last match of innards */
519 sep = walk(m, ssp, rest, ssub, esub, false);
520 if (sep == NULL || sep == ssp)
521 break; /* failed or matched null */
522 oldssp = ssp; /* on to next try */
523 ssp = sep;
524 }
525 if (sep == NULL) {
526 /* last successful match */
527 sep = ssp;
528 ssp = oldssp;
529 }
530 assert(sep == rest); /* must exhaust substring */
531 assert(walk(m, ssp, sep, ssub, esub, false) == rest);
532 dp = dissect(m, ssp, sep, ssub, esub);
533 assert(dp == sep);
534 sp = rest;
535 break;
536 case OCH_:
537 stp = stop;
538 for (;;) {
539 /* how long could this one be? */
540 rest = walk(m, sp, stp, ss, es, false);
541 assert(rest != NULL); /* it did match */
542 /* could the rest match the rest? */
543 tail = walk(m, rest, stop, es, stopst, false);
544 if (tail == stop)
545 break; /* yes! */
546 /* no -- try a shorter match for this one */
547 stp = rest - 1;
548 assert(stp >= sp); /* it did work */
549 }
550 ssub = ss + 1;
551 esub = ss + OPND(m->g->strip[ss]) - 1;
552 assert(OP(m->g->strip[esub]) == OOR1);
553 for (;;) { /* find first matching branch */
554 if (walk(m, sp, rest, ssub, esub, false) == rest)
555 break; /* it matched all of it */
556 /* that one missed, try next one */
557 assert(OP(m->g->strip[esub]) == OOR1);
558 esub++;
559 assert(OP(m->g->strip[esub]) == OOR2);
560 ssub = esub + 1;
561 esub += OPND(m->g->strip[esub]);
562 if (OP(m->g->strip[esub]) == (sop)OOR2)
563 esub--;
564 else
565 assert(OP(m->g->strip[esub]) == O_CH);
566 }
567 dp = dissect(m, sp, rest, ssub, esub);
568 assert(dp == rest);
569 sp = rest;
570 break;
571 case O_PLUS:
572 case O_QUEST:
573 case OOR1:
574 case OOR2:
575 case O_CH:
576 assert(nope);
577 break;
578 case OLPAREN:
579 i = OPND(m->g->strip[ss]);
580 assert(0 < i && i <= m->g->nsub);
581 m->pmatch[i].rm_so = sp - m->offp;
582 break;
583 case ORPAREN:
584 i = OPND(m->g->strip[ss]);
585 assert(0 < i && i <= m->g->nsub);
586 m->pmatch[i].rm_eo = sp - m->offp;
587 break;
588 default: /* uh oh */
589 assert(nope);
590 break;
591 }
592 }
593
594 assert(sp == stop);
595 return(sp);
596 }
597
598 #define ISBOW(m, sp) \
599 (sp < m->endp && ISWORD(*sp) && \
600 ((sp == m->beginp && !(m->eflags®_NOTBOL)) || \
601 (sp > m->offp && !ISWORD(*(sp-1)))))
602 #define ISEOW(m, sp) \
603 (((sp == m->endp && !(m->eflags®_NOTEOL)) || \
604 (sp < m->endp && *sp == '\n' && \
605 (m->g->cflags®_NEWLINE)) || \
606 (sp < m->endp && !ISWORD(*sp)) ) && \
607 (sp > m->beginp && ISWORD(*(sp-1)))) \
608
609 /*
610 - backref - figure out what matched what, figuring in back references
611 == static const char *backref(struct match *m, const char *start, \
612 == const char *stop, sopno startst, sopno stopst, sopno lev);
613 */
614 static const char * /* == stop (success) or NULL (failure) */
backref(struct match * m,const char * start,const char * stop,sopno startst,sopno stopst,sopno lev,int rec)615 backref(struct match *m,
616 const char *start,
617 const char *stop,
618 sopno startst,
619 sopno stopst,
620 sopno lev, /* PLUS nesting level */
621 int rec)
622 {
623 int i;
624 sopno ss; /* start sop of current subRE */
625 const char *sp; /* start of string matched by it */
626 sopno ssub; /* start sop of subsubRE */
627 sopno esub; /* end sop of subsubRE */
628 const char *ssp; /* start of string matched by subsubRE */
629 const char *dp;
630 size_t len;
631 int hard;
632 sop s;
633 regoff_t offsave;
634 cset *cs;
635 wint_t wc;
636
637 AT("back", start, stop, startst, stopst);
638 sp = start;
639
640 /* get as far as we can with easy stuff */
641 hard = 0;
642 for (ss = startst; !hard && ss < stopst; ss++)
643 switch (OP(s = m->g->strip[ss])) {
644 case OCHAR:
645 if (sp == stop)
646 return(NULL);
647 sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR);
648 if (wc != OPND(s))
649 return(NULL);
650 break;
651 case OANY:
652 if (sp == stop)
653 return(NULL);
654 sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR);
655 if (wc == BADCHAR)
656 return (NULL);
657 break;
658 case OANYOF:
659 if (sp == stop)
660 return (NULL);
661 cs = &m->g->sets[OPND(s)];
662 sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR);
663 if (wc == BADCHAR || !CHIN(cs, wc))
664 return(NULL);
665 break;
666 case OBOS:
667 if (sp == m->beginp && (m->eflags & REG_NOTBOL) == 0)
668 { /* yes */ }
669 else
670 return(NULL);
671 break;
672 case OEOS:
673 if (sp == m->endp && (m->eflags & REG_NOTEOL) == 0)
674 { /* yes */ }
675 else
676 return(NULL);
677 break;
678 case OBOL:
679 if ((sp == m->beginp && !(m->eflags®_NOTBOL)) ||
680 (sp > m->offp && sp < m->endp &&
681 *(sp-1) == '\n' && (m->g->cflags®_NEWLINE)))
682 { /* yes */ }
683 else
684 return(NULL);
685 break;
686 case OEOL:
687 if ( (sp == m->endp && !(m->eflags®_NOTEOL)) ||
688 (sp < m->endp && *sp == '\n' &&
689 (m->g->cflags®_NEWLINE)) )
690 { /* yes */ }
691 else
692 return(NULL);
693 break;
694 case OWBND:
695 if (ISBOW(m, sp) || ISEOW(m, sp))
696 { /* yes */ }
697 else
698 return(NULL);
699 break;
700 case ONWBND:
701 if (((sp == m->beginp) && !ISWORD(*sp)) ||
702 (sp == m->endp && !ISWORD(*(sp - 1))))
703 { /* yes, beginning/end of subject */ }
704 else if (ISWORD(*(sp - 1)) == ISWORD(*sp))
705 { /* yes, beginning/end of subject */ }
706 else
707 return(NULL);
708 break;
709 case OBOW:
710 if (ISBOW(m, sp))
711 { /* yes */ }
712 else
713 return(NULL);
714 break;
715 case OEOW:
716 if (ISEOW(m, sp))
717 { /* yes */ }
718 else
719 return(NULL);
720 break;
721 case O_QUEST:
722 break;
723 case OOR1: /* matches null but needs to skip */
724 ss++;
725 s = m->g->strip[ss];
726 do {
727 assert(OP(s) == OOR2);
728 ss += OPND(s);
729 } while (OP(s = m->g->strip[ss]) != (sop)O_CH);
730 /* note that the ss++ gets us past the O_CH */
731 break;
732 default: /* have to make a choice */
733 hard = 1;
734 break;
735 }
736 if (!hard) { /* that was it! */
737 if (sp != stop)
738 return(NULL);
739 return(sp);
740 }
741 ss--; /* adjust for the for's final increment */
742
743 /* the hard stuff */
744 AT("hard", sp, stop, ss, stopst);
745 s = m->g->strip[ss];
746 switch (OP(s)) {
747 case OBACK_: /* the vilest depths */
748 i = OPND(s);
749 assert(0 < i && i <= m->g->nsub);
750 if (m->pmatch[i].rm_eo == -1)
751 return(NULL);
752 assert(m->pmatch[i].rm_so != -1);
753 len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so;
754 if (len == 0 && rec++ > MAX_RECURSION)
755 return(NULL);
756 assert(stop - m->beginp >= len);
757 if (sp > stop - len)
758 return(NULL); /* not enough left to match */
759 ssp = m->offp + m->pmatch[i].rm_so;
760 if (memcmp(sp, ssp, len) != 0)
761 return(NULL);
762 while (m->g->strip[ss] != (sop)SOP(O_BACK, i))
763 ss++;
764 return(backref(m, sp+len, stop, ss+1, stopst, lev, rec));
765 case OQUEST_: /* to null or not */
766 dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
767 if (dp != NULL)
768 return(dp); /* not */
769 return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec));
770 case OPLUS_:
771 assert(m->lastpos != NULL);
772 assert(lev+1 <= m->g->nplus);
773 m->lastpos[lev+1] = sp;
774 return(backref(m, sp, stop, ss+1, stopst, lev+1, rec));
775 case O_PLUS:
776 if (sp == m->lastpos[lev]) /* last pass matched null */
777 return(backref(m, sp, stop, ss+1, stopst, lev-1, rec));
778 /* try another pass */
779 m->lastpos[lev] = sp;
780 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec);
781 if (dp == NULL)
782 return(backref(m, sp, stop, ss+1, stopst, lev-1, rec));
783 else
784 return(dp);
785 case OCH_: /* find the right one, if any */
786 ssub = ss + 1;
787 esub = ss + OPND(s) - 1;
788 assert(OP(m->g->strip[esub]) == OOR1);
789 for (;;) { /* find first matching branch */
790 dp = backref(m, sp, stop, ssub, esub, lev, rec);
791 if (dp != NULL)
792 return(dp);
793 /* that one missed, try next one */
794 if (OP(m->g->strip[esub]) == (sop)O_CH)
795 return(NULL); /* there is none */
796 esub++;
797 assert(OP(m->g->strip[esub]) == (sop)OOR2);
798 ssub = esub + 1;
799 esub += OPND(m->g->strip[esub]);
800 if (OP(m->g->strip[esub]) == (sop)OOR2)
801 esub--;
802 else
803 assert(OP(m->g->strip[esub]) == O_CH);
804 }
805 /* NOTREACHED */
806 break;
807 case OLPAREN: /* must undo assignment if rest fails */
808 i = OPND(s);
809 assert(0 < i && i <= m->g->nsub);
810 offsave = m->pmatch[i].rm_so;
811 m->pmatch[i].rm_so = sp - m->offp;
812 dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
813 if (dp != NULL)
814 return(dp);
815 m->pmatch[i].rm_so = offsave;
816 return(NULL);
817 case ORPAREN: /* must undo assignment if rest fails */
818 i = OPND(s);
819 assert(0 < i && i <= m->g->nsub);
820 offsave = m->pmatch[i].rm_eo;
821 m->pmatch[i].rm_eo = sp - m->offp;
822 dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
823 if (dp != NULL)
824 return(dp);
825 m->pmatch[i].rm_eo = offsave;
826 return(NULL);
827 default: /* uh oh */
828 assert(nope);
829 break;
830 }
831
832 /* "can't happen" */
833 assert(nope);
834 /* NOTREACHED */
835 return "shut up gcc";
836 }
837
838 /*
839 - walk - step through the string either quickly or slowly
840 == static const char *walk(struct match *m, const char *start, \
841 == const char *stop, sopno startst, sopno stopst, bool fast);
842 */
843 static const char * /* where it ended, or NULL */
walk(struct match * m,const char * start,const char * stop,sopno startst,sopno stopst,bool fast)844 walk(struct match *m, const char *start, const char *stop, sopno startst,
845 sopno stopst, bool fast)
846 {
847 states st = m->st;
848 states fresh = m->fresh;
849 states empty = m->empty;
850 states tmp = m->tmp;
851 const char *p = start;
852 wint_t c;
853 wint_t lastc; /* previous c */
854 wint_t flagch;
855 int i, sflags;
856 const char *matchp; /* last p at which a match ended */
857 size_t clen;
858
859 sflags = 0;
860 AT("slow", start, stop, startst, stopst);
861 CLEAR(st);
862 SET1(st, startst);
863 SP("sstart", st, *p);
864 st = step(m->g, startst, stopst, st, NOTHING, st, sflags);
865 if (fast)
866 ASSIGN(fresh, st);
867 matchp = NULL;
868 if (start == m->offp || (start == m->beginp && !(m->eflags®_NOTBOL)))
869 c = OUT;
870 else {
871 /*
872 * XXX Wrong if the previous character was multi-byte.
873 * Newline never is (in encodings supported by FreeBSD),
874 * so this only breaks the ISWORD tests below.
875 */
876 c = (uch)*(start - 1);
877 }
878 for (;;) {
879 /* next character */
880 lastc = c;
881 sflags = 0;
882 if (p == m->endp) {
883 c = OUT;
884 clen = 0;
885 } else
886 clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR);
887
888 if (fast && EQ(st, fresh))
889 matchp = p;
890
891 /* is there an EOL and/or BOL between lastc and c? */
892 flagch = '\0';
893 i = 0;
894 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) ||
895 (lastc == OUT && !(m->eflags®_NOTBOL)) ) {
896 flagch = BOL;
897 i = m->g->nbol;
898 }
899 if ( (c == '\n' && m->g->cflags®_NEWLINE) ||
900 (c == OUT && !(m->eflags®_NOTEOL)) ) {
901 flagch = (flagch == BOL) ? BOLEOL : EOL;
902 i += m->g->neol;
903 }
904 if (lastc == OUT && (m->eflags & REG_NOTBOL) == 0) {
905 sflags |= SBOS;
906 /* Step one more for BOS. */
907 i++;
908 }
909 if (c == OUT && (m->eflags & REG_NOTEOL) == 0) {
910 sflags |= SEOS;
911 /* Step one more for EOS. */
912 i++;
913 }
914 if (i != 0) {
915 for (; i > 0; i--)
916 st = step(m->g, startst, stopst, st, flagch, st,
917 sflags);
918 SP("sboleol", st, c);
919 }
920
921 /* how about a word boundary? */
922 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
923 (c != OUT && ISWORD(c)) ) {
924 flagch = BOW;
925 }
926 if ( (lastc != OUT && ISWORD(lastc)) &&
927 (flagch == EOL || (c != OUT && !ISWORD(c))) ) {
928 flagch = EOW;
929 }
930 if (flagch == BOW || flagch == EOW) {
931 st = step(m->g, startst, stopst, st, flagch, st, sflags);
932 SP("sboweow", st, c);
933 }
934 if (lastc != OUT && c != OUT &&
935 ISWORD(lastc) == ISWORD(c)) {
936 flagch = NWBND;
937 } else if ((lastc == OUT && !ISWORD(c)) ||
938 (c == OUT && !ISWORD(lastc))) {
939 flagch = NWBND;
940 }
941 if (flagch == NWBND) {
942 st = step(m->g, startst, stopst, st, flagch, st, sflags);
943 SP("snwbnd", st, c);
944 }
945
946 /* are we done? */
947 if (ISSET(st, stopst)) {
948 if (fast)
949 break;
950 else
951 matchp = p;
952 }
953 if (EQ(st, empty) || p == stop || clen > (size_t)(stop - p))
954 break; /* NOTE BREAK OUT */
955
956 /* no, we must deal with this character */
957 ASSIGN(tmp, st);
958 if (fast)
959 ASSIGN(st, fresh);
960 else
961 ASSIGN(st, empty);
962 assert(c != OUT);
963 st = step(m->g, startst, stopst, tmp, c, st, sflags);
964 SP("saft", st, c);
965 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st, sflags),
966 st));
967 p += clen;
968 }
969
970 if (fast) {
971 assert(matchp != NULL);
972 m->coldp = matchp;
973 if (ISSET(st, stopst))
974 return (p + XMBRTOWC(NULL, p, stop - p, &m->mbs, 0));
975 else
976 return (NULL);
977 } else
978 return (matchp);
979 }
980
981 /*
982 - step - map set of states reachable before char to set reachable after
983 == static states step(struct re_guts *g, sopno start, sopno stop, \
984 == states bef, int ch, states aft);
985 == #define BOL (OUT-1)
986 == #define EOL (BOL-1)
987 == #define BOLEOL (BOL-2)
988 == #define NOTHING (BOL-3)
989 == #define BOW (BOL-4)
990 == #define EOW (BOL-5)
991 == #define BADCHAR (BOL-6)
992 == #define NONCHAR(c) ((c) <= OUT)
993 */
994 static states
step(struct re_guts * g,sopno start,sopno stop,states bef,wint_t ch,states aft,int sflags)995 step(struct re_guts *g,
996 sopno start, /* start state within strip */
997 sopno stop, /* state after stop state within strip */
998 states bef, /* states reachable before */
999 wint_t ch, /* character or NONCHAR code */
1000 states aft, /* states already known reachable after */
1001 int sflags) /* state flags */
1002 {
1003 cset *cs;
1004 sop s;
1005 sopno pc;
1006 onestate here; /* note, macros know this name */
1007 sopno look;
1008 int i;
1009
1010 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) {
1011 s = g->strip[pc];
1012 switch (OP(s)) {
1013 case OEND:
1014 assert(pc == stop-1);
1015 break;
1016 case OCHAR:
1017 /* only characters can match */
1018 assert(!NONCHAR(ch) || ch != OPND(s));
1019 if (ch == OPND(s))
1020 FWD(aft, bef, 1);
1021 break;
1022 case OBOS:
1023 if ((ch == BOL || ch == BOLEOL) && (sflags & SBOS) != 0)
1024 FWD(aft, bef, 1);
1025 break;
1026 case OEOS:
1027 if ((ch == EOL || ch == BOLEOL) && (sflags & SEOS) != 0)
1028 FWD(aft, bef, 1);
1029 break;
1030 case OBOL:
1031 if (ch == BOL || ch == BOLEOL)
1032 FWD(aft, bef, 1);
1033 break;
1034 case OEOL:
1035 if (ch == EOL || ch == BOLEOL)
1036 FWD(aft, bef, 1);
1037 break;
1038 case OBOW:
1039 if (ch == BOW)
1040 FWD(aft, bef, 1);
1041 break;
1042 case OEOW:
1043 if (ch == EOW)
1044 FWD(aft, bef, 1);
1045 break;
1046 case OWBND:
1047 if (ch == BOW || ch == EOW)
1048 FWD(aft, bef, 1);
1049 break;
1050 case ONWBND:
1051 if (ch == NWBND)
1052 FWD(aft, aft, 1);
1053 break;
1054 case OANY:
1055 if (!NONCHAR(ch))
1056 FWD(aft, bef, 1);
1057 break;
1058 case OANYOF:
1059 cs = &g->sets[OPND(s)];
1060 if (!NONCHAR(ch) && CHIN(cs, ch))
1061 FWD(aft, bef, 1);
1062 break;
1063 case OBACK_: /* ignored here */
1064 case O_BACK:
1065 FWD(aft, aft, 1);
1066 break;
1067 case OPLUS_: /* forward, this is just an empty */
1068 FWD(aft, aft, 1);
1069 break;
1070 case O_PLUS: /* both forward and back */
1071 FWD(aft, aft, 1);
1072 i = ISSETBACK(aft, OPND(s));
1073 BACK(aft, aft, OPND(s));
1074 if (!i && ISSETBACK(aft, OPND(s))) {
1075 /* oho, must reconsider loop body */
1076 pc -= OPND(s) + 1;
1077 INIT(here, pc);
1078 }
1079 break;
1080 case OQUEST_: /* two branches, both forward */
1081 FWD(aft, aft, 1);
1082 FWD(aft, aft, OPND(s));
1083 break;
1084 case O_QUEST: /* just an empty */
1085 FWD(aft, aft, 1);
1086 break;
1087 case OLPAREN: /* not significant here */
1088 case ORPAREN:
1089 FWD(aft, aft, 1);
1090 break;
1091 case OCH_: /* mark the first two branches */
1092 FWD(aft, aft, 1);
1093 assert(OP(g->strip[pc+OPND(s)]) == (sop)OOR2);
1094 FWD(aft, aft, OPND(s));
1095 break;
1096 case OOR1: /* done a branch, find the O_CH */
1097 if (ISSTATEIN(aft, here)) {
1098 for (look = 1;
1099 OP(s = g->strip[pc+look]) != (sop)O_CH;
1100 look += OPND(s))
1101 assert(OP(s) == (sop)OOR2);
1102 FWD(aft, aft, look + 1);
1103 }
1104 break;
1105 case OOR2: /* propagate OCH_'s marking */
1106 FWD(aft, aft, 1);
1107 if (OP(g->strip[pc+OPND(s)]) != (sop)O_CH) {
1108 assert(OP(g->strip[pc+OPND(s)]) == (sop)OOR2);
1109 FWD(aft, aft, OPND(s));
1110 }
1111 break;
1112 case O_CH: /* just empty */
1113 FWD(aft, aft, 1);
1114 break;
1115 default: /* ooooops... */
1116 assert(nope);
1117 break;
1118 }
1119 }
1120
1121 return(aft);
1122 }
1123
1124 #ifdef REDEBUG
1125 /*
1126 - print - print a set of states
1127 == #ifdef REDEBUG
1128 == static void print(struct match *m, const char *caption, states st, \
1129 == int ch, FILE *d);
1130 == #endif
1131 */
1132 static void
print(struct match * m,const char * caption,states st,int ch,FILE * d)1133 print(struct match *m,
1134 const char *caption,
1135 states st,
1136 int ch,
1137 FILE *d)
1138 {
1139 struct re_guts *g = m->g;
1140 sopno i;
1141 int first = 1;
1142
1143 if (!(m->eflags®_TRACE))
1144 return;
1145
1146 fprintf(d, "%s", caption);
1147 if (ch != '\0')
1148 fprintf(d, " %s", pchar(ch));
1149 for (i = 0; i < g->nstates; i++)
1150 if (ISSET(st, i)) {
1151 fprintf(d, "%s%lu", (first) ? "\t" : ", ", i);
1152 first = 0;
1153 }
1154 fprintf(d, "\n");
1155 }
1156
1157 /*
1158 - at - print current situation
1159 == #ifdef REDEBUG
1160 == static void at(struct match *m, const char *title, const char *start, \
1161 == const char *stop, sopno startst, sopno stopst);
1162 == #endif
1163 */
1164 static void
at(struct match * m,const char * title,const char * start,const char * stop,sopno startst,sopno stopst)1165 at( struct match *m,
1166 const char *title,
1167 const char *start,
1168 const char *stop,
1169 sopno startst,
1170 sopno stopst)
1171 {
1172 if (!(m->eflags®_TRACE))
1173 return;
1174
1175 printf("%s %s-", title, pchar(*start));
1176 printf("%s ", pchar(*stop));
1177 printf("%ld-%ld\n", (long)startst, (long)stopst);
1178 }
1179
1180 #ifndef PCHARDONE
1181 #define PCHARDONE /* never again */
1182 /*
1183 - pchar - make a character printable
1184 == #ifdef REDEBUG
1185 == static const char *pchar(int ch);
1186 == #endif
1187 *
1188 * Is this identical to regchar() over in debug.c? Well, yes. But a
1189 * duplicate here avoids having a debugging-capable regexec.o tied to
1190 * a matching debug.o, and this is convenient. It all disappears in
1191 * the non-debug compilation anyway, so it doesn't matter much.
1192 */
1193 static const char * /* -> representation */
pchar(int ch)1194 pchar(int ch)
1195 {
1196 static char pbuf[10];
1197
1198 if (isprint((uch)ch) || ch == ' ')
1199 sprintf(pbuf, "%c", ch);
1200 else
1201 sprintf(pbuf, "\\%o", ch);
1202 return(pbuf);
1203 }
1204 #endif
1205 #endif
1206
1207 #undef stepback
1208 #undef matcher
1209 #undef walk
1210 #undef dissect
1211 #undef backref
1212 #undef step
1213 #undef print
1214 #undef at
1215 #undef match
1216