1 /*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1992 Diomidis Spinellis.
5 * Copyright (c) 1992, 1993
6 * The Regents of the University of California. All rights reserved.
7 *
8 * This code is derived from software contributed to Berkeley by
9 * Diomidis Spinellis of Imperial College, University of London.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 * 3. Neither the name of the University nor the names of its contributors
20 * may be used to endorse or promote products derived from this software
21 * without specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
34 */
35
36 #include <sys/cdefs.h>
37 #ifndef lint
38 static const char sccsid[] = "@(#)compile.c 8.1 (Berkeley) 6/6/93";
39 #endif
40
41 #include <sys/types.h>
42 #include <sys/stat.h>
43
44 #include <ctype.h>
45 #include <err.h>
46 #include <errno.h>
47 #include <fcntl.h>
48 #include <limits.h>
49 #include <regex.h>
50 #include <stdbool.h>
51 #include <stdio.h>
52 #include <stdlib.h>
53 #include <string.h>
54 #include <wchar.h>
55
56 #include "defs.h"
57 #include "extern.h"
58
59 #define LHSZ 128
60 #define LHMASK (LHSZ - 1)
61 static struct labhash {
62 struct labhash *lh_next;
63 u_int lh_hash;
64 struct s_command *lh_cmd;
65 int lh_ref;
66 } *labels[LHSZ];
67
68 static char *compile_addr(char *, struct s_addr *);
69 static char *compile_ccl(char **, char *);
70 static char *compile_delimited(char *, char *, int);
71 static char *compile_flags(char *, struct s_subst *);
72 static regex_t *compile_re(char *, int);
73 static char *compile_subst(char *, struct s_subst *);
74 static char *compile_text(void);
75 static char *compile_tr(char *, struct s_tr **);
76 static struct s_command
77 **compile_stream(struct s_command **);
78 static char *duptoeol(char *, const char *);
79 static void enterlabel(struct s_command *);
80 static struct s_command
81 *findlabel(char *);
82 static void fixuplabel(struct s_command *, struct s_command *);
83 static void uselabel(void);
84
85 /*
86 * Command specification. This is used to drive the command parser.
87 */
88 struct s_format {
89 char code; /* Command code */
90 int naddr; /* Number of address args */
91 enum e_args args; /* Argument type */
92 };
93
94 static struct s_format cmd_fmts[] = {
95 {'{', 2, GROUP},
96 {'}', 0, ENDGROUP},
97 {'a', 1, TEXT},
98 {'b', 2, BRANCH},
99 {'c', 2, TEXT},
100 {'d', 2, EMPTY},
101 {'D', 2, EMPTY},
102 {'g', 2, EMPTY},
103 {'G', 2, EMPTY},
104 {'h', 2, EMPTY},
105 {'H', 2, EMPTY},
106 {'i', 1, TEXT},
107 {'l', 2, EMPTY},
108 {'n', 2, EMPTY},
109 {'N', 2, EMPTY},
110 {'p', 2, EMPTY},
111 {'P', 2, EMPTY},
112 {'q', 1, EMPTY},
113 {'r', 1, RFILE},
114 {'s', 2, SUBST},
115 {'t', 2, BRANCH},
116 {'w', 2, WFILE},
117 {'x', 2, EMPTY},
118 {'y', 2, TR},
119 {'!', 2, NONSEL},
120 {':', 0, LABEL},
121 {'#', 0, COMMENT},
122 {'=', 1, EMPTY},
123 {'\0', 0, COMMENT},
124 };
125
126 /* The compiled program. */
127 struct s_command *prog;
128
129 /*
130 * Compile the program into prog.
131 * Initialise appends.
132 */
133 void
compile(void)134 compile(void)
135 {
136 *compile_stream(&prog) = NULL;
137 fixuplabel(prog, NULL);
138 uselabel();
139 if (appendnum == 0)
140 appends = NULL;
141 else if ((appends = malloc(sizeof(struct s_appends) * appendnum)) ==
142 NULL)
143 err(1, "malloc");
144 if ((match = malloc((maxnsub + 1) * sizeof(regmatch_t))) == NULL)
145 err(1, "malloc");
146 }
147
148 #define EATSPACE() do { \
149 if (p) \
150 while (*p && isspace((unsigned char)*p)) \
151 p++; \
152 } while (0)
153
154 static struct s_command **
compile_stream(struct s_command ** link)155 compile_stream(struct s_command **link)
156 {
157 char *p;
158 static char lbuf[_POSIX2_LINE_MAX + 1]; /* To save stack */
159 struct s_command *cmd, *cmd2, *stack;
160 struct s_format *fp;
161 char re[_POSIX2_LINE_MAX + 1];
162 int naddr; /* Number of addresses */
163
164 stack = NULL;
165 for (;;) {
166 if ((p = cu_fgets(lbuf, sizeof(lbuf), NULL)) == NULL) {
167 if (stack != NULL)
168 errx(1, "%lu: %s: unexpected EOF (pending }'s)",
169 linenum, fname);
170 return (link);
171 }
172
173 semicolon: EATSPACE();
174 if (p) {
175 if (*p == '#' || *p == '\0')
176 continue;
177 else if (*p == ';') {
178 p++;
179 goto semicolon;
180 }
181 }
182 if ((*link = cmd = malloc(sizeof(struct s_command))) == NULL)
183 err(1, "malloc");
184 link = &cmd->next;
185 cmd->startline = cmd->nonsel = 0;
186 /* First parse the addresses */
187 naddr = 0;
188
189 /* Valid characters to start an address */
190 #define addrchar(c) (strchr("0123456789/\\$", (c)))
191 if (addrchar(*p)) {
192 naddr++;
193 if ((cmd->a1 = malloc(sizeof(struct s_addr))) == NULL)
194 err(1, "malloc");
195 p = compile_addr(p, cmd->a1);
196 EATSPACE(); /* EXTENSION */
197 if (*p == ',') {
198 p++;
199 EATSPACE(); /* EXTENSION */
200 naddr++;
201 if ((cmd->a2 = malloc(sizeof(struct s_addr)))
202 == NULL)
203 err(1, "malloc");
204 p = compile_addr(p, cmd->a2);
205 EATSPACE();
206 } else
207 cmd->a2 = NULL;
208 } else
209 cmd->a1 = cmd->a2 = NULL;
210
211 nonsel: /* Now parse the command */
212 if (!*p)
213 errx(1, "%lu: %s: command expected", linenum, fname);
214 cmd->code = *p;
215 for (fp = cmd_fmts; fp->code; fp++)
216 if (fp->code == *p)
217 break;
218 if (!fp->code)
219 errx(1, "%lu: %s: invalid command code %c", linenum, fname, *p);
220 if (naddr > fp->naddr)
221 errx(1,
222 "%lu: %s: command %c expects up to %d address(es), found %d",
223 linenum, fname, *p, fp->naddr, naddr);
224 switch (fp->args) {
225 case NONSEL: /* ! */
226 p++;
227 EATSPACE();
228 cmd->nonsel = 1;
229 goto nonsel;
230 case GROUP: /* { */
231 p++;
232 EATSPACE();
233 cmd->next = stack;
234 stack = cmd;
235 link = &cmd->u.c;
236 if (*p)
237 goto semicolon;
238 break;
239 case ENDGROUP:
240 /*
241 * Short-circuit command processing, since end of
242 * group is really just a noop.
243 */
244 cmd->nonsel = 1;
245 if (stack == NULL)
246 errx(1, "%lu: %s: unexpected }", linenum, fname);
247 cmd2 = stack;
248 stack = cmd2->next;
249 cmd2->next = cmd;
250 /*FALLTHROUGH*/
251 case EMPTY: /* d D g G h H l n N p P q x = \0 */
252 p++;
253 EATSPACE();
254 if (*p == ';') {
255 p++;
256 link = &cmd->next;
257 goto semicolon;
258 }
259 if (*p)
260 errx(1, "%lu: %s: extra characters at the end of %c command",
261 linenum, fname, cmd->code);
262 break;
263 case TEXT: /* a c i */
264 p++;
265 EATSPACE();
266 if (*p != '\\')
267 errx(1,
268 "%lu: %s: command %c expects \\ followed by text", linenum, fname, cmd->code);
269 p++;
270 EATSPACE();
271 if (*p)
272 errx(1,
273 "%lu: %s: extra characters after \\ at the end of %c command",
274 linenum, fname, cmd->code);
275 cmd->t = compile_text();
276 break;
277 case COMMENT: /* \0 # */
278 break;
279 case WFILE: /* w */
280 p++;
281 EATSPACE();
282 if (*p == '\0')
283 errx(1, "%lu: %s: filename expected", linenum, fname);
284 cmd->t = duptoeol(p, "w command");
285 if (aflag)
286 cmd->u.fd = -1;
287 else if ((cmd->u.fd = open(p,
288 O_WRONLY|O_APPEND|O_CREAT|O_TRUNC,
289 DEFFILEMODE)) == -1)
290 err(1, "%s", p);
291 break;
292 case RFILE: /* r */
293 p++;
294 EATSPACE();
295 if (*p == '\0')
296 errx(1, "%lu: %s: filename expected", linenum, fname);
297 else
298 cmd->t = duptoeol(p, "read command");
299 break;
300 case BRANCH: /* b t */
301 p++;
302 EATSPACE();
303 if (*p == '\0')
304 cmd->t = NULL;
305 else
306 cmd->t = duptoeol(p, "branch");
307 break;
308 case LABEL: /* : */
309 p++;
310 EATSPACE();
311 cmd->t = duptoeol(p, "label");
312 if (strlen(p) == 0)
313 errx(1, "%lu: %s: empty label", linenum, fname);
314 enterlabel(cmd);
315 break;
316 case SUBST: /* s */
317 p++;
318 if (*p == '\0' || *p == '\\')
319 errx(1,
320 "%lu: %s: substitute pattern can not be delimited by newline or backslash",
321 linenum, fname);
322 if ((cmd->u.s = calloc(1, sizeof(struct s_subst))) == NULL)
323 err(1, "malloc");
324 p = compile_delimited(p, re, 0);
325 if (p == NULL)
326 errx(1,
327 "%lu: %s: unterminated substitute pattern", linenum, fname);
328
329 /* Compile RE with no case sensitivity temporarily */
330 if (*re == '\0')
331 cmd->u.s->re = NULL;
332 else
333 cmd->u.s->re = compile_re(re, 0);
334 --p;
335 p = compile_subst(p, cmd->u.s);
336 p = compile_flags(p, cmd->u.s);
337
338 /* Recompile RE with case sensitivity from "I" flag if any */
339 if (*re == '\0')
340 cmd->u.s->re = NULL;
341 else
342 cmd->u.s->re = compile_re(re, cmd->u.s->icase);
343 EATSPACE();
344 if (*p == ';') {
345 p++;
346 link = &cmd->next;
347 goto semicolon;
348 }
349 break;
350 case TR: /* y */
351 p++;
352 p = compile_tr(p, &cmd->u.y);
353 EATSPACE();
354 if (*p == ';') {
355 p++;
356 link = &cmd->next;
357 goto semicolon;
358 }
359 if (*p)
360 errx(1,
361 "%lu: %s: extra text at the end of a transform command", linenum, fname);
362 break;
363 }
364 }
365 }
366
367 static int
hex2char(const char * in,char * out,int len)368 hex2char(const char *in, char *out, int len)
369 {
370 long ord;
371 char *endptr, hexbuf[3];
372
373 hexbuf[0] = in[0];
374 hexbuf[1] = len > 1 ? in[1] : '\0';
375 hexbuf[2] = '\0';
376
377 errno = 0;
378 ord = strtol(hexbuf, &endptr, 16);
379 if (*endptr != '\0' || errno != 0)
380 return (ERANGE);
381 *out = (char)ord;
382 return (0);
383 }
384
385 static bool
hexdigit(char c)386 hexdigit(char c)
387 {
388 int lc;
389
390 lc = tolower(c);
391 return isdigit(lc) || (lc >= 'a' && lc <= 'f');
392 }
393
394 static bool
dohex(const char * in,char * out,int * len)395 dohex(const char *in, char *out, int *len)
396 {
397 int tmplen;
398
399 if (!hexdigit(in[0]))
400 return (false);
401 tmplen = 1;
402 if (hexdigit(in[1]))
403 ++tmplen;
404 if (hex2char(in, out, tmplen) == 0) {
405 *len = tmplen;
406 return (true);
407 }
408
409 return (false);
410 }
411
412 /*
413 * Get a delimited string. P points to the delimiter of the string; d points
414 * to a buffer area. Newline and delimiter escapes are processed; other
415 * escapes are ignored.
416 *
417 * Returns a pointer to the first character after the final delimiter or NULL
418 * in the case of a non-terminated string. The character array d is filled
419 * with the processed string.
420 */
421 static char *
compile_delimited(char * p,char * d,int is_tr)422 compile_delimited(char *p, char *d, int is_tr)
423 {
424 int hexlen;
425 char c;
426
427 c = *p++;
428 if (c == '\0')
429 return (NULL);
430 else if (c == '\\')
431 errx(1, "%lu: %s: \\ can not be used as a string delimiter",
432 linenum, fname);
433 else if (c == '\n')
434 errx(1, "%lu: %s: newline can not be used as a string delimiter",
435 linenum, fname);
436 while (*p) {
437 if (*p == '[' && *p != c) {
438 if (!is_tr) {
439 if ((d = compile_ccl(&p, d)) == NULL) {
440 errx(1,
441 "%lu: %s: unbalanced brackets ([])",
442 linenum, fname);
443 }
444 continue;
445 }
446 } else if (*p == '\\' && p[1] == '[') {
447 if (is_tr)
448 p++;
449 else
450 *d++ = *p++;
451 } else if (*p == '\\' && p[1] == c) {
452 p++;
453 } else if (*p == '\\' &&
454 (p[1] == 'n' || p[1] == 'r' || p[1] == 't')) {
455 switch (p[1]) {
456 case 'n':
457 *d++ = '\n';
458 break;
459 case 'r':
460 *d++ = '\r';
461 break;
462 case 't':
463 *d++ = '\t';
464 break;
465 }
466 p += 2;
467 continue;
468 } else if (*p == '\\' && p[1] == 'x') {
469 if (dohex(&p[2], d, &hexlen)) {
470 ++d;
471 p += hexlen + 2;
472 continue;
473 }
474 } else if (*p == '\\' && p[1] == '\\') {
475 if (is_tr)
476 p++;
477 else
478 *d++ = *p++;
479 } else if (*p == c) {
480 *d = '\0';
481 return (p + 1);
482 }
483 *d++ = *p++;
484 }
485 return (NULL);
486 }
487
488
489 /* compile_ccl: expand a POSIX character class */
490 static char *
compile_ccl(char ** sp,char * t)491 compile_ccl(char **sp, char *t)
492 {
493 int c, d, hexlen;
494 char *s = *sp;
495
496 *t++ = *s++;
497 if (*s == '^')
498 *t++ = *s++;
499 if (*s == ']')
500 *t++ = *s++;
501 for (; *s && (*t = *s) != ']'; s++, t++) {
502 if (*s == '[' && ((d = *(s+1)) == '.' || d == ':' || d == '=')) {
503 *++t = *++s, t++, s++;
504 for (c = *s; (*t = *s) != ']' || c != d; s++, t++)
505 if ((c = *s) == '\0')
506 return NULL;
507 } else if (*s == '\\') {
508 switch (s[1]) {
509 case 'n':
510 *t = '\n';
511 s++;
512 break;
513 case 'r':
514 *t = '\r';
515 s++;
516 break;
517 case 't':
518 *t = '\t';
519 s++;
520 break;
521 case 'x':
522 if (dohex(&s[2], t, &hexlen))
523 s += hexlen + 1;
524 break;
525 }
526 }
527 }
528 return (*s == ']') ? *sp = ++s, ++t : NULL;
529 }
530
531 /*
532 * Compiles the regular expression in RE and returns a pointer to the compiled
533 * regular expression.
534 * Cflags are passed to regcomp.
535 */
536 static regex_t *
compile_re(char * re,int case_insensitive)537 compile_re(char *re, int case_insensitive)
538 {
539 regex_t *rep;
540 int eval, flags;
541
542
543 flags = rflags;
544 if (case_insensitive)
545 flags |= REG_ICASE;
546 if ((rep = malloc(sizeof(regex_t))) == NULL)
547 err(1, "malloc");
548 if ((eval = regcomp(rep, re, flags)) != 0)
549 errx(1, "%lu: %s: RE error: %s",
550 linenum, fname, strregerror(eval, rep));
551 if (maxnsub < rep->re_nsub)
552 maxnsub = rep->re_nsub;
553 return (rep);
554 }
555
556 /*
557 * Compile the substitution string of a regular expression and set res to
558 * point to a saved copy of it. Nsub is the number of parenthesized regular
559 * expressions.
560 */
561 static char *
compile_subst(char * p,struct s_subst * s)562 compile_subst(char *p, struct s_subst *s)
563 {
564 static char lbuf[_POSIX2_LINE_MAX + 1];
565 int asize, hexlen, size;
566 u_char ref;
567 char c, *text, *op, *sp;
568 int more = 1, sawesc = 0;
569
570 c = *p++; /* Terminator character */
571 if (c == '\0')
572 return (NULL);
573
574 s->maxbref = 0;
575 s->linenum = linenum;
576 asize = 2 * _POSIX2_LINE_MAX + 1;
577 if ((text = malloc(asize)) == NULL)
578 err(1, "malloc");
579 size = 0;
580 do {
581 op = sp = text + size;
582 for (; *p; p++) {
583 if (*p == '\\' || sawesc) {
584 /*
585 * If this is a continuation from the last
586 * buffer, we won't have a character to
587 * skip over.
588 */
589 if (sawesc)
590 sawesc = 0;
591 else
592 p++;
593
594 if (*p == '\0') {
595 /*
596 * This escaped character is continued
597 * in the next part of the line. Note
598 * this fact, then cause the loop to
599 * exit w/ normal EOL case and reenter
600 * above with the new buffer.
601 */
602 sawesc = 1;
603 p--;
604 continue;
605 } else if (strchr("123456789", *p) != NULL) {
606 *sp++ = '\\';
607 ref = *p - '0';
608 if (s->re != NULL &&
609 ref > s->re->re_nsub)
610 errx(1, "%lu: %s: \\%c not defined in the RE",
611 linenum, fname, *p);
612 if (s->maxbref < ref)
613 s->maxbref = ref;
614 } else {
615 switch (*p) {
616 case '&':
617 case '\\':
618 *sp++ = '\\';
619 break;
620 case 'n':
621 *p = '\n';
622 break;
623 case 'r':
624 *p = '\r';
625 break;
626 case 't':
627 *p = '\t';
628 break;
629 case 'x':
630 #define ADVANCE_N(s, n) \
631 do { \
632 char *adv = (s); \
633 while (*(adv + (n) - 1) != '\0') { \
634 *adv = *(adv + (n)); \
635 ++adv; \
636 } \
637 *adv = '\0'; \
638 } while (0);
639 if (dohex(&p[1], p, &hexlen)) {
640 ADVANCE_N(p + 1,
641 hexlen);
642 }
643 break;
644 }
645 }
646 } else if (*p == c) {
647 if (*++p == '\0' && more) {
648 if (cu_fgets(lbuf, sizeof(lbuf), &more))
649 p = lbuf;
650 }
651 *sp++ = '\0';
652 size += sp - op;
653 if ((s->new = realloc(text, size)) == NULL)
654 err(1, "realloc");
655 return (p);
656 } else if (*p == '\n') {
657 errx(1,
658 "%lu: %s: unescaped newline inside substitute pattern", linenum, fname);
659 /* NOTREACHED */
660 }
661 *sp++ = *p;
662 }
663 size += sp - op;
664 if (asize - size < _POSIX2_LINE_MAX + 1) {
665 asize *= 2;
666 if ((text = realloc(text, asize)) == NULL)
667 err(1, "realloc");
668 }
669 } while (cu_fgets(p = lbuf, sizeof(lbuf), &more) != NULL);
670 errx(1, "%lu: %s: unterminated substitute in regular expression",
671 linenum, fname);
672 /* NOTREACHED */
673 }
674
675 /*
676 * Compile the flags of the s command
677 */
678 static char *
compile_flags(char * p,struct s_subst * s)679 compile_flags(char *p, struct s_subst *s)
680 {
681 int gn; /* True if we have seen g or n */
682 unsigned long nval;
683 char wfile[_POSIX2_LINE_MAX + 1], *q, *eq;
684
685 s->n = 1; /* Default */
686 s->p = 0;
687 s->wfile = NULL;
688 s->wfd = -1;
689 s->icase = 0;
690 for (gn = 0;;) {
691 EATSPACE(); /* EXTENSION */
692 switch (*p) {
693 case 'g':
694 if (gn)
695 errx(1,
696 "%lu: %s: more than one number or 'g' in substitute flags", linenum, fname);
697 gn = 1;
698 s->n = 0;
699 break;
700 case '\0':
701 case '\n':
702 case ';':
703 return (p);
704 case 'p':
705 s->p = 1;
706 break;
707 case 'i':
708 case 'I':
709 s->icase = 1;
710 break;
711 case '1': case '2': case '3':
712 case '4': case '5': case '6':
713 case '7': case '8': case '9':
714 if (gn)
715 errx(1,
716 "%lu: %s: more than one number or 'g' in substitute flags", linenum, fname);
717 gn = 1;
718 errno = 0;
719 nval = strtol(p, &p, 10);
720 if (errno == ERANGE || nval > INT_MAX)
721 errx(1,
722 "%lu: %s: overflow in the 'N' substitute flag", linenum, fname);
723 s->n = nval;
724 p--;
725 break;
726 case 'w':
727 p++;
728 #ifdef HISTORIC_PRACTICE
729 if (*p != ' ') {
730 warnx("%lu: %s: space missing before w wfile", linenum, fname);
731 return (p);
732 }
733 #endif
734 EATSPACE();
735 q = wfile;
736 eq = wfile + sizeof(wfile) - 1;
737 while (*p) {
738 if (*p == '\n')
739 break;
740 if (q >= eq)
741 err(1, "wfile too long");
742 *q++ = *p++;
743 }
744 *q = '\0';
745 if (q == wfile)
746 errx(1, "%lu: %s: no wfile specified", linenum, fname);
747 s->wfile = strdup(wfile);
748 if (!aflag && (s->wfd = open(wfile,
749 O_WRONLY|O_APPEND|O_CREAT|O_TRUNC,
750 DEFFILEMODE)) == -1)
751 err(1, "%s", wfile);
752 return (p);
753 default:
754 errx(1, "%lu: %s: bad flag in substitute command: '%c'",
755 linenum, fname, *p);
756 break;
757 }
758 p++;
759 }
760 }
761
762 /*
763 * Compile a translation set of strings into a lookup table.
764 */
765 static char *
compile_tr(char * p,struct s_tr ** py)766 compile_tr(char *p, struct s_tr **py)
767 {
768 struct s_tr *y;
769 int i;
770 const char *op, *np;
771 char old[_POSIX2_LINE_MAX + 1];
772 char new[_POSIX2_LINE_MAX + 1];
773 size_t oclen, oldlen, nclen, newlen;
774 mbstate_t mbs1, mbs2;
775
776 if ((*py = y = malloc(sizeof(*y))) == NULL)
777 err(1, NULL);
778 y->multis = NULL;
779 y->nmultis = 0;
780
781 if (*p == '\0' || *p == '\\')
782 errx(1,
783 "%lu: %s: transform pattern can not be delimited by newline or backslash",
784 linenum, fname);
785 p = compile_delimited(p, old, 1);
786 if (p == NULL)
787 errx(1, "%lu: %s: unterminated transform source string",
788 linenum, fname);
789 p = compile_delimited(p - 1, new, 1);
790 if (p == NULL)
791 errx(1, "%lu: %s: unterminated transform target string",
792 linenum, fname);
793 EATSPACE();
794 op = old;
795 oldlen = mbsrtowcs(NULL, &op, 0, NULL);
796 if (oldlen == (size_t)-1)
797 err(1, NULL);
798 np = new;
799 newlen = mbsrtowcs(NULL, &np, 0, NULL);
800 if (newlen == (size_t)-1)
801 err(1, NULL);
802 if (newlen != oldlen)
803 errx(1, "%lu: %s: transform strings are not the same length",
804 linenum, fname);
805 if (MB_CUR_MAX == 1) {
806 /*
807 * The single-byte encoding case is easy: generate a
808 * lookup table.
809 */
810 for (i = 0; i <= UCHAR_MAX; i++)
811 y->bytetab[i] = (char)i;
812 for (; *op; op++, np++)
813 y->bytetab[(u_char)*op] = *np;
814 } else {
815 /*
816 * Multi-byte encoding case: generate a lookup table as
817 * above, but only for single-byte characters. The first
818 * bytes of multi-byte characters have their lookup table
819 * entries set to 0, which causes do_tr() to search through
820 * an auxiliary vector of multi-byte mappings.
821 */
822 memset(&mbs1, 0, sizeof(mbs1));
823 memset(&mbs2, 0, sizeof(mbs2));
824 for (i = 0; i <= UCHAR_MAX; i++)
825 y->bytetab[i] = (btowc(i) != WEOF) ? i : 0;
826 while (*op != '\0') {
827 oclen = mbrlen(op, MB_LEN_MAX, &mbs1);
828 if (oclen == (size_t)-1 || oclen == (size_t)-2)
829 errc(1, EILSEQ, NULL);
830 nclen = mbrlen(np, MB_LEN_MAX, &mbs2);
831 if (nclen == (size_t)-1 || nclen == (size_t)-2)
832 errc(1, EILSEQ, NULL);
833 if (oclen == 1 && nclen == 1)
834 y->bytetab[(u_char)*op] = *np;
835 else {
836 y->bytetab[(u_char)*op] = 0;
837 y->multis = realloc(y->multis,
838 (y->nmultis + 1) * sizeof(*y->multis));
839 if (y->multis == NULL)
840 err(1, NULL);
841 i = y->nmultis++;
842 y->multis[i].fromlen = oclen;
843 memcpy(y->multis[i].from, op, oclen);
844 y->multis[i].tolen = nclen;
845 memcpy(y->multis[i].to, np, nclen);
846 }
847 op += oclen;
848 np += nclen;
849 }
850 }
851 return (p);
852 }
853
854 /*
855 * Compile the text following an a, c, or i command.
856 */
857 static char *
compile_text(void)858 compile_text(void)
859 {
860 int asize, esc_nl, size;
861 char *text, *p, *op, *s;
862 char lbuf[_POSIX2_LINE_MAX + 1];
863
864 asize = 2 * _POSIX2_LINE_MAX + 1;
865 if ((text = malloc(asize)) == NULL)
866 err(1, "malloc");
867 size = 0;
868 while (cu_fgets(lbuf, sizeof(lbuf), NULL) != NULL) {
869 op = s = text + size;
870 p = lbuf;
871 #ifdef LEGACY_BSDSED_COMPAT
872 EATSPACE();
873 #endif
874 for (esc_nl = 0; *p != '\0'; p++) {
875 if (*p == '\\' && p[1] != '\0' && *++p == '\n')
876 esc_nl = 1;
877 *s++ = *p;
878 }
879 size += s - op;
880 if (!esc_nl) {
881 *s = '\0';
882 break;
883 }
884 if (asize - size < _POSIX2_LINE_MAX + 1) {
885 asize *= 2;
886 if ((text = realloc(text, asize)) == NULL)
887 err(1, "realloc");
888 }
889 }
890 text[size] = '\0';
891 if ((p = realloc(text, size + 1)) == NULL)
892 err(1, "realloc");
893 return (p);
894 }
895
896 /*
897 * Get an address and return a pointer to the first character after
898 * it. Fill the structure pointed to according to the address.
899 */
900 static char *
compile_addr(char * p,struct s_addr * a)901 compile_addr(char *p, struct s_addr *a)
902 {
903 char *end, re[_POSIX2_LINE_MAX + 1];
904 int icase;
905
906 icase = 0;
907
908 a->type = 0;
909 switch (*p) {
910 case '\\': /* Context address */
911 ++p;
912 /* FALLTHROUGH */
913 case '/': /* Context address */
914 p = compile_delimited(p, re, 0);
915 if (p == NULL)
916 errx(1, "%lu: %s: unterminated regular expression", linenum, fname);
917 /* Check for case insensitive regexp flag */
918 if (*p == 'I') {
919 icase = 1;
920 p++;
921 }
922 if (*re == '\0')
923 a->u.r = NULL;
924 else
925 a->u.r = compile_re(re, icase);
926 a->type = AT_RE;
927 return (p);
928
929 case '$': /* Last line */
930 a->type = AT_LAST;
931 return (p + 1);
932
933 case '+': /* Relative line number */
934 a->type = AT_RELLINE;
935 p++;
936 /* FALLTHROUGH */
937 /* Line number */
938 case '0': case '1': case '2': case '3': case '4':
939 case '5': case '6': case '7': case '8': case '9':
940 if (a->type == 0)
941 a->type = AT_LINE;
942 a->u.l = strtol(p, &end, 10);
943 return (end);
944 default:
945 errx(1, "%lu: %s: expected context address", linenum, fname);
946 return (NULL);
947 }
948 }
949
950 /*
951 * duptoeol --
952 * Return a copy of all the characters up to \n or \0.
953 */
954 static char *
duptoeol(char * s,const char * ctype)955 duptoeol(char *s, const char *ctype)
956 {
957 size_t len;
958 int ws;
959 char *p, *start;
960
961 ws = 0;
962 for (start = s; *s != '\0' && *s != '\n'; ++s)
963 ws = isspace((unsigned char)*s);
964 *s = '\0';
965 if (ws)
966 warnx("%lu: %s: whitespace after %s", linenum, fname, ctype);
967 len = s - start + 1;
968 if ((p = malloc(len)) == NULL)
969 err(1, "malloc");
970 return (memmove(p, start, len));
971 }
972
973 /*
974 * Convert goto label names to addresses, and count a and r commands, in
975 * the given subset of the script. Free the memory used by labels in b
976 * and t commands (but not by :).
977 *
978 * TODO: Remove } nodes
979 */
980 static void
fixuplabel(struct s_command * cp,struct s_command * end)981 fixuplabel(struct s_command *cp, struct s_command *end)
982 {
983
984 for (; cp != end; cp = cp->next)
985 switch (cp->code) {
986 case 'a':
987 case 'r':
988 appendnum++;
989 break;
990 case 'b':
991 case 't':
992 /* Resolve branch target. */
993 if (cp->t == NULL) {
994 cp->u.c = NULL;
995 break;
996 }
997 if ((cp->u.c = findlabel(cp->t)) == NULL)
998 errx(1, "%lu: %s: undefined label '%s'", linenum, fname, cp->t);
999 free(cp->t);
1000 break;
1001 case '{':
1002 /* Do interior commands. */
1003 fixuplabel(cp->u.c, cp->next);
1004 break;
1005 }
1006 }
1007
1008 /*
1009 * Associate the given command label for later lookup.
1010 */
1011 static void
enterlabel(struct s_command * cp)1012 enterlabel(struct s_command *cp)
1013 {
1014 struct labhash **lhp, *lh;
1015 u_char *p;
1016 u_int h, c;
1017
1018 for (h = 0, p = (u_char *)cp->t; (c = *p) != 0; p++)
1019 h = (h << 5) + h + c;
1020 lhp = &labels[h & LHMASK];
1021 for (lh = *lhp; lh != NULL; lh = lh->lh_next)
1022 if (lh->lh_hash == h && strcmp(cp->t, lh->lh_cmd->t) == 0)
1023 errx(1, "%lu: %s: duplicate label '%s'", linenum, fname, cp->t);
1024 if ((lh = malloc(sizeof *lh)) == NULL)
1025 err(1, "malloc");
1026 lh->lh_next = *lhp;
1027 lh->lh_hash = h;
1028 lh->lh_cmd = cp;
1029 lh->lh_ref = 0;
1030 *lhp = lh;
1031 }
1032
1033 /*
1034 * Find the label contained in the command l in the command linked
1035 * list cp. L is excluded from the search. Return NULL if not found.
1036 */
1037 static struct s_command *
findlabel(char * name)1038 findlabel(char *name)
1039 {
1040 struct labhash *lh;
1041 u_char *p;
1042 u_int h, c;
1043
1044 for (h = 0, p = (u_char *)name; (c = *p) != 0; p++)
1045 h = (h << 5) + h + c;
1046 for (lh = labels[h & LHMASK]; lh != NULL; lh = lh->lh_next) {
1047 if (lh->lh_hash == h && strcmp(name, lh->lh_cmd->t) == 0) {
1048 lh->lh_ref = 1;
1049 return (lh->lh_cmd);
1050 }
1051 }
1052 return (NULL);
1053 }
1054
1055 /*
1056 * Warn about any unused labels. As a side effect, release the label hash
1057 * table space.
1058 */
1059 static void
uselabel(void)1060 uselabel(void)
1061 {
1062 struct labhash *lh, *next;
1063 int i;
1064
1065 for (i = 0; i < LHSZ; i++) {
1066 for (lh = labels[i]; lh != NULL; lh = next) {
1067 next = lh->lh_next;
1068 if (!lh->lh_ref)
1069 warnx("%lu: %s: unused label '%s'",
1070 linenum, fname, lh->lh_cmd->t);
1071 free(lh);
1072 }
1073 }
1074 }
1075