1 /* Extended regular expression matching and search library.
2    Copyright (C) 2002-2007,2009,2010,2011,2012 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4    Contributed by Isamu Hasegawa <[email protected]>.
5 
6    The GNU C Library is free software; you can redistribute it and/or
7    modify it under the terms of the GNU Lesser General Public
8    License as published by the Free Software Foundation; either
9    version 2.1 of the License, or (at your option) any later version.
10 
11    The GNU C Library is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    Lesser General Public License for more details.
15 
16    You should have received a copy of the GNU Lesser General Public
17    License along with the GNU C Library; if not, see
18    <http://www.gnu.org/licenses/>.  */
19 
20 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
21 					  size_t length, reg_syntax_t syntax);
22 static void re_compile_fastmap_iter (regex_t *bufp,
23 				     const re_dfastate_t *init_state,
24 				     char *fastmap);
25 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
26 #ifdef RE_ENABLE_I18N
27 static void free_charset (re_charset_t *cset);
28 #endif /* RE_ENABLE_I18N */
29 static void free_workarea_compile (regex_t *preg);
30 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
31 #ifdef RE_ENABLE_I18N
32 static void optimize_utf8 (re_dfa_t *dfa);
33 #endif
34 static reg_errcode_t analyze (regex_t *preg);
35 static reg_errcode_t preorder (bin_tree_t *root,
36 			       reg_errcode_t (fn (void *, bin_tree_t *)),
37 			       void *extra);
38 static reg_errcode_t postorder (bin_tree_t *root,
39 				reg_errcode_t (fn (void *, bin_tree_t *)),
40 				void *extra);
41 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
42 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
43 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
44 				 bin_tree_t *node);
45 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
46 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
47 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
48 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
49 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
50 				   unsigned int constraint);
51 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
52 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
53 					 int node, int root);
54 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
55 static int fetch_number (re_string_t *input, re_token_t *token,
56 			 reg_syntax_t syntax);
57 static int peek_token (re_token_t *token, re_string_t *input,
58 			reg_syntax_t syntax) internal_function;
59 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
60 			  reg_syntax_t syntax, reg_errcode_t *err);
61 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
62 				  re_token_t *token, reg_syntax_t syntax,
63 				  int nest, reg_errcode_t *err);
64 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
65 				 re_token_t *token, reg_syntax_t syntax,
66 				 int nest, reg_errcode_t *err);
67 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
68 				     re_token_t *token, reg_syntax_t syntax,
69 				     int nest, reg_errcode_t *err);
70 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
71 				  re_token_t *token, reg_syntax_t syntax,
72 				  int nest, reg_errcode_t *err);
73 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
74 				 re_dfa_t *dfa, re_token_t *token,
75 				 reg_syntax_t syntax, reg_errcode_t *err);
76 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
77 				      re_token_t *token, reg_syntax_t syntax,
78 				      reg_errcode_t *err);
79 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
80 					    re_string_t *regexp,
81 					    re_token_t *token, int token_len,
82 					    re_dfa_t *dfa,
83 					    reg_syntax_t syntax,
84 					    int accept_hyphen);
85 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
86 					  re_string_t *regexp,
87 					  re_token_t *token);
88 #ifdef RE_ENABLE_I18N
89 static reg_errcode_t build_equiv_class (bitset_t sbcset,
90 					re_charset_t *mbcset,
91 					int *equiv_class_alloc,
92 					const unsigned char *name);
93 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
94 				      bitset_t sbcset,
95 				      re_charset_t *mbcset,
96 				      int *char_class_alloc,
97 				      const unsigned char *class_name,
98 				      reg_syntax_t syntax);
99 #else  /* not RE_ENABLE_I18N */
100 static reg_errcode_t build_equiv_class (bitset_t sbcset,
101 					const unsigned char *name);
102 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
103 				      bitset_t sbcset,
104 				      const unsigned char *class_name,
105 				      reg_syntax_t syntax);
106 #endif /* not RE_ENABLE_I18N */
107 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
108 				       RE_TRANSLATE_TYPE trans,
109 				       const unsigned char *class_name,
110 				       const unsigned char *extra,
111 				       int non_match, reg_errcode_t *err);
112 static bin_tree_t *create_tree (re_dfa_t *dfa,
113 				bin_tree_t *left, bin_tree_t *right,
114 				re_token_type_t type);
115 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
116 				      bin_tree_t *left, bin_tree_t *right,
117 				      const re_token_t *token);
118 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
119 static void free_token (re_token_t *node);
120 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
121 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
122 
123 /* This table gives an error message for each of the error codes listed
124    in regex.h.  Obviously the order here has to be same as there.
125    POSIX doesn't require that we do anything for REG_NOERROR,
126    but why not be nice?  */
127 
128 const char __re_error_msgid[] attribute_hidden =
129   {
130 #define REG_NOERROR_IDX	0
131     gettext_noop ("Success")	/* REG_NOERROR */
132     "\0"
133 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
134     gettext_noop ("No match")	/* REG_NOMATCH */
135     "\0"
136 #define REG_BADPAT_IDX	(REG_NOMATCH_IDX + sizeof "No match")
137     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
138     "\0"
139 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
140     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
141     "\0"
142 #define REG_ECTYPE_IDX	(REG_ECOLLATE_IDX + sizeof "Invalid collation character")
143     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
144     "\0"
145 #define REG_EESCAPE_IDX	(REG_ECTYPE_IDX + sizeof "Invalid character class name")
146     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
147     "\0"
148 #define REG_ESUBREG_IDX	(REG_EESCAPE_IDX + sizeof "Trailing backslash")
149     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
150     "\0"
151 #define REG_EBRACK_IDX	(REG_ESUBREG_IDX + sizeof "Invalid back reference")
152     gettext_noop ("Unmatched [ or [^")	/* REG_EBRACK */
153     "\0"
154 #define REG_EPAREN_IDX	(REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
155     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
156     "\0"
157 #define REG_EBRACE_IDX	(REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
158     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
159     "\0"
160 #define REG_BADBR_IDX	(REG_EBRACE_IDX + sizeof "Unmatched \\{")
161     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
162     "\0"
163 #define REG_ERANGE_IDX	(REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
164     gettext_noop ("Invalid range end")	/* REG_ERANGE */
165     "\0"
166 #define REG_ESPACE_IDX	(REG_ERANGE_IDX + sizeof "Invalid range end")
167     gettext_noop ("Memory exhausted") /* REG_ESPACE */
168     "\0"
169 #define REG_BADRPT_IDX	(REG_ESPACE_IDX + sizeof "Memory exhausted")
170     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
171     "\0"
172 #define REG_EEND_IDX	(REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
173     gettext_noop ("Premature end of regular expression") /* REG_EEND */
174     "\0"
175 #define REG_ESIZE_IDX	(REG_EEND_IDX + sizeof "Premature end of regular expression")
176     gettext_noop ("Regular expression too big") /* REG_ESIZE */
177     "\0"
178 #define REG_ERPAREN_IDX	(REG_ESIZE_IDX + sizeof "Regular expression too big")
179     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
180   };
181 
182 const size_t __re_error_msgid_idx[] attribute_hidden =
183   {
184     REG_NOERROR_IDX,
185     REG_NOMATCH_IDX,
186     REG_BADPAT_IDX,
187     REG_ECOLLATE_IDX,
188     REG_ECTYPE_IDX,
189     REG_EESCAPE_IDX,
190     REG_ESUBREG_IDX,
191     REG_EBRACK_IDX,
192     REG_EPAREN_IDX,
193     REG_EBRACE_IDX,
194     REG_BADBR_IDX,
195     REG_ERANGE_IDX,
196     REG_ESPACE_IDX,
197     REG_BADRPT_IDX,
198     REG_EEND_IDX,
199     REG_ESIZE_IDX,
200     REG_ERPAREN_IDX
201   };
202 
203 /* Entry points for GNU code.  */
204 
205 /* re_compile_pattern is the GNU regular expression compiler: it
206    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
207    Returns 0 if the pattern was valid, otherwise an error string.
208 
209    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
210    are set in BUFP on entry.  */
211 
212 const char *
re_compile_pattern(pattern,length,bufp)213 re_compile_pattern (pattern, length, bufp)
214     const char *pattern;
215     size_t length;
216     struct re_pattern_buffer *bufp;
217 {
218   reg_errcode_t ret;
219 
220   /* And GNU code determines whether or not to get register information
221      by passing null for the REGS argument to re_match, etc., not by
222      setting no_sub, unless RE_NO_SUB is set.  */
223   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
224 
225   /* Match anchors at newline.  */
226   bufp->newline_anchor = 1;
227 
228   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
229 
230   if (!ret)
231     return NULL;
232   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
233 }
234 #ifdef _LIBC
weak_alias(__re_compile_pattern,re_compile_pattern)235 weak_alias (__re_compile_pattern, re_compile_pattern)
236 #endif
237 
238 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
239    also be assigned to arbitrarily: each pattern buffer stores its own
240    syntax, so it can be changed between regex compilations.  */
241 /* This has no initializer because initialized variables in Emacs
242    become read-only after dumping.  */
243 reg_syntax_t re_syntax_options;
244 
245 
246 /* Specify the precise syntax of regexps for compilation.  This provides
247    for compatibility for various utilities which historically have
248    different, incompatible syntaxes.
249 
250    The argument SYNTAX is a bit mask comprised of the various bits
251    defined in regex.h.  We return the old syntax.  */
252 
253 reg_syntax_t
254 re_set_syntax (syntax)
255     reg_syntax_t syntax;
256 {
257   reg_syntax_t ret = re_syntax_options;
258 
259   re_syntax_options = syntax;
260   return ret;
261 }
262 #ifdef _LIBC
263 weak_alias (__re_set_syntax, re_set_syntax)
264 #endif
265 
266 int
267 re_compile_fastmap (bufp)
268     struct re_pattern_buffer *bufp;
269 {
270   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
271   char *fastmap = bufp->fastmap;
272 
273   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
274   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
275   if (dfa->init_state != dfa->init_state_word)
276     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
277   if (dfa->init_state != dfa->init_state_nl)
278     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
279   if (dfa->init_state != dfa->init_state_begbuf)
280     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
281   bufp->fastmap_accurate = 1;
282   return 0;
283 }
284 #ifdef _LIBC
weak_alias(__re_compile_fastmap,re_compile_fastmap)285 weak_alias (__re_compile_fastmap, re_compile_fastmap)
286 #endif
287 
288 static inline void
289 __attribute ((always_inline))
290 re_set_fastmap (char *fastmap, int icase, int ch)
291 {
292   fastmap[ch] = 1;
293   if (icase)
294     fastmap[tolower (ch)] = 1;
295 }
296 
297 /* Helper function for re_compile_fastmap.
298    Compile fastmap for the initial_state INIT_STATE.  */
299 
300 static void
re_compile_fastmap_iter(regex_t * bufp,const re_dfastate_t * init_state,char * fastmap)301 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
302 			 char *fastmap)
303 {
304   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
305   int node_cnt;
306   int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
307   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
308     {
309       int node = init_state->nodes.elems[node_cnt];
310       re_token_type_t type = dfa->nodes[node].type;
311 
312       if (type == CHARACTER)
313 	{
314 	  re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
315 #ifdef RE_ENABLE_I18N
316 	  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
317 	    {
318 	      unsigned char *buf = alloca (dfa->mb_cur_max), *p;
319 	      wchar_t wc;
320 	      mbstate_t state;
321 
322 	      p = buf;
323 	      *p++ = dfa->nodes[node].opr.c;
324 	      while (++node < dfa->nodes_len
325 		     &&	dfa->nodes[node].type == CHARACTER
326 		     && dfa->nodes[node].mb_partial)
327 		*p++ = dfa->nodes[node].opr.c;
328 	      memset (&state, '\0', sizeof (state));
329 	      if (__mbrtowc (&wc, (const char *) buf, p - buf,
330 			     &state) == p - buf
331 		  && (__wcrtomb ((char *) buf, towlower (wc), &state)
332 		      != (size_t) -1))
333 		re_set_fastmap (fastmap, 0, buf[0]);
334 	    }
335 #endif
336 	}
337       else if (type == SIMPLE_BRACKET)
338 	{
339 	  int i, ch;
340 	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
341 	    {
342 	      int j;
343 	      bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
344 	      for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
345 		if (w & ((bitset_word_t) 1 << j))
346 		  re_set_fastmap (fastmap, icase, ch);
347 	    }
348 	}
349 #ifdef RE_ENABLE_I18N
350       else if (type == COMPLEX_BRACKET)
351 	{
352 	  re_charset_t *cset = dfa->nodes[node].opr.mbcset;
353 	  int i;
354 
355 # ifdef _LIBC
356 	  /* See if we have to try all bytes which start multiple collation
357 	     elements.
358 	     e.g. In da_DK, we want to catch 'a' since "aa" is a valid
359 		  collation element, and don't catch 'b' since 'b' is
360 		  the only collation element which starts from 'b' (and
361 		  it is caught by SIMPLE_BRACKET).  */
362 	      if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
363 		  && (cset->ncoll_syms || cset->nranges))
364 		{
365 		  const int32_t *table = (const int32_t *)
366 		    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
367 		  for (i = 0; i < SBC_MAX; ++i)
368 		    if (table[i] < 0)
369 		      re_set_fastmap (fastmap, icase, i);
370 		}
371 # endif /* _LIBC */
372 
373 	  /* See if we have to start the match at all multibyte characters,
374 	     i.e. where we would not find an invalid sequence.  This only
375 	     applies to multibyte character sets; for single byte character
376 	     sets, the SIMPLE_BRACKET again suffices.  */
377 	  if (dfa->mb_cur_max > 1
378 	      && (cset->nchar_classes || cset->non_match || cset->nranges
379 # ifdef _LIBC
380 		  || cset->nequiv_classes
381 # endif /* _LIBC */
382 		 ))
383 	    {
384 	      unsigned char c = 0;
385 	      do
386 		{
387 		  mbstate_t mbs;
388 		  memset (&mbs, 0, sizeof (mbs));
389 		  if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
390 		    re_set_fastmap (fastmap, false, (int) c);
391 		}
392 	      while (++c != 0);
393 	    }
394 
395 	  else
396 	    {
397 	      /* ... Else catch all bytes which can start the mbchars.  */
398 	      for (i = 0; i < cset->nmbchars; ++i)
399 		{
400 		  char buf[256];
401 		  mbstate_t state;
402 		  memset (&state, '\0', sizeof (state));
403 		  if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
404 		    re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
405 		  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
406 		    {
407 		      if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
408 			  != (size_t) -1)
409 			re_set_fastmap (fastmap, false, *(unsigned char *) buf);
410 		    }
411 		}
412 	    }
413 	}
414 #endif /* RE_ENABLE_I18N */
415       else if (type == OP_PERIOD
416 #ifdef RE_ENABLE_I18N
417 	       || type == OP_UTF8_PERIOD
418 #endif /* RE_ENABLE_I18N */
419 	       || type == END_OF_RE)
420 	{
421 	  memset (fastmap, '\1', sizeof (char) * SBC_MAX);
422 	  if (type == END_OF_RE)
423 	    bufp->can_be_null = 1;
424 	  return;
425 	}
426     }
427 }
428 
429 /* Entry point for POSIX code.  */
430 /* regcomp takes a regular expression as a string and compiles it.
431 
432    PREG is a regex_t *.  We do not expect any fields to be initialized,
433    since POSIX says we shouldn't.  Thus, we set
434 
435      `buffer' to the compiled pattern;
436      `used' to the length of the compiled pattern;
437      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
438        REG_EXTENDED bit in CFLAGS is set; otherwise, to
439        RE_SYNTAX_POSIX_BASIC;
440      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
441      `fastmap' to an allocated space for the fastmap;
442      `fastmap_accurate' to zero;
443      `re_nsub' to the number of subexpressions in PATTERN.
444 
445    PATTERN is the address of the pattern string.
446 
447    CFLAGS is a series of bits which affect compilation.
448 
449      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
450      use POSIX basic syntax.
451 
452      If REG_NEWLINE is set, then . and [^...] don't match newline.
453      Also, regexec will try a match beginning after every newline.
454 
455      If REG_ICASE is set, then we considers upper- and lowercase
456      versions of letters to be equivalent when matching.
457 
458      If REG_NOSUB is set, then when PREG is passed to regexec, that
459      routine will report only success or failure, and nothing about the
460      registers.
461 
462    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
463    the return codes and their meanings.)  */
464 
465 int
regcomp(preg,pattern,cflags)466 regcomp (preg, pattern, cflags)
467     regex_t *__restrict preg;
468     const char *__restrict pattern;
469     int cflags;
470 {
471   reg_errcode_t ret;
472   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
473 			 : RE_SYNTAX_POSIX_BASIC);
474 
475   preg->buffer = NULL;
476   preg->allocated = 0;
477   preg->used = 0;
478 
479   /* Try to allocate space for the fastmap.  */
480   preg->fastmap = re_malloc (char, SBC_MAX);
481   if (BE (preg->fastmap == NULL, 0))
482     return REG_ESPACE;
483 
484   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
485 
486   /* If REG_NEWLINE is set, newlines are treated differently.  */
487   if (cflags & REG_NEWLINE)
488     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
489       syntax &= ~RE_DOT_NEWLINE;
490       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
491       /* It also changes the matching behavior.  */
492       preg->newline_anchor = 1;
493     }
494   else
495     preg->newline_anchor = 0;
496   preg->no_sub = !!(cflags & REG_NOSUB);
497   preg->translate = NULL;
498 
499   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
500 
501   /* POSIX doesn't distinguish between an unmatched open-group and an
502      unmatched close-group: both are REG_EPAREN.  */
503   if (ret == REG_ERPAREN)
504     ret = REG_EPAREN;
505 
506   /* We have already checked preg->fastmap != NULL.  */
507   if (BE (ret == REG_NOERROR, 1))
508     /* Compute the fastmap now, since regexec cannot modify the pattern
509        buffer.  This function never fails in this implementation.  */
510     (void) re_compile_fastmap (preg);
511   else
512     {
513       /* Some error occurred while compiling the expression.  */
514       re_free (preg->fastmap);
515       preg->fastmap = NULL;
516     }
517 
518   return (int) ret;
519 }
520 #ifdef _LIBC
521 weak_alias (__regcomp, regcomp)
522 #endif
523 
524 /* Returns a message corresponding to an error code, ERRCODE, returned
525    from either regcomp or regexec.   We don't use PREG here.  */
526 
527 size_t
528 regerror (errcode, preg, errbuf, errbuf_size)
529     int errcode;
530     const regex_t *__restrict preg;
531     char *__restrict errbuf;
532     size_t errbuf_size;
533 {
534   const char *msg;
535   size_t msg_size;
536 
537   if (BE (errcode < 0
538 	  || errcode >= (int) (sizeof (__re_error_msgid_idx)
539 			       / sizeof (__re_error_msgid_idx[0])), 0))
540     /* Only error codes returned by the rest of the code should be passed
541        to this routine.  If we are given anything else, or if other regex
542        code generates an invalid error code, then the program has a bug.
543        Dump core so we can fix it.  */
544     abort ();
545 
546   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
547 
548   msg_size = strlen (msg) + 1; /* Includes the null.  */
549 
550   if (BE (errbuf_size != 0, 1))
551     {
552       if (BE (msg_size > errbuf_size, 0))
553 	{
554 #if defined HAVE_MEMPCPY || defined _LIBC
555 	  *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
556 #else
557 	  memcpy (errbuf, msg, errbuf_size - 1);
558 	  errbuf[errbuf_size - 1] = 0;
559 #endif
560 	}
561       else
562 	memcpy (errbuf, msg, msg_size);
563     }
564 
565   return msg_size;
566 }
567 #ifdef _LIBC
568 weak_alias (__regerror, regerror)
569 #endif
570 
571 
572 #ifdef RE_ENABLE_I18N
573 /* This static array is used for the map to single-byte characters when
574    UTF-8 is used.  Otherwise we would allocate memory just to initialize
575    it the same all the time.  UTF-8 is the preferred encoding so this is
576    a worthwhile optimization.  */
577 static const bitset_t utf8_sb_map =
578 {
579   /* Set the first 128 bits.  */
580   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
581 };
582 #endif
583 
584 
585 static void
free_dfa_content(re_dfa_t * dfa)586 free_dfa_content (re_dfa_t *dfa)
587 {
588   int i, j;
589 
590   if (dfa->nodes)
591     for (i = 0; i < dfa->nodes_len; ++i)
592       free_token (dfa->nodes + i);
593   re_free (dfa->nexts);
594   for (i = 0; i < dfa->nodes_len; ++i)
595     {
596       if (dfa->eclosures != NULL)
597 	re_node_set_free (dfa->eclosures + i);
598       if (dfa->inveclosures != NULL)
599 	re_node_set_free (dfa->inveclosures + i);
600       if (dfa->edests != NULL)
601 	re_node_set_free (dfa->edests + i);
602     }
603   re_free (dfa->edests);
604   re_free (dfa->eclosures);
605   re_free (dfa->inveclosures);
606   re_free (dfa->nodes);
607 
608   if (dfa->state_table)
609     for (i = 0; i <= dfa->state_hash_mask; ++i)
610       {
611 	struct re_state_table_entry *entry = dfa->state_table + i;
612 	for (j = 0; j < entry->num; ++j)
613 	  {
614 	    re_dfastate_t *state = entry->array[j];
615 	    free_state (state);
616 	  }
617 	re_free (entry->array);
618       }
619   re_free (dfa->state_table);
620 #ifdef RE_ENABLE_I18N
621   if (dfa->sb_char != utf8_sb_map)
622     re_free (dfa->sb_char);
623 #endif
624   re_free (dfa->subexp_map);
625 #ifdef DEBUG
626   re_free (dfa->re_str);
627 #endif
628 
629   re_free (dfa);
630 }
631 
632 
633 /* Free dynamically allocated space used by PREG.  */
634 
635 void
regfree(preg)636 regfree (preg)
637     regex_t *preg;
638 {
639   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
640   if (BE (dfa != NULL, 1))
641     free_dfa_content (dfa);
642   preg->buffer = NULL;
643   preg->allocated = 0;
644 
645   re_free (preg->fastmap);
646   preg->fastmap = NULL;
647 
648   re_free (preg->translate);
649   preg->translate = NULL;
650 }
651 #ifdef _LIBC
652 weak_alias (__regfree, regfree)
653 #endif
654 
655 /* Entry points compatible with 4.2 BSD regex library.  We don't define
656    them unless specifically requested.  */
657 
658 #if defined _REGEX_RE_COMP || defined _LIBC
659 
660 /* BSD has one and only one pattern buffer.  */
661 static struct re_pattern_buffer re_comp_buf;
662 
663 char *
664 # ifdef _LIBC
665 /* Make these definitions weak in libc, so POSIX programs can redefine
666    these names if they don't use our functions, and still use
667    regcomp/regexec above without link errors.  */
668 weak_function
669 # endif
re_comp(s)670 re_comp (s)
671      const char *s;
672 {
673   reg_errcode_t ret;
674   char *fastmap;
675 
676   if (!s)
677     {
678       if (!re_comp_buf.buffer)
679 	return gettext ("No previous regular expression");
680       return 0;
681     }
682 
683   if (re_comp_buf.buffer)
684     {
685       fastmap = re_comp_buf.fastmap;
686       re_comp_buf.fastmap = NULL;
687       __regfree (&re_comp_buf);
688       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
689       re_comp_buf.fastmap = fastmap;
690     }
691 
692   if (re_comp_buf.fastmap == NULL)
693     {
694       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
695       if (re_comp_buf.fastmap == NULL)
696 	return (char *) gettext (__re_error_msgid
697 				 + __re_error_msgid_idx[(int) REG_ESPACE]);
698     }
699 
700   /* Since `re_exec' always passes NULL for the `regs' argument, we
701      don't need to initialize the pattern buffer fields which affect it.  */
702 
703   /* Match anchors at newlines.  */
704   re_comp_buf.newline_anchor = 1;
705 
706   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
707 
708   if (!ret)
709     return NULL;
710 
711   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
712   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
713 }
714 
715 #ifdef _LIBC
libc_freeres_fn(free_mem)716 libc_freeres_fn (free_mem)
717 {
718   __regfree (&re_comp_buf);
719 }
720 #endif
721 
722 #endif /* _REGEX_RE_COMP */
723 
724 /* Internal entry point.
725    Compile the regular expression PATTERN, whose length is LENGTH.
726    SYNTAX indicate regular expression's syntax.  */
727 
728 static reg_errcode_t
re_compile_internal(regex_t * preg,const char * pattern,size_t length,reg_syntax_t syntax)729 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
730 		     reg_syntax_t syntax)
731 {
732   reg_errcode_t err = REG_NOERROR;
733   re_dfa_t *dfa;
734   re_string_t regexp;
735 
736   /* Initialize the pattern buffer.  */
737   preg->fastmap_accurate = 0;
738   preg->syntax = syntax;
739   preg->not_bol = preg->not_eol = 0;
740   preg->used = 0;
741   preg->re_nsub = 0;
742   preg->can_be_null = 0;
743   preg->regs_allocated = REGS_UNALLOCATED;
744 
745   /* Initialize the dfa.  */
746   dfa = (re_dfa_t *) preg->buffer;
747   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
748     {
749       /* If zero allocated, but buffer is non-null, try to realloc
750 	 enough space.  This loses if buffer's address is bogus, but
751 	 that is the user's responsibility.  If ->buffer is NULL this
752 	 is a simple allocation.  */
753       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
754       if (dfa == NULL)
755 	return REG_ESPACE;
756       preg->allocated = sizeof (re_dfa_t);
757       preg->buffer = (unsigned char *) dfa;
758     }
759   preg->used = sizeof (re_dfa_t);
760 
761   err = init_dfa (dfa, length);
762   if (BE (err != REG_NOERROR, 0))
763     {
764       free_dfa_content (dfa);
765       preg->buffer = NULL;
766       preg->allocated = 0;
767       return err;
768     }
769 #ifdef DEBUG
770   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
771   dfa->re_str = re_malloc (char, length + 1);
772   strncpy (dfa->re_str, pattern, length + 1);
773 #endif
774 
775   __libc_lock_init (dfa->lock);
776 
777   err = re_string_construct (&regexp, pattern, length, preg->translate,
778 			     syntax & RE_ICASE, dfa);
779   if (BE (err != REG_NOERROR, 0))
780     {
781     re_compile_internal_free_return:
782       free_workarea_compile (preg);
783       re_string_destruct (&regexp);
784       free_dfa_content (dfa);
785       preg->buffer = NULL;
786       preg->allocated = 0;
787       return err;
788     }
789 
790   /* Parse the regular expression, and build a structure tree.  */
791   preg->re_nsub = 0;
792   dfa->str_tree = parse (&regexp, preg, syntax, &err);
793   if (BE (dfa->str_tree == NULL, 0))
794     goto re_compile_internal_free_return;
795 
796   /* Analyze the tree and create the nfa.  */
797   err = analyze (preg);
798   if (BE (err != REG_NOERROR, 0))
799     goto re_compile_internal_free_return;
800 
801 #ifdef RE_ENABLE_I18N
802   /* If possible, do searching in single byte encoding to speed things up.  */
803   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
804     optimize_utf8 (dfa);
805 #endif
806 
807   /* Then create the initial state of the dfa.  */
808   err = create_initial_state (dfa);
809 
810   /* Release work areas.  */
811   free_workarea_compile (preg);
812   re_string_destruct (&regexp);
813 
814   if (BE (err != REG_NOERROR, 0))
815     {
816       free_dfa_content (dfa);
817       preg->buffer = NULL;
818       preg->allocated = 0;
819     }
820 
821   return err;
822 }
823 
824 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
825    as the initial length of some arrays.  */
826 
827 static reg_errcode_t
init_dfa(re_dfa_t * dfa,size_t pat_len)828 init_dfa (re_dfa_t *dfa, size_t pat_len)
829 {
830   unsigned int table_size;
831 #ifndef _LIBC
832   char *codeset_name;
833 #endif
834 
835   memset (dfa, '\0', sizeof (re_dfa_t));
836 
837   /* Force allocation of str_tree_storage the first time.  */
838   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
839 
840   /* Avoid overflows.  */
841   if (pat_len == SIZE_MAX)
842     return REG_ESPACE;
843 
844   dfa->nodes_alloc = pat_len + 1;
845   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
846 
847   /*  table_size = 2 ^ ceil(log pat_len) */
848   for (table_size = 1; ; table_size <<= 1)
849     if (table_size > pat_len)
850       break;
851 
852   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
853   dfa->state_hash_mask = table_size - 1;
854 
855   dfa->mb_cur_max = MB_CUR_MAX;
856 #ifdef _LIBC
857   if (dfa->mb_cur_max == 6
858       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
859     dfa->is_utf8 = 1;
860   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
861 		       != 0);
862 #else
863 # ifdef HAVE_LANGINFO_CODESET
864   codeset_name = nl_langinfo (CODESET);
865 # else
866   codeset_name = getenv ("LC_ALL");
867   if (codeset_name == NULL || codeset_name[0] == '\0')
868     codeset_name = getenv ("LC_CTYPE");
869   if (codeset_name == NULL || codeset_name[0] == '\0')
870     codeset_name = getenv ("LANG");
871   if (codeset_name == NULL)
872     codeset_name = "";
873   else if (strchr (codeset_name, '.') !=  NULL)
874     codeset_name = strchr (codeset_name, '.') + 1;
875 # endif
876 
877   if (strcasecmp (codeset_name, "UTF-8") == 0
878       || strcasecmp (codeset_name, "UTF8") == 0)
879     dfa->is_utf8 = 1;
880 
881   /* We check exhaustively in the loop below if this charset is a
882      superset of ASCII.  */
883   dfa->map_notascii = 0;
884 #endif
885 
886 #ifdef RE_ENABLE_I18N
887   if (dfa->mb_cur_max > 1)
888     {
889       if (dfa->is_utf8)
890 	dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
891       else
892 	{
893 	  int i, j, ch;
894 
895 	  dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
896 	  if (BE (dfa->sb_char == NULL, 0))
897 	    return REG_ESPACE;
898 
899 	  /* Set the bits corresponding to single byte chars.  */
900 	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
901 	    for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
902 	      {
903 		wint_t wch = __btowc (ch);
904 		if (wch != WEOF)
905 		  dfa->sb_char[i] |= (bitset_word_t) 1 << j;
906 # ifndef _LIBC
907 		if (isascii (ch) && wch != ch)
908 		  dfa->map_notascii = 1;
909 # endif
910 	      }
911 	}
912     }
913 #endif
914 
915   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
916     return REG_ESPACE;
917   return REG_NOERROR;
918 }
919 
920 /* Initialize WORD_CHAR table, which indicate which character is
921    "word".  In this case "word" means that it is the word construction
922    character used by some operators like "\<", "\>", etc.  */
923 
924 static void
925 internal_function
init_word_char(re_dfa_t * dfa)926 init_word_char (re_dfa_t *dfa)
927 {
928   dfa->word_ops_used = 1;
929   int i = 0;
930   int ch = 0;
931   if (BE (dfa->map_notascii == 0, 1))
932     {
933       if (sizeof (dfa->word_char[0]) == 8)
934 	{
935           /* The extra temporaries here avoid "implicitly truncated"
936              warnings in the case when this is dead code, i.e. 32-bit.  */
937           const uint64_t wc0 = UINT64_C (0x03ff000000000000);
938           const uint64_t wc1 = UINT64_C (0x07fffffe87fffffe);
939 	  dfa->word_char[0] = wc0;
940 	  dfa->word_char[1] = wc1;
941 	  i = 2;
942 	}
943       else if (sizeof (dfa->word_char[0]) == 4)
944 	{
945 	  dfa->word_char[0] = UINT32_C (0x00000000);
946 	  dfa->word_char[1] = UINT32_C (0x03ff0000);
947 	  dfa->word_char[2] = UINT32_C (0x87fffffe);
948 	  dfa->word_char[3] = UINT32_C (0x07fffffe);
949 	  i = 4;
950 	}
951       else
952 	abort ();
953       ch = 128;
954 
955       if (BE (dfa->is_utf8, 1))
956 	{
957 	  memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
958 	  return;
959 	}
960     }
961 
962   for (; i < BITSET_WORDS; ++i)
963     for (int j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
964       if (isalnum (ch) || ch == '_')
965 	dfa->word_char[i] |= (bitset_word_t) 1 << j;
966 }
967 
968 /* Free the work area which are only used while compiling.  */
969 
970 static void
free_workarea_compile(regex_t * preg)971 free_workarea_compile (regex_t *preg)
972 {
973   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
974   bin_tree_storage_t *storage, *next;
975   for (storage = dfa->str_tree_storage; storage; storage = next)
976     {
977       next = storage->next;
978       re_free (storage);
979     }
980   dfa->str_tree_storage = NULL;
981   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
982   dfa->str_tree = NULL;
983   re_free (dfa->org_indices);
984   dfa->org_indices = NULL;
985 }
986 
987 /* Create initial states for all contexts.  */
988 
989 static reg_errcode_t
create_initial_state(re_dfa_t * dfa)990 create_initial_state (re_dfa_t *dfa)
991 {
992   int first, i;
993   reg_errcode_t err;
994   re_node_set init_nodes;
995 
996   /* Initial states have the epsilon closure of the node which is
997      the first node of the regular expression.  */
998   first = dfa->str_tree->first->node_idx;
999   dfa->init_node = first;
1000   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1001   if (BE (err != REG_NOERROR, 0))
1002     return err;
1003 
1004   /* The back-references which are in initial states can epsilon transit,
1005      since in this case all of the subexpressions can be null.
1006      Then we add epsilon closures of the nodes which are the next nodes of
1007      the back-references.  */
1008   if (dfa->nbackref > 0)
1009     for (i = 0; i < init_nodes.nelem; ++i)
1010       {
1011 	int node_idx = init_nodes.elems[i];
1012 	re_token_type_t type = dfa->nodes[node_idx].type;
1013 
1014 	int clexp_idx;
1015 	if (type != OP_BACK_REF)
1016 	  continue;
1017 	for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1018 	  {
1019 	    re_token_t *clexp_node;
1020 	    clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1021 	    if (clexp_node->type == OP_CLOSE_SUBEXP
1022 		&& clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1023 	      break;
1024 	  }
1025 	if (clexp_idx == init_nodes.nelem)
1026 	  continue;
1027 
1028 	if (type == OP_BACK_REF)
1029 	  {
1030 	    int dest_idx = dfa->edests[node_idx].elems[0];
1031 	    if (!re_node_set_contains (&init_nodes, dest_idx))
1032 	      {
1033 		reg_errcode_t err = re_node_set_merge (&init_nodes,
1034 						       dfa->eclosures
1035 						       + dest_idx);
1036 		if (err != REG_NOERROR)
1037 		  return err;
1038 		i = 0;
1039 	      }
1040 	  }
1041       }
1042 
1043   /* It must be the first time to invoke acquire_state.  */
1044   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1045   /* We don't check ERR here, since the initial state must not be NULL.  */
1046   if (BE (dfa->init_state == NULL, 0))
1047     return err;
1048   if (dfa->init_state->has_constraint)
1049     {
1050       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1051 						       CONTEXT_WORD);
1052       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1053 						     CONTEXT_NEWLINE);
1054       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1055 							 &init_nodes,
1056 							 CONTEXT_NEWLINE
1057 							 | CONTEXT_BEGBUF);
1058       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1059 	      || dfa->init_state_begbuf == NULL, 0))
1060 	return err;
1061     }
1062   else
1063     dfa->init_state_word = dfa->init_state_nl
1064       = dfa->init_state_begbuf = dfa->init_state;
1065 
1066   re_node_set_free (&init_nodes);
1067   return REG_NOERROR;
1068 }
1069 
1070 #ifdef RE_ENABLE_I18N
1071 /* If it is possible to do searching in single byte encoding instead of UTF-8
1072    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1073    DFA nodes where needed.  */
1074 
1075 static void
optimize_utf8(re_dfa_t * dfa)1076 optimize_utf8 (re_dfa_t *dfa)
1077 {
1078   int node, i, mb_chars = 0, has_period = 0;
1079 
1080   for (node = 0; node < dfa->nodes_len; ++node)
1081     switch (dfa->nodes[node].type)
1082       {
1083       case CHARACTER:
1084 	if (dfa->nodes[node].opr.c >= 0x80)
1085 	  mb_chars = 1;
1086 	break;
1087       case ANCHOR:
1088 	switch (dfa->nodes[node].opr.ctx_type)
1089 	  {
1090 	  case LINE_FIRST:
1091 	  case LINE_LAST:
1092 	  case BUF_FIRST:
1093 	  case BUF_LAST:
1094 	    break;
1095 	  default:
1096 	    /* Word anchors etc. cannot be handled.  It's okay to test
1097 	       opr.ctx_type since constraints (for all DFA nodes) are
1098 	       created by ORing one or more opr.ctx_type values.  */
1099 	    return;
1100 	  }
1101 	break;
1102       case OP_PERIOD:
1103 	has_period = 1;
1104 	break;
1105       case OP_BACK_REF:
1106       case OP_ALT:
1107       case END_OF_RE:
1108       case OP_DUP_ASTERISK:
1109       case OP_OPEN_SUBEXP:
1110       case OP_CLOSE_SUBEXP:
1111 	break;
1112       case COMPLEX_BRACKET:
1113 	return;
1114       case SIMPLE_BRACKET:
1115 	/* Just double check.  The non-ASCII range starts at 0x80.  */
1116 	assert (0x80 % BITSET_WORD_BITS == 0);
1117 	for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1118 	  if (dfa->nodes[node].opr.sbcset[i])
1119 	    return;
1120 	break;
1121       default:
1122 	abort ();
1123       }
1124 
1125   if (mb_chars || has_period)
1126     for (node = 0; node < dfa->nodes_len; ++node)
1127       {
1128 	if (dfa->nodes[node].type == CHARACTER
1129 	    && dfa->nodes[node].opr.c >= 0x80)
1130 	  dfa->nodes[node].mb_partial = 0;
1131 	else if (dfa->nodes[node].type == OP_PERIOD)
1132 	  dfa->nodes[node].type = OP_UTF8_PERIOD;
1133       }
1134 
1135   /* The search can be in single byte locale.  */
1136   dfa->mb_cur_max = 1;
1137   dfa->is_utf8 = 0;
1138   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1139 }
1140 #endif
1141 
1142 /* Analyze the structure tree, and calculate "first", "next", "edest",
1143    "eclosure", and "inveclosure".  */
1144 
1145 static reg_errcode_t
analyze(regex_t * preg)1146 analyze (regex_t *preg)
1147 {
1148   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1149   reg_errcode_t ret;
1150 
1151   /* Allocate arrays.  */
1152   dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1153   dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1154   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1155   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1156   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1157 	  || dfa->eclosures == NULL, 0))
1158     return REG_ESPACE;
1159 
1160   dfa->subexp_map = re_malloc (int, preg->re_nsub);
1161   if (dfa->subexp_map != NULL)
1162     {
1163       int i;
1164       for (i = 0; i < preg->re_nsub; i++)
1165 	dfa->subexp_map[i] = i;
1166       preorder (dfa->str_tree, optimize_subexps, dfa);
1167       for (i = 0; i < preg->re_nsub; i++)
1168 	if (dfa->subexp_map[i] != i)
1169 	  break;
1170       if (i == preg->re_nsub)
1171 	{
1172 	  free (dfa->subexp_map);
1173 	  dfa->subexp_map = NULL;
1174 	}
1175     }
1176 
1177   ret = postorder (dfa->str_tree, lower_subexps, preg);
1178   if (BE (ret != REG_NOERROR, 0))
1179     return ret;
1180   ret = postorder (dfa->str_tree, calc_first, dfa);
1181   if (BE (ret != REG_NOERROR, 0))
1182     return ret;
1183   preorder (dfa->str_tree, calc_next, dfa);
1184   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1185   if (BE (ret != REG_NOERROR, 0))
1186     return ret;
1187   ret = calc_eclosure (dfa);
1188   if (BE (ret != REG_NOERROR, 0))
1189     return ret;
1190 
1191   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1192      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1193   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1194       || dfa->nbackref)
1195     {
1196       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1197       if (BE (dfa->inveclosures == NULL, 0))
1198 	return REG_ESPACE;
1199       ret = calc_inveclosure (dfa);
1200     }
1201 
1202   return ret;
1203 }
1204 
1205 /* Our parse trees are very unbalanced, so we cannot use a stack to
1206    implement parse tree visits.  Instead, we use parent pointers and
1207    some hairy code in these two functions.  */
1208 static reg_errcode_t
postorder(bin_tree_t * root,reg_errcode_t (fn (void *,bin_tree_t *)),void * extra)1209 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1210 	   void *extra)
1211 {
1212   bin_tree_t *node, *prev;
1213 
1214   for (node = root; ; )
1215     {
1216       /* Descend down the tree, preferably to the left (or to the right
1217 	 if that's the only child).  */
1218       while (node->left || node->right)
1219 	if (node->left)
1220 	  node = node->left;
1221 	else
1222 	  node = node->right;
1223 
1224       do
1225 	{
1226 	  reg_errcode_t err = fn (extra, node);
1227 	  if (BE (err != REG_NOERROR, 0))
1228 	    return err;
1229 	  if (node->parent == NULL)
1230 	    return REG_NOERROR;
1231 	  prev = node;
1232 	  node = node->parent;
1233 	}
1234       /* Go up while we have a node that is reached from the right.  */
1235       while (node->right == prev || node->right == NULL);
1236       node = node->right;
1237     }
1238 }
1239 
1240 static reg_errcode_t
preorder(bin_tree_t * root,reg_errcode_t (fn (void *,bin_tree_t *)),void * extra)1241 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1242 	  void *extra)
1243 {
1244   bin_tree_t *node;
1245 
1246   for (node = root; ; )
1247     {
1248       reg_errcode_t err = fn (extra, node);
1249       if (BE (err != REG_NOERROR, 0))
1250 	return err;
1251 
1252       /* Go to the left node, or up and to the right.  */
1253       if (node->left)
1254 	node = node->left;
1255       else
1256 	{
1257 	  bin_tree_t *prev = NULL;
1258 	  while (node->right == prev || node->right == NULL)
1259 	    {
1260 	      prev = node;
1261 	      node = node->parent;
1262 	      if (!node)
1263 		return REG_NOERROR;
1264 	    }
1265 	  node = node->right;
1266 	}
1267     }
1268 }
1269 
1270 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1271    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1272    backreferences as well.  Requires a preorder visit.  */
1273 static reg_errcode_t
optimize_subexps(void * extra,bin_tree_t * node)1274 optimize_subexps (void *extra, bin_tree_t *node)
1275 {
1276   re_dfa_t *dfa = (re_dfa_t *) extra;
1277 
1278   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1279     {
1280       int idx = node->token.opr.idx;
1281       node->token.opr.idx = dfa->subexp_map[idx];
1282       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1283     }
1284 
1285   else if (node->token.type == SUBEXP
1286 	   && node->left && node->left->token.type == SUBEXP)
1287     {
1288       int other_idx = node->left->token.opr.idx;
1289 
1290       node->left = node->left->left;
1291       if (node->left)
1292 	node->left->parent = node;
1293 
1294       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1295       if (other_idx < BITSET_WORD_BITS)
1296 	  dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1297     }
1298 
1299   return REG_NOERROR;
1300 }
1301 
1302 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1303    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1304 static reg_errcode_t
lower_subexps(void * extra,bin_tree_t * node)1305 lower_subexps (void *extra, bin_tree_t *node)
1306 {
1307   regex_t *preg = (regex_t *) extra;
1308   reg_errcode_t err = REG_NOERROR;
1309 
1310   if (node->left && node->left->token.type == SUBEXP)
1311     {
1312       node->left = lower_subexp (&err, preg, node->left);
1313       if (node->left)
1314 	node->left->parent = node;
1315     }
1316   if (node->right && node->right->token.type == SUBEXP)
1317     {
1318       node->right = lower_subexp (&err, preg, node->right);
1319       if (node->right)
1320 	node->right->parent = node;
1321     }
1322 
1323   return err;
1324 }
1325 
1326 static bin_tree_t *
lower_subexp(reg_errcode_t * err,regex_t * preg,bin_tree_t * node)1327 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1328 {
1329   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1330   bin_tree_t *body = node->left;
1331   bin_tree_t *op, *cls, *tree1, *tree;
1332 
1333   if (preg->no_sub
1334       /* We do not optimize empty subexpressions, because otherwise we may
1335 	 have bad CONCAT nodes with NULL children.  This is obviously not
1336 	 very common, so we do not lose much.  An example that triggers
1337 	 this case is the sed "script" /\(\)/x.  */
1338       && node->left != NULL
1339       && (node->token.opr.idx >= BITSET_WORD_BITS
1340 	  || !(dfa->used_bkref_map
1341 	       & ((bitset_word_t) 1 << node->token.opr.idx))))
1342     return node->left;
1343 
1344   /* Convert the SUBEXP node to the concatenation of an
1345      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1346   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1347   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1348   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1349   tree = create_tree (dfa, op, tree1, CONCAT);
1350   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1351     {
1352       *err = REG_ESPACE;
1353       return NULL;
1354     }
1355 
1356   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1357   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1358   return tree;
1359 }
1360 
1361 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1362    nodes.  Requires a postorder visit.  */
1363 static reg_errcode_t
calc_first(void * extra,bin_tree_t * node)1364 calc_first (void *extra, bin_tree_t *node)
1365 {
1366   re_dfa_t *dfa = (re_dfa_t *) extra;
1367   if (node->token.type == CONCAT)
1368     {
1369       node->first = node->left->first;
1370       node->node_idx = node->left->node_idx;
1371     }
1372   else
1373     {
1374       node->first = node;
1375       node->node_idx = re_dfa_add_node (dfa, node->token);
1376       if (BE (node->node_idx == -1, 0))
1377 	return REG_ESPACE;
1378       if (node->token.type == ANCHOR)
1379 	dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1380     }
1381   return REG_NOERROR;
1382 }
1383 
1384 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1385 static reg_errcode_t
calc_next(void * extra,bin_tree_t * node)1386 calc_next (void *extra, bin_tree_t *node)
1387 {
1388   switch (node->token.type)
1389     {
1390     case OP_DUP_ASTERISK:
1391       node->left->next = node;
1392       break;
1393     case CONCAT:
1394       node->left->next = node->right->first;
1395       node->right->next = node->next;
1396       break;
1397     default:
1398       if (node->left)
1399 	node->left->next = node->next;
1400       if (node->right)
1401 	node->right->next = node->next;
1402       break;
1403     }
1404   return REG_NOERROR;
1405 }
1406 
1407 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1408 static reg_errcode_t
link_nfa_nodes(void * extra,bin_tree_t * node)1409 link_nfa_nodes (void *extra, bin_tree_t *node)
1410 {
1411   re_dfa_t *dfa = (re_dfa_t *) extra;
1412   int idx = node->node_idx;
1413   reg_errcode_t err = REG_NOERROR;
1414 
1415   switch (node->token.type)
1416     {
1417     case CONCAT:
1418       break;
1419 
1420     case END_OF_RE:
1421       assert (node->next == NULL);
1422       break;
1423 
1424     case OP_DUP_ASTERISK:
1425     case OP_ALT:
1426       {
1427 	int left, right;
1428 	dfa->has_plural_match = 1;
1429 	if (node->left != NULL)
1430 	  left = node->left->first->node_idx;
1431 	else
1432 	  left = node->next->node_idx;
1433 	if (node->right != NULL)
1434 	  right = node->right->first->node_idx;
1435 	else
1436 	  right = node->next->node_idx;
1437 	assert (left > -1);
1438 	assert (right > -1);
1439 	err = re_node_set_init_2 (dfa->edests + idx, left, right);
1440       }
1441       break;
1442 
1443     case ANCHOR:
1444     case OP_OPEN_SUBEXP:
1445     case OP_CLOSE_SUBEXP:
1446       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1447       break;
1448 
1449     case OP_BACK_REF:
1450       dfa->nexts[idx] = node->next->node_idx;
1451       if (node->token.type == OP_BACK_REF)
1452 	err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1453       break;
1454 
1455     default:
1456       assert (!IS_EPSILON_NODE (node->token.type));
1457       dfa->nexts[idx] = node->next->node_idx;
1458       break;
1459     }
1460 
1461   return err;
1462 }
1463 
1464 /* Duplicate the epsilon closure of the node ROOT_NODE.
1465    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1466    to their own constraint.  */
1467 
1468 static reg_errcode_t
1469 internal_function
duplicate_node_closure(re_dfa_t * dfa,int top_org_node,int top_clone_node,int root_node,unsigned int init_constraint)1470 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1471 			int root_node, unsigned int init_constraint)
1472 {
1473   int org_node, clone_node, ret;
1474   unsigned int constraint = init_constraint;
1475   for (org_node = top_org_node, clone_node = top_clone_node;;)
1476     {
1477       int org_dest, clone_dest;
1478       if (dfa->nodes[org_node].type == OP_BACK_REF)
1479 	{
1480 	  /* If the back reference epsilon-transit, its destination must
1481 	     also have the constraint.  Then duplicate the epsilon closure
1482 	     of the destination of the back reference, and store it in
1483 	     edests of the back reference.  */
1484 	  org_dest = dfa->nexts[org_node];
1485 	  re_node_set_empty (dfa->edests + clone_node);
1486 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1487 	  if (BE (clone_dest == -1, 0))
1488 	    return REG_ESPACE;
1489 	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1490 	  ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1491 	  if (BE (ret < 0, 0))
1492 	    return REG_ESPACE;
1493 	}
1494       else if (dfa->edests[org_node].nelem == 0)
1495 	{
1496 	  /* In case of the node can't epsilon-transit, don't duplicate the
1497 	     destination and store the original destination as the
1498 	     destination of the node.  */
1499 	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1500 	  break;
1501 	}
1502       else if (dfa->edests[org_node].nelem == 1)
1503 	{
1504 	  /* In case of the node can epsilon-transit, and it has only one
1505 	     destination.  */
1506 	  org_dest = dfa->edests[org_node].elems[0];
1507 	  re_node_set_empty (dfa->edests + clone_node);
1508 	  /* If the node is root_node itself, it means the epsilon clsoure
1509 	     has a loop.   Then tie it to the destination of the root_node.  */
1510 	  if (org_node == root_node && clone_node != org_node)
1511 	    {
1512 	      ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1513 	      if (BE (ret < 0, 0))
1514 		return REG_ESPACE;
1515 	      break;
1516 	    }
1517 	  /* In case of the node has another constraint, add it.  */
1518 	  constraint |= dfa->nodes[org_node].constraint;
1519 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1520 	  if (BE (clone_dest == -1, 0))
1521 	    return REG_ESPACE;
1522 	  ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1523 	  if (BE (ret < 0, 0))
1524 	    return REG_ESPACE;
1525 	}
1526       else /* dfa->edests[org_node].nelem == 2 */
1527 	{
1528 	  /* In case of the node can epsilon-transit, and it has two
1529 	     destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1530 	  org_dest = dfa->edests[org_node].elems[0];
1531 	  re_node_set_empty (dfa->edests + clone_node);
1532 	  /* Search for a duplicated node which satisfies the constraint.  */
1533 	  clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1534 	  if (clone_dest == -1)
1535 	    {
1536 	      /* There is no such duplicated node, create a new one.  */
1537 	      reg_errcode_t err;
1538 	      clone_dest = duplicate_node (dfa, org_dest, constraint);
1539 	      if (BE (clone_dest == -1, 0))
1540 		return REG_ESPACE;
1541 	      ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1542 	      if (BE (ret < 0, 0))
1543 		return REG_ESPACE;
1544 	      err = duplicate_node_closure (dfa, org_dest, clone_dest,
1545 					    root_node, constraint);
1546 	      if (BE (err != REG_NOERROR, 0))
1547 		return err;
1548 	    }
1549 	  else
1550 	    {
1551 	      /* There is a duplicated node which satisfies the constraint,
1552 		 use it to avoid infinite loop.  */
1553 	      ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1554 	      if (BE (ret < 0, 0))
1555 		return REG_ESPACE;
1556 	    }
1557 
1558 	  org_dest = dfa->edests[org_node].elems[1];
1559 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1560 	  if (BE (clone_dest == -1, 0))
1561 	    return REG_ESPACE;
1562 	  ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1563 	  if (BE (ret < 0, 0))
1564 	    return REG_ESPACE;
1565 	}
1566       org_node = org_dest;
1567       clone_node = clone_dest;
1568     }
1569   return REG_NOERROR;
1570 }
1571 
1572 /* Search for a node which is duplicated from the node ORG_NODE, and
1573    satisfies the constraint CONSTRAINT.  */
1574 
1575 static int
search_duplicated_node(const re_dfa_t * dfa,int org_node,unsigned int constraint)1576 search_duplicated_node (const re_dfa_t *dfa, int org_node,
1577 			unsigned int constraint)
1578 {
1579   int idx;
1580   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1581     {
1582       if (org_node == dfa->org_indices[idx]
1583 	  && constraint == dfa->nodes[idx].constraint)
1584 	return idx; /* Found.  */
1585     }
1586   return -1; /* Not found.  */
1587 }
1588 
1589 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1590    Return the index of the new node, or -1 if insufficient storage is
1591    available.  */
1592 
1593 static int
duplicate_node(re_dfa_t * dfa,int org_idx,unsigned int constraint)1594 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1595 {
1596   int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1597   if (BE (dup_idx != -1, 1))
1598     {
1599       dfa->nodes[dup_idx].constraint = constraint;
1600       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1601       dfa->nodes[dup_idx].duplicated = 1;
1602 
1603       /* Store the index of the original node.  */
1604       dfa->org_indices[dup_idx] = org_idx;
1605     }
1606   return dup_idx;
1607 }
1608 
1609 static reg_errcode_t
calc_inveclosure(re_dfa_t * dfa)1610 calc_inveclosure (re_dfa_t *dfa)
1611 {
1612   int src, idx, ret;
1613   for (idx = 0; idx < dfa->nodes_len; ++idx)
1614     re_node_set_init_empty (dfa->inveclosures + idx);
1615 
1616   for (src = 0; src < dfa->nodes_len; ++src)
1617     {
1618       int *elems = dfa->eclosures[src].elems;
1619       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1620 	{
1621 	  ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1622 	  if (BE (ret == -1, 0))
1623 	    return REG_ESPACE;
1624 	}
1625     }
1626 
1627   return REG_NOERROR;
1628 }
1629 
1630 /* Calculate "eclosure" for all the node in DFA.  */
1631 
1632 static reg_errcode_t
calc_eclosure(re_dfa_t * dfa)1633 calc_eclosure (re_dfa_t *dfa)
1634 {
1635   int node_idx, incomplete;
1636 #ifdef DEBUG
1637   assert (dfa->nodes_len > 0);
1638 #endif
1639   incomplete = 0;
1640   /* For each nodes, calculate epsilon closure.  */
1641   for (node_idx = 0; ; ++node_idx)
1642     {
1643       reg_errcode_t err;
1644       re_node_set eclosure_elem;
1645       if (node_idx == dfa->nodes_len)
1646 	{
1647 	  if (!incomplete)
1648 	    break;
1649 	  incomplete = 0;
1650 	  node_idx = 0;
1651 	}
1652 
1653 #ifdef DEBUG
1654       assert (dfa->eclosures[node_idx].nelem != -1);
1655 #endif
1656 
1657       /* If we have already calculated, skip it.  */
1658       if (dfa->eclosures[node_idx].nelem != 0)
1659 	continue;
1660       /* Calculate epsilon closure of `node_idx'.  */
1661       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1662       if (BE (err != REG_NOERROR, 0))
1663 	return err;
1664 
1665       if (dfa->eclosures[node_idx].nelem == 0)
1666 	{
1667 	  incomplete = 1;
1668 	  re_node_set_free (&eclosure_elem);
1669 	}
1670     }
1671   return REG_NOERROR;
1672 }
1673 
1674 /* Calculate epsilon closure of NODE.  */
1675 
1676 static reg_errcode_t
calc_eclosure_iter(re_node_set * new_set,re_dfa_t * dfa,int node,int root)1677 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1678 {
1679   reg_errcode_t err;
1680   int i;
1681   re_node_set eclosure;
1682   int ret;
1683   int incomplete = 0;
1684   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1685   if (BE (err != REG_NOERROR, 0))
1686     return err;
1687 
1688   /* This indicates that we are calculating this node now.
1689      We reference this value to avoid infinite loop.  */
1690   dfa->eclosures[node].nelem = -1;
1691 
1692   /* If the current node has constraints, duplicate all nodes
1693      since they must inherit the constraints.  */
1694   if (dfa->nodes[node].constraint
1695       && dfa->edests[node].nelem
1696       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1697     {
1698       err = duplicate_node_closure (dfa, node, node, node,
1699 				    dfa->nodes[node].constraint);
1700       if (BE (err != REG_NOERROR, 0))
1701 	return err;
1702     }
1703 
1704   /* Expand each epsilon destination nodes.  */
1705   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1706     for (i = 0; i < dfa->edests[node].nelem; ++i)
1707       {
1708 	re_node_set eclosure_elem;
1709 	int edest = dfa->edests[node].elems[i];
1710 	/* If calculating the epsilon closure of `edest' is in progress,
1711 	   return intermediate result.  */
1712 	if (dfa->eclosures[edest].nelem == -1)
1713 	  {
1714 	    incomplete = 1;
1715 	    continue;
1716 	  }
1717 	/* If we haven't calculated the epsilon closure of `edest' yet,
1718 	   calculate now. Otherwise use calculated epsilon closure.  */
1719 	if (dfa->eclosures[edest].nelem == 0)
1720 	  {
1721 	    err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1722 	    if (BE (err != REG_NOERROR, 0))
1723 	      return err;
1724 	  }
1725 	else
1726 	  eclosure_elem = dfa->eclosures[edest];
1727 	/* Merge the epsilon closure of `edest'.  */
1728 	err = re_node_set_merge (&eclosure, &eclosure_elem);
1729 	if (BE (err != REG_NOERROR, 0))
1730 	  return err;
1731 	/* If the epsilon closure of `edest' is incomplete,
1732 	   the epsilon closure of this node is also incomplete.  */
1733 	if (dfa->eclosures[edest].nelem == 0)
1734 	  {
1735 	    incomplete = 1;
1736 	    re_node_set_free (&eclosure_elem);
1737 	  }
1738       }
1739 
1740   /* An epsilon closure includes itself.  */
1741   ret = re_node_set_insert (&eclosure, node);
1742   if (BE (ret < 0, 0))
1743     return REG_ESPACE;
1744   if (incomplete && !root)
1745     dfa->eclosures[node].nelem = 0;
1746   else
1747     dfa->eclosures[node] = eclosure;
1748   *new_set = eclosure;
1749   return REG_NOERROR;
1750 }
1751 
1752 /* Functions for token which are used in the parser.  */
1753 
1754 /* Fetch a token from INPUT.
1755    We must not use this function inside bracket expressions.  */
1756 
1757 static void
1758 internal_function
fetch_token(re_token_t * result,re_string_t * input,reg_syntax_t syntax)1759 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1760 {
1761   re_string_skip_bytes (input, peek_token (result, input, syntax));
1762 }
1763 
1764 /* Peek a token from INPUT, and return the length of the token.
1765    We must not use this function inside bracket expressions.  */
1766 
1767 static int
1768 internal_function
peek_token(re_token_t * token,re_string_t * input,reg_syntax_t syntax)1769 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1770 {
1771   unsigned char c;
1772 
1773   if (re_string_eoi (input))
1774     {
1775       token->type = END_OF_RE;
1776       return 0;
1777     }
1778 
1779   c = re_string_peek_byte (input, 0);
1780   token->opr.c = c;
1781 
1782   token->word_char = 0;
1783 #ifdef RE_ENABLE_I18N
1784   token->mb_partial = 0;
1785   if (input->mb_cur_max > 1 &&
1786       !re_string_first_byte (input, re_string_cur_idx (input)))
1787     {
1788       token->type = CHARACTER;
1789       token->mb_partial = 1;
1790       return 1;
1791     }
1792 #endif
1793   if (c == '\\')
1794     {
1795       unsigned char c2;
1796       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1797 	{
1798 	  token->type = BACK_SLASH;
1799 	  return 1;
1800 	}
1801 
1802       c2 = re_string_peek_byte_case (input, 1);
1803       token->opr.c = c2;
1804       token->type = CHARACTER;
1805 #ifdef RE_ENABLE_I18N
1806       if (input->mb_cur_max > 1)
1807 	{
1808 	  wint_t wc = re_string_wchar_at (input,
1809 					  re_string_cur_idx (input) + 1);
1810 	  token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1811 	}
1812       else
1813 #endif
1814 	token->word_char = IS_WORD_CHAR (c2) != 0;
1815 
1816       switch (c2)
1817 	{
1818 	case '|':
1819 	  if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1820 	    token->type = OP_ALT;
1821 	  break;
1822 	case '1': case '2': case '3': case '4': case '5':
1823 	case '6': case '7': case '8': case '9':
1824 	  if (!(syntax & RE_NO_BK_REFS))
1825 	    {
1826 	      token->type = OP_BACK_REF;
1827 	      token->opr.idx = c2 - '1';
1828 	    }
1829 	  break;
1830 	case '<':
1831 	  if (!(syntax & RE_NO_GNU_OPS))
1832 	    {
1833 	      token->type = ANCHOR;
1834 	      token->opr.ctx_type = WORD_FIRST;
1835 	    }
1836 	  break;
1837 	case '>':
1838 	  if (!(syntax & RE_NO_GNU_OPS))
1839 	    {
1840 	      token->type = ANCHOR;
1841 	      token->opr.ctx_type = WORD_LAST;
1842 	    }
1843 	  break;
1844 	case 'b':
1845 	  if (!(syntax & RE_NO_GNU_OPS))
1846 	    {
1847 	      token->type = ANCHOR;
1848 	      token->opr.ctx_type = WORD_DELIM;
1849 	    }
1850 	  break;
1851 	case 'B':
1852 	  if (!(syntax & RE_NO_GNU_OPS))
1853 	    {
1854 	      token->type = ANCHOR;
1855 	      token->opr.ctx_type = NOT_WORD_DELIM;
1856 	    }
1857 	  break;
1858 	case 'w':
1859 	  if (!(syntax & RE_NO_GNU_OPS))
1860 	    token->type = OP_WORD;
1861 	  break;
1862 	case 'W':
1863 	  if (!(syntax & RE_NO_GNU_OPS))
1864 	    token->type = OP_NOTWORD;
1865 	  break;
1866 	case 's':
1867 	  if (!(syntax & RE_NO_GNU_OPS))
1868 	    token->type = OP_SPACE;
1869 	  break;
1870 	case 'S':
1871 	  if (!(syntax & RE_NO_GNU_OPS))
1872 	    token->type = OP_NOTSPACE;
1873 	  break;
1874 	case '`':
1875 	  if (!(syntax & RE_NO_GNU_OPS))
1876 	    {
1877 	      token->type = ANCHOR;
1878 	      token->opr.ctx_type = BUF_FIRST;
1879 	    }
1880 	  break;
1881 	case '\'':
1882 	  if (!(syntax & RE_NO_GNU_OPS))
1883 	    {
1884 	      token->type = ANCHOR;
1885 	      token->opr.ctx_type = BUF_LAST;
1886 	    }
1887 	  break;
1888 	case '(':
1889 	  if (!(syntax & RE_NO_BK_PARENS))
1890 	    token->type = OP_OPEN_SUBEXP;
1891 	  break;
1892 	case ')':
1893 	  if (!(syntax & RE_NO_BK_PARENS))
1894 	    token->type = OP_CLOSE_SUBEXP;
1895 	  break;
1896 	case '+':
1897 	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1898 	    token->type = OP_DUP_PLUS;
1899 	  break;
1900 	case '?':
1901 	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1902 	    token->type = OP_DUP_QUESTION;
1903 	  break;
1904 	case '{':
1905 	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1906 	    token->type = OP_OPEN_DUP_NUM;
1907 	  break;
1908 	case '}':
1909 	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1910 	    token->type = OP_CLOSE_DUP_NUM;
1911 	  break;
1912 	default:
1913 	  break;
1914 	}
1915       return 2;
1916     }
1917 
1918   token->type = CHARACTER;
1919 #ifdef RE_ENABLE_I18N
1920   if (input->mb_cur_max > 1)
1921     {
1922       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1923       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1924     }
1925   else
1926 #endif
1927     token->word_char = IS_WORD_CHAR (token->opr.c);
1928 
1929   switch (c)
1930     {
1931     case '\n':
1932       if (syntax & RE_NEWLINE_ALT)
1933 	token->type = OP_ALT;
1934       break;
1935     case '|':
1936       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1937 	token->type = OP_ALT;
1938       break;
1939     case '*':
1940       token->type = OP_DUP_ASTERISK;
1941       break;
1942     case '+':
1943       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1944 	token->type = OP_DUP_PLUS;
1945       break;
1946     case '?':
1947       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1948 	token->type = OP_DUP_QUESTION;
1949       break;
1950     case '{':
1951       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1952 	token->type = OP_OPEN_DUP_NUM;
1953       break;
1954     case '}':
1955       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1956 	token->type = OP_CLOSE_DUP_NUM;
1957       break;
1958     case '(':
1959       if (syntax & RE_NO_BK_PARENS)
1960 	token->type = OP_OPEN_SUBEXP;
1961       break;
1962     case ')':
1963       if (syntax & RE_NO_BK_PARENS)
1964 	token->type = OP_CLOSE_SUBEXP;
1965       break;
1966     case '[':
1967       token->type = OP_OPEN_BRACKET;
1968       break;
1969     case '.':
1970       token->type = OP_PERIOD;
1971       break;
1972     case '^':
1973       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1974 	  re_string_cur_idx (input) != 0)
1975 	{
1976 	  char prev = re_string_peek_byte (input, -1);
1977 	  if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1978 	    break;
1979 	}
1980       token->type = ANCHOR;
1981       token->opr.ctx_type = LINE_FIRST;
1982       break;
1983     case '$':
1984       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1985 	  re_string_cur_idx (input) + 1 != re_string_length (input))
1986 	{
1987 	  re_token_t next;
1988 	  re_string_skip_bytes (input, 1);
1989 	  peek_token (&next, input, syntax);
1990 	  re_string_skip_bytes (input, -1);
1991 	  if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1992 	    break;
1993 	}
1994       token->type = ANCHOR;
1995       token->opr.ctx_type = LINE_LAST;
1996       break;
1997     default:
1998       break;
1999     }
2000   return 1;
2001 }
2002 
2003 /* Peek a token from INPUT, and return the length of the token.
2004    We must not use this function out of bracket expressions.  */
2005 
2006 static int
2007 internal_function
peek_token_bracket(re_token_t * token,re_string_t * input,reg_syntax_t syntax)2008 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2009 {
2010   unsigned char c;
2011   if (re_string_eoi (input))
2012     {
2013       token->type = END_OF_RE;
2014       return 0;
2015     }
2016   c = re_string_peek_byte (input, 0);
2017   token->opr.c = c;
2018 
2019 #ifdef RE_ENABLE_I18N
2020   if (input->mb_cur_max > 1 &&
2021       !re_string_first_byte (input, re_string_cur_idx (input)))
2022     {
2023       token->type = CHARACTER;
2024       return 1;
2025     }
2026 #endif /* RE_ENABLE_I18N */
2027 
2028   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2029       && re_string_cur_idx (input) + 1 < re_string_length (input))
2030     {
2031       /* In this case, '\' escape a character.  */
2032       unsigned char c2;
2033       re_string_skip_bytes (input, 1);
2034       c2 = re_string_peek_byte (input, 0);
2035       token->opr.c = c2;
2036       token->type = CHARACTER;
2037       return 1;
2038     }
2039   if (c == '[') /* '[' is a special char in a bracket exps.  */
2040     {
2041       unsigned char c2;
2042       int token_len;
2043       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2044 	c2 = re_string_peek_byte (input, 1);
2045       else
2046 	c2 = 0;
2047       token->opr.c = c2;
2048       token_len = 2;
2049       switch (c2)
2050 	{
2051 	case '.':
2052 	  token->type = OP_OPEN_COLL_ELEM;
2053 	  break;
2054 	case '=':
2055 	  token->type = OP_OPEN_EQUIV_CLASS;
2056 	  break;
2057 	case ':':
2058 	  if (syntax & RE_CHAR_CLASSES)
2059 	    {
2060 	      token->type = OP_OPEN_CHAR_CLASS;
2061 	      break;
2062 	    }
2063 	  /* else fall through.  */
2064 	default:
2065 	  token->type = CHARACTER;
2066 	  token->opr.c = c;
2067 	  token_len = 1;
2068 	  break;
2069 	}
2070       return token_len;
2071     }
2072   switch (c)
2073     {
2074     case '-':
2075       token->type = OP_CHARSET_RANGE;
2076       break;
2077     case ']':
2078       token->type = OP_CLOSE_BRACKET;
2079       break;
2080     case '^':
2081       token->type = OP_NON_MATCH_LIST;
2082       break;
2083     default:
2084       token->type = CHARACTER;
2085     }
2086   return 1;
2087 }
2088 
2089 /* Functions for parser.  */
2090 
2091 /* Entry point of the parser.
2092    Parse the regular expression REGEXP and return the structure tree.
2093    If an error is occured, ERR is set by error code, and return NULL.
2094    This function build the following tree, from regular expression <reg_exp>:
2095 	   CAT
2096 	   / \
2097 	  /   \
2098    <reg_exp>  EOR
2099 
2100    CAT means concatenation.
2101    EOR means end of regular expression.  */
2102 
2103 static bin_tree_t *
parse(re_string_t * regexp,regex_t * preg,reg_syntax_t syntax,reg_errcode_t * err)2104 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2105        reg_errcode_t *err)
2106 {
2107   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2108   bin_tree_t *tree, *eor, *root;
2109   re_token_t current_token;
2110   dfa->syntax = syntax;
2111   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2112   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2113   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2114     return NULL;
2115   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2116   if (tree != NULL)
2117     root = create_tree (dfa, tree, eor, CONCAT);
2118   else
2119     root = eor;
2120   if (BE (eor == NULL || root == NULL, 0))
2121     {
2122       *err = REG_ESPACE;
2123       return NULL;
2124     }
2125   return root;
2126 }
2127 
2128 /* This function build the following tree, from regular expression
2129    <branch1>|<branch2>:
2130 	   ALT
2131 	   / \
2132 	  /   \
2133    <branch1> <branch2>
2134 
2135    ALT means alternative, which represents the operator `|'.  */
2136 
2137 static bin_tree_t *
parse_reg_exp(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,int nest,reg_errcode_t * err)2138 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2139 	       reg_syntax_t syntax, int nest, reg_errcode_t *err)
2140 {
2141   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2142   bin_tree_t *tree, *branch = NULL;
2143   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2144   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2145     return NULL;
2146 
2147   while (token->type == OP_ALT)
2148     {
2149       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2150       if (token->type != OP_ALT && token->type != END_OF_RE
2151 	  && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2152 	{
2153 	  branch = parse_branch (regexp, preg, token, syntax, nest, err);
2154 	  if (BE (*err != REG_NOERROR && branch == NULL, 0))
2155 	    return NULL;
2156 	}
2157       else
2158 	branch = NULL;
2159       tree = create_tree (dfa, tree, branch, OP_ALT);
2160       if (BE (tree == NULL, 0))
2161 	{
2162 	  *err = REG_ESPACE;
2163 	  return NULL;
2164 	}
2165     }
2166   return tree;
2167 }
2168 
2169 /* This function build the following tree, from regular expression
2170    <exp1><exp2>:
2171 	CAT
2172 	/ \
2173        /   \
2174    <exp1> <exp2>
2175 
2176    CAT means concatenation.  */
2177 
2178 static bin_tree_t *
parse_branch(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,int nest,reg_errcode_t * err)2179 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2180 	      reg_syntax_t syntax, int nest, reg_errcode_t *err)
2181 {
2182   bin_tree_t *tree, *exp;
2183   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2184   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2185   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2186     return NULL;
2187 
2188   while (token->type != OP_ALT && token->type != END_OF_RE
2189 	 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2190     {
2191       exp = parse_expression (regexp, preg, token, syntax, nest, err);
2192       if (BE (*err != REG_NOERROR && exp == NULL, 0))
2193 	{
2194 	  if (tree != NULL)
2195 	    postorder (tree, free_tree, NULL);
2196 	  return NULL;
2197 	}
2198       if (tree != NULL && exp != NULL)
2199 	{
2200 	  bin_tree_t *newtree = create_tree (dfa, tree, exp, CONCAT);
2201 	  if (newtree == NULL)
2202 	    {
2203 	      postorder (exp, free_tree, NULL);
2204 	      postorder (tree, free_tree, NULL);
2205 	      *err = REG_ESPACE;
2206 	      return NULL;
2207 	    }
2208 	  tree = newtree;
2209 	}
2210       else if (tree == NULL)
2211 	tree = exp;
2212       /* Otherwise exp == NULL, we don't need to create new tree.  */
2213     }
2214   return tree;
2215 }
2216 
2217 /* This function build the following tree, from regular expression a*:
2218 	 *
2219 	 |
2220 	 a
2221 */
2222 
2223 static bin_tree_t *
parse_expression(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,int nest,reg_errcode_t * err)2224 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2225 		  reg_syntax_t syntax, int nest, reg_errcode_t *err)
2226 {
2227   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2228   bin_tree_t *tree;
2229   switch (token->type)
2230     {
2231     case CHARACTER:
2232       tree = create_token_tree (dfa, NULL, NULL, token);
2233       if (BE (tree == NULL, 0))
2234 	{
2235 	  *err = REG_ESPACE;
2236 	  return NULL;
2237 	}
2238 #ifdef RE_ENABLE_I18N
2239       if (dfa->mb_cur_max > 1)
2240 	{
2241 	  while (!re_string_eoi (regexp)
2242 		 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2243 	    {
2244 	      bin_tree_t *mbc_remain;
2245 	      fetch_token (token, regexp, syntax);
2246 	      mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2247 	      tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2248 	      if (BE (mbc_remain == NULL || tree == NULL, 0))
2249 		{
2250 		  *err = REG_ESPACE;
2251 		  return NULL;
2252 		}
2253 	    }
2254 	}
2255 #endif
2256       break;
2257     case OP_OPEN_SUBEXP:
2258       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2259       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2260 	return NULL;
2261       break;
2262     case OP_OPEN_BRACKET:
2263       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2264       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2265 	return NULL;
2266       break;
2267     case OP_BACK_REF:
2268       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2269 	{
2270 	  *err = REG_ESUBREG;
2271 	  return NULL;
2272 	}
2273       dfa->used_bkref_map |= 1 << token->opr.idx;
2274       tree = create_token_tree (dfa, NULL, NULL, token);
2275       if (BE (tree == NULL, 0))
2276 	{
2277 	  *err = REG_ESPACE;
2278 	  return NULL;
2279 	}
2280       ++dfa->nbackref;
2281       dfa->has_mb_node = 1;
2282       break;
2283     case OP_OPEN_DUP_NUM:
2284       if (syntax & RE_CONTEXT_INVALID_DUP)
2285 	{
2286 	  *err = REG_BADRPT;
2287 	  return NULL;
2288 	}
2289       /* FALLTHROUGH */
2290     case OP_DUP_ASTERISK:
2291     case OP_DUP_PLUS:
2292     case OP_DUP_QUESTION:
2293       if (syntax & RE_CONTEXT_INVALID_OPS)
2294 	{
2295 	  *err = REG_BADRPT;
2296 	  return NULL;
2297 	}
2298       else if (syntax & RE_CONTEXT_INDEP_OPS)
2299 	{
2300 	  fetch_token (token, regexp, syntax);
2301 	  return parse_expression (regexp, preg, token, syntax, nest, err);
2302 	}
2303       /* else fall through  */
2304     case OP_CLOSE_SUBEXP:
2305       if ((token->type == OP_CLOSE_SUBEXP) &&
2306 	  !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2307 	{
2308 	  *err = REG_ERPAREN;
2309 	  return NULL;
2310 	}
2311       /* else fall through  */
2312     case OP_CLOSE_DUP_NUM:
2313       /* We treat it as a normal character.  */
2314 
2315       /* Then we can these characters as normal characters.  */
2316       token->type = CHARACTER;
2317       /* mb_partial and word_char bits should be initialized already
2318 	 by peek_token.  */
2319       tree = create_token_tree (dfa, NULL, NULL, token);
2320       if (BE (tree == NULL, 0))
2321 	{
2322 	  *err = REG_ESPACE;
2323 	  return NULL;
2324 	}
2325       break;
2326     case ANCHOR:
2327       if ((token->opr.ctx_type
2328 	   & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2329 	  && dfa->word_ops_used == 0)
2330 	init_word_char (dfa);
2331       if (token->opr.ctx_type == WORD_DELIM
2332 	  || token->opr.ctx_type == NOT_WORD_DELIM)
2333 	{
2334 	  bin_tree_t *tree_first, *tree_last;
2335 	  if (token->opr.ctx_type == WORD_DELIM)
2336 	    {
2337 	      token->opr.ctx_type = WORD_FIRST;
2338 	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2339 	      token->opr.ctx_type = WORD_LAST;
2340 	    }
2341 	  else
2342 	    {
2343 	      token->opr.ctx_type = INSIDE_WORD;
2344 	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2345 	      token->opr.ctx_type = INSIDE_NOTWORD;
2346 	    }
2347 	  tree_last = create_token_tree (dfa, NULL, NULL, token);
2348 	  tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2349 	  if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2350 	    {
2351 	      *err = REG_ESPACE;
2352 	      return NULL;
2353 	    }
2354 	}
2355       else
2356 	{
2357 	  tree = create_token_tree (dfa, NULL, NULL, token);
2358 	  if (BE (tree == NULL, 0))
2359 	    {
2360 	      *err = REG_ESPACE;
2361 	      return NULL;
2362 	    }
2363 	}
2364       /* We must return here, since ANCHORs can't be followed
2365 	 by repetition operators.
2366 	 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2367 	     it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2368       fetch_token (token, regexp, syntax);
2369       return tree;
2370     case OP_PERIOD:
2371       tree = create_token_tree (dfa, NULL, NULL, token);
2372       if (BE (tree == NULL, 0))
2373 	{
2374 	  *err = REG_ESPACE;
2375 	  return NULL;
2376 	}
2377       if (dfa->mb_cur_max > 1)
2378 	dfa->has_mb_node = 1;
2379       break;
2380     case OP_WORD:
2381     case OP_NOTWORD:
2382       tree = build_charclass_op (dfa, regexp->trans,
2383 				 (const unsigned char *) "alnum",
2384 				 (const unsigned char *) "_",
2385 				 token->type == OP_NOTWORD, err);
2386       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2387 	return NULL;
2388       break;
2389     case OP_SPACE:
2390     case OP_NOTSPACE:
2391       tree = build_charclass_op (dfa, regexp->trans,
2392 				 (const unsigned char *) "space",
2393 				 (const unsigned char *) "",
2394 				 token->type == OP_NOTSPACE, err);
2395       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2396 	return NULL;
2397       break;
2398     case OP_ALT:
2399     case END_OF_RE:
2400       return NULL;
2401     case BACK_SLASH:
2402       *err = REG_EESCAPE;
2403       return NULL;
2404     default:
2405       /* Must not happen?  */
2406 #ifdef DEBUG
2407       assert (0);
2408 #endif
2409       return NULL;
2410     }
2411   fetch_token (token, regexp, syntax);
2412 
2413   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2414 	 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2415     {
2416       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2417       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2418 	return NULL;
2419       /* In BRE consecutive duplications are not allowed.  */
2420       if ((syntax & RE_CONTEXT_INVALID_DUP)
2421 	  && (token->type == OP_DUP_ASTERISK
2422 	      || token->type == OP_OPEN_DUP_NUM))
2423 	{
2424 	  *err = REG_BADRPT;
2425 	  return NULL;
2426 	}
2427     }
2428 
2429   return tree;
2430 }
2431 
2432 /* This function build the following tree, from regular expression
2433    (<reg_exp>):
2434 	 SUBEXP
2435 	    |
2436 	<reg_exp>
2437 */
2438 
2439 static bin_tree_t *
parse_sub_exp(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,int nest,reg_errcode_t * err)2440 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2441 	       reg_syntax_t syntax, int nest, reg_errcode_t *err)
2442 {
2443   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2444   bin_tree_t *tree;
2445   size_t cur_nsub;
2446   cur_nsub = preg->re_nsub++;
2447 
2448   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2449 
2450   /* The subexpression may be a null string.  */
2451   if (token->type == OP_CLOSE_SUBEXP)
2452     tree = NULL;
2453   else
2454     {
2455       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2456       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2457 	{
2458 	  if (tree != NULL)
2459 	    postorder (tree, free_tree, NULL);
2460 	  *err = REG_EPAREN;
2461 	}
2462       if (BE (*err != REG_NOERROR, 0))
2463 	return NULL;
2464     }
2465 
2466   if (cur_nsub <= '9' - '1')
2467     dfa->completed_bkref_map |= 1 << cur_nsub;
2468 
2469   tree = create_tree (dfa, tree, NULL, SUBEXP);
2470   if (BE (tree == NULL, 0))
2471     {
2472       *err = REG_ESPACE;
2473       return NULL;
2474     }
2475   tree->token.opr.idx = cur_nsub;
2476   return tree;
2477 }
2478 
2479 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2480 
2481 static bin_tree_t *
parse_dup_op(bin_tree_t * elem,re_string_t * regexp,re_dfa_t * dfa,re_token_t * token,reg_syntax_t syntax,reg_errcode_t * err)2482 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2483 	      re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2484 {
2485   bin_tree_t *tree = NULL, *old_tree = NULL;
2486   int i, start, end, start_idx = re_string_cur_idx (regexp);
2487   re_token_t start_token = *token;
2488 
2489   if (token->type == OP_OPEN_DUP_NUM)
2490     {
2491       end = 0;
2492       start = fetch_number (regexp, token, syntax);
2493       if (start == -1)
2494 	{
2495 	  if (token->type == CHARACTER && token->opr.c == ',')
2496 	    start = 0; /* We treat "{,m}" as "{0,m}".  */
2497 	  else
2498 	    {
2499 	      *err = REG_BADBR; /* <re>{} is invalid.  */
2500 	      return NULL;
2501 	    }
2502 	}
2503       if (BE (start != -2, 1))
2504 	{
2505 	  /* We treat "{n}" as "{n,n}".  */
2506 	  end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2507 		 : ((token->type == CHARACTER && token->opr.c == ',')
2508 		    ? fetch_number (regexp, token, syntax) : -2));
2509 	}
2510       if (BE (start == -2 || end == -2, 0))
2511 	{
2512 	  /* Invalid sequence.  */
2513 	  if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2514 	    {
2515 	      if (token->type == END_OF_RE)
2516 		*err = REG_EBRACE;
2517 	      else
2518 		*err = REG_BADBR;
2519 
2520 	      return NULL;
2521 	    }
2522 
2523 	  /* If the syntax bit is set, rollback.  */
2524 	  re_string_set_index (regexp, start_idx);
2525 	  *token = start_token;
2526 	  token->type = CHARACTER;
2527 	  /* mb_partial and word_char bits should be already initialized by
2528 	     peek_token.  */
2529 	  return elem;
2530 	}
2531 
2532       if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
2533 	{
2534 	  /* First number greater than second.  */
2535 	  *err = REG_BADBR;
2536 	  return NULL;
2537 	}
2538     }
2539   else
2540     {
2541       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2542       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2543     }
2544 
2545   fetch_token (token, regexp, syntax);
2546 
2547   if (BE (elem == NULL, 0))
2548     return NULL;
2549   if (BE (start == 0 && end == 0, 0))
2550     {
2551       postorder (elem, free_tree, NULL);
2552       return NULL;
2553     }
2554 
2555   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2556   if (BE (start > 0, 0))
2557     {
2558       tree = elem;
2559       for (i = 2; i <= start; ++i)
2560 	{
2561 	  elem = duplicate_tree (elem, dfa);
2562 	  tree = create_tree (dfa, tree, elem, CONCAT);
2563 	  if (BE (elem == NULL || tree == NULL, 0))
2564 	    goto parse_dup_op_espace;
2565 	}
2566 
2567       if (start == end)
2568 	return tree;
2569 
2570       /* Duplicate ELEM before it is marked optional.  */
2571       elem = duplicate_tree (elem, dfa);
2572       old_tree = tree;
2573     }
2574   else
2575     old_tree = NULL;
2576 
2577   if (elem->token.type == SUBEXP)
2578     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2579 
2580   tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2581   if (BE (tree == NULL, 0))
2582     goto parse_dup_op_espace;
2583 
2584   /* This loop is actually executed only when end != -1,
2585      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2586      already created the start+1-th copy.  */
2587   for (i = start + 2; i <= end; ++i)
2588     {
2589       elem = duplicate_tree (elem, dfa);
2590       tree = create_tree (dfa, tree, elem, CONCAT);
2591       if (BE (elem == NULL || tree == NULL, 0))
2592 	goto parse_dup_op_espace;
2593 
2594       tree = create_tree (dfa, tree, NULL, OP_ALT);
2595       if (BE (tree == NULL, 0))
2596 	goto parse_dup_op_espace;
2597     }
2598 
2599   if (old_tree)
2600     tree = create_tree (dfa, old_tree, tree, CONCAT);
2601 
2602   return tree;
2603 
2604  parse_dup_op_espace:
2605   *err = REG_ESPACE;
2606   return NULL;
2607 }
2608 
2609 /* Size of the names for collating symbol/equivalence_class/character_class.
2610    I'm not sure, but maybe enough.  */
2611 #define BRACKET_NAME_BUF_SIZE 32
2612 
2613 #ifndef _LIBC
2614   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2615      Build the range expression which starts from START_ELEM, and ends
2616      at END_ELEM.  The result are written to MBCSET and SBCSET.
2617      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2618      mbcset->range_ends, is a pointer argument sinse we may
2619      update it.  */
2620 
2621 static reg_errcode_t
2622 internal_function
2623 # ifdef RE_ENABLE_I18N
build_range_exp(bitset_t sbcset,re_charset_t * mbcset,int * range_alloc,bracket_elem_t * start_elem,bracket_elem_t * end_elem)2624 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2625 		 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2626 # else /* not RE_ENABLE_I18N */
2627 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2628 		 bracket_elem_t *end_elem)
2629 # endif /* not RE_ENABLE_I18N */
2630 {
2631   unsigned int start_ch, end_ch;
2632   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2633   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2634 	  || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2635 	  0))
2636     return REG_ERANGE;
2637 
2638   /* We can handle no multi character collating elements without libc
2639      support.  */
2640   if (BE ((start_elem->type == COLL_SYM
2641 	   && strlen ((char *) start_elem->opr.name) > 1)
2642 	  || (end_elem->type == COLL_SYM
2643 	      && strlen ((char *) end_elem->opr.name) > 1), 0))
2644     return REG_ECOLLATE;
2645 
2646 # ifdef RE_ENABLE_I18N
2647   {
2648     wchar_t wc;
2649     wint_t start_wc;
2650     wint_t end_wc;
2651     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2652 
2653     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2654 		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2655 		   : 0));
2656     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2657 	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2658 		 : 0));
2659     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2660 		? __btowc (start_ch) : start_elem->opr.wch);
2661     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2662 	      ? __btowc (end_ch) : end_elem->opr.wch);
2663     if (start_wc == WEOF || end_wc == WEOF)
2664       return REG_ECOLLATE;
2665     cmp_buf[0] = start_wc;
2666     cmp_buf[4] = end_wc;
2667     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2668       return REG_ERANGE;
2669 
2670     /* Got valid collation sequence values, add them as a new entry.
2671        However, for !_LIBC we have no collation elements: if the
2672        character set is single byte, the single byte character set
2673        that we build below suffices.  parse_bracket_exp passes
2674        no MBCSET if dfa->mb_cur_max == 1.  */
2675     if (mbcset)
2676       {
2677 	/* Check the space of the arrays.  */
2678 	if (BE (*range_alloc == mbcset->nranges, 0))
2679 	  {
2680 	    /* There is not enough space, need realloc.  */
2681 	    wchar_t *new_array_start, *new_array_end;
2682 	    int new_nranges;
2683 
2684 	    /* +1 in case of mbcset->nranges is 0.  */
2685 	    new_nranges = 2 * mbcset->nranges + 1;
2686 	    /* Use realloc since mbcset->range_starts and mbcset->range_ends
2687 	       are NULL if *range_alloc == 0.  */
2688 	    new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2689 					  new_nranges);
2690 	    new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2691 					new_nranges);
2692 
2693 	    if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2694 	      return REG_ESPACE;
2695 
2696 	    mbcset->range_starts = new_array_start;
2697 	    mbcset->range_ends = new_array_end;
2698 	    *range_alloc = new_nranges;
2699 	  }
2700 
2701 	mbcset->range_starts[mbcset->nranges] = start_wc;
2702 	mbcset->range_ends[mbcset->nranges++] = end_wc;
2703       }
2704 
2705     /* Build the table for single byte characters.  */
2706     for (wc = 0; wc < SBC_MAX; ++wc)
2707       {
2708 	cmp_buf[2] = wc;
2709 	if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2710 	    && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2711 	  bitset_set (sbcset, wc);
2712       }
2713   }
2714 # else /* not RE_ENABLE_I18N */
2715   {
2716     unsigned int ch;
2717     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2718 		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2719 		   : 0));
2720     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2721 	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2722 		 : 0));
2723     if (start_ch > end_ch)
2724       return REG_ERANGE;
2725     /* Build the table for single byte characters.  */
2726     for (ch = 0; ch < SBC_MAX; ++ch)
2727       if (start_ch <= ch  && ch <= end_ch)
2728 	bitset_set (sbcset, ch);
2729   }
2730 # endif /* not RE_ENABLE_I18N */
2731   return REG_NOERROR;
2732 }
2733 #endif /* not _LIBC */
2734 
2735 #ifndef _LIBC
2736 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2737    Build the collating element which is represented by NAME.
2738    The result are written to MBCSET and SBCSET.
2739    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2740    pointer argument since we may update it.  */
2741 
2742 static reg_errcode_t
2743 internal_function
2744 # ifdef RE_ENABLE_I18N
build_collating_symbol(bitset_t sbcset,re_charset_t * mbcset,int * coll_sym_alloc,const unsigned char * name)2745 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2746 			int *coll_sym_alloc, const unsigned char *name)
2747 # else /* not RE_ENABLE_I18N */
2748 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2749 # endif /* not RE_ENABLE_I18N */
2750 {
2751   size_t name_len = strlen ((const char *) name);
2752   if (BE (name_len != 1, 0))
2753     return REG_ECOLLATE;
2754   else
2755     {
2756       bitset_set (sbcset, name[0]);
2757       return REG_NOERROR;
2758     }
2759 }
2760 #endif /* not _LIBC */
2761 
2762 /* This function parse bracket expression like "[abc]", "[a-c]",
2763    "[[.a-a.]]" etc.  */
2764 
2765 static bin_tree_t *
parse_bracket_exp(re_string_t * regexp,re_dfa_t * dfa,re_token_t * token,reg_syntax_t syntax,reg_errcode_t * err)2766 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2767 		   reg_syntax_t syntax, reg_errcode_t *err)
2768 {
2769 #ifdef _LIBC
2770   const unsigned char *collseqmb;
2771   const char *collseqwc;
2772   uint32_t nrules;
2773   int32_t table_size;
2774   const int32_t *symb_table;
2775   const unsigned char *extra;
2776 
2777   /* Local function for parse_bracket_exp used in _LIBC environement.
2778      Seek the collating symbol entry correspondings to NAME.
2779      Return the index of the symbol in the SYMB_TABLE,
2780      or -1 if not found.  */
2781 
2782   auto inline int32_t
2783   __attribute ((always_inline))
2784   seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2785     {
2786       int32_t elem;
2787 
2788       for (elem = 0; elem < table_size; elem++)
2789 	if (symb_table[2 * elem] != 0)
2790 	  {
2791 	    int32_t idx = symb_table[2 * elem + 1];
2792 	    /* Skip the name of collating element name.  */
2793 	    idx += 1 + extra[idx];
2794 	    if (/* Compare the length of the name.  */
2795 		name_len == extra[idx]
2796 		/* Compare the name.  */
2797 		&& memcmp (name, &extra[idx + 1], name_len) == 0)
2798 	      /* Yep, this is the entry.  */
2799 	      return elem;
2800 	  }
2801       return -1;
2802     }
2803 
2804   /* Local function for parse_bracket_exp used in _LIBC environment.
2805      Look up the collation sequence value of BR_ELEM.
2806      Return the value if succeeded, UINT_MAX otherwise.  */
2807 
2808   auto inline unsigned int
2809   __attribute ((always_inline))
2810   lookup_collation_sequence_value (bracket_elem_t *br_elem)
2811     {
2812       if (br_elem->type == SB_CHAR)
2813 	{
2814 	  /*
2815 	  if (MB_CUR_MAX == 1)
2816 	  */
2817 	  if (nrules == 0)
2818 	    return collseqmb[br_elem->opr.ch];
2819 	  else
2820 	    {
2821 	      wint_t wc = __btowc (br_elem->opr.ch);
2822 	      return __collseq_table_lookup (collseqwc, wc);
2823 	    }
2824 	}
2825       else if (br_elem->type == MB_CHAR)
2826 	{
2827 	  if (nrules != 0)
2828 	    return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2829 	}
2830       else if (br_elem->type == COLL_SYM)
2831 	{
2832 	  size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2833 	  if (nrules != 0)
2834 	    {
2835 	      int32_t elem, idx;
2836 	      elem = seek_collating_symbol_entry (br_elem->opr.name,
2837 						  sym_name_len);
2838 	      if (elem != -1)
2839 		{
2840 		  /* We found the entry.  */
2841 		  idx = symb_table[2 * elem + 1];
2842 		  /* Skip the name of collating element name.  */
2843 		  idx += 1 + extra[idx];
2844 		  /* Skip the byte sequence of the collating element.  */
2845 		  idx += 1 + extra[idx];
2846 		  /* Adjust for the alignment.  */
2847 		  idx = (idx + 3) & ~3;
2848 		  /* Skip the multibyte collation sequence value.  */
2849 		  idx += sizeof (unsigned int);
2850 		  /* Skip the wide char sequence of the collating element.  */
2851 		  idx += sizeof (unsigned int) *
2852 		    (1 + *(unsigned int *) (extra + idx));
2853 		  /* Return the collation sequence value.  */
2854 		  return *(unsigned int *) (extra + idx);
2855 		}
2856 	      else if (sym_name_len == 1)
2857 		{
2858 		  /* No valid character.  Match it as a single byte
2859 		     character.  */
2860 		  return collseqmb[br_elem->opr.name[0]];
2861 		}
2862 	    }
2863 	  else if (sym_name_len == 1)
2864 	    return collseqmb[br_elem->opr.name[0]];
2865 	}
2866       return UINT_MAX;
2867     }
2868 
2869   /* Local function for parse_bracket_exp used in _LIBC environement.
2870      Build the range expression which starts from START_ELEM, and ends
2871      at END_ELEM.  The result are written to MBCSET and SBCSET.
2872      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2873      mbcset->range_ends, is a pointer argument sinse we may
2874      update it.  */
2875 
2876   auto inline reg_errcode_t
2877   __attribute ((always_inline))
2878   build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2879 		   bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2880     {
2881       unsigned int ch;
2882       uint32_t start_collseq;
2883       uint32_t end_collseq;
2884 
2885       /* Equivalence Classes and Character Classes can't be a range
2886 	 start/end.  */
2887       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2888 	      || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2889 	      0))
2890 	return REG_ERANGE;
2891 
2892       start_collseq = lookup_collation_sequence_value (start_elem);
2893       end_collseq = lookup_collation_sequence_value (end_elem);
2894       /* Check start/end collation sequence values.  */
2895       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2896 	return REG_ECOLLATE;
2897       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2898 	return REG_ERANGE;
2899 
2900       /* Got valid collation sequence values, add them as a new entry.
2901 	 However, if we have no collation elements, and the character set
2902 	 is single byte, the single byte character set that we
2903 	 build below suffices. */
2904       if (nrules > 0 || dfa->mb_cur_max > 1)
2905 	{
2906 	  /* Check the space of the arrays.  */
2907 	  if (BE (*range_alloc == mbcset->nranges, 0))
2908 	    {
2909 	      /* There is not enough space, need realloc.  */
2910 	      uint32_t *new_array_start;
2911 	      uint32_t *new_array_end;
2912 	      int new_nranges;
2913 
2914 	      /* +1 in case of mbcset->nranges is 0.  */
2915 	      new_nranges = 2 * mbcset->nranges + 1;
2916 	      new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2917 					    new_nranges);
2918 	      new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2919 					  new_nranges);
2920 
2921 	      if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2922 		return REG_ESPACE;
2923 
2924 	      mbcset->range_starts = new_array_start;
2925 	      mbcset->range_ends = new_array_end;
2926 	      *range_alloc = new_nranges;
2927 	    }
2928 
2929 	  mbcset->range_starts[mbcset->nranges] = start_collseq;
2930 	  mbcset->range_ends[mbcset->nranges++] = end_collseq;
2931 	}
2932 
2933       /* Build the table for single byte characters.  */
2934       for (ch = 0; ch < SBC_MAX; ch++)
2935 	{
2936 	  uint32_t ch_collseq;
2937 	  /*
2938 	  if (MB_CUR_MAX == 1)
2939 	  */
2940 	  if (nrules == 0)
2941 	    ch_collseq = collseqmb[ch];
2942 	  else
2943 	    ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2944 	  if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2945 	    bitset_set (sbcset, ch);
2946 	}
2947       return REG_NOERROR;
2948     }
2949 
2950   /* Local function for parse_bracket_exp used in _LIBC environement.
2951      Build the collating element which is represented by NAME.
2952      The result are written to MBCSET and SBCSET.
2953      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2954      pointer argument sinse we may update it.  */
2955 
2956   auto inline reg_errcode_t
2957   __attribute ((always_inline))
2958   build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2959 			  int *coll_sym_alloc, const unsigned char *name)
2960     {
2961       int32_t elem, idx;
2962       size_t name_len = strlen ((const char *) name);
2963       if (nrules != 0)
2964 	{
2965 	  elem = seek_collating_symbol_entry (name, name_len);
2966 	  if (elem != -1)
2967 	    {
2968 	      /* We found the entry.  */
2969 	      idx = symb_table[2 * elem + 1];
2970 	      /* Skip the name of collating element name.  */
2971 	      idx += 1 + extra[idx];
2972 	    }
2973 	  else if (name_len == 1)
2974 	    {
2975 	      /* No valid character, treat it as a normal
2976 		 character.  */
2977 	      bitset_set (sbcset, name[0]);
2978 	      return REG_NOERROR;
2979 	    }
2980 	  else
2981 	    return REG_ECOLLATE;
2982 
2983 	  /* Got valid collation sequence, add it as a new entry.  */
2984 	  /* Check the space of the arrays.  */
2985 	  if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2986 	    {
2987 	      /* Not enough, realloc it.  */
2988 	      /* +1 in case of mbcset->ncoll_syms is 0.  */
2989 	      int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2990 	      /* Use realloc since mbcset->coll_syms is NULL
2991 		 if *alloc == 0.  */
2992 	      int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2993 						   new_coll_sym_alloc);
2994 	      if (BE (new_coll_syms == NULL, 0))
2995 		return REG_ESPACE;
2996 	      mbcset->coll_syms = new_coll_syms;
2997 	      *coll_sym_alloc = new_coll_sym_alloc;
2998 	    }
2999 	  mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3000 	  return REG_NOERROR;
3001 	}
3002       else
3003 	{
3004 	  if (BE (name_len != 1, 0))
3005 	    return REG_ECOLLATE;
3006 	  else
3007 	    {
3008 	      bitset_set (sbcset, name[0]);
3009 	      return REG_NOERROR;
3010 	    }
3011 	}
3012     }
3013 #endif
3014 
3015   re_token_t br_token;
3016   re_bitset_ptr_t sbcset;
3017 #ifdef RE_ENABLE_I18N
3018   re_charset_t *mbcset;
3019   int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3020   int equiv_class_alloc = 0, char_class_alloc = 0;
3021 #endif /* not RE_ENABLE_I18N */
3022   int non_match = 0;
3023   bin_tree_t *work_tree;
3024   int token_len;
3025   int first_round = 1;
3026 #ifdef _LIBC
3027   collseqmb = (const unsigned char *)
3028     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3029   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3030   if (nrules)
3031     {
3032       /*
3033       if (MB_CUR_MAX > 1)
3034       */
3035       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3036       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3037       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3038 						  _NL_COLLATE_SYMB_TABLEMB);
3039       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3040 						   _NL_COLLATE_SYMB_EXTRAMB);
3041     }
3042 #endif
3043   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3044 #ifdef RE_ENABLE_I18N
3045   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3046 #endif /* RE_ENABLE_I18N */
3047 #ifdef RE_ENABLE_I18N
3048   if (BE (sbcset == NULL || mbcset == NULL, 0))
3049 #else
3050   if (BE (sbcset == NULL, 0))
3051 #endif /* RE_ENABLE_I18N */
3052     {
3053       re_free (sbcset);
3054 #ifdef RE_ENABLE_I18N
3055       re_free (mbcset);
3056 #endif
3057       *err = REG_ESPACE;
3058       return NULL;
3059     }
3060 
3061   token_len = peek_token_bracket (token, regexp, syntax);
3062   if (BE (token->type == END_OF_RE, 0))
3063     {
3064       *err = REG_BADPAT;
3065       goto parse_bracket_exp_free_return;
3066     }
3067   if (token->type == OP_NON_MATCH_LIST)
3068     {
3069 #ifdef RE_ENABLE_I18N
3070       mbcset->non_match = 1;
3071 #endif /* not RE_ENABLE_I18N */
3072       non_match = 1;
3073       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3074 	bitset_set (sbcset, '\n');
3075       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3076       token_len = peek_token_bracket (token, regexp, syntax);
3077       if (BE (token->type == END_OF_RE, 0))
3078 	{
3079 	  *err = REG_BADPAT;
3080 	  goto parse_bracket_exp_free_return;
3081 	}
3082     }
3083 
3084   /* We treat the first ']' as a normal character.  */
3085   if (token->type == OP_CLOSE_BRACKET)
3086     token->type = CHARACTER;
3087 
3088   while (1)
3089     {
3090       bracket_elem_t start_elem, end_elem;
3091       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3092       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3093       reg_errcode_t ret;
3094       int token_len2 = 0, is_range_exp = 0;
3095       re_token_t token2;
3096 
3097       start_elem.opr.name = start_name_buf;
3098       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3099 				   syntax, first_round);
3100       if (BE (ret != REG_NOERROR, 0))
3101 	{
3102 	  *err = ret;
3103 	  goto parse_bracket_exp_free_return;
3104 	}
3105       first_round = 0;
3106 
3107       /* Get information about the next token.  We need it in any case.  */
3108       token_len = peek_token_bracket (token, regexp, syntax);
3109 
3110       /* Do not check for ranges if we know they are not allowed.  */
3111       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3112 	{
3113 	  if (BE (token->type == END_OF_RE, 0))
3114 	    {
3115 	      *err = REG_EBRACK;
3116 	      goto parse_bracket_exp_free_return;
3117 	    }
3118 	  if (token->type == OP_CHARSET_RANGE)
3119 	    {
3120 	      re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3121 	      token_len2 = peek_token_bracket (&token2, regexp, syntax);
3122 	      if (BE (token2.type == END_OF_RE, 0))
3123 		{
3124 		  *err = REG_EBRACK;
3125 		  goto parse_bracket_exp_free_return;
3126 		}
3127 	      if (token2.type == OP_CLOSE_BRACKET)
3128 		{
3129 		  /* We treat the last '-' as a normal character.  */
3130 		  re_string_skip_bytes (regexp, -token_len);
3131 		  token->type = CHARACTER;
3132 		}
3133 	      else
3134 		is_range_exp = 1;
3135 	    }
3136 	}
3137 
3138       if (is_range_exp == 1)
3139 	{
3140 	  end_elem.opr.name = end_name_buf;
3141 	  ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3142 				       dfa, syntax, 1);
3143 	  if (BE (ret != REG_NOERROR, 0))
3144 	    {
3145 	      *err = ret;
3146 	      goto parse_bracket_exp_free_return;
3147 	    }
3148 
3149 	  token_len = peek_token_bracket (token, regexp, syntax);
3150 
3151 #ifdef _LIBC
3152 	  *err = build_range_exp (sbcset, mbcset, &range_alloc,
3153 				  &start_elem, &end_elem);
3154 #else
3155 # ifdef RE_ENABLE_I18N
3156 	  *err = build_range_exp (sbcset,
3157 				  dfa->mb_cur_max > 1 ? mbcset : NULL,
3158 				  &range_alloc, &start_elem, &end_elem);
3159 # else
3160 	  *err = build_range_exp (sbcset, &start_elem, &end_elem);
3161 # endif
3162 #endif /* RE_ENABLE_I18N */
3163 	  if (BE (*err != REG_NOERROR, 0))
3164 	    goto parse_bracket_exp_free_return;
3165 	}
3166       else
3167 	{
3168 	  switch (start_elem.type)
3169 	    {
3170 	    case SB_CHAR:
3171 	      bitset_set (sbcset, start_elem.opr.ch);
3172 	      break;
3173 #ifdef RE_ENABLE_I18N
3174 	    case MB_CHAR:
3175 	      /* Check whether the array has enough space.  */
3176 	      if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3177 		{
3178 		  wchar_t *new_mbchars;
3179 		  /* Not enough, realloc it.  */
3180 		  /* +1 in case of mbcset->nmbchars is 0.  */
3181 		  mbchar_alloc = 2 * mbcset->nmbchars + 1;
3182 		  /* Use realloc since array is NULL if *alloc == 0.  */
3183 		  new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3184 					    mbchar_alloc);
3185 		  if (BE (new_mbchars == NULL, 0))
3186 		    goto parse_bracket_exp_espace;
3187 		  mbcset->mbchars = new_mbchars;
3188 		}
3189 	      mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3190 	      break;
3191 #endif /* RE_ENABLE_I18N */
3192 	    case EQUIV_CLASS:
3193 	      *err = build_equiv_class (sbcset,
3194 #ifdef RE_ENABLE_I18N
3195 					mbcset, &equiv_class_alloc,
3196 #endif /* RE_ENABLE_I18N */
3197 					start_elem.opr.name);
3198 	      if (BE (*err != REG_NOERROR, 0))
3199 		goto parse_bracket_exp_free_return;
3200 	      break;
3201 	    case COLL_SYM:
3202 	      *err = build_collating_symbol (sbcset,
3203 #ifdef RE_ENABLE_I18N
3204 					     mbcset, &coll_sym_alloc,
3205 #endif /* RE_ENABLE_I18N */
3206 					     start_elem.opr.name);
3207 	      if (BE (*err != REG_NOERROR, 0))
3208 		goto parse_bracket_exp_free_return;
3209 	      break;
3210 	    case CHAR_CLASS:
3211 	      *err = build_charclass (regexp->trans, sbcset,
3212 #ifdef RE_ENABLE_I18N
3213 				      mbcset, &char_class_alloc,
3214 #endif /* RE_ENABLE_I18N */
3215 				      start_elem.opr.name, syntax);
3216 	      if (BE (*err != REG_NOERROR, 0))
3217 	       goto parse_bracket_exp_free_return;
3218 	      break;
3219 	    default:
3220 	      assert (0);
3221 	      break;
3222 	    }
3223 	}
3224       if (BE (token->type == END_OF_RE, 0))
3225 	{
3226 	  *err = REG_EBRACK;
3227 	  goto parse_bracket_exp_free_return;
3228 	}
3229       if (token->type == OP_CLOSE_BRACKET)
3230 	break;
3231     }
3232 
3233   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3234 
3235   /* If it is non-matching list.  */
3236   if (non_match)
3237     bitset_not (sbcset);
3238 
3239 #ifdef RE_ENABLE_I18N
3240   /* Ensure only single byte characters are set.  */
3241   if (dfa->mb_cur_max > 1)
3242     bitset_mask (sbcset, dfa->sb_char);
3243 
3244   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3245       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3246 						     || mbcset->non_match)))
3247     {
3248       bin_tree_t *mbc_tree;
3249       int sbc_idx;
3250       /* Build a tree for complex bracket.  */
3251       dfa->has_mb_node = 1;
3252       br_token.type = COMPLEX_BRACKET;
3253       br_token.opr.mbcset = mbcset;
3254       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3255       if (BE (mbc_tree == NULL, 0))
3256 	goto parse_bracket_exp_espace;
3257       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3258 	if (sbcset[sbc_idx])
3259 	  break;
3260       /* If there are no bits set in sbcset, there is no point
3261 	 of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3262       if (sbc_idx < BITSET_WORDS)
3263 	{
3264 	  /* Build a tree for simple bracket.  */
3265 	  br_token.type = SIMPLE_BRACKET;
3266 	  br_token.opr.sbcset = sbcset;
3267 	  work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3268 	  if (BE (work_tree == NULL, 0))
3269 	    goto parse_bracket_exp_espace;
3270 
3271 	  /* Then join them by ALT node.  */
3272 	  work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3273 	  if (BE (work_tree == NULL, 0))
3274 	    goto parse_bracket_exp_espace;
3275 	}
3276       else
3277 	{
3278 	  re_free (sbcset);
3279 	  work_tree = mbc_tree;
3280 	}
3281     }
3282   else
3283 #endif /* not RE_ENABLE_I18N */
3284     {
3285 #ifdef RE_ENABLE_I18N
3286       free_charset (mbcset);
3287 #endif
3288       /* Build a tree for simple bracket.  */
3289       br_token.type = SIMPLE_BRACKET;
3290       br_token.opr.sbcset = sbcset;
3291       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3292       if (BE (work_tree == NULL, 0))
3293 	goto parse_bracket_exp_espace;
3294     }
3295   return work_tree;
3296 
3297  parse_bracket_exp_espace:
3298   *err = REG_ESPACE;
3299  parse_bracket_exp_free_return:
3300   re_free (sbcset);
3301 #ifdef RE_ENABLE_I18N
3302   free_charset (mbcset);
3303 #endif /* RE_ENABLE_I18N */
3304   return NULL;
3305 }
3306 
3307 /* Parse an element in the bracket expression.  */
3308 
3309 static reg_errcode_t
parse_bracket_element(bracket_elem_t * elem,re_string_t * regexp,re_token_t * token,int token_len,re_dfa_t * dfa,reg_syntax_t syntax,int accept_hyphen)3310 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3311 		       re_token_t *token, int token_len, re_dfa_t *dfa,
3312 		       reg_syntax_t syntax, int accept_hyphen)
3313 {
3314 #ifdef RE_ENABLE_I18N
3315   int cur_char_size;
3316   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3317   if (cur_char_size > 1)
3318     {
3319       elem->type = MB_CHAR;
3320       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3321       re_string_skip_bytes (regexp, cur_char_size);
3322       return REG_NOERROR;
3323     }
3324 #endif /* RE_ENABLE_I18N */
3325   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3326   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3327       || token->type == OP_OPEN_EQUIV_CLASS)
3328     return parse_bracket_symbol (elem, regexp, token);
3329   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3330     {
3331       /* A '-' must only appear as anything but a range indicator before
3332 	 the closing bracket.  Everything else is an error.  */
3333       re_token_t token2;
3334       (void) peek_token_bracket (&token2, regexp, syntax);
3335       if (token2.type != OP_CLOSE_BRACKET)
3336 	/* The actual error value is not standardized since this whole
3337 	   case is undefined.  But ERANGE makes good sense.  */
3338 	return REG_ERANGE;
3339     }
3340   elem->type = SB_CHAR;
3341   elem->opr.ch = token->opr.c;
3342   return REG_NOERROR;
3343 }
3344 
3345 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3346    such as [:<character_class>:], [.<collating_element>.], and
3347    [=<equivalent_class>=].  */
3348 
3349 static reg_errcode_t
parse_bracket_symbol(bracket_elem_t * elem,re_string_t * regexp,re_token_t * token)3350 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3351 		      re_token_t *token)
3352 {
3353   unsigned char ch, delim = token->opr.c;
3354   int i = 0;
3355   if (re_string_eoi(regexp))
3356     return REG_EBRACK;
3357   for (;; ++i)
3358     {
3359       if (i >= BRACKET_NAME_BUF_SIZE)
3360 	return REG_EBRACK;
3361       if (token->type == OP_OPEN_CHAR_CLASS)
3362 	ch = re_string_fetch_byte_case (regexp);
3363       else
3364 	ch = re_string_fetch_byte (regexp);
3365       if (re_string_eoi(regexp))
3366 	return REG_EBRACK;
3367       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3368 	break;
3369       elem->opr.name[i] = ch;
3370     }
3371   re_string_skip_bytes (regexp, 1);
3372   elem->opr.name[i] = '\0';
3373   switch (token->type)
3374     {
3375     case OP_OPEN_COLL_ELEM:
3376       elem->type = COLL_SYM;
3377       break;
3378     case OP_OPEN_EQUIV_CLASS:
3379       elem->type = EQUIV_CLASS;
3380       break;
3381     case OP_OPEN_CHAR_CLASS:
3382       elem->type = CHAR_CLASS;
3383       break;
3384     default:
3385       break;
3386     }
3387   return REG_NOERROR;
3388 }
3389 
3390   /* Helper function for parse_bracket_exp.
3391      Build the equivalence class which is represented by NAME.
3392      The result are written to MBCSET and SBCSET.
3393      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3394      is a pointer argument sinse we may update it.  */
3395 
3396 static reg_errcode_t
3397 #ifdef RE_ENABLE_I18N
build_equiv_class(bitset_t sbcset,re_charset_t * mbcset,int * equiv_class_alloc,const unsigned char * name)3398 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3399 		   int *equiv_class_alloc, const unsigned char *name)
3400 #else /* not RE_ENABLE_I18N */
3401 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3402 #endif /* not RE_ENABLE_I18N */
3403 {
3404 #ifdef _LIBC
3405   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3406   if (nrules != 0)
3407     {
3408       const int32_t *table, *indirect;
3409       const unsigned char *weights, *extra, *cp;
3410       unsigned char char_buf[2];
3411       int32_t idx1, idx2;
3412       unsigned int ch;
3413       size_t len;
3414       /* This #include defines a local function!  */
3415 # include <locale/weight.h>
3416       /* Calculate the index for equivalence class.  */
3417       cp = name;
3418       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3419       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3420 					       _NL_COLLATE_WEIGHTMB);
3421       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3422 						   _NL_COLLATE_EXTRAMB);
3423       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3424 						_NL_COLLATE_INDIRECTMB);
3425       idx1 = findidx (&cp, -1);
3426       if (BE (idx1 == 0 || *cp != '\0', 0))
3427 	/* This isn't a valid character.  */
3428 	return REG_ECOLLATE;
3429 
3430       /* Build single byte matcing table for this equivalence class.  */
3431       len = weights[idx1 & 0xffffff];
3432       for (ch = 0; ch < SBC_MAX; ++ch)
3433 	{
3434 	  char_buf[0] = ch;
3435 	  cp = char_buf;
3436 	  idx2 = findidx (&cp, 1);
3437 /*
3438 	  idx2 = table[ch];
3439 */
3440 	  if (idx2 == 0)
3441 	    /* This isn't a valid character.  */
3442 	    continue;
3443 	  /* Compare only if the length matches and the collation rule
3444 	     index is the same.  */
3445 	  if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3446 	    {
3447 	      int cnt = 0;
3448 
3449 	      while (cnt <= len &&
3450 		     weights[(idx1 & 0xffffff) + 1 + cnt]
3451 		     == weights[(idx2 & 0xffffff) + 1 + cnt])
3452 		++cnt;
3453 
3454 	      if (cnt > len)
3455 		bitset_set (sbcset, ch);
3456 	    }
3457 	}
3458       /* Check whether the array has enough space.  */
3459       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3460 	{
3461 	  /* Not enough, realloc it.  */
3462 	  /* +1 in case of mbcset->nequiv_classes is 0.  */
3463 	  int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3464 	  /* Use realloc since the array is NULL if *alloc == 0.  */
3465 	  int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3466 						   int32_t,
3467 						   new_equiv_class_alloc);
3468 	  if (BE (new_equiv_classes == NULL, 0))
3469 	    return REG_ESPACE;
3470 	  mbcset->equiv_classes = new_equiv_classes;
3471 	  *equiv_class_alloc = new_equiv_class_alloc;
3472 	}
3473       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3474     }
3475   else
3476 #endif /* _LIBC */
3477     {
3478       if (BE (strlen ((const char *) name) != 1, 0))
3479 	return REG_ECOLLATE;
3480       bitset_set (sbcset, *name);
3481     }
3482   return REG_NOERROR;
3483 }
3484 
3485   /* Helper function for parse_bracket_exp.
3486      Build the character class which is represented by NAME.
3487      The result are written to MBCSET and SBCSET.
3488      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3489      is a pointer argument sinse we may update it.  */
3490 
3491 static reg_errcode_t
3492 #ifdef RE_ENABLE_I18N
build_charclass(RE_TRANSLATE_TYPE trans,bitset_t sbcset,re_charset_t * mbcset,int * char_class_alloc,const unsigned char * class_name,reg_syntax_t syntax)3493 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3494 		 re_charset_t *mbcset, int *char_class_alloc,
3495 		 const unsigned char *class_name, reg_syntax_t syntax)
3496 #else /* not RE_ENABLE_I18N */
3497 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3498 		 const unsigned char *class_name, reg_syntax_t syntax)
3499 #endif /* not RE_ENABLE_I18N */
3500 {
3501   int i;
3502   const char *name = (const char *) class_name;
3503 
3504   /* In case of REG_ICASE "upper" and "lower" match the both of
3505      upper and lower cases.  */
3506   if ((syntax & RE_ICASE)
3507       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3508     name = "alpha";
3509 
3510 #ifdef RE_ENABLE_I18N
3511   /* Check the space of the arrays.  */
3512   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3513     {
3514       /* Not enough, realloc it.  */
3515       /* +1 in case of mbcset->nchar_classes is 0.  */
3516       int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3517       /* Use realloc since array is NULL if *alloc == 0.  */
3518       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3519 					       new_char_class_alloc);
3520       if (BE (new_char_classes == NULL, 0))
3521 	return REG_ESPACE;
3522       mbcset->char_classes = new_char_classes;
3523       *char_class_alloc = new_char_class_alloc;
3524     }
3525   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3526 #endif /* RE_ENABLE_I18N */
3527 
3528 #define BUILD_CHARCLASS_LOOP(ctype_func)	\
3529   do {						\
3530     if (BE (trans != NULL, 0))			\
3531       {						\
3532 	for (i = 0; i < SBC_MAX; ++i)		\
3533   	  if (ctype_func (i))			\
3534 	    bitset_set (sbcset, trans[i]);	\
3535       }						\
3536     else					\
3537       {						\
3538 	for (i = 0; i < SBC_MAX; ++i)		\
3539   	  if (ctype_func (i))			\
3540 	    bitset_set (sbcset, i);		\
3541       }						\
3542   } while (0)
3543 
3544   if (strcmp (name, "alnum") == 0)
3545     BUILD_CHARCLASS_LOOP (isalnum);
3546   else if (strcmp (name, "cntrl") == 0)
3547     BUILD_CHARCLASS_LOOP (iscntrl);
3548   else if (strcmp (name, "lower") == 0)
3549     BUILD_CHARCLASS_LOOP (islower);
3550   else if (strcmp (name, "space") == 0)
3551     BUILD_CHARCLASS_LOOP (isspace);
3552   else if (strcmp (name, "alpha") == 0)
3553     BUILD_CHARCLASS_LOOP (isalpha);
3554   else if (strcmp (name, "digit") == 0)
3555     BUILD_CHARCLASS_LOOP (isdigit);
3556   else if (strcmp (name, "print") == 0)
3557     BUILD_CHARCLASS_LOOP (isprint);
3558   else if (strcmp (name, "upper") == 0)
3559     BUILD_CHARCLASS_LOOP (isupper);
3560   else if (strcmp (name, "blank") == 0)
3561     BUILD_CHARCLASS_LOOP (isblank);
3562   else if (strcmp (name, "graph") == 0)
3563     BUILD_CHARCLASS_LOOP (isgraph);
3564   else if (strcmp (name, "punct") == 0)
3565     BUILD_CHARCLASS_LOOP (ispunct);
3566   else if (strcmp (name, "xdigit") == 0)
3567     BUILD_CHARCLASS_LOOP (isxdigit);
3568   else
3569     return REG_ECTYPE;
3570 
3571   return REG_NOERROR;
3572 }
3573 
3574 static bin_tree_t *
build_charclass_op(re_dfa_t * dfa,RE_TRANSLATE_TYPE trans,const unsigned char * class_name,const unsigned char * extra,int non_match,reg_errcode_t * err)3575 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3576 		    const unsigned char *class_name,
3577 		    const unsigned char *extra, int non_match,
3578 		    reg_errcode_t *err)
3579 {
3580   re_bitset_ptr_t sbcset;
3581 #ifdef RE_ENABLE_I18N
3582   re_charset_t *mbcset;
3583   int alloc = 0;
3584 #endif /* not RE_ENABLE_I18N */
3585   reg_errcode_t ret;
3586   re_token_t br_token;
3587   bin_tree_t *tree;
3588 
3589   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3590 #ifdef RE_ENABLE_I18N
3591   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3592 #endif /* RE_ENABLE_I18N */
3593 
3594 #ifdef RE_ENABLE_I18N
3595   if (BE (sbcset == NULL || mbcset == NULL, 0))
3596 #else /* not RE_ENABLE_I18N */
3597   if (BE (sbcset == NULL, 0))
3598 #endif /* not RE_ENABLE_I18N */
3599     {
3600       *err = REG_ESPACE;
3601       return NULL;
3602     }
3603 
3604   if (non_match)
3605     {
3606 #ifdef RE_ENABLE_I18N
3607       mbcset->non_match = 1;
3608 #endif /* not RE_ENABLE_I18N */
3609     }
3610 
3611   /* We don't care the syntax in this case.  */
3612   ret = build_charclass (trans, sbcset,
3613 #ifdef RE_ENABLE_I18N
3614 			 mbcset, &alloc,
3615 #endif /* RE_ENABLE_I18N */
3616 			 class_name, 0);
3617 
3618   if (BE (ret != REG_NOERROR, 0))
3619     {
3620       re_free (sbcset);
3621 #ifdef RE_ENABLE_I18N
3622       free_charset (mbcset);
3623 #endif /* RE_ENABLE_I18N */
3624       *err = ret;
3625       return NULL;
3626     }
3627   /* \w match '_' also.  */
3628   for (; *extra; extra++)
3629     bitset_set (sbcset, *extra);
3630 
3631   /* If it is non-matching list.  */
3632   if (non_match)
3633     bitset_not (sbcset);
3634 
3635 #ifdef RE_ENABLE_I18N
3636   /* Ensure only single byte characters are set.  */
3637   if (dfa->mb_cur_max > 1)
3638     bitset_mask (sbcset, dfa->sb_char);
3639 #endif
3640 
3641   /* Build a tree for simple bracket.  */
3642   br_token.type = SIMPLE_BRACKET;
3643   br_token.opr.sbcset = sbcset;
3644   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3645   if (BE (tree == NULL, 0))
3646     goto build_word_op_espace;
3647 
3648 #ifdef RE_ENABLE_I18N
3649   if (dfa->mb_cur_max > 1)
3650     {
3651       bin_tree_t *mbc_tree;
3652       /* Build a tree for complex bracket.  */
3653       br_token.type = COMPLEX_BRACKET;
3654       br_token.opr.mbcset = mbcset;
3655       dfa->has_mb_node = 1;
3656       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3657       if (BE (mbc_tree == NULL, 0))
3658 	goto build_word_op_espace;
3659       /* Then join them by ALT node.  */
3660       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3661       if (BE (mbc_tree != NULL, 1))
3662 	return tree;
3663     }
3664   else
3665     {
3666       free_charset (mbcset);
3667       return tree;
3668     }
3669 #else /* not RE_ENABLE_I18N */
3670   return tree;
3671 #endif /* not RE_ENABLE_I18N */
3672 
3673  build_word_op_espace:
3674   re_free (sbcset);
3675 #ifdef RE_ENABLE_I18N
3676   free_charset (mbcset);
3677 #endif /* RE_ENABLE_I18N */
3678   *err = REG_ESPACE;
3679   return NULL;
3680 }
3681 
3682 /* This is intended for the expressions like "a{1,3}".
3683    Fetch a number from `input', and return the number.
3684    Return -1, if the number field is empty like "{,1}".
3685    Return -2, If an error is occured.  */
3686 
3687 static int
fetch_number(re_string_t * input,re_token_t * token,reg_syntax_t syntax)3688 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3689 {
3690   int num = -1;
3691   unsigned char c;
3692   while (1)
3693     {
3694       fetch_token (token, input, syntax);
3695       c = token->opr.c;
3696       if (BE (token->type == END_OF_RE, 0))
3697 	return -2;
3698       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3699 	break;
3700       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3701 	     ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3702       num = (num > RE_DUP_MAX) ? -2 : num;
3703     }
3704   return num;
3705 }
3706 
3707 #ifdef RE_ENABLE_I18N
3708 static void
free_charset(re_charset_t * cset)3709 free_charset (re_charset_t *cset)
3710 {
3711   re_free (cset->mbchars);
3712 # ifdef _LIBC
3713   re_free (cset->coll_syms);
3714   re_free (cset->equiv_classes);
3715   re_free (cset->range_starts);
3716   re_free (cset->range_ends);
3717 # endif
3718   re_free (cset->char_classes);
3719   re_free (cset);
3720 }
3721 #endif /* RE_ENABLE_I18N */
3722 
3723 /* Functions for binary tree operation.  */
3724 
3725 /* Create a tree node.  */
3726 
3727 static bin_tree_t *
create_tree(re_dfa_t * dfa,bin_tree_t * left,bin_tree_t * right,re_token_type_t type)3728 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3729 	     re_token_type_t type)
3730 {
3731   re_token_t t;
3732   t.type = type;
3733   return create_token_tree (dfa, left, right, &t);
3734 }
3735 
3736 static bin_tree_t *
create_token_tree(re_dfa_t * dfa,bin_tree_t * left,bin_tree_t * right,const re_token_t * token)3737 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3738 		   const re_token_t *token)
3739 {
3740   bin_tree_t *tree;
3741   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3742     {
3743       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3744 
3745       if (storage == NULL)
3746 	return NULL;
3747       storage->next = dfa->str_tree_storage;
3748       dfa->str_tree_storage = storage;
3749       dfa->str_tree_storage_idx = 0;
3750     }
3751   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3752 
3753   tree->parent = NULL;
3754   tree->left = left;
3755   tree->right = right;
3756   tree->token = *token;
3757   tree->token.duplicated = 0;
3758   tree->token.opt_subexp = 0;
3759   tree->first = NULL;
3760   tree->next = NULL;
3761   tree->node_idx = -1;
3762 
3763   if (left != NULL)
3764     left->parent = tree;
3765   if (right != NULL)
3766     right->parent = tree;
3767   return tree;
3768 }
3769 
3770 /* Mark the tree SRC as an optional subexpression.
3771    To be called from preorder or postorder.  */
3772 
3773 static reg_errcode_t
mark_opt_subexp(void * extra,bin_tree_t * node)3774 mark_opt_subexp (void *extra, bin_tree_t *node)
3775 {
3776   int idx = (int) (long) extra;
3777   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3778     node->token.opt_subexp = 1;
3779 
3780   return REG_NOERROR;
3781 }
3782 
3783 /* Free the allocated memory inside NODE. */
3784 
3785 static void
free_token(re_token_t * node)3786 free_token (re_token_t *node)
3787 {
3788 #ifdef RE_ENABLE_I18N
3789   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3790     free_charset (node->opr.mbcset);
3791   else
3792 #endif /* RE_ENABLE_I18N */
3793     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3794       re_free (node->opr.sbcset);
3795 }
3796 
3797 /* Worker function for tree walking.  Free the allocated memory inside NODE
3798    and its children. */
3799 
3800 static reg_errcode_t
free_tree(void * extra,bin_tree_t * node)3801 free_tree (void *extra, bin_tree_t *node)
3802 {
3803   free_token (&node->token);
3804   return REG_NOERROR;
3805 }
3806 
3807 
3808 /* Duplicate the node SRC, and return new node.  This is a preorder
3809    visit similar to the one implemented by the generic visitor, but
3810    we need more infrastructure to maintain two parallel trees --- so,
3811    it's easier to duplicate.  */
3812 
3813 static bin_tree_t *
duplicate_tree(const bin_tree_t * root,re_dfa_t * dfa)3814 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3815 {
3816   const bin_tree_t *node;
3817   bin_tree_t *dup_root;
3818   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3819 
3820   for (node = root; ; )
3821     {
3822       /* Create a new tree and link it back to the current parent.  */
3823       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3824       if (*p_new == NULL)
3825 	return NULL;
3826       (*p_new)->parent = dup_node;
3827       (*p_new)->token.duplicated = 1;
3828       dup_node = *p_new;
3829 
3830       /* Go to the left node, or up and to the right.  */
3831       if (node->left)
3832 	{
3833 	  node = node->left;
3834 	  p_new = &dup_node->left;
3835 	}
3836       else
3837 	{
3838 	  const bin_tree_t *prev = NULL;
3839 	  while (node->right == prev || node->right == NULL)
3840 	    {
3841 	      prev = node;
3842 	      node = node->parent;
3843 	      dup_node = dup_node->parent;
3844 	      if (!node)
3845 		return dup_root;
3846 	    }
3847 	  node = node->right;
3848 	  p_new = &dup_node->right;
3849 	}
3850     }
3851 }
3852