This source file includes following definitions.
- re_compile_pattern
- weak_alias
- weak_alias
- weak_alias
- re_compile_fastmap_iter
- regcomp
- libc_hidden_def
- free_dfa_content
- regfree
- libc_hidden_def
- libc_freeres_fn
- re_compile_internal
- init_dfa
- init_word_char
- free_workarea_compile
- create_initial_state
- optimize_utf8
- analyze
- postorder
- preorder
- optimize_subexps
- lower_subexps
- lower_subexp
- calc_first
- calc_next
- link_nfa_nodes
- duplicate_node_closure
- search_duplicated_node
- duplicate_node
- calc_inveclosure
- calc_eclosure
- calc_eclosure_iter
- fetch_token
- peek_token
- peek_token_bracket
- parse
- parse_reg_exp
- parse_branch
- parse_expression
- parse_sub_exp
- parse_dup_op
- parse_byte
- build_range_exp
- build_collating_symbol
- seek_collating_symbol_entry
- lookup_collation_sequence_value
- build_range_exp
- build_collating_symbol
- parse_bracket_exp
- parse_bracket_element
- parse_bracket_symbol
- build_equiv_class
- build_charclass
- build_charclass_op
- fetch_number
- free_charset
- create_tree
- create_token_tree
- mark_opt_subexp
- free_token
- free_tree
- duplicate_tree
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20 #ifdef _LIBC
21 # include <locale/weight.h>
22 #endif
23
24 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
25 size_t length, reg_syntax_t syntax);
26 static void re_compile_fastmap_iter (regex_t *bufp,
27 const re_dfastate_t *init_state,
28 char *fastmap);
29 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
30 static void free_charset (re_charset_t *cset);
31 static void free_workarea_compile (regex_t *preg);
32 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
33 static void optimize_utf8 (re_dfa_t *dfa);
34 static reg_errcode_t analyze (regex_t *preg);
35 static reg_errcode_t preorder (bin_tree_t *root,
36 reg_errcode_t (fn (void *, bin_tree_t *)),
37 void *extra);
38 static reg_errcode_t postorder (bin_tree_t *root,
39 reg_errcode_t (fn (void *, bin_tree_t *)),
40 void *extra);
41 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
42 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
43 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
44 bin_tree_t *node);
45 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
46 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
47 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
48 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
49 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
50 unsigned int constraint);
51 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
52 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
53 Idx node, bool root);
54 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
55 static Idx fetch_number (re_string_t *input, re_token_t *token,
56 reg_syntax_t syntax);
57 static int peek_token (re_token_t *token, re_string_t *input,
58 reg_syntax_t syntax);
59 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
60 reg_syntax_t syntax, reg_errcode_t *err);
61 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
62 re_token_t *token, reg_syntax_t syntax,
63 Idx nest, reg_errcode_t *err);
64 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
65 re_token_t *token, reg_syntax_t syntax,
66 Idx nest, reg_errcode_t *err);
67 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
68 re_token_t *token, reg_syntax_t syntax,
69 Idx nest, reg_errcode_t *err);
70 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
71 re_token_t *token, reg_syntax_t syntax,
72 Idx nest, reg_errcode_t *err);
73 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
74 re_dfa_t *dfa, re_token_t *token,
75 reg_syntax_t syntax, reg_errcode_t *err);
76 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
77 re_token_t *token, reg_syntax_t syntax,
78 reg_errcode_t *err);
79 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
80 re_string_t *regexp,
81 re_token_t *token, int token_len,
82 re_dfa_t *dfa,
83 reg_syntax_t syntax,
84 bool accept_hyphen);
85 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
86 re_string_t *regexp,
87 re_token_t *token);
88 static reg_errcode_t build_equiv_class (bitset_t sbcset,
89 re_charset_t *mbcset,
90 Idx *equiv_class_alloc,
91 const unsigned char *name);
92 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
93 bitset_t sbcset,
94 re_charset_t *mbcset,
95 Idx *char_class_alloc,
96 const char *class_name,
97 reg_syntax_t syntax);
98 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
99 RE_TRANSLATE_TYPE trans,
100 const char *class_name,
101 const char *extra,
102 bool non_match, reg_errcode_t *err);
103 static bin_tree_t *create_tree (re_dfa_t *dfa,
104 bin_tree_t *left, bin_tree_t *right,
105 re_token_type_t type);
106 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
107 bin_tree_t *left, bin_tree_t *right,
108 const re_token_t *token);
109 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
110 static void free_token (re_token_t *node);
111 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
112 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
113
114
115
116
117
118
119 static const char __re_error_msgid[] =
120 {
121 #define REG_NOERROR_IDX 0
122 gettext_noop ("Success")
123 "\0"
124 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
125 gettext_noop ("No match")
126 "\0"
127 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
128 gettext_noop ("Invalid regular expression")
129 "\0"
130 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
131 gettext_noop ("Invalid collation character")
132 "\0"
133 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
134 gettext_noop ("Invalid character class name")
135 "\0"
136 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
137 gettext_noop ("Trailing backslash")
138 "\0"
139 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
140 gettext_noop ("Invalid back reference")
141 "\0"
142 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
143 gettext_noop ("Unmatched [, [^, [:, [., or [=")
144 "\0"
145 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [, [^, [:, [., or [=")
146 gettext_noop ("Unmatched ( or \\(")
147 "\0"
148 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
149 gettext_noop ("Unmatched \\{")
150 "\0"
151 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
152 gettext_noop ("Invalid content of \\{\\}")
153 "\0"
154 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
155 gettext_noop ("Invalid range end")
156 "\0"
157 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
158 gettext_noop ("Memory exhausted")
159 "\0"
160 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
161 gettext_noop ("Invalid preceding regular expression")
162 "\0"
163 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
164 gettext_noop ("Premature end of regular expression")
165 "\0"
166 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
167 gettext_noop ("Regular expression too big")
168 "\0"
169 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
170 gettext_noop ("Unmatched ) or \\)")
171 };
172
173 static const size_t __re_error_msgid_idx[] =
174 {
175 REG_NOERROR_IDX,
176 REG_NOMATCH_IDX,
177 REG_BADPAT_IDX,
178 REG_ECOLLATE_IDX,
179 REG_ECTYPE_IDX,
180 REG_EESCAPE_IDX,
181 REG_ESUBREG_IDX,
182 REG_EBRACK_IDX,
183 REG_EPAREN_IDX,
184 REG_EBRACE_IDX,
185 REG_BADBR_IDX,
186 REG_ERANGE_IDX,
187 REG_ESPACE_IDX,
188 REG_BADRPT_IDX,
189 REG_EEND_IDX,
190 REG_ESIZE_IDX,
191 REG_ERPAREN_IDX
192 };
193
194
195
196
197
198
199
200
201
202
203 const char *
204 re_compile_pattern (const char *pattern, size_t length,
205 struct re_pattern_buffer *bufp)
206 {
207 reg_errcode_t ret;
208
209
210
211
212 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
213
214
215 bufp->newline_anchor = 1;
216
217 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
218
219 if (!ret)
220 return NULL;
221 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
222 }
223 weak_alias (__re_compile_pattern, re_compile_pattern)
224
225
226
227
228
229
230 reg_syntax_t re_syntax_options;
231
232
233
234
235
236
237
238
239
240 reg_syntax_t
241 re_set_syntax (reg_syntax_t syntax)
242 {
243 reg_syntax_t ret = re_syntax_options;
244
245 re_syntax_options = syntax;
246 return ret;
247 }
248 weak_alias (__re_set_syntax, re_set_syntax)
249
250 int
251 re_compile_fastmap (struct re_pattern_buffer *bufp)
252 {
253 re_dfa_t *dfa = bufp->buffer;
254 char *fastmap = bufp->fastmap;
255
256 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
257 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
258 if (dfa->init_state != dfa->init_state_word)
259 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
260 if (dfa->init_state != dfa->init_state_nl)
261 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
262 if (dfa->init_state != dfa->init_state_begbuf)
263 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
264 bufp->fastmap_accurate = 1;
265 return 0;
266 }
267 weak_alias (__re_compile_fastmap, re_compile_fastmap)
268
269 static __always_inline void
270 re_set_fastmap (char *fastmap, bool icase, int ch)
271 {
272 fastmap[ch] = 1;
273 if (icase)
274 fastmap[tolower (ch)] = 1;
275 }
276
277
278
279
280 static void
281 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
282 char *fastmap)
283 {
284 re_dfa_t *dfa = bufp->buffer;
285 Idx node_cnt;
286 bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
287 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
288 {
289 Idx node = init_state->nodes.elems[node_cnt];
290 re_token_type_t type = dfa->nodes[node].type;
291
292 if (type == CHARACTER)
293 {
294 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
295 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
296 {
297 unsigned char buf[MB_LEN_MAX];
298 unsigned char *p;
299 wchar_t wc;
300 mbstate_t state;
301
302 p = buf;
303 *p++ = dfa->nodes[node].opr.c;
304 while (++node < dfa->nodes_len
305 && dfa->nodes[node].type == CHARACTER
306 && dfa->nodes[node].mb_partial)
307 *p++ = dfa->nodes[node].opr.c;
308 memset (&state, '\0', sizeof (state));
309 if (__mbrtowc (&wc, (const char *) buf, p - buf,
310 &state) == p - buf
311 && (__wcrtomb ((char *) buf, __towlower (wc), &state)
312 != (size_t) -1))
313 re_set_fastmap (fastmap, false, buf[0]);
314 }
315 }
316 else if (type == SIMPLE_BRACKET)
317 {
318 int i, ch;
319 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
320 {
321 int j;
322 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
323 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
324 if (w & ((bitset_word_t) 1 << j))
325 re_set_fastmap (fastmap, icase, ch);
326 }
327 }
328 else if (type == COMPLEX_BRACKET)
329 {
330 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
331 Idx i;
332
333 #ifdef _LIBC
334
335
336
337
338
339
340 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
341 && (cset->ncoll_syms || cset->nranges))
342 {
343 const int32_t *table = (const int32_t *)
344 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
345 for (i = 0; i < SBC_MAX; ++i)
346 if (table[i] < 0)
347 re_set_fastmap (fastmap, icase, i);
348 }
349 #endif
350
351
352
353
354
355 if (dfa->mb_cur_max > 1
356 && (cset->nchar_classes || cset->non_match || cset->nranges
357 #ifdef _LIBC
358 || cset->nequiv_classes
359 #endif
360 ))
361 {
362 unsigned char c = 0;
363 do
364 {
365 mbstate_t mbs;
366 memset (&mbs, 0, sizeof (mbs));
367 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
368 re_set_fastmap (fastmap, false, (int) c);
369 }
370 while (++c != 0);
371 }
372
373 else
374 {
375
376 for (i = 0; i < cset->nmbchars; ++i)
377 {
378 char buf[256];
379 mbstate_t state;
380 memset (&state, '\0', sizeof (state));
381 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
382 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
383 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
384 {
385 if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state)
386 != (size_t) -1)
387 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
388 }
389 }
390 }
391 }
392 else if (type == OP_PERIOD || type == OP_UTF8_PERIOD || type == END_OF_RE)
393 {
394 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
395 if (type == END_OF_RE)
396 bufp->can_be_null = 1;
397 return;
398 }
399 }
400 }
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438 int
439 regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
440 {
441 reg_errcode_t ret;
442 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
443 : RE_SYNTAX_POSIX_BASIC);
444
445 preg->buffer = NULL;
446 preg->allocated = 0;
447 preg->used = 0;
448
449
450 preg->fastmap = re_malloc (char, SBC_MAX);
451 if (__glibc_unlikely (preg->fastmap == NULL))
452 return REG_ESPACE;
453
454 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
455
456
457 if (cflags & REG_NEWLINE)
458 {
459 syntax &= ~RE_DOT_NEWLINE;
460 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
461
462 preg->newline_anchor = 1;
463 }
464 else
465 preg->newline_anchor = 0;
466 preg->no_sub = !!(cflags & REG_NOSUB);
467 preg->translate = NULL;
468
469 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
470
471
472
473 if (ret == REG_ERPAREN)
474 ret = REG_EPAREN;
475
476
477 if (__glibc_likely (ret == REG_NOERROR))
478
479
480 (void) re_compile_fastmap (preg);
481 else
482 {
483
484 re_free (preg->fastmap);
485 preg->fastmap = NULL;
486 }
487
488 return (int) ret;
489 }
490 libc_hidden_def (__regcomp)
491 weak_alias (__regcomp, regcomp)
492
493
494
495
496 size_t
497 regerror (int errcode, const regex_t *__restrict preg, char *__restrict errbuf,
498 size_t errbuf_size)
499 {
500 const char *msg;
501 size_t msg_size;
502 int nerrcodes = sizeof __re_error_msgid_idx / sizeof __re_error_msgid_idx[0];
503
504 if (__glibc_unlikely (errcode < 0 || errcode >= nerrcodes))
505
506
507
508
509 abort ();
510
511 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
512
513 msg_size = strlen (msg) + 1;
514
515 if (__glibc_likely (errbuf_size != 0))
516 {
517 size_t cpy_size = msg_size;
518 if (__glibc_unlikely (msg_size > errbuf_size))
519 {
520 cpy_size = errbuf_size - 1;
521 errbuf[cpy_size] = '\0';
522 }
523 memcpy (errbuf, msg, cpy_size);
524 }
525
526 return msg_size;
527 }
528 weak_alias (__regerror, regerror)
529
530
531
532
533
534
535 static const bitset_t utf8_sb_map =
536 {
537
538 #if (defined __GNUC__ || __clang_major__ >= 4) && !defined __STRICT_ANSI__
539 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
540 #else
541 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
542 # error "bitset_word_t is narrower than 32 bits"
543 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
544 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
545 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
546 BITSET_WORD_MAX, BITSET_WORD_MAX,
547 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
548 BITSET_WORD_MAX,
549 # endif
550 (BITSET_WORD_MAX
551 >> (SBC_MAX % BITSET_WORD_BITS == 0
552 ? 0
553 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
554 #endif
555 };
556
557
558 static void
559 free_dfa_content (re_dfa_t *dfa)
560 {
561 Idx i, j;
562
563 if (dfa->nodes)
564 for (i = 0; i < dfa->nodes_len; ++i)
565 free_token (dfa->nodes + i);
566 re_free (dfa->nexts);
567 for (i = 0; i < dfa->nodes_len; ++i)
568 {
569 if (dfa->eclosures != NULL)
570 re_node_set_free (dfa->eclosures + i);
571 if (dfa->inveclosures != NULL)
572 re_node_set_free (dfa->inveclosures + i);
573 if (dfa->edests != NULL)
574 re_node_set_free (dfa->edests + i);
575 }
576 re_free (dfa->edests);
577 re_free (dfa->eclosures);
578 re_free (dfa->inveclosures);
579 re_free (dfa->nodes);
580
581 if (dfa->state_table)
582 for (i = 0; i <= dfa->state_hash_mask; ++i)
583 {
584 struct re_state_table_entry *entry = dfa->state_table + i;
585 for (j = 0; j < entry->num; ++j)
586 {
587 re_dfastate_t *state = entry->array[j];
588 free_state (state);
589 }
590 re_free (entry->array);
591 }
592 re_free (dfa->state_table);
593 if (dfa->sb_char != utf8_sb_map)
594 re_free (dfa->sb_char);
595 re_free (dfa->subexp_map);
596 #ifdef DEBUG
597 re_free (dfa->re_str);
598 #endif
599
600 re_free (dfa);
601 }
602
603
604
605
606 void
607 regfree (regex_t *preg)
608 {
609 re_dfa_t *dfa = preg->buffer;
610 if (__glibc_likely (dfa != NULL))
611 {
612 lock_fini (dfa->lock);
613 free_dfa_content (dfa);
614 }
615 preg->buffer = NULL;
616 preg->allocated = 0;
617
618 re_free (preg->fastmap);
619 preg->fastmap = NULL;
620
621 re_free (preg->translate);
622 preg->translate = NULL;
623 }
624 libc_hidden_def (__regfree)
625 weak_alias (__regfree, regfree)
626
627
628
629
630 #if defined _REGEX_RE_COMP || defined _LIBC
631
632
633 static struct re_pattern_buffer re_comp_buf;
634
635 char *
636 # ifdef _LIBC
637
638
639
640 weak_function
641 # endif
642 re_comp (const char *s)
643 {
644 reg_errcode_t ret;
645 char *fastmap;
646
647 if (!s)
648 {
649 if (!re_comp_buf.buffer)
650 return gettext ("No previous regular expression");
651 return 0;
652 }
653
654 if (re_comp_buf.buffer)
655 {
656 fastmap = re_comp_buf.fastmap;
657 re_comp_buf.fastmap = NULL;
658 __regfree (&re_comp_buf);
659 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
660 re_comp_buf.fastmap = fastmap;
661 }
662
663 if (re_comp_buf.fastmap == NULL)
664 {
665 re_comp_buf.fastmap = re_malloc (char, SBC_MAX);
666 if (re_comp_buf.fastmap == NULL)
667 return (char *) gettext (__re_error_msgid
668 + __re_error_msgid_idx[(int) REG_ESPACE]);
669 }
670
671
672
673
674
675 re_comp_buf.newline_anchor = 1;
676
677 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
678
679 if (!ret)
680 return NULL;
681
682
683 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
684 }
685
686 #ifdef _LIBC
687 libc_freeres_fn (free_mem)
688 {
689 __regfree (&re_comp_buf);
690 }
691 #endif
692
693 #endif
694
695
696
697
698
699 static reg_errcode_t
700 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
701 reg_syntax_t syntax)
702 {
703 reg_errcode_t err = REG_NOERROR;
704 re_dfa_t *dfa;
705 re_string_t regexp;
706
707
708 preg->fastmap_accurate = 0;
709 preg->syntax = syntax;
710 preg->not_bol = preg->not_eol = 0;
711 preg->used = 0;
712 preg->re_nsub = 0;
713 preg->can_be_null = 0;
714 preg->regs_allocated = REGS_UNALLOCATED;
715
716
717 dfa = preg->buffer;
718 if (__glibc_unlikely (preg->allocated < sizeof (re_dfa_t)))
719 {
720
721
722
723
724 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
725 if (dfa == NULL)
726 return REG_ESPACE;
727 preg->allocated = sizeof (re_dfa_t);
728 preg->buffer = dfa;
729 }
730 preg->used = sizeof (re_dfa_t);
731
732 err = init_dfa (dfa, length);
733 if (__glibc_unlikely (err == REG_NOERROR && lock_init (dfa->lock) != 0))
734 err = REG_ESPACE;
735 if (__glibc_unlikely (err != REG_NOERROR))
736 {
737 free_dfa_content (dfa);
738 preg->buffer = NULL;
739 preg->allocated = 0;
740 return err;
741 }
742 #ifdef DEBUG
743
744 dfa->re_str = re_malloc (char, length + 1);
745 strncpy (dfa->re_str, pattern, length + 1);
746 #endif
747
748 err = re_string_construct (®exp, pattern, length, preg->translate,
749 (syntax & RE_ICASE) != 0, dfa);
750 if (__glibc_unlikely (err != REG_NOERROR))
751 {
752 re_compile_internal_free_return:
753 free_workarea_compile (preg);
754 re_string_destruct (®exp);
755 lock_fini (dfa->lock);
756 free_dfa_content (dfa);
757 preg->buffer = NULL;
758 preg->allocated = 0;
759 return err;
760 }
761
762
763 preg->re_nsub = 0;
764 dfa->str_tree = parse (®exp, preg, syntax, &err);
765 if (__glibc_unlikely (dfa->str_tree == NULL))
766 goto re_compile_internal_free_return;
767
768
769 err = analyze (preg);
770 if (__glibc_unlikely (err != REG_NOERROR))
771 goto re_compile_internal_free_return;
772
773
774 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
775 optimize_utf8 (dfa);
776
777
778 err = create_initial_state (dfa);
779
780
781 free_workarea_compile (preg);
782 re_string_destruct (®exp);
783
784 if (__glibc_unlikely (err != REG_NOERROR))
785 {
786 lock_fini (dfa->lock);
787 free_dfa_content (dfa);
788 preg->buffer = NULL;
789 preg->allocated = 0;
790 }
791
792 return err;
793 }
794
795
796
797
798 static reg_errcode_t
799 init_dfa (re_dfa_t *dfa, size_t pat_len)
800 {
801 __re_size_t table_size;
802 #ifndef _LIBC
803 const char *codeset_name;
804 #endif
805 size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
806 size_t max_object_size =
807 MAX (sizeof (struct re_state_table_entry),
808 MAX (sizeof (re_token_t),
809 MAX (sizeof (re_node_set),
810 MAX (sizeof (regmatch_t),
811 max_i18n_object_size))));
812
813 memset (dfa, '\0', sizeof (re_dfa_t));
814
815
816 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
817
818
819
820
821
822 if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2
823 <= pat_len))
824 return REG_ESPACE;
825
826 dfa->nodes_alloc = pat_len + 1;
827 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
828
829
830 for (table_size = 1; ; table_size <<= 1)
831 if (table_size > pat_len)
832 break;
833
834 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
835 dfa->state_hash_mask = table_size - 1;
836
837 dfa->mb_cur_max = MB_CUR_MAX;
838 #ifdef _LIBC
839 if (dfa->mb_cur_max == 6
840 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
841 dfa->is_utf8 = 1;
842 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
843 != 0);
844 #else
845 codeset_name = nl_langinfo (CODESET);
846 if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
847 && (codeset_name[1] == 'T' || codeset_name[1] == 't')
848 && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
849 && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
850 dfa->is_utf8 = 1;
851
852
853
854 dfa->map_notascii = 0;
855 #endif
856
857 if (dfa->mb_cur_max > 1)
858 {
859 if (dfa->is_utf8)
860 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
861 else
862 {
863 int i, j, ch;
864
865 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
866 if (__glibc_unlikely (dfa->sb_char == NULL))
867 return REG_ESPACE;
868
869
870 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
871 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
872 {
873 wint_t wch = __btowc (ch);
874 if (wch != WEOF)
875 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
876 #ifndef _LIBC
877 if (isascii (ch) && wch != ch)
878 dfa->map_notascii = 1;
879 #endif
880 }
881 }
882 }
883
884 if (__glibc_unlikely (dfa->nodes == NULL || dfa->state_table == NULL))
885 return REG_ESPACE;
886 return REG_NOERROR;
887 }
888
889
890
891
892
893 static void
894 init_word_char (re_dfa_t *dfa)
895 {
896 int i = 0;
897 int j;
898 int ch = 0;
899 dfa->word_ops_used = 1;
900 if (__glibc_likely (dfa->map_notascii == 0))
901 {
902 bitset_word_t bits0 = 0x00000000;
903 bitset_word_t bits1 = 0x03ff0000;
904 bitset_word_t bits2 = 0x87fffffe;
905 bitset_word_t bits3 = 0x07fffffe;
906 if (BITSET_WORD_BITS == 64)
907 {
908
909 dfa->word_char[0] = bits1 << 31 << 1 | bits0;
910 dfa->word_char[1] = bits3 << 31 << 1 | bits2;
911 i = 2;
912 }
913 else if (BITSET_WORD_BITS == 32)
914 {
915 dfa->word_char[0] = bits0;
916 dfa->word_char[1] = bits1;
917 dfa->word_char[2] = bits2;
918 dfa->word_char[3] = bits3;
919 i = 4;
920 }
921 else
922 goto general_case;
923 ch = 128;
924
925 if (__glibc_likely (dfa->is_utf8))
926 {
927 memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
928 return;
929 }
930 }
931
932 general_case:
933 for (; i < BITSET_WORDS; ++i)
934 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
935 if (isalnum (ch) || ch == '_')
936 dfa->word_char[i] |= (bitset_word_t) 1 << j;
937 }
938
939
940
941 static void
942 free_workarea_compile (regex_t *preg)
943 {
944 re_dfa_t *dfa = preg->buffer;
945 bin_tree_storage_t *storage, *next;
946 for (storage = dfa->str_tree_storage; storage; storage = next)
947 {
948 next = storage->next;
949 re_free (storage);
950 }
951 dfa->str_tree_storage = NULL;
952 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
953 dfa->str_tree = NULL;
954 re_free (dfa->org_indices);
955 dfa->org_indices = NULL;
956 }
957
958
959
960 static reg_errcode_t
961 create_initial_state (re_dfa_t *dfa)
962 {
963 Idx first, i;
964 reg_errcode_t err;
965 re_node_set init_nodes;
966
967
968
969 first = dfa->str_tree->first->node_idx;
970 dfa->init_node = first;
971 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
972 if (__glibc_unlikely (err != REG_NOERROR))
973 return err;
974
975
976
977
978
979 if (dfa->nbackref > 0)
980 for (i = 0; i < init_nodes.nelem; ++i)
981 {
982 Idx node_idx = init_nodes.elems[i];
983 re_token_type_t type = dfa->nodes[node_idx].type;
984
985 Idx clexp_idx;
986 if (type != OP_BACK_REF)
987 continue;
988 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
989 {
990 re_token_t *clexp_node;
991 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
992 if (clexp_node->type == OP_CLOSE_SUBEXP
993 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
994 break;
995 }
996 if (clexp_idx == init_nodes.nelem)
997 continue;
998
999 if (type == OP_BACK_REF)
1000 {
1001 Idx dest_idx = dfa->edests[node_idx].elems[0];
1002 if (!re_node_set_contains (&init_nodes, dest_idx))
1003 {
1004 reg_errcode_t merge_err
1005 = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1006 if (merge_err != REG_NOERROR)
1007 return merge_err;
1008 i = 0;
1009 }
1010 }
1011 }
1012
1013
1014 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1015
1016 if (__glibc_unlikely (dfa->init_state == NULL))
1017 return err;
1018 if (dfa->init_state->has_constraint)
1019 {
1020 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1021 CONTEXT_WORD);
1022 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1023 CONTEXT_NEWLINE);
1024 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1025 &init_nodes,
1026 CONTEXT_NEWLINE
1027 | CONTEXT_BEGBUF);
1028 if (__glibc_unlikely (dfa->init_state_word == NULL
1029 || dfa->init_state_nl == NULL
1030 || dfa->init_state_begbuf == NULL))
1031 return err;
1032 }
1033 else
1034 dfa->init_state_word = dfa->init_state_nl
1035 = dfa->init_state_begbuf = dfa->init_state;
1036
1037 re_node_set_free (&init_nodes);
1038 return REG_NOERROR;
1039 }
1040
1041
1042
1043
1044
1045 static void
1046 optimize_utf8 (re_dfa_t *dfa)
1047 {
1048 Idx node;
1049 int i;
1050 bool mb_chars = false;
1051 bool has_period = false;
1052
1053 for (node = 0; node < dfa->nodes_len; ++node)
1054 switch (dfa->nodes[node].type)
1055 {
1056 case CHARACTER:
1057 if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1058 mb_chars = true;
1059 break;
1060 case ANCHOR:
1061 switch (dfa->nodes[node].opr.ctx_type)
1062 {
1063 case LINE_FIRST:
1064 case LINE_LAST:
1065 case BUF_FIRST:
1066 case BUF_LAST:
1067 break;
1068 default:
1069
1070
1071
1072 return;
1073 }
1074 break;
1075 case OP_PERIOD:
1076 has_period = true;
1077 break;
1078 case OP_BACK_REF:
1079 case OP_ALT:
1080 case END_OF_RE:
1081 case OP_DUP_ASTERISK:
1082 case OP_OPEN_SUBEXP:
1083 case OP_CLOSE_SUBEXP:
1084 break;
1085 case COMPLEX_BRACKET:
1086 return;
1087 case SIMPLE_BRACKET:
1088
1089 {
1090 int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1091 ? 0
1092 : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1093 for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1094 {
1095 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1096 return;
1097 rshift = 0;
1098 }
1099 }
1100 break;
1101 default:
1102 abort ();
1103 }
1104
1105 if (mb_chars || has_period)
1106 for (node = 0; node < dfa->nodes_len; ++node)
1107 {
1108 if (dfa->nodes[node].type == CHARACTER
1109 && dfa->nodes[node].opr.c >= ASCII_CHARS)
1110 dfa->nodes[node].mb_partial = 0;
1111 else if (dfa->nodes[node].type == OP_PERIOD)
1112 dfa->nodes[node].type = OP_UTF8_PERIOD;
1113 }
1114
1115
1116 dfa->mb_cur_max = 1;
1117 dfa->is_utf8 = 0;
1118 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1119 }
1120
1121
1122
1123
1124 static reg_errcode_t
1125 analyze (regex_t *preg)
1126 {
1127 re_dfa_t *dfa = preg->buffer;
1128 reg_errcode_t ret;
1129
1130
1131 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1132 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1133 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1134 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1135 if (__glibc_unlikely (dfa->nexts == NULL || dfa->org_indices == NULL
1136 || dfa->edests == NULL || dfa->eclosures == NULL))
1137 return REG_ESPACE;
1138
1139 dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1140 if (dfa->subexp_map != NULL)
1141 {
1142 Idx i;
1143 for (i = 0; i < preg->re_nsub; i++)
1144 dfa->subexp_map[i] = i;
1145 preorder (dfa->str_tree, optimize_subexps, dfa);
1146 for (i = 0; i < preg->re_nsub; i++)
1147 if (dfa->subexp_map[i] != i)
1148 break;
1149 if (i == preg->re_nsub)
1150 {
1151 re_free (dfa->subexp_map);
1152 dfa->subexp_map = NULL;
1153 }
1154 }
1155
1156 ret = postorder (dfa->str_tree, lower_subexps, preg);
1157 if (__glibc_unlikely (ret != REG_NOERROR))
1158 return ret;
1159 ret = postorder (dfa->str_tree, calc_first, dfa);
1160 if (__glibc_unlikely (ret != REG_NOERROR))
1161 return ret;
1162 preorder (dfa->str_tree, calc_next, dfa);
1163 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1164 if (__glibc_unlikely (ret != REG_NOERROR))
1165 return ret;
1166 ret = calc_eclosure (dfa);
1167 if (__glibc_unlikely (ret != REG_NOERROR))
1168 return ret;
1169
1170
1171
1172 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1173 || dfa->nbackref)
1174 {
1175 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1176 if (__glibc_unlikely (dfa->inveclosures == NULL))
1177 return REG_ESPACE;
1178 ret = calc_inveclosure (dfa);
1179 }
1180
1181 return ret;
1182 }
1183
1184
1185
1186
1187 static reg_errcode_t
1188 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1189 void *extra)
1190 {
1191 bin_tree_t *node, *prev;
1192
1193 for (node = root; ; )
1194 {
1195
1196
1197 while (node->left || node->right)
1198 if (node->left)
1199 node = node->left;
1200 else
1201 node = node->right;
1202
1203 do
1204 {
1205 reg_errcode_t err = fn (extra, node);
1206 if (__glibc_unlikely (err != REG_NOERROR))
1207 return err;
1208 if (node->parent == NULL)
1209 return REG_NOERROR;
1210 prev = node;
1211 node = node->parent;
1212 }
1213
1214 while (node->right == prev || node->right == NULL);
1215 node = node->right;
1216 }
1217 }
1218
1219 static reg_errcode_t
1220 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1221 void *extra)
1222 {
1223 bin_tree_t *node;
1224
1225 for (node = root; ; )
1226 {
1227 reg_errcode_t err = fn (extra, node);
1228 if (__glibc_unlikely (err != REG_NOERROR))
1229 return err;
1230
1231
1232 if (node->left)
1233 node = node->left;
1234 else
1235 {
1236 bin_tree_t *prev = NULL;
1237 while (node->right == prev || node->right == NULL)
1238 {
1239 prev = node;
1240 node = node->parent;
1241 if (!node)
1242 return REG_NOERROR;
1243 }
1244 node = node->right;
1245 }
1246 }
1247 }
1248
1249
1250
1251
1252 static reg_errcode_t
1253 optimize_subexps (void *extra, bin_tree_t *node)
1254 {
1255 re_dfa_t *dfa = (re_dfa_t *) extra;
1256
1257 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1258 {
1259 int idx = node->token.opr.idx;
1260 node->token.opr.idx = dfa->subexp_map[idx];
1261 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1262 }
1263
1264 else if (node->token.type == SUBEXP
1265 && node->left && node->left->token.type == SUBEXP)
1266 {
1267 Idx other_idx = node->left->token.opr.idx;
1268
1269 node->left = node->left->left;
1270 if (node->left)
1271 node->left->parent = node;
1272
1273 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1274 if (other_idx < BITSET_WORD_BITS)
1275 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1276 }
1277
1278 return REG_NOERROR;
1279 }
1280
1281
1282
1283 static reg_errcode_t
1284 lower_subexps (void *extra, bin_tree_t *node)
1285 {
1286 regex_t *preg = (regex_t *) extra;
1287 reg_errcode_t err = REG_NOERROR;
1288
1289 if (node->left && node->left->token.type == SUBEXP)
1290 {
1291 node->left = lower_subexp (&err, preg, node->left);
1292 if (node->left)
1293 node->left->parent = node;
1294 }
1295 if (node->right && node->right->token.type == SUBEXP)
1296 {
1297 node->right = lower_subexp (&err, preg, node->right);
1298 if (node->right)
1299 node->right->parent = node;
1300 }
1301
1302 return err;
1303 }
1304
1305 static bin_tree_t *
1306 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1307 {
1308 re_dfa_t *dfa = preg->buffer;
1309 bin_tree_t *body = node->left;
1310 bin_tree_t *op, *cls, *tree1, *tree;
1311
1312 if (preg->no_sub
1313
1314
1315
1316
1317 && node->left != NULL
1318 && (node->token.opr.idx >= BITSET_WORD_BITS
1319 || !(dfa->used_bkref_map
1320 & ((bitset_word_t) 1 << node->token.opr.idx))))
1321 return node->left;
1322
1323
1324
1325 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1326 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1327 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1328 tree = create_tree (dfa, op, tree1, CONCAT);
1329 if (__glibc_unlikely (tree == NULL || tree1 == NULL
1330 || op == NULL || cls == NULL))
1331 {
1332 *err = REG_ESPACE;
1333 return NULL;
1334 }
1335
1336 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1337 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1338 return tree;
1339 }
1340
1341
1342
1343 static reg_errcode_t
1344 calc_first (void *extra, bin_tree_t *node)
1345 {
1346 re_dfa_t *dfa = (re_dfa_t *) extra;
1347 if (node->token.type == CONCAT)
1348 {
1349 node->first = node->left->first;
1350 node->node_idx = node->left->node_idx;
1351 }
1352 else
1353 {
1354 node->first = node;
1355 node->node_idx = re_dfa_add_node (dfa, node->token);
1356 if (__glibc_unlikely (node->node_idx == -1))
1357 return REG_ESPACE;
1358 if (node->token.type == ANCHOR)
1359 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1360 }
1361 return REG_NOERROR;
1362 }
1363
1364
1365 static reg_errcode_t
1366 calc_next (void *extra, bin_tree_t *node)
1367 {
1368 switch (node->token.type)
1369 {
1370 case OP_DUP_ASTERISK:
1371 node->left->next = node;
1372 break;
1373 case CONCAT:
1374 node->left->next = node->right->first;
1375 node->right->next = node->next;
1376 break;
1377 default:
1378 if (node->left)
1379 node->left->next = node->next;
1380 if (node->right)
1381 node->right->next = node->next;
1382 break;
1383 }
1384 return REG_NOERROR;
1385 }
1386
1387
1388 static reg_errcode_t
1389 link_nfa_nodes (void *extra, bin_tree_t *node)
1390 {
1391 re_dfa_t *dfa = (re_dfa_t *) extra;
1392 Idx idx = node->node_idx;
1393 reg_errcode_t err = REG_NOERROR;
1394
1395 switch (node->token.type)
1396 {
1397 case CONCAT:
1398 break;
1399
1400 case END_OF_RE:
1401 DEBUG_ASSERT (node->next == NULL);
1402 break;
1403
1404 case OP_DUP_ASTERISK:
1405 case OP_ALT:
1406 {
1407 Idx left, right;
1408 dfa->has_plural_match = 1;
1409 if (node->left != NULL)
1410 left = node->left->first->node_idx;
1411 else
1412 left = node->next->node_idx;
1413 if (node->right != NULL)
1414 right = node->right->first->node_idx;
1415 else
1416 right = node->next->node_idx;
1417 DEBUG_ASSERT (left > -1);
1418 DEBUG_ASSERT (right > -1);
1419 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1420 }
1421 break;
1422
1423 case ANCHOR:
1424 case OP_OPEN_SUBEXP:
1425 case OP_CLOSE_SUBEXP:
1426 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1427 break;
1428
1429 case OP_BACK_REF:
1430 dfa->nexts[idx] = node->next->node_idx;
1431 if (node->token.type == OP_BACK_REF)
1432 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1433 break;
1434
1435 default:
1436 DEBUG_ASSERT (!IS_EPSILON_NODE (node->token.type));
1437 dfa->nexts[idx] = node->next->node_idx;
1438 break;
1439 }
1440
1441 return err;
1442 }
1443
1444
1445
1446
1447
1448 static reg_errcode_t
1449 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1450 Idx root_node, unsigned int init_constraint)
1451 {
1452 Idx org_node, clone_node;
1453 bool ok;
1454 unsigned int constraint = init_constraint;
1455 for (org_node = top_org_node, clone_node = top_clone_node;;)
1456 {
1457 Idx org_dest, clone_dest;
1458 if (dfa->nodes[org_node].type == OP_BACK_REF)
1459 {
1460
1461
1462
1463
1464 org_dest = dfa->nexts[org_node];
1465 re_node_set_empty (dfa->edests + clone_node);
1466 clone_dest = duplicate_node (dfa, org_dest, constraint);
1467 if (__glibc_unlikely (clone_dest == -1))
1468 return REG_ESPACE;
1469 dfa->nexts[clone_node] = dfa->nexts[org_node];
1470 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1471 if (__glibc_unlikely (! ok))
1472 return REG_ESPACE;
1473 }
1474 else if (dfa->edests[org_node].nelem == 0)
1475 {
1476
1477
1478
1479 dfa->nexts[clone_node] = dfa->nexts[org_node];
1480 break;
1481 }
1482 else if (dfa->edests[org_node].nelem == 1)
1483 {
1484
1485
1486 org_dest = dfa->edests[org_node].elems[0];
1487 re_node_set_empty (dfa->edests + clone_node);
1488
1489
1490 if (org_node == root_node && clone_node != org_node)
1491 {
1492 ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1493 if (__glibc_unlikely (! ok))
1494 return REG_ESPACE;
1495 break;
1496 }
1497
1498 constraint |= dfa->nodes[org_node].constraint;
1499 clone_dest = duplicate_node (dfa, org_dest, constraint);
1500 if (__glibc_unlikely (clone_dest == -1))
1501 return REG_ESPACE;
1502 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1503 if (__glibc_unlikely (! ok))
1504 return REG_ESPACE;
1505 }
1506 else
1507 {
1508
1509
1510 org_dest = dfa->edests[org_node].elems[0];
1511 re_node_set_empty (dfa->edests + clone_node);
1512
1513 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1514 if (clone_dest == -1)
1515 {
1516
1517 reg_errcode_t err;
1518 clone_dest = duplicate_node (dfa, org_dest, constraint);
1519 if (__glibc_unlikely (clone_dest == -1))
1520 return REG_ESPACE;
1521 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1522 if (__glibc_unlikely (! ok))
1523 return REG_ESPACE;
1524 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1525 root_node, constraint);
1526 if (__glibc_unlikely (err != REG_NOERROR))
1527 return err;
1528 }
1529 else
1530 {
1531
1532
1533 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1534 if (__glibc_unlikely (! ok))
1535 return REG_ESPACE;
1536 }
1537
1538 org_dest = dfa->edests[org_node].elems[1];
1539 clone_dest = duplicate_node (dfa, org_dest, constraint);
1540 if (__glibc_unlikely (clone_dest == -1))
1541 return REG_ESPACE;
1542 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1543 if (__glibc_unlikely (! ok))
1544 return REG_ESPACE;
1545 }
1546 org_node = org_dest;
1547 clone_node = clone_dest;
1548 }
1549 return REG_NOERROR;
1550 }
1551
1552
1553
1554
1555 static Idx
1556 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1557 unsigned int constraint)
1558 {
1559 Idx idx;
1560 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1561 {
1562 if (org_node == dfa->org_indices[idx]
1563 && constraint == dfa->nodes[idx].constraint)
1564 return idx;
1565 }
1566 return -1;
1567 }
1568
1569
1570
1571
1572
1573 static Idx
1574 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1575 {
1576 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1577 if (__glibc_likely (dup_idx != -1))
1578 {
1579 dfa->nodes[dup_idx].constraint = constraint;
1580 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1581 dfa->nodes[dup_idx].duplicated = 1;
1582
1583
1584 dfa->org_indices[dup_idx] = org_idx;
1585 }
1586 return dup_idx;
1587 }
1588
1589 static reg_errcode_t
1590 calc_inveclosure (re_dfa_t *dfa)
1591 {
1592 Idx src, idx;
1593 bool ok;
1594 for (idx = 0; idx < dfa->nodes_len; ++idx)
1595 re_node_set_init_empty (dfa->inveclosures + idx);
1596
1597 for (src = 0; src < dfa->nodes_len; ++src)
1598 {
1599 Idx *elems = dfa->eclosures[src].elems;
1600 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1601 {
1602 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1603 if (__glibc_unlikely (! ok))
1604 return REG_ESPACE;
1605 }
1606 }
1607
1608 return REG_NOERROR;
1609 }
1610
1611
1612
1613 static reg_errcode_t
1614 calc_eclosure (re_dfa_t *dfa)
1615 {
1616 Idx node_idx;
1617 bool incomplete;
1618 DEBUG_ASSERT (dfa->nodes_len > 0);
1619 incomplete = false;
1620
1621 for (node_idx = 0; ; ++node_idx)
1622 {
1623 reg_errcode_t err;
1624 re_node_set eclosure_elem;
1625 if (node_idx == dfa->nodes_len)
1626 {
1627 if (!incomplete)
1628 break;
1629 incomplete = false;
1630 node_idx = 0;
1631 }
1632
1633 DEBUG_ASSERT (dfa->eclosures[node_idx].nelem != -1);
1634
1635
1636 if (dfa->eclosures[node_idx].nelem != 0)
1637 continue;
1638
1639 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1640 if (__glibc_unlikely (err != REG_NOERROR))
1641 return err;
1642
1643 if (dfa->eclosures[node_idx].nelem == 0)
1644 {
1645 incomplete = true;
1646 re_node_set_free (&eclosure_elem);
1647 }
1648 }
1649 return REG_NOERROR;
1650 }
1651
1652
1653
1654 static reg_errcode_t
1655 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1656 {
1657 reg_errcode_t err;
1658 Idx i;
1659 re_node_set eclosure;
1660 bool incomplete = false;
1661 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1662 if (__glibc_unlikely (err != REG_NOERROR))
1663 return err;
1664
1665
1666 eclosure.elems[eclosure.nelem++] = node;
1667
1668
1669
1670 dfa->eclosures[node].nelem = -1;
1671
1672
1673
1674 if (dfa->nodes[node].constraint
1675 && dfa->edests[node].nelem
1676 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1677 {
1678 err = duplicate_node_closure (dfa, node, node, node,
1679 dfa->nodes[node].constraint);
1680 if (__glibc_unlikely (err != REG_NOERROR))
1681 return err;
1682 }
1683
1684
1685 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1686 for (i = 0; i < dfa->edests[node].nelem; ++i)
1687 {
1688 re_node_set eclosure_elem;
1689 Idx edest = dfa->edests[node].elems[i];
1690
1691
1692 if (dfa->eclosures[edest].nelem == -1)
1693 {
1694 incomplete = true;
1695 continue;
1696 }
1697
1698
1699 if (dfa->eclosures[edest].nelem == 0)
1700 {
1701 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1702 if (__glibc_unlikely (err != REG_NOERROR))
1703 return err;
1704 }
1705 else
1706 eclosure_elem = dfa->eclosures[edest];
1707
1708 err = re_node_set_merge (&eclosure, &eclosure_elem);
1709 if (__glibc_unlikely (err != REG_NOERROR))
1710 return err;
1711
1712
1713 if (dfa->eclosures[edest].nelem == 0)
1714 {
1715 incomplete = true;
1716 re_node_set_free (&eclosure_elem);
1717 }
1718 }
1719
1720 if (incomplete && !root)
1721 dfa->eclosures[node].nelem = 0;
1722 else
1723 dfa->eclosures[node] = eclosure;
1724 *new_set = eclosure;
1725 return REG_NOERROR;
1726 }
1727
1728
1729
1730
1731
1732
1733 static void
1734 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1735 {
1736 re_string_skip_bytes (input, peek_token (result, input, syntax));
1737 }
1738
1739
1740
1741
1742 static int
1743 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1744 {
1745 unsigned char c;
1746
1747 if (re_string_eoi (input))
1748 {
1749 token->type = END_OF_RE;
1750 return 0;
1751 }
1752
1753 c = re_string_peek_byte (input, 0);
1754 token->opr.c = c;
1755
1756 token->word_char = 0;
1757 token->mb_partial = 0;
1758 if (input->mb_cur_max > 1
1759 && !re_string_first_byte (input, re_string_cur_idx (input)))
1760 {
1761 token->type = CHARACTER;
1762 token->mb_partial = 1;
1763 return 1;
1764 }
1765 if (c == '\\')
1766 {
1767 unsigned char c2;
1768 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1769 {
1770 token->type = BACK_SLASH;
1771 return 1;
1772 }
1773
1774 c2 = re_string_peek_byte_case (input, 1);
1775 token->opr.c = c2;
1776 token->type = CHARACTER;
1777 if (input->mb_cur_max > 1)
1778 {
1779 wint_t wc = re_string_wchar_at (input,
1780 re_string_cur_idx (input) + 1);
1781 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1782 }
1783 else
1784 token->word_char = IS_WORD_CHAR (c2) != 0;
1785
1786 switch (c2)
1787 {
1788 case '|':
1789 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1790 token->type = OP_ALT;
1791 break;
1792 case '1': case '2': case '3': case '4': case '5':
1793 case '6': case '7': case '8': case '9':
1794 if (!(syntax & RE_NO_BK_REFS))
1795 {
1796 token->type = OP_BACK_REF;
1797 token->opr.idx = c2 - '1';
1798 }
1799 break;
1800 case '<':
1801 if (!(syntax & RE_NO_GNU_OPS))
1802 {
1803 token->type = ANCHOR;
1804 token->opr.ctx_type = WORD_FIRST;
1805 }
1806 break;
1807 case '>':
1808 if (!(syntax & RE_NO_GNU_OPS))
1809 {
1810 token->type = ANCHOR;
1811 token->opr.ctx_type = WORD_LAST;
1812 }
1813 break;
1814 case 'b':
1815 if (!(syntax & RE_NO_GNU_OPS))
1816 {
1817 token->type = ANCHOR;
1818 token->opr.ctx_type = WORD_DELIM;
1819 }
1820 break;
1821 case 'B':
1822 if (!(syntax & RE_NO_GNU_OPS))
1823 {
1824 token->type = ANCHOR;
1825 token->opr.ctx_type = NOT_WORD_DELIM;
1826 }
1827 break;
1828 case 'w':
1829 if (!(syntax & RE_NO_GNU_OPS))
1830 token->type = OP_WORD;
1831 break;
1832 case 'W':
1833 if (!(syntax & RE_NO_GNU_OPS))
1834 token->type = OP_NOTWORD;
1835 break;
1836 case 's':
1837 if (!(syntax & RE_NO_GNU_OPS))
1838 token->type = OP_SPACE;
1839 break;
1840 case 'S':
1841 if (!(syntax & RE_NO_GNU_OPS))
1842 token->type = OP_NOTSPACE;
1843 break;
1844 case '`':
1845 if (!(syntax & RE_NO_GNU_OPS))
1846 {
1847 token->type = ANCHOR;
1848 token->opr.ctx_type = BUF_FIRST;
1849 }
1850 break;
1851 case '\'':
1852 if (!(syntax & RE_NO_GNU_OPS))
1853 {
1854 token->type = ANCHOR;
1855 token->opr.ctx_type = BUF_LAST;
1856 }
1857 break;
1858 case '(':
1859 if (!(syntax & RE_NO_BK_PARENS))
1860 token->type = OP_OPEN_SUBEXP;
1861 break;
1862 case ')':
1863 if (!(syntax & RE_NO_BK_PARENS))
1864 token->type = OP_CLOSE_SUBEXP;
1865 break;
1866 case '+':
1867 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1868 token->type = OP_DUP_PLUS;
1869 break;
1870 case '?':
1871 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1872 token->type = OP_DUP_QUESTION;
1873 break;
1874 case '{':
1875 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1876 token->type = OP_OPEN_DUP_NUM;
1877 break;
1878 case '}':
1879 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1880 token->type = OP_CLOSE_DUP_NUM;
1881 break;
1882 default:
1883 break;
1884 }
1885 return 2;
1886 }
1887
1888 token->type = CHARACTER;
1889 if (input->mb_cur_max > 1)
1890 {
1891 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1892 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1893 }
1894 else
1895 token->word_char = IS_WORD_CHAR (token->opr.c);
1896
1897 switch (c)
1898 {
1899 case '\n':
1900 if (syntax & RE_NEWLINE_ALT)
1901 token->type = OP_ALT;
1902 break;
1903 case '|':
1904 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1905 token->type = OP_ALT;
1906 break;
1907 case '*':
1908 token->type = OP_DUP_ASTERISK;
1909 break;
1910 case '+':
1911 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1912 token->type = OP_DUP_PLUS;
1913 break;
1914 case '?':
1915 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1916 token->type = OP_DUP_QUESTION;
1917 break;
1918 case '{':
1919 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1920 token->type = OP_OPEN_DUP_NUM;
1921 break;
1922 case '}':
1923 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1924 token->type = OP_CLOSE_DUP_NUM;
1925 break;
1926 case '(':
1927 if (syntax & RE_NO_BK_PARENS)
1928 token->type = OP_OPEN_SUBEXP;
1929 break;
1930 case ')':
1931 if (syntax & RE_NO_BK_PARENS)
1932 token->type = OP_CLOSE_SUBEXP;
1933 break;
1934 case '[':
1935 token->type = OP_OPEN_BRACKET;
1936 break;
1937 case '.':
1938 token->type = OP_PERIOD;
1939 break;
1940 case '^':
1941 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE))
1942 && re_string_cur_idx (input) != 0)
1943 {
1944 char prev = re_string_peek_byte (input, -1);
1945 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1946 break;
1947 }
1948 token->type = ANCHOR;
1949 token->opr.ctx_type = LINE_FIRST;
1950 break;
1951 case '$':
1952 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS)
1953 && re_string_cur_idx (input) + 1 != re_string_length (input))
1954 {
1955 re_token_t next;
1956 re_string_skip_bytes (input, 1);
1957 peek_token (&next, input, syntax);
1958 re_string_skip_bytes (input, -1);
1959 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1960 break;
1961 }
1962 token->type = ANCHOR;
1963 token->opr.ctx_type = LINE_LAST;
1964 break;
1965 default:
1966 break;
1967 }
1968 return 1;
1969 }
1970
1971
1972
1973
1974 static int
1975 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1976 {
1977 unsigned char c;
1978 if (re_string_eoi (input))
1979 {
1980 token->type = END_OF_RE;
1981 return 0;
1982 }
1983 c = re_string_peek_byte (input, 0);
1984 token->opr.c = c;
1985
1986 if (input->mb_cur_max > 1
1987 && !re_string_first_byte (input, re_string_cur_idx (input)))
1988 {
1989 token->type = CHARACTER;
1990 return 1;
1991 }
1992
1993 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
1994 && re_string_cur_idx (input) + 1 < re_string_length (input))
1995 {
1996
1997 unsigned char c2;
1998 re_string_skip_bytes (input, 1);
1999 c2 = re_string_peek_byte (input, 0);
2000 token->opr.c = c2;
2001 token->type = CHARACTER;
2002 return 1;
2003 }
2004 if (c == '[')
2005 {
2006 unsigned char c2;
2007 int token_len;
2008 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2009 c2 = re_string_peek_byte (input, 1);
2010 else
2011 c2 = 0;
2012 token->opr.c = c2;
2013 token_len = 2;
2014 switch (c2)
2015 {
2016 case '.':
2017 token->type = OP_OPEN_COLL_ELEM;
2018 break;
2019
2020 case '=':
2021 token->type = OP_OPEN_EQUIV_CLASS;
2022 break;
2023
2024 case ':':
2025 if (syntax & RE_CHAR_CLASSES)
2026 {
2027 token->type = OP_OPEN_CHAR_CLASS;
2028 break;
2029 }
2030 FALLTHROUGH;
2031 default:
2032 token->type = CHARACTER;
2033 token->opr.c = c;
2034 token_len = 1;
2035 break;
2036 }
2037 return token_len;
2038 }
2039 switch (c)
2040 {
2041 case ']':
2042 token->type = OP_CLOSE_BRACKET;
2043 break;
2044 case '^':
2045 token->type = OP_NON_MATCH_LIST;
2046 break;
2047 case '-':
2048
2049
2050
2051 if (! (re_string_cur_idx (input) + 2 < re_string_length (input)
2052 && re_string_peek_byte (input, 1) == '-'
2053 && re_string_peek_byte (input, 2) == '-'))
2054 {
2055 token->type = OP_CHARSET_RANGE;
2056 break;
2057 }
2058 re_string_skip_bytes (input, 2);
2059 FALLTHROUGH;
2060 default:
2061 token->type = CHARACTER;
2062 }
2063 return 1;
2064 }
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080 static bin_tree_t *
2081 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2082 reg_errcode_t *err)
2083 {
2084 re_dfa_t *dfa = preg->buffer;
2085 bin_tree_t *tree, *eor, *root;
2086 re_token_t current_token;
2087 dfa->syntax = syntax;
2088 fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2089 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
2090 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2091 return NULL;
2092 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2093 if (tree != NULL)
2094 root = create_tree (dfa, tree, eor, CONCAT);
2095 else
2096 root = eor;
2097 if (__glibc_unlikely (eor == NULL || root == NULL))
2098 {
2099 *err = REG_ESPACE;
2100 return NULL;
2101 }
2102 return root;
2103 }
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114 static bin_tree_t *
2115 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2116 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2117 {
2118 re_dfa_t *dfa = preg->buffer;
2119 bin_tree_t *tree, *branch = NULL;
2120 bitset_word_t initial_bkref_map = dfa->completed_bkref_map;
2121 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2122 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2123 return NULL;
2124
2125 while (token->type == OP_ALT)
2126 {
2127 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2128 if (token->type != OP_ALT && token->type != END_OF_RE
2129 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2130 {
2131 bitset_word_t accumulated_bkref_map = dfa->completed_bkref_map;
2132 dfa->completed_bkref_map = initial_bkref_map;
2133 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2134 if (__glibc_unlikely (*err != REG_NOERROR && branch == NULL))
2135 {
2136 if (tree != NULL)
2137 postorder (tree, free_tree, NULL);
2138 return NULL;
2139 }
2140 dfa->completed_bkref_map |= accumulated_bkref_map;
2141 }
2142 else
2143 branch = NULL;
2144 tree = create_tree (dfa, tree, branch, OP_ALT);
2145 if (__glibc_unlikely (tree == NULL))
2146 {
2147 *err = REG_ESPACE;
2148 return NULL;
2149 }
2150 }
2151 return tree;
2152 }
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163 static bin_tree_t *
2164 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2165 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2166 {
2167 bin_tree_t *tree, *expr;
2168 re_dfa_t *dfa = preg->buffer;
2169 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2170 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2171 return NULL;
2172
2173 while (token->type != OP_ALT && token->type != END_OF_RE
2174 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2175 {
2176 expr = parse_expression (regexp, preg, token, syntax, nest, err);
2177 if (__glibc_unlikely (*err != REG_NOERROR && expr == NULL))
2178 {
2179 if (tree != NULL)
2180 postorder (tree, free_tree, NULL);
2181 return NULL;
2182 }
2183 if (tree != NULL && expr != NULL)
2184 {
2185 bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2186 if (newtree == NULL)
2187 {
2188 postorder (expr, free_tree, NULL);
2189 postorder (tree, free_tree, NULL);
2190 *err = REG_ESPACE;
2191 return NULL;
2192 }
2193 tree = newtree;
2194 }
2195 else if (tree == NULL)
2196 tree = expr;
2197
2198 }
2199 return tree;
2200 }
2201
2202
2203
2204
2205
2206
2207
2208 static bin_tree_t *
2209 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2210 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2211 {
2212 re_dfa_t *dfa = preg->buffer;
2213 bin_tree_t *tree;
2214 switch (token->type)
2215 {
2216 case CHARACTER:
2217 tree = create_token_tree (dfa, NULL, NULL, token);
2218 if (__glibc_unlikely (tree == NULL))
2219 {
2220 *err = REG_ESPACE;
2221 return NULL;
2222 }
2223 if (dfa->mb_cur_max > 1)
2224 {
2225 while (!re_string_eoi (regexp)
2226 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2227 {
2228 bin_tree_t *mbc_remain;
2229 fetch_token (token, regexp, syntax);
2230 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2231 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2232 if (__glibc_unlikely (mbc_remain == NULL || tree == NULL))
2233 {
2234 *err = REG_ESPACE;
2235 return NULL;
2236 }
2237 }
2238 }
2239 break;
2240
2241 case OP_OPEN_SUBEXP:
2242 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2243 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2244 return NULL;
2245 break;
2246
2247 case OP_OPEN_BRACKET:
2248 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2249 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2250 return NULL;
2251 break;
2252
2253 case OP_BACK_REF:
2254 if (!__glibc_likely (dfa->completed_bkref_map & (1 << token->opr.idx)))
2255 {
2256 *err = REG_ESUBREG;
2257 return NULL;
2258 }
2259 dfa->used_bkref_map |= 1 << token->opr.idx;
2260 tree = create_token_tree (dfa, NULL, NULL, token);
2261 if (__glibc_unlikely (tree == NULL))
2262 {
2263 *err = REG_ESPACE;
2264 return NULL;
2265 }
2266 ++dfa->nbackref;
2267 dfa->has_mb_node = 1;
2268 break;
2269
2270 case OP_OPEN_DUP_NUM:
2271 if (syntax & RE_CONTEXT_INVALID_DUP)
2272 {
2273 *err = REG_BADRPT;
2274 return NULL;
2275 }
2276 FALLTHROUGH;
2277 case OP_DUP_ASTERISK:
2278 case OP_DUP_PLUS:
2279 case OP_DUP_QUESTION:
2280 if (syntax & RE_CONTEXT_INVALID_OPS)
2281 {
2282 *err = REG_BADRPT;
2283 return NULL;
2284 }
2285 else if (syntax & RE_CONTEXT_INDEP_OPS)
2286 {
2287 fetch_token (token, regexp, syntax);
2288 return parse_expression (regexp, preg, token, syntax, nest, err);
2289 }
2290 FALLTHROUGH;
2291 case OP_CLOSE_SUBEXP:
2292 if ((token->type == OP_CLOSE_SUBEXP)
2293 && !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2294 {
2295 *err = REG_ERPAREN;
2296 return NULL;
2297 }
2298 FALLTHROUGH;
2299 case OP_CLOSE_DUP_NUM:
2300
2301
2302
2303 token->type = CHARACTER;
2304
2305
2306 tree = create_token_tree (dfa, NULL, NULL, token);
2307 if (__glibc_unlikely (tree == NULL))
2308 {
2309 *err = REG_ESPACE;
2310 return NULL;
2311 }
2312 break;
2313
2314 case ANCHOR:
2315 if ((token->opr.ctx_type
2316 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2317 && dfa->word_ops_used == 0)
2318 init_word_char (dfa);
2319 if (token->opr.ctx_type == WORD_DELIM
2320 || token->opr.ctx_type == NOT_WORD_DELIM)
2321 {
2322 bin_tree_t *tree_first, *tree_last;
2323 if (token->opr.ctx_type == WORD_DELIM)
2324 {
2325 token->opr.ctx_type = WORD_FIRST;
2326 tree_first = create_token_tree (dfa, NULL, NULL, token);
2327 token->opr.ctx_type = WORD_LAST;
2328 }
2329 else
2330 {
2331 token->opr.ctx_type = INSIDE_WORD;
2332 tree_first = create_token_tree (dfa, NULL, NULL, token);
2333 token->opr.ctx_type = INSIDE_NOTWORD;
2334 }
2335 tree_last = create_token_tree (dfa, NULL, NULL, token);
2336 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2337 if (__glibc_unlikely (tree_first == NULL || tree_last == NULL
2338 || tree == NULL))
2339 {
2340 *err = REG_ESPACE;
2341 return NULL;
2342 }
2343 }
2344 else
2345 {
2346 tree = create_token_tree (dfa, NULL, NULL, token);
2347 if (__glibc_unlikely (tree == NULL))
2348 {
2349 *err = REG_ESPACE;
2350 return NULL;
2351 }
2352 }
2353
2354
2355
2356
2357 fetch_token (token, regexp, syntax);
2358 return tree;
2359
2360 case OP_PERIOD:
2361 tree = create_token_tree (dfa, NULL, NULL, token);
2362 if (__glibc_unlikely (tree == NULL))
2363 {
2364 *err = REG_ESPACE;
2365 return NULL;
2366 }
2367 if (dfa->mb_cur_max > 1)
2368 dfa->has_mb_node = 1;
2369 break;
2370
2371 case OP_WORD:
2372 case OP_NOTWORD:
2373 tree = build_charclass_op (dfa, regexp->trans,
2374 "alnum",
2375 "_",
2376 token->type == OP_NOTWORD, err);
2377 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2378 return NULL;
2379 break;
2380
2381 case OP_SPACE:
2382 case OP_NOTSPACE:
2383 tree = build_charclass_op (dfa, regexp->trans,
2384 "space",
2385 "",
2386 token->type == OP_NOTSPACE, err);
2387 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2388 return NULL;
2389 break;
2390
2391 case OP_ALT:
2392 case END_OF_RE:
2393 return NULL;
2394
2395 case BACK_SLASH:
2396 *err = REG_EESCAPE;
2397 return NULL;
2398
2399 default:
2400
2401 DEBUG_ASSERT (false);
2402 return NULL;
2403 }
2404 fetch_token (token, regexp, syntax);
2405
2406 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2407 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2408 {
2409 bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token,
2410 syntax, err);
2411 if (__glibc_unlikely (*err != REG_NOERROR && dup_tree == NULL))
2412 {
2413 if (tree != NULL)
2414 postorder (tree, free_tree, NULL);
2415 return NULL;
2416 }
2417 tree = dup_tree;
2418
2419 if ((syntax & RE_CONTEXT_INVALID_DUP)
2420 && (token->type == OP_DUP_ASTERISK
2421 || token->type == OP_OPEN_DUP_NUM))
2422 {
2423 if (tree != NULL)
2424 postorder (tree, free_tree, NULL);
2425 *err = REG_BADRPT;
2426 return NULL;
2427 }
2428 }
2429
2430 return tree;
2431 }
2432
2433
2434
2435
2436
2437
2438
2439
2440 static bin_tree_t *
2441 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2442 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2443 {
2444 re_dfa_t *dfa = preg->buffer;
2445 bin_tree_t *tree;
2446 size_t cur_nsub;
2447 cur_nsub = preg->re_nsub++;
2448
2449 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2450
2451
2452 if (token->type == OP_CLOSE_SUBEXP)
2453 tree = NULL;
2454 else
2455 {
2456 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2457 if (__glibc_unlikely (*err == REG_NOERROR
2458 && token->type != OP_CLOSE_SUBEXP))
2459 {
2460 if (tree != NULL)
2461 postorder (tree, free_tree, NULL);
2462 *err = REG_EPAREN;
2463 }
2464 if (__glibc_unlikely (*err != REG_NOERROR))
2465 return NULL;
2466 }
2467
2468 if (cur_nsub <= '9' - '1')
2469 dfa->completed_bkref_map |= 1 << cur_nsub;
2470
2471 tree = create_tree (dfa, tree, NULL, SUBEXP);
2472 if (__glibc_unlikely (tree == NULL))
2473 {
2474 *err = REG_ESPACE;
2475 return NULL;
2476 }
2477 tree->token.opr.idx = cur_nsub;
2478 return tree;
2479 }
2480
2481
2482
2483 static bin_tree_t *
2484 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2485 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2486 {
2487 bin_tree_t *tree = NULL, *old_tree = NULL;
2488 Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2489 re_token_t start_token = *token;
2490
2491 if (token->type == OP_OPEN_DUP_NUM)
2492 {
2493 end = 0;
2494 start = fetch_number (regexp, token, syntax);
2495 if (start == -1)
2496 {
2497 if (token->type == CHARACTER && token->opr.c == ',')
2498 start = 0;
2499 else
2500 {
2501 *err = REG_BADBR;
2502 return NULL;
2503 }
2504 }
2505 if (__glibc_likely (start != -2))
2506 {
2507
2508 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2509 : ((token->type == CHARACTER && token->opr.c == ',')
2510 ? fetch_number (regexp, token, syntax) : -2));
2511 }
2512 if (__glibc_unlikely (start == -2 || end == -2))
2513 {
2514
2515 if (__glibc_unlikely (!(syntax & RE_INVALID_INTERVAL_ORD)))
2516 {
2517 if (token->type == END_OF_RE)
2518 *err = REG_EBRACE;
2519 else
2520 *err = REG_BADBR;
2521
2522 return NULL;
2523 }
2524
2525
2526 re_string_set_index (regexp, start_idx);
2527 *token = start_token;
2528 token->type = CHARACTER;
2529
2530
2531 return elem;
2532 }
2533
2534 if (__glibc_unlikely ((end != -1 && start > end)
2535 || token->type != OP_CLOSE_DUP_NUM))
2536 {
2537
2538 *err = REG_BADBR;
2539 return NULL;
2540 }
2541
2542 if (__glibc_unlikely (RE_DUP_MAX < (end == -1 ? start : end)))
2543 {
2544 *err = REG_ESIZE;
2545 return NULL;
2546 }
2547 }
2548 else
2549 {
2550 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2551 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2552 }
2553
2554 fetch_token (token, regexp, syntax);
2555
2556 if (__glibc_unlikely (elem == NULL))
2557 return NULL;
2558 if (__glibc_unlikely (start == 0 && end == 0))
2559 {
2560 postorder (elem, free_tree, NULL);
2561 return NULL;
2562 }
2563
2564
2565 if (__glibc_unlikely (start > 0))
2566 {
2567 tree = elem;
2568 for (i = 2; i <= start; ++i)
2569 {
2570 elem = duplicate_tree (elem, dfa);
2571 tree = create_tree (dfa, tree, elem, CONCAT);
2572 if (__glibc_unlikely (elem == NULL || tree == NULL))
2573 goto parse_dup_op_espace;
2574 }
2575
2576 if (start == end)
2577 return tree;
2578
2579
2580 elem = duplicate_tree (elem, dfa);
2581 if (__glibc_unlikely (elem == NULL))
2582 goto parse_dup_op_espace;
2583 old_tree = tree;
2584 }
2585 else
2586 old_tree = NULL;
2587
2588 if (elem->token.type == SUBEXP)
2589 {
2590 uintptr_t subidx = elem->token.opr.idx;
2591 postorder (elem, mark_opt_subexp, (void *) subidx);
2592 }
2593
2594 tree = create_tree (dfa, elem, NULL,
2595 (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2596 if (__glibc_unlikely (tree == NULL))
2597 goto parse_dup_op_espace;
2598
2599
2600
2601
2602 if (TYPE_SIGNED (Idx) || end != -1)
2603 for (i = start + 2; i <= end; ++i)
2604 {
2605 elem = duplicate_tree (elem, dfa);
2606 tree = create_tree (dfa, tree, elem, CONCAT);
2607 if (__glibc_unlikely (elem == NULL || tree == NULL))
2608 goto parse_dup_op_espace;
2609
2610 tree = create_tree (dfa, tree, NULL, OP_ALT);
2611 if (__glibc_unlikely (tree == NULL))
2612 goto parse_dup_op_espace;
2613 }
2614
2615 if (old_tree)
2616 tree = create_tree (dfa, old_tree, tree, CONCAT);
2617
2618 return tree;
2619
2620 parse_dup_op_espace:
2621 *err = REG_ESPACE;
2622 return NULL;
2623 }
2624
2625
2626
2627 #define BRACKET_NAME_BUF_SIZE 32
2628
2629 #ifndef _LIBC
2630
2631
2632
2633
2634 static wint_t
2635 parse_byte (unsigned char b, re_dfa_t const *dfa)
2636 {
2637 return dfa->mb_cur_max > 1 ? __btowc (b) : b;
2638 }
2639
2640
2641
2642
2643
2644
2645
2646
2647 static reg_errcode_t
2648 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc,
2649 bracket_elem_t *start_elem, bracket_elem_t *end_elem,
2650 re_dfa_t *dfa, reg_syntax_t syntax, uint_fast32_t nrules,
2651 const unsigned char *collseqmb, const char *collseqwc,
2652 int_fast32_t table_size, const void *symb_table,
2653 const unsigned char *extra)
2654 {
2655
2656 if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2657 || start_elem->type == CHAR_CLASS
2658 || end_elem->type == EQUIV_CLASS
2659 || end_elem->type == CHAR_CLASS))
2660 return REG_ERANGE;
2661
2662
2663
2664 if (__glibc_unlikely ((start_elem->type == COLL_SYM
2665 && strlen ((char *) start_elem->opr.name) > 1)
2666 || (end_elem->type == COLL_SYM
2667 && strlen ((char *) end_elem->opr.name) > 1)))
2668 return REG_ECOLLATE;
2669
2670 unsigned int
2671 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2672 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2673 : 0)),
2674 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2675 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2676 : 0));
2677 wint_t
2678 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2679 ? parse_byte (start_ch, dfa) : start_elem->opr.wch),
2680 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2681 ? parse_byte (end_ch, dfa) : end_elem->opr.wch);
2682
2683 if (start_wc == WEOF || end_wc == WEOF)
2684 return REG_ECOLLATE;
2685 else if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2686 && start_wc > end_wc))
2687 return REG_ERANGE;
2688
2689
2690
2691
2692
2693
2694 if (dfa->mb_cur_max > 1)
2695 {
2696
2697 if (__glibc_unlikely (*range_alloc == mbcset->nranges))
2698 {
2699
2700 wchar_t *new_array_start, *new_array_end;
2701 Idx new_nranges;
2702
2703
2704 new_nranges = 2 * mbcset->nranges + 1;
2705
2706
2707 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2708 new_nranges);
2709 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2710 new_nranges);
2711
2712 if (__glibc_unlikely (new_array_start == NULL
2713 || new_array_end == NULL))
2714 {
2715 re_free (new_array_start);
2716 re_free (new_array_end);
2717 return REG_ESPACE;
2718 }
2719
2720 mbcset->range_starts = new_array_start;
2721 mbcset->range_ends = new_array_end;
2722 *range_alloc = new_nranges;
2723 }
2724
2725 mbcset->range_starts[mbcset->nranges] = start_wc;
2726 mbcset->range_ends[mbcset->nranges++] = end_wc;
2727 }
2728
2729
2730 for (wchar_t wc = 0; wc < SBC_MAX; ++wc)
2731 {
2732 if (start_wc <= wc && wc <= end_wc)
2733 bitset_set (sbcset, wc);
2734 }
2735
2736 return REG_NOERROR;
2737 }
2738 #endif
2739
2740 #ifndef _LIBC
2741
2742
2743
2744
2745
2746
2747 static reg_errcode_t
2748 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2749 Idx *coll_sym_alloc, const unsigned char *name,
2750 uint_fast32_t nrules, int_fast32_t table_size,
2751 const void *symb_table, const unsigned char *extra)
2752 {
2753 size_t name_len = strlen ((const char *) name);
2754 if (__glibc_unlikely (name_len != 1))
2755 return REG_ECOLLATE;
2756 else
2757 {
2758 bitset_set (sbcset, name[0]);
2759 return REG_NOERROR;
2760 }
2761 }
2762 #endif
2763
2764 #ifdef _LIBC
2765
2766
2767
2768
2769
2770 static __always_inline int32_t
2771 seek_collating_symbol_entry (const unsigned char *name, size_t name_len,
2772 const int32_t *symb_table,
2773 int_fast32_t table_size,
2774 const unsigned char *extra)
2775 {
2776 int_fast32_t elem;
2777
2778 for (elem = 0; elem < table_size; elem++)
2779 if (symb_table[2 * elem] != 0)
2780 {
2781 int32_t idx = symb_table[2 * elem + 1];
2782
2783 idx += 1 + extra[idx];
2784 if (
2785 name_len == extra[idx]
2786
2787 && memcmp (name, &extra[idx + 1], name_len) == 0)
2788
2789 return elem;
2790 }
2791 return -1;
2792 }
2793
2794
2795
2796
2797
2798 static __always_inline unsigned int
2799 lookup_collation_sequence_value (bracket_elem_t *br_elem, uint32_t nrules,
2800 const unsigned char *collseqmb,
2801 const char *collseqwc,
2802 int_fast32_t table_size,
2803 const int32_t *symb_table,
2804 const unsigned char *extra)
2805 {
2806 if (br_elem->type == SB_CHAR)
2807 {
2808
2809 if (nrules == 0)
2810 return collseqmb[br_elem->opr.ch];
2811 else
2812 {
2813 wint_t wc = __btowc (br_elem->opr.ch);
2814 return __collseq_table_lookup (collseqwc, wc);
2815 }
2816 }
2817 else if (br_elem->type == MB_CHAR)
2818 {
2819 if (nrules != 0)
2820 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2821 }
2822 else if (br_elem->type == COLL_SYM)
2823 {
2824 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2825 if (nrules != 0)
2826 {
2827 int32_t elem, idx;
2828 elem = seek_collating_symbol_entry (br_elem->opr.name,
2829 sym_name_len,
2830 symb_table, table_size,
2831 extra);
2832 if (elem != -1)
2833 {
2834
2835 idx = symb_table[2 * elem + 1];
2836
2837 idx += 1 + extra[idx];
2838
2839 idx += 1 + extra[idx];
2840
2841 idx = (idx + 3) & ~3;
2842
2843 idx += sizeof (unsigned int);
2844
2845 idx += sizeof (unsigned int) *
2846 (1 + *(unsigned int *) (extra + idx));
2847
2848 return *(unsigned int *) (extra + idx);
2849 }
2850 else if (sym_name_len == 1)
2851 {
2852
2853
2854 return collseqmb[br_elem->opr.name[0]];
2855 }
2856 }
2857 else if (sym_name_len == 1)
2858 return collseqmb[br_elem->opr.name[0]];
2859 }
2860 return UINT_MAX;
2861 }
2862
2863
2864
2865
2866
2867
2868
2869
2870 static __always_inline reg_errcode_t
2871 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc,
2872 bracket_elem_t *start_elem, bracket_elem_t *end_elem,
2873 re_dfa_t *dfa, reg_syntax_t syntax, uint32_t nrules,
2874 const unsigned char *collseqmb, const char *collseqwc,
2875 int_fast32_t table_size, const int32_t *symb_table,
2876 const unsigned char *extra)
2877 {
2878 unsigned int ch;
2879 uint32_t start_collseq;
2880 uint32_t end_collseq;
2881
2882
2883
2884 if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2885 || start_elem->type == CHAR_CLASS
2886 || end_elem->type == EQUIV_CLASS
2887 || end_elem->type == CHAR_CLASS))
2888 return REG_ERANGE;
2889
2890
2891 start_collseq = lookup_collation_sequence_value (start_elem, nrules, collseqmb, collseqwc,
2892 table_size, symb_table, extra);
2893 end_collseq = lookup_collation_sequence_value (end_elem, nrules, collseqmb, collseqwc,
2894 table_size, symb_table, extra);
2895
2896 if (__glibc_unlikely (start_collseq == UINT_MAX
2897 || end_collseq == UINT_MAX))
2898 return REG_ECOLLATE;
2899 if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2900 && start_collseq > end_collseq))
2901 return REG_ERANGE;
2902
2903
2904
2905
2906
2907 if (nrules > 0 || dfa->mb_cur_max > 1)
2908 {
2909
2910 if (__glibc_unlikely (*range_alloc == mbcset->nranges))
2911 {
2912
2913 uint32_t *new_array_start;
2914 uint32_t *new_array_end;
2915 int new_nranges;
2916
2917
2918 new_nranges = 2 * mbcset->nranges + 1;
2919 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2920 new_nranges);
2921 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2922 new_nranges);
2923
2924 if (__glibc_unlikely (new_array_start == NULL
2925 || new_array_end == NULL))
2926 return REG_ESPACE;
2927
2928 mbcset->range_starts = new_array_start;
2929 mbcset->range_ends = new_array_end;
2930 *range_alloc = new_nranges;
2931 }
2932
2933 mbcset->range_starts[mbcset->nranges] = start_collseq;
2934 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2935 }
2936
2937
2938 for (ch = 0; ch < SBC_MAX; ch++)
2939 {
2940 uint32_t ch_collseq;
2941
2942 if (nrules == 0)
2943 ch_collseq = collseqmb[ch];
2944 else
2945 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2946 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2947 bitset_set (sbcset, ch);
2948 }
2949 return REG_NOERROR;
2950 }
2951
2952
2953
2954
2955
2956
2957
2958 static __always_inline reg_errcode_t
2959 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2960 Idx *coll_sym_alloc, const unsigned char *name,
2961 uint_fast32_t nrules, int_fast32_t table_size,
2962 const int32_t *symb_table, const unsigned char *extra)
2963 {
2964 int32_t elem, idx;
2965 size_t name_len = strlen ((const char *) name);
2966 if (nrules != 0)
2967 {
2968 elem = seek_collating_symbol_entry (name, name_len, symb_table,
2969 table_size, extra);
2970 if (elem != -1)
2971 {
2972
2973 idx = symb_table[2 * elem + 1];
2974
2975 idx += 1 + extra[idx];
2976 }
2977 else if (name_len == 1)
2978 {
2979
2980
2981 bitset_set (sbcset, name[0]);
2982 return REG_NOERROR;
2983 }
2984 else
2985 return REG_ECOLLATE;
2986
2987
2988
2989 if (__glibc_unlikely (*coll_sym_alloc == mbcset->ncoll_syms))
2990 {
2991
2992
2993 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2994
2995
2996 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2997 new_coll_sym_alloc);
2998 if (__glibc_unlikely (new_coll_syms == NULL))
2999 return REG_ESPACE;
3000 mbcset->coll_syms = new_coll_syms;
3001 *coll_sym_alloc = new_coll_sym_alloc;
3002 }
3003 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3004 return REG_NOERROR;
3005 }
3006 else
3007 {
3008 if (__glibc_unlikely (name_len != 1))
3009 return REG_ECOLLATE;
3010 else
3011 {
3012 bitset_set (sbcset, name[0]);
3013 return REG_NOERROR;
3014 }
3015 }
3016 }
3017 #endif
3018
3019
3020
3021
3022 static bin_tree_t *
3023 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
3024 reg_syntax_t syntax, reg_errcode_t *err)
3025 {
3026 const unsigned char *collseqmb = NULL;
3027 const char *collseqwc = NULL;
3028 uint_fast32_t nrules = 0;
3029 int_fast32_t table_size = 0;
3030 const void *symb_table = NULL;
3031 const unsigned char *extra = NULL;
3032
3033 re_token_t br_token;
3034 re_bitset_ptr_t sbcset;
3035 re_charset_t *mbcset;
3036 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3037 Idx equiv_class_alloc = 0, char_class_alloc = 0;
3038 bool non_match = false;
3039 bin_tree_t *work_tree;
3040 int token_len;
3041 bool first_round = true;
3042 #ifdef _LIBC
3043 collseqmb = (const unsigned char *)
3044 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3045 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3046 if (nrules)
3047 {
3048
3049
3050
3051 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3052 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3053 symb_table = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_TABLEMB);
3054 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3055 _NL_COLLATE_SYMB_EXTRAMB);
3056 }
3057 #endif
3058 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3059 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3060 if (__glibc_unlikely (sbcset == NULL || mbcset == NULL))
3061 {
3062 re_free (sbcset);
3063 re_free (mbcset);
3064 *err = REG_ESPACE;
3065 return NULL;
3066 }
3067
3068 token_len = peek_token_bracket (token, regexp, syntax);
3069 if (__glibc_unlikely (token->type == END_OF_RE))
3070 {
3071 *err = REG_BADPAT;
3072 goto parse_bracket_exp_free_return;
3073 }
3074 if (token->type == OP_NON_MATCH_LIST)
3075 {
3076 mbcset->non_match = 1;
3077 non_match = true;
3078 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3079 bitset_set (sbcset, '\n');
3080 re_string_skip_bytes (regexp, token_len);
3081 token_len = peek_token_bracket (token, regexp, syntax);
3082 if (__glibc_unlikely (token->type == END_OF_RE))
3083 {
3084 *err = REG_BADPAT;
3085 goto parse_bracket_exp_free_return;
3086 }
3087 }
3088
3089
3090 if (token->type == OP_CLOSE_BRACKET)
3091 token->type = CHARACTER;
3092
3093 while (1)
3094 {
3095 bracket_elem_t start_elem, end_elem;
3096 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3097 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3098 reg_errcode_t ret;
3099 int token_len2 = 0;
3100 bool is_range_exp = false;
3101 re_token_t token2;
3102
3103 start_elem.opr.name = start_name_buf;
3104 start_elem.type = COLL_SYM;
3105 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3106 syntax, first_round);
3107 if (__glibc_unlikely (ret != REG_NOERROR))
3108 {
3109 *err = ret;
3110 goto parse_bracket_exp_free_return;
3111 }
3112 first_round = false;
3113
3114
3115 token_len = peek_token_bracket (token, regexp, syntax);
3116
3117
3118 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3119 {
3120 if (__glibc_unlikely (token->type == END_OF_RE))
3121 {
3122 *err = REG_EBRACK;
3123 goto parse_bracket_exp_free_return;
3124 }
3125 if (token->type == OP_CHARSET_RANGE)
3126 {
3127 re_string_skip_bytes (regexp, token_len);
3128 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3129 if (__glibc_unlikely (token2.type == END_OF_RE))
3130 {
3131 *err = REG_EBRACK;
3132 goto parse_bracket_exp_free_return;
3133 }
3134 if (token2.type == OP_CLOSE_BRACKET)
3135 {
3136
3137 re_string_skip_bytes (regexp, -token_len);
3138 token->type = CHARACTER;
3139 }
3140 else
3141 is_range_exp = true;
3142 }
3143 }
3144
3145 if (is_range_exp == true)
3146 {
3147 end_elem.opr.name = end_name_buf;
3148 end_elem.type = COLL_SYM;
3149 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3150 dfa, syntax, true);
3151 if (__glibc_unlikely (ret != REG_NOERROR))
3152 {
3153 *err = ret;
3154 goto parse_bracket_exp_free_return;
3155 }
3156
3157 token_len = peek_token_bracket (token, regexp, syntax);
3158
3159 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3160 &start_elem, &end_elem,
3161 dfa, syntax, nrules, collseqmb, collseqwc,
3162 table_size, symb_table, extra);
3163 if (__glibc_unlikely (*err != REG_NOERROR))
3164 goto parse_bracket_exp_free_return;
3165 }
3166 else
3167 {
3168 switch (start_elem.type)
3169 {
3170 case SB_CHAR:
3171 bitset_set (sbcset, start_elem.opr.ch);
3172 break;
3173 case MB_CHAR:
3174
3175 if (__glibc_unlikely (mbchar_alloc == mbcset->nmbchars))
3176 {
3177 wchar_t *new_mbchars;
3178
3179
3180 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3181
3182 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3183 mbchar_alloc);
3184 if (__glibc_unlikely (new_mbchars == NULL))
3185 goto parse_bracket_exp_espace;
3186 mbcset->mbchars = new_mbchars;
3187 }
3188 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3189 break;
3190 case EQUIV_CLASS:
3191 *err = build_equiv_class (sbcset,
3192 mbcset, &equiv_class_alloc,
3193 start_elem.opr.name);
3194 if (__glibc_unlikely (*err != REG_NOERROR))
3195 goto parse_bracket_exp_free_return;
3196 break;
3197 case COLL_SYM:
3198 *err = build_collating_symbol (sbcset,
3199 mbcset, &coll_sym_alloc,
3200 start_elem.opr.name,
3201 nrules, table_size, symb_table, extra);
3202 if (__glibc_unlikely (*err != REG_NOERROR))
3203 goto parse_bracket_exp_free_return;
3204 break;
3205 case CHAR_CLASS:
3206 *err = build_charclass (regexp->trans, sbcset,
3207 mbcset, &char_class_alloc,
3208 (const char *) start_elem.opr.name,
3209 syntax);
3210 if (__glibc_unlikely (*err != REG_NOERROR))
3211 goto parse_bracket_exp_free_return;
3212 break;
3213 default:
3214 DEBUG_ASSERT (false);
3215 break;
3216 }
3217 }
3218 if (__glibc_unlikely (token->type == END_OF_RE))
3219 {
3220 *err = REG_EBRACK;
3221 goto parse_bracket_exp_free_return;
3222 }
3223 if (token->type == OP_CLOSE_BRACKET)
3224 break;
3225 }
3226
3227 re_string_skip_bytes (regexp, token_len);
3228
3229
3230 if (non_match)
3231 bitset_not (sbcset);
3232
3233
3234 if (dfa->mb_cur_max > 1)
3235 bitset_mask (sbcset, dfa->sb_char);
3236
3237 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3238 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3239 || mbcset->non_match)))
3240 {
3241 bin_tree_t *mbc_tree;
3242 int sbc_idx;
3243
3244 dfa->has_mb_node = 1;
3245 br_token.type = COMPLEX_BRACKET;
3246 br_token.opr.mbcset = mbcset;
3247 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3248 if (__glibc_unlikely (mbc_tree == NULL))
3249 goto parse_bracket_exp_espace;
3250 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3251 if (sbcset[sbc_idx])
3252 break;
3253
3254
3255 if (sbc_idx < BITSET_WORDS)
3256 {
3257
3258 br_token.type = SIMPLE_BRACKET;
3259 br_token.opr.sbcset = sbcset;
3260 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3261 if (__glibc_unlikely (work_tree == NULL))
3262 goto parse_bracket_exp_espace;
3263
3264
3265 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3266 if (__glibc_unlikely (work_tree == NULL))
3267 goto parse_bracket_exp_espace;
3268 }
3269 else
3270 {
3271 re_free (sbcset);
3272 work_tree = mbc_tree;
3273 }
3274 }
3275 else
3276 {
3277 free_charset (mbcset);
3278
3279 br_token.type = SIMPLE_BRACKET;
3280 br_token.opr.sbcset = sbcset;
3281 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3282 if (__glibc_unlikely (work_tree == NULL))
3283 goto parse_bracket_exp_espace;
3284 }
3285 return work_tree;
3286
3287 parse_bracket_exp_espace:
3288 *err = REG_ESPACE;
3289 parse_bracket_exp_free_return:
3290 re_free (sbcset);
3291 free_charset (mbcset);
3292 return NULL;
3293 }
3294
3295
3296
3297 static reg_errcode_t
3298 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3299 re_token_t *token, int token_len, re_dfa_t *dfa,
3300 reg_syntax_t syntax, bool accept_hyphen)
3301 {
3302 int cur_char_size;
3303 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3304 if (cur_char_size > 1)
3305 {
3306 elem->type = MB_CHAR;
3307 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3308 re_string_skip_bytes (regexp, cur_char_size);
3309 return REG_NOERROR;
3310 }
3311 re_string_skip_bytes (regexp, token_len);
3312 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3313 || token->type == OP_OPEN_EQUIV_CLASS)
3314 return parse_bracket_symbol (elem, regexp, token);
3315 if (__glibc_unlikely (token->type == OP_CHARSET_RANGE) && !accept_hyphen)
3316 {
3317
3318
3319 re_token_t token2;
3320 (void) peek_token_bracket (&token2, regexp, syntax);
3321 if (token2.type != OP_CLOSE_BRACKET)
3322
3323
3324 return REG_ERANGE;
3325 }
3326 elem->type = SB_CHAR;
3327 elem->opr.ch = token->opr.c;
3328 return REG_NOERROR;
3329 }
3330
3331
3332
3333
3334
3335 static reg_errcode_t
3336 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3337 re_token_t *token)
3338 {
3339 unsigned char ch, delim = token->opr.c;
3340 int i = 0;
3341 if (re_string_eoi(regexp))
3342 return REG_EBRACK;
3343 for (;; ++i)
3344 {
3345 if (i >= BRACKET_NAME_BUF_SIZE)
3346 return REG_EBRACK;
3347 if (token->type == OP_OPEN_CHAR_CLASS)
3348 ch = re_string_fetch_byte_case (regexp);
3349 else
3350 ch = re_string_fetch_byte (regexp);
3351 if (re_string_eoi(regexp))
3352 return REG_EBRACK;
3353 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3354 break;
3355 elem->opr.name[i] = ch;
3356 }
3357 re_string_skip_bytes (regexp, 1);
3358 elem->opr.name[i] = '\0';
3359 switch (token->type)
3360 {
3361 case OP_OPEN_COLL_ELEM:
3362 elem->type = COLL_SYM;
3363 break;
3364 case OP_OPEN_EQUIV_CLASS:
3365 elem->type = EQUIV_CLASS;
3366 break;
3367 case OP_OPEN_CHAR_CLASS:
3368 elem->type = CHAR_CLASS;
3369 break;
3370 default:
3371 break;
3372 }
3373 return REG_NOERROR;
3374 }
3375
3376
3377
3378
3379
3380
3381
3382 static reg_errcode_t
3383 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3384 Idx *equiv_class_alloc, const unsigned char *name)
3385 {
3386 #ifdef _LIBC
3387 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3388 if (nrules != 0)
3389 {
3390 const int32_t *table, *indirect;
3391 const unsigned char *weights, *extra, *cp;
3392 unsigned char char_buf[2];
3393 int32_t idx1, idx2;
3394 unsigned int ch;
3395 size_t len;
3396
3397 cp = name;
3398 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3399 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3400 _NL_COLLATE_WEIGHTMB);
3401 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3402 _NL_COLLATE_EXTRAMB);
3403 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3404 _NL_COLLATE_INDIRECTMB);
3405 idx1 = findidx (table, indirect, extra, &cp, -1);
3406 if (__glibc_unlikely (idx1 == 0 || *cp != '\0'))
3407
3408 return REG_ECOLLATE;
3409
3410
3411 len = weights[idx1 & 0xffffff];
3412 for (ch = 0; ch < SBC_MAX; ++ch)
3413 {
3414 char_buf[0] = ch;
3415 cp = char_buf;
3416 idx2 = findidx (table, indirect, extra, &cp, 1);
3417
3418
3419
3420 if (idx2 == 0)
3421
3422 continue;
3423
3424
3425 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24)
3426 && memcmp (weights + (idx1 & 0xffffff) + 1,
3427 weights + (idx2 & 0xffffff) + 1, len) == 0)
3428 bitset_set (sbcset, ch);
3429 }
3430
3431 if (__glibc_unlikely (*equiv_class_alloc == mbcset->nequiv_classes))
3432 {
3433
3434
3435 Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3436
3437 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3438 int32_t,
3439 new_equiv_class_alloc);
3440 if (__glibc_unlikely (new_equiv_classes == NULL))
3441 return REG_ESPACE;
3442 mbcset->equiv_classes = new_equiv_classes;
3443 *equiv_class_alloc = new_equiv_class_alloc;
3444 }
3445 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3446 }
3447 else
3448 #endif
3449 {
3450 if (__glibc_unlikely (strlen ((const char *) name) != 1))
3451 return REG_ECOLLATE;
3452 bitset_set (sbcset, *name);
3453 }
3454 return REG_NOERROR;
3455 }
3456
3457
3458
3459
3460
3461
3462
3463 static reg_errcode_t
3464 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3465 re_charset_t *mbcset, Idx *char_class_alloc,
3466 const char *class_name, reg_syntax_t syntax)
3467 {
3468 int i;
3469 const char *name = class_name;
3470
3471
3472
3473 if ((syntax & RE_ICASE)
3474 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3475 name = "alpha";
3476
3477
3478 if (__glibc_unlikely (*char_class_alloc == mbcset->nchar_classes))
3479 {
3480
3481
3482 Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3483
3484 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3485 new_char_class_alloc);
3486 if (__glibc_unlikely (new_char_classes == NULL))
3487 return REG_ESPACE;
3488 mbcset->char_classes = new_char_classes;
3489 *char_class_alloc = new_char_class_alloc;
3490 }
3491 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3492
3493 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3494 do { \
3495 if (__glibc_unlikely (trans != NULL)) \
3496 { \
3497 for (i = 0; i < SBC_MAX; ++i) \
3498 if (ctype_func (i)) \
3499 bitset_set (sbcset, trans[i]); \
3500 } \
3501 else \
3502 { \
3503 for (i = 0; i < SBC_MAX; ++i) \
3504 if (ctype_func (i)) \
3505 bitset_set (sbcset, i); \
3506 } \
3507 } while (0)
3508
3509 if (strcmp (name, "alnum") == 0)
3510 BUILD_CHARCLASS_LOOP (isalnum);
3511 else if (strcmp (name, "cntrl") == 0)
3512 BUILD_CHARCLASS_LOOP (iscntrl);
3513 else if (strcmp (name, "lower") == 0)
3514 BUILD_CHARCLASS_LOOP (islower);
3515 else if (strcmp (name, "space") == 0)
3516 BUILD_CHARCLASS_LOOP (isspace);
3517 else if (strcmp (name, "alpha") == 0)
3518 BUILD_CHARCLASS_LOOP (isalpha);
3519 else if (strcmp (name, "digit") == 0)
3520 BUILD_CHARCLASS_LOOP (isdigit);
3521 else if (strcmp (name, "print") == 0)
3522 BUILD_CHARCLASS_LOOP (isprint);
3523 else if (strcmp (name, "upper") == 0)
3524 BUILD_CHARCLASS_LOOP (isupper);
3525 else if (strcmp (name, "blank") == 0)
3526 BUILD_CHARCLASS_LOOP (isblank);
3527 else if (strcmp (name, "graph") == 0)
3528 BUILD_CHARCLASS_LOOP (isgraph);
3529 else if (strcmp (name, "punct") == 0)
3530 BUILD_CHARCLASS_LOOP (ispunct);
3531 else if (strcmp (name, "xdigit") == 0)
3532 BUILD_CHARCLASS_LOOP (isxdigit);
3533 else
3534 return REG_ECTYPE;
3535
3536 return REG_NOERROR;
3537 }
3538
3539 static bin_tree_t *
3540 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3541 const char *class_name,
3542 const char *extra, bool non_match,
3543 reg_errcode_t *err)
3544 {
3545 re_bitset_ptr_t sbcset;
3546 re_charset_t *mbcset;
3547 Idx alloc = 0;
3548 reg_errcode_t ret;
3549 bin_tree_t *tree;
3550
3551 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3552 if (__glibc_unlikely (sbcset == NULL))
3553 {
3554 *err = REG_ESPACE;
3555 return NULL;
3556 }
3557 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3558 if (__glibc_unlikely (mbcset == NULL))
3559 {
3560 re_free (sbcset);
3561 *err = REG_ESPACE;
3562 return NULL;
3563 }
3564 mbcset->non_match = non_match;
3565
3566
3567 ret = build_charclass (trans, sbcset, mbcset, &alloc, class_name, 0);
3568
3569 if (__glibc_unlikely (ret != REG_NOERROR))
3570 {
3571 re_free (sbcset);
3572 free_charset (mbcset);
3573 *err = ret;
3574 return NULL;
3575 }
3576
3577 for (; *extra; extra++)
3578 bitset_set (sbcset, *extra);
3579
3580
3581 if (non_match)
3582 bitset_not (sbcset);
3583
3584
3585 if (dfa->mb_cur_max > 1)
3586 bitset_mask (sbcset, dfa->sb_char);
3587
3588
3589 re_token_t br_token = { .type = SIMPLE_BRACKET, .opr.sbcset = sbcset };
3590 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3591 if (__glibc_unlikely (tree == NULL))
3592 goto build_word_op_espace;
3593
3594 if (dfa->mb_cur_max > 1)
3595 {
3596 bin_tree_t *mbc_tree;
3597
3598 br_token.type = COMPLEX_BRACKET;
3599 br_token.opr.mbcset = mbcset;
3600 dfa->has_mb_node = 1;
3601 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3602 if (__glibc_unlikely (mbc_tree == NULL))
3603 goto build_word_op_espace;
3604
3605 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3606 if (__glibc_likely (mbc_tree != NULL))
3607 return tree;
3608 }
3609 else
3610 {
3611 free_charset (mbcset);
3612 return tree;
3613 }
3614
3615 build_word_op_espace:
3616 re_free (sbcset);
3617 free_charset (mbcset);
3618 *err = REG_ESPACE;
3619 return NULL;
3620 }
3621
3622
3623
3624
3625
3626
3627
3628 static Idx
3629 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3630 {
3631 Idx num = -1;
3632 unsigned char c;
3633 while (1)
3634 {
3635 fetch_token (token, input, syntax);
3636 c = token->opr.c;
3637 if (__glibc_unlikely (token->type == END_OF_RE))
3638 return -2;
3639 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3640 break;
3641 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3642 ? -2
3643 : num == -1
3644 ? c - '0'
3645 : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3646 }
3647 return num;
3648 }
3649
3650 static void
3651 free_charset (re_charset_t *cset)
3652 {
3653 re_free (cset->mbchars);
3654 #ifdef _LIBC
3655 re_free (cset->coll_syms);
3656 re_free (cset->equiv_classes);
3657 #endif
3658 re_free (cset->range_starts);
3659 re_free (cset->range_ends);
3660 re_free (cset->char_classes);
3661 re_free (cset);
3662 }
3663
3664
3665
3666
3667
3668 static bin_tree_t *
3669 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3670 re_token_type_t type)
3671 {
3672 re_token_t t = { .type = type };
3673 return create_token_tree (dfa, left, right, &t);
3674 }
3675
3676 static bin_tree_t *
3677 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3678 const re_token_t *token)
3679 {
3680 bin_tree_t *tree;
3681 if (__glibc_unlikely (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE))
3682 {
3683 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3684
3685 if (storage == NULL)
3686 return NULL;
3687 storage->next = dfa->str_tree_storage;
3688 dfa->str_tree_storage = storage;
3689 dfa->str_tree_storage_idx = 0;
3690 }
3691 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3692
3693 tree->parent = NULL;
3694 tree->left = left;
3695 tree->right = right;
3696 tree->token = *token;
3697 tree->token.duplicated = 0;
3698 tree->token.opt_subexp = 0;
3699 tree->first = NULL;
3700 tree->next = NULL;
3701 tree->node_idx = -1;
3702
3703 if (left != NULL)
3704 left->parent = tree;
3705 if (right != NULL)
3706 right->parent = tree;
3707 return tree;
3708 }
3709
3710
3711
3712
3713 static reg_errcode_t
3714 mark_opt_subexp (void *extra, bin_tree_t *node)
3715 {
3716 Idx idx = (uintptr_t) extra;
3717 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3718 node->token.opt_subexp = 1;
3719
3720 return REG_NOERROR;
3721 }
3722
3723
3724
3725 static void
3726 free_token (re_token_t *node)
3727 {
3728 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3729 free_charset (node->opr.mbcset);
3730 else if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3731 re_free (node->opr.sbcset);
3732 }
3733
3734
3735
3736
3737 static reg_errcode_t
3738 free_tree (void *extra, bin_tree_t *node)
3739 {
3740 free_token (&node->token);
3741 return REG_NOERROR;
3742 }
3743
3744
3745
3746
3747
3748
3749
3750 static bin_tree_t *
3751 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3752 {
3753 const bin_tree_t *node;
3754 bin_tree_t *dup_root;
3755 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3756
3757 for (node = root; ; )
3758 {
3759
3760 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3761 if (*p_new == NULL)
3762 return NULL;
3763 (*p_new)->parent = dup_node;
3764 (*p_new)->token.duplicated = 1;
3765 dup_node = *p_new;
3766
3767
3768 if (node->left)
3769 {
3770 node = node->left;
3771 p_new = &dup_node->left;
3772 }
3773 else
3774 {
3775 const bin_tree_t *prev = NULL;
3776 while (node->right == prev || node->right == NULL)
3777 {
3778 prev = node;
3779 node = node->parent;
3780 dup_node = dup_node->parent;
3781 if (!node)
3782 return dup_root;
3783 }
3784 node = node->right;
3785 p_new = &dup_node->right;
3786 }
3787 }
3788 }