lib/regcomp.c

/* [<][>][^][v][top][bottom][index][help] */
This source file includes following definitions.
re_compile_pattern
weak_alias
weak_alias
weak_alias
re_compile_fastmap_iter
regcomp
libc_hidden_def
free_dfa_content
regfree
libc_hidden_def
libc_freeres_fn
re_compile_internal
init_dfa
init_word_char
free_workarea_compile
create_initial_state
optimize_utf8
analyze
postorder
preorder
optimize_subexps
lower_subexps
lower_subexp
calc_first
calc_next
link_nfa_nodes
duplicate_node_closure
search_duplicated_node
duplicate_node
calc_inveclosure
calc_eclosure
calc_eclosure_iter
fetch_token
peek_token
peek_token_bracket
parse
parse_reg_exp
parse_branch
parse_expression
parse_sub_exp
parse_dup_op
parse_byte
build_range_exp
build_collating_symbol
seek_collating_symbol_entry
lookup_collation_sequence_value
build_range_exp
build_collating_symbol
parse_bracket_exp
parse_bracket_element
parse_bracket_symbol
build_equiv_class
build_charclass
build_charclass_op
fetch_number
free_charset
create_tree
create_token_tree
mark_opt_subexp
free_token
free_tree
duplicate_tree
     1 /* Extended regular expression matching and search library.
     2    Copyright (C) 2002-2023 Free Software Foundation, Inc.
     3    This file is part of the GNU C Library.
     4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
     5 
     6    The GNU C Library is free software; you can redistribute it and/or
     7    modify it under the terms of the GNU Lesser General Public
     8    License as published by the Free Software Foundation; either
     9    version 2.1 of the License, or (at your option) any later version.
    10 
    11    The GNU C Library is distributed in the hope that it will be useful,
    12    but WITHOUT ANY WARRANTY; without even the implied warranty of
    13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    14    Lesser General Public License for more details.
    15 
    16    You should have received a copy of the GNU Lesser General Public
    17    License along with the GNU C Library; if not, see
    18    <https://www.gnu.org/licenses/>.  */
    19 
    20 #ifdef _LIBC
    21 # include <locale/weight.h>
    22 #endif
    23 
    24 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
    25                                           size_t length, reg_syntax_t syntax);
    26 static void re_compile_fastmap_iter (regex_t *bufp,
    27                                      const re_dfastate_t *init_state,
    28                                      char *fastmap);
    29 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
    30 static void free_charset (re_charset_t *cset);
    31 static void free_workarea_compile (regex_t *preg);
    32 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
    33 static void optimize_utf8 (re_dfa_t *dfa);
    34 static reg_errcode_t analyze (regex_t *preg);
    35 static reg_errcode_t preorder (bin_tree_t *root,
    36                                reg_errcode_t (fn (void *, bin_tree_t *)),
    37                                void *extra);
    38 static reg_errcode_t postorder (bin_tree_t *root,
    39                                 reg_errcode_t (fn (void *, bin_tree_t *)),
    40                                 void *extra);
    41 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
    42 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
    43 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
    44                                  bin_tree_t *node);
    45 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
    46 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
    47 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
    48 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
    49 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
    50                                    unsigned int constraint);
    51 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
    52 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
    53                                          Idx node, bool root);
    54 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
    55 static Idx fetch_number (re_string_t *input, re_token_t *token,
    56                          reg_syntax_t syntax);
    57 static int peek_token (re_token_t *token, re_string_t *input,
    58                         reg_syntax_t syntax);
    59 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
    60                           reg_syntax_t syntax, reg_errcode_t *err);
    61 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
    62                                   re_token_t *token, reg_syntax_t syntax,
    63                                   Idx nest, reg_errcode_t *err);
    64 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
    65                                  re_token_t *token, reg_syntax_t syntax,
    66                                  Idx nest, reg_errcode_t *err);
    67 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
    68                                      re_token_t *token, reg_syntax_t syntax,
    69                                      Idx nest, reg_errcode_t *err);
    70 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
    71                                   re_token_t *token, reg_syntax_t syntax,
    72                                   Idx nest, reg_errcode_t *err);
    73 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
    74                                  re_dfa_t *dfa, re_token_t *token,
    75                                  reg_syntax_t syntax, reg_errcode_t *err);
    76 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
    77                                       re_token_t *token, reg_syntax_t syntax,
    78                                       reg_errcode_t *err);
    79 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
    80                                             re_string_t *regexp,
    81                                             re_token_t *token, int token_len,
    82                                             re_dfa_t *dfa,
    83                                             reg_syntax_t syntax,
    84                                             bool accept_hyphen);
    85 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
    86                                           re_string_t *regexp,
    87                                           re_token_t *token);
    88 static reg_errcode_t build_equiv_class (bitset_t sbcset,
    89                                         re_charset_t *mbcset,
    90                                         Idx *equiv_class_alloc,
    91                                         const unsigned char *name);
    92 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
    93                                       bitset_t sbcset,
    94                                       re_charset_t *mbcset,
    95                                       Idx *char_class_alloc,
    96                                       const char *class_name,
    97                                       reg_syntax_t syntax);
    98 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
    99                                        RE_TRANSLATE_TYPE trans,
   100                                        const char *class_name,
   101                                        const char *extra,
   102                                        bool non_match, reg_errcode_t *err);
   103 static bin_tree_t *create_tree (re_dfa_t *dfa,
   104                                 bin_tree_t *left, bin_tree_t *right,
   105                                 re_token_type_t type);
   106 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
   107                                       bin_tree_t *left, bin_tree_t *right,
   108                                       const re_token_t *token);
   109 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
   110 static void free_token (re_token_t *node);
   111 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
   112 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
   113 
   114 /* This table gives an error message for each of the error codes listed
   115    in regex.h.  Obviously the order here has to be same as there.
   116    POSIX doesn't require that we do anything for REG_NOERROR,
   117    but why not be nice?  */
   118 
   119 static const char __re_error_msgid[] =
   120   {
   121 #define REG_NOERROR_IDX 0
   122     gettext_noop ("Success")    /* REG_NOERROR */
   123     "\0"
   124 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
   125     gettext_noop ("No match")   /* REG_NOMATCH */
   126     "\0"
   127 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
   128     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
   129     "\0"
   130 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
   131     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
   132     "\0"
   133 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
   134     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
   135     "\0"
   136 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
   137     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
   138     "\0"
   139 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
   140     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
   141     "\0"
   142 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
   143     gettext_noop ("Unmatched [, [^, [:, [., or [=")     /* REG_EBRACK */
   144     "\0"
   145 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [, [^, [:, [., or [=")
   146     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
   147     "\0"
   148 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
   149     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
   150     "\0"
   151 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
   152     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
   153     "\0"
   154 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
   155     gettext_noop ("Invalid range end")  /* REG_ERANGE */
   156     "\0"
   157 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
   158     gettext_noop ("Memory exhausted") /* REG_ESPACE */
   159     "\0"
   160 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
   161     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
   162     "\0"
   163 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
   164     gettext_noop ("Premature end of regular expression") /* REG_EEND */
   165     "\0"
   166 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
   167     gettext_noop ("Regular expression too big") /* REG_ESIZE */
   168     "\0"
   169 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
   170     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
   171   };
   172 
   173 static const size_t __re_error_msgid_idx[] =
   174   {
   175     REG_NOERROR_IDX,
   176     REG_NOMATCH_IDX,
   177     REG_BADPAT_IDX,
   178     REG_ECOLLATE_IDX,
   179     REG_ECTYPE_IDX,
   180     REG_EESCAPE_IDX,
   181     REG_ESUBREG_IDX,
   182     REG_EBRACK_IDX,
   183     REG_EPAREN_IDX,
   184     REG_EBRACE_IDX,
   185     REG_BADBR_IDX,
   186     REG_ERANGE_IDX,
   187     REG_ESPACE_IDX,
   188     REG_BADRPT_IDX,
   189     REG_EEND_IDX,
   190     REG_ESIZE_IDX,
   191     REG_ERPAREN_IDX
   192   };
   193 
   194 /* Entry points for GNU code.  */
   195 
   196 /* re_compile_pattern is the GNU regular expression compiler: it
   197    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
   198    Returns 0 if the pattern was valid, otherwise an error string.
   199 
   200    Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
   201    are set in BUFP on entry.  */
   202 
   203 const char *
   204 re_compile_pattern (const char *pattern, size_t length,
   205                     struct re_pattern_buffer *bufp)
   206 {
   207   reg_errcode_t ret;
   208 
   209   /* And GNU code determines whether or not to get register information
   210      by passing null for the REGS argument to re_match, etc., not by
   211      setting no_sub, unless RE_NO_SUB is set.  */
   212   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
   213 
   214   /* Match anchors at newline.  */
   215   bufp->newline_anchor = 1;
   216 
   217   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
   218 
   219   if (!ret)
   220     return NULL;
   221   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
   222 }
   223 weak_alias (__re_compile_pattern, re_compile_pattern)
   224 
   225 /* Set by 're_set_syntax' to the current regexp syntax to recognize.  Can
   226    also be assigned to arbitrarily: each pattern buffer stores its own
   227    syntax, so it can be changed between regex compilations.  */
   228 /* This has no initializer because initialized variables in Emacs
   229    become read-only after dumping.  */
   230 reg_syntax_t re_syntax_options;
   231 
   232 
   233 /* Specify the precise syntax of regexps for compilation.  This provides
   234    for compatibility for various utilities which historically have
   235    different, incompatible syntaxes.
   236 
   237    The argument SYNTAX is a bit mask comprised of the various bits
   238    defined in regex.h.  We return the old syntax.  */
   239 
   240 reg_syntax_t
   241 re_set_syntax (reg_syntax_t syntax)
   242 {
   243   reg_syntax_t ret = re_syntax_options;
   244 
   245   re_syntax_options = syntax;
   246   return ret;
   247 }
   248 weak_alias (__re_set_syntax, re_set_syntax)
   249 
   250 int
   251 re_compile_fastmap (struct re_pattern_buffer *bufp)
   252 {
   253   re_dfa_t *dfa = bufp->buffer;
   254   char *fastmap = bufp->fastmap;
   255 
   256   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
   257   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
   258   if (dfa->init_state != dfa->init_state_word)
   259     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
   260   if (dfa->init_state != dfa->init_state_nl)
   261     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
   262   if (dfa->init_state != dfa->init_state_begbuf)
   263     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
   264   bufp->fastmap_accurate = 1;
   265   return 0;
   266 }
   267 weak_alias (__re_compile_fastmap, re_compile_fastmap)
   268 
   269 static __always_inline void
   270 re_set_fastmap (char *fastmap, bool icase, int ch)
   271 {
   272   fastmap[ch] = 1;
   273   if (icase)
   274     fastmap[tolower (ch)] = 1;
   275 }
   276 
   277 /* Helper function for re_compile_fastmap.
   278    Compile fastmap for the initial_state INIT_STATE.  */
   279 
   280 static void
   281 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
   282                          char *fastmap)
   283 {
   284   re_dfa_t *dfa = bufp->buffer;
   285   Idx node_cnt;
   286   bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
   287   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
   288     {
   289       Idx node = init_state->nodes.elems[node_cnt];
   290       re_token_type_t type = dfa->nodes[node].type;
   291 
   292       if (type == CHARACTER)
   293         {
   294           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
   295           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
   296             {
   297               unsigned char buf[MB_LEN_MAX];
   298               unsigned char *p;
   299               wchar_t wc;
   300               mbstate_t state;
   301 
   302               p = buf;
   303               *p++ = dfa->nodes[node].opr.c;
   304               while (++node < dfa->nodes_len
   305                      && dfa->nodes[node].type == CHARACTER
   306                      && dfa->nodes[node].mb_partial)
   307                 *p++ = dfa->nodes[node].opr.c;
   308               memset (&state, '\0', sizeof (state));
   309               if (__mbrtowc (&wc, (const char *) buf, p - buf,
   310                              &state) == p - buf
   311                   && (__wcrtomb ((char *) buf, __towlower (wc), &state)
   312                       != (size_t) -1))
   313                 re_set_fastmap (fastmap, false, buf[0]);
   314             }
   315         }
   316       else if (type == SIMPLE_BRACKET)
   317         {
   318           int i, ch;
   319           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
   320             {
   321               int j;
   322               bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
   323               for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
   324                 if (w & ((bitset_word_t) 1 << j))
   325                   re_set_fastmap (fastmap, icase, ch);
   326             }
   327         }
   328       else if (type == COMPLEX_BRACKET)
   329         {
   330           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
   331           Idx i;
   332 
   333 #ifdef _LIBC
   334           /* See if we have to try all bytes which start multiple collation
   335              elements.
   336              e.g. In da_DK, we want to catch 'a' since "aa" is a valid
   337                   collation element, and don't catch 'b' since 'b' is
   338                   the only collation element which starts from 'b' (and
   339                   it is caught by SIMPLE_BRACKET).  */
   340               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
   341                   && (cset->ncoll_syms || cset->nranges))
   342                 {
   343                   const int32_t *table = (const int32_t *)
   344                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
   345                   for (i = 0; i < SBC_MAX; ++i)
   346                     if (table[i] < 0)
   347                       re_set_fastmap (fastmap, icase, i);
   348                 }
   349 #endif /* _LIBC */
   350 
   351           /* See if we have to start the match at all multibyte characters,
   352              i.e. where we would not find an invalid sequence.  This only
   353              applies to multibyte character sets; for single byte character
   354              sets, the SIMPLE_BRACKET again suffices.  */
   355           if (dfa->mb_cur_max > 1
   356               && (cset->nchar_classes || cset->non_match || cset->nranges
   357 #ifdef _LIBC
   358                   || cset->nequiv_classes
   359 #endif /* _LIBC */
   360                  ))
   361             {
   362               unsigned char c = 0;
   363               do
   364                 {
   365                   mbstate_t mbs;
   366                   memset (&mbs, 0, sizeof (mbs));
   367                   if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
   368                     re_set_fastmap (fastmap, false, (int) c);
   369                 }
   370               while (++c != 0);
   371             }
   372 
   373           else
   374             {
   375               /* ... Else catch all bytes which can start the mbchars.  */
   376               for (i = 0; i < cset->nmbchars; ++i)
   377                 {
   378                   char buf[256];
   379                   mbstate_t state;
   380                   memset (&state, '\0', sizeof (state));
   381                   if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
   382                     re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
   383                   if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
   384                     {
   385                       if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state)
   386                           != (size_t) -1)
   387                         re_set_fastmap (fastmap, false, *(unsigned char *) buf);
   388                     }
   389                 }
   390             }
   391         }
   392       else if (type == OP_PERIOD || type == OP_UTF8_PERIOD || type == END_OF_RE)
   393         {
   394           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
   395           if (type == END_OF_RE)
   396             bufp->can_be_null = 1;
   397           return;
   398         }
   399     }
   400 }
   401 
   402 /* Entry point for POSIX code.  */
   403 /* regcomp takes a regular expression as a string and compiles it.
   404 
   405    PREG is a regex_t *.  We do not expect any fields to be initialized,
   406    since POSIX says we shouldn't.  Thus, we set
   407 
   408      'buffer' to the compiled pattern;
   409      'used' to the length of the compiled pattern;
   410      'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
   411        REG_EXTENDED bit in CFLAGS is set; otherwise, to
   412        RE_SYNTAX_POSIX_BASIC;
   413      'newline_anchor' to REG_NEWLINE being set in CFLAGS;
   414      'fastmap' to an allocated space for the fastmap;
   415      'fastmap_accurate' to zero;
   416      're_nsub' to the number of subexpressions in PATTERN.
   417 
   418    PATTERN is the address of the pattern string.
   419 
   420    CFLAGS is a series of bits which affect compilation.
   421 
   422      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
   423      use POSIX basic syntax.
   424 
   425      If REG_NEWLINE is set, then . and [^...] don't match newline.
   426      Also, regexec will try a match beginning after every newline.
   427 
   428      If REG_ICASE is set, then we considers upper- and lowercase
   429      versions of letters to be equivalent when matching.
   430 
   431      If REG_NOSUB is set, then when PREG is passed to regexec, that
   432      routine will report only success or failure, and nothing about the
   433      registers.
   434 
   435    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
   436    the return codes and their meanings.)  */
   437 
   438 int
   439 regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
   440 {
   441   reg_errcode_t ret;
   442   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
   443                          : RE_SYNTAX_POSIX_BASIC);
   444 
   445   preg->buffer = NULL;
   446   preg->allocated = 0;
   447   preg->used = 0;
   448 
   449   /* Try to allocate space for the fastmap.  */
   450   preg->fastmap = re_malloc (char, SBC_MAX);
   451   if (__glibc_unlikely (preg->fastmap == NULL))
   452     return REG_ESPACE;
   453 
   454   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
   455 
   456   /* If REG_NEWLINE is set, newlines are treated differently.  */
   457   if (cflags & REG_NEWLINE)
   458     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
   459       syntax &= ~RE_DOT_NEWLINE;
   460       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
   461       /* It also changes the matching behavior.  */
   462       preg->newline_anchor = 1;
   463     }
   464   else
   465     preg->newline_anchor = 0;
   466   preg->no_sub = !!(cflags & REG_NOSUB);
   467   preg->translate = NULL;
   468 
   469   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
   470 
   471   /* POSIX doesn't distinguish between an unmatched open-group and an
   472      unmatched close-group: both are REG_EPAREN.  */
   473   if (ret == REG_ERPAREN)
   474     ret = REG_EPAREN;
   475 
   476   /* We have already checked preg->fastmap != NULL.  */
   477   if (__glibc_likely (ret == REG_NOERROR))
   478     /* Compute the fastmap now, since regexec cannot modify the pattern
   479        buffer.  This function never fails in this implementation.  */
   480     (void) re_compile_fastmap (preg);
   481   else
   482     {
   483       /* Some error occurred while compiling the expression.  */
   484       re_free (preg->fastmap);
   485       preg->fastmap = NULL;
   486     }
   487 
   488   return (int) ret;
   489 }
   490 libc_hidden_def (__regcomp)
   491 weak_alias (__regcomp, regcomp)
   492 
   493 /* Returns a message corresponding to an error code, ERRCODE, returned
   494    from either regcomp or regexec.   We don't use PREG here.  */
   495 
   496 size_t
   497 regerror (int errcode, const regex_t *__restrict preg, char *__restrict errbuf,
   498           size_t errbuf_size)
   499 {
   500   const char *msg;
   501   size_t msg_size;
   502   int nerrcodes = sizeof __re_error_msgid_idx / sizeof __re_error_msgid_idx[0];
   503 
   504   if (__glibc_unlikely (errcode < 0 || errcode >= nerrcodes))
   505     /* Only error codes returned by the rest of the code should be passed
   506        to this routine.  If we are given anything else, or if other regex
   507        code generates an invalid error code, then the program has a bug.
   508        Dump core so we can fix it.  */
   509     abort ();
   510 
   511   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
   512 
   513   msg_size = strlen (msg) + 1; /* Includes the null.  */
   514 
   515   if (__glibc_likely (errbuf_size != 0))
   516     {
   517       size_t cpy_size = msg_size;
   518       if (__glibc_unlikely (msg_size > errbuf_size))
   519         {
   520           cpy_size = errbuf_size - 1;
   521           errbuf[cpy_size] = '\0';
   522         }
   523       memcpy (errbuf, msg, cpy_size);
   524     }
   525 
   526   return msg_size;
   527 }
   528 weak_alias (__regerror, regerror)
   529 
   530 
   531 /* This static array is used for the map to single-byte characters when
   532    UTF-8 is used.  Otherwise we would allocate memory just to initialize
   533    it the same all the time.  UTF-8 is the preferred encoding so this is
   534    a worthwhile optimization.  */
   535 static const bitset_t utf8_sb_map =
   536 {
   537   /* Set the first 128 bits.  */
   538 #if (defined __GNUC__ || __clang_major__ >= 4) && !defined __STRICT_ANSI__
   539   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
   540 #else
   541 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
   542 #  error "bitset_word_t is narrower than 32 bits"
   543 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
   544   BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
   545 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
   546   BITSET_WORD_MAX, BITSET_WORD_MAX,
   547 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
   548   BITSET_WORD_MAX,
   549 # endif
   550   (BITSET_WORD_MAX
   551    >> (SBC_MAX % BITSET_WORD_BITS == 0
   552        ? 0
   553        : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
   554 #endif
   555 };
   556 
   557 
   558 static void
   559 free_dfa_content (re_dfa_t *dfa)
   560 {
   561   Idx i, j;
   562 
   563   if (dfa->nodes)
   564     for (i = 0; i < dfa->nodes_len; ++i)
   565       free_token (dfa->nodes + i);
   566   re_free (dfa->nexts);
   567   for (i = 0; i < dfa->nodes_len; ++i)
   568     {
   569       if (dfa->eclosures != NULL)
   570         re_node_set_free (dfa->eclosures + i);
   571       if (dfa->inveclosures != NULL)
   572         re_node_set_free (dfa->inveclosures + i);
   573       if (dfa->edests != NULL)
   574         re_node_set_free (dfa->edests + i);
   575     }
   576   re_free (dfa->edests);
   577   re_free (dfa->eclosures);
   578   re_free (dfa->inveclosures);
   579   re_free (dfa->nodes);
   580 
   581   if (dfa->state_table)
   582     for (i = 0; i <= dfa->state_hash_mask; ++i)
   583       {
   584         struct re_state_table_entry *entry = dfa->state_table + i;
   585         for (j = 0; j < entry->num; ++j)
   586           {
   587             re_dfastate_t *state = entry->array[j];
   588             free_state (state);
   589           }
   590         re_free (entry->array);
   591       }
   592   re_free (dfa->state_table);
   593   if (dfa->sb_char != utf8_sb_map)
   594     re_free (dfa->sb_char);
   595   re_free (dfa->subexp_map);
   596 #ifdef DEBUG
   597   re_free (dfa->re_str);
   598 #endif
   599 
   600   re_free (dfa);
   601 }
   602 
   603 
   604 /* Free dynamically allocated space used by PREG.  */
   605 
   606 void
   607 regfree (regex_t *preg)
   608 {
   609   re_dfa_t *dfa = preg->buffer;
   610   if (__glibc_likely (dfa != NULL))
   611     {
   612       lock_fini (dfa->lock);
   613       free_dfa_content (dfa);
   614     }
   615   preg->buffer = NULL;
   616   preg->allocated = 0;
   617 
   618   re_free (preg->fastmap);
   619   preg->fastmap = NULL;
   620 
   621   re_free (preg->translate);
   622   preg->translate = NULL;
   623 }
   624 libc_hidden_def (__regfree)
   625 weak_alias (__regfree, regfree)
   626 
   627 /* Entry points compatible with 4.2 BSD regex library.  We don't define
   628    them unless specifically requested.  */
   629 
   630 #if defined _REGEX_RE_COMP || defined _LIBC
   631 
   632 /* BSD has one and only one pattern buffer.  */
   633 static struct re_pattern_buffer re_comp_buf;
   634 
   635 char *
   636 # ifdef _LIBC
   637 /* Make these definitions weak in libc, so POSIX programs can redefine
   638    these names if they don't use our functions, and still use
   639    regcomp/regexec above without link errors.  */
   640 weak_function
   641 # endif
   642 re_comp (const char *s)
   643 {
   644   reg_errcode_t ret;
   645   char *fastmap;
   646 
   647   if (!s)
   648     {
   649       if (!re_comp_buf.buffer)
   650         return gettext ("No previous regular expression");
   651       return 0;
   652     }
   653 
   654   if (re_comp_buf.buffer)
   655     {
   656       fastmap = re_comp_buf.fastmap;
   657       re_comp_buf.fastmap = NULL;
   658       __regfree (&re_comp_buf);
   659       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
   660       re_comp_buf.fastmap = fastmap;
   661     }
   662 
   663   if (re_comp_buf.fastmap == NULL)
   664     {
   665       re_comp_buf.fastmap = re_malloc (char, SBC_MAX);
   666       if (re_comp_buf.fastmap == NULL)
   667         return (char *) gettext (__re_error_msgid
   668                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
   669     }
   670 
   671   /* Since 're_exec' always passes NULL for the 'regs' argument, we
   672      don't need to initialize the pattern buffer fields which affect it.  */
   673 
   674   /* Match anchors at newlines.  */
   675   re_comp_buf.newline_anchor = 1;
   676 
   677   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
   678 
   679   if (!ret)
   680     return NULL;
   681 
   682   /* Yes, we're discarding 'const' here if !HAVE_LIBINTL.  */
   683   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
   684 }
   685 
   686 #ifdef _LIBC
   687 libc_freeres_fn (free_mem)
   688 {
   689   __regfree (&re_comp_buf);
   690 }
   691 #endif
   692 
   693 #endif /* _REGEX_RE_COMP */
   694 
   695 /* Internal entry point.
   696    Compile the regular expression PATTERN, whose length is LENGTH.
   697    SYNTAX indicate regular expression's syntax.  */
   698 
   699 static reg_errcode_t
   700 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
   701                      reg_syntax_t syntax)
   702 {
   703   reg_errcode_t err = REG_NOERROR;
   704   re_dfa_t *dfa;
   705   re_string_t regexp;
   706 
   707   /* Initialize the pattern buffer.  */
   708   preg->fastmap_accurate = 0;
   709   preg->syntax = syntax;
   710   preg->not_bol = preg->not_eol = 0;
   711   preg->used = 0;
   712   preg->re_nsub = 0;
   713   preg->can_be_null = 0;
   714   preg->regs_allocated = REGS_UNALLOCATED;
   715 
   716   /* Initialize the dfa.  */
   717   dfa = preg->buffer;
   718   if (__glibc_unlikely (preg->allocated < sizeof (re_dfa_t)))
   719     {
   720       /* If zero allocated, but buffer is non-null, try to realloc
   721          enough space.  This loses if buffer's address is bogus, but
   722          that is the user's responsibility.  If ->buffer is NULL this
   723          is a simple allocation.  */
   724       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
   725       if (dfa == NULL)
   726         return REG_ESPACE;
   727       preg->allocated = sizeof (re_dfa_t);
   728       preg->buffer = dfa;
   729     }
   730   preg->used = sizeof (re_dfa_t);
   731 
   732   err = init_dfa (dfa, length);
   733   if (__glibc_unlikely (err == REG_NOERROR && lock_init (dfa->lock) != 0))
   734     err = REG_ESPACE;
   735   if (__glibc_unlikely (err != REG_NOERROR))
   736     {
   737       free_dfa_content (dfa);
   738       preg->buffer = NULL;
   739       preg->allocated = 0;
   740       return err;
   741     }
   742 #ifdef DEBUG
   743   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
   744   dfa->re_str = re_malloc (char, length + 1);
   745   strncpy (dfa->re_str, pattern, length + 1);
   746 #endif
   747 
   748   err = re_string_construct (&regexp, pattern, length, preg->translate,
   749                              (syntax & RE_ICASE) != 0, dfa);
   750   if (__glibc_unlikely (err != REG_NOERROR))
   751     {
   752     re_compile_internal_free_return:
   753       free_workarea_compile (preg);
   754       re_string_destruct (&regexp);
   755       lock_fini (dfa->lock);
   756       free_dfa_content (dfa);
   757       preg->buffer = NULL;
   758       preg->allocated = 0;
   759       return err;
   760     }
   761 
   762   /* Parse the regular expression, and build a structure tree.  */
   763   preg->re_nsub = 0;
   764   dfa->str_tree = parse (&regexp, preg, syntax, &err);
   765   if (__glibc_unlikely (dfa->str_tree == NULL))
   766     goto re_compile_internal_free_return;
   767 
   768   /* Analyze the tree and create the nfa.  */
   769   err = analyze (preg);
   770   if (__glibc_unlikely (err != REG_NOERROR))
   771     goto re_compile_internal_free_return;
   772 
   773   /* If possible, do searching in single byte encoding to speed things up.  */
   774   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
   775     optimize_utf8 (dfa);
   776 
   777   /* Then create the initial state of the dfa.  */
   778   err = create_initial_state (dfa);
   779 
   780   /* Release work areas.  */
   781   free_workarea_compile (preg);
   782   re_string_destruct (&regexp);
   783 
   784   if (__glibc_unlikely (err != REG_NOERROR))
   785     {
   786       lock_fini (dfa->lock);
   787       free_dfa_content (dfa);
   788       preg->buffer = NULL;
   789       preg->allocated = 0;
   790     }
   791 
   792   return err;
   793 }
   794 
   795 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
   796    as the initial length of some arrays.  */
   797 
   798 static reg_errcode_t
   799 init_dfa (re_dfa_t *dfa, size_t pat_len)
   800 {
   801   __re_size_t table_size;
   802 #ifndef _LIBC
   803   const char *codeset_name;
   804 #endif
   805   size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
   806   size_t max_object_size =
   807     MAX (sizeof (struct re_state_table_entry),
   808          MAX (sizeof (re_token_t),
   809               MAX (sizeof (re_node_set),
   810                    MAX (sizeof (regmatch_t),
   811                         max_i18n_object_size))));
   812 
   813   memset (dfa, '\0', sizeof (re_dfa_t));
   814 
   815   /* Force allocation of str_tree_storage the first time.  */
   816   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
   817 
   818   /* Avoid overflows.  The extra "/ 2" is for the table_size doubling
   819      calculation below, and for similar doubling calculations
   820      elsewhere.  And it's <= rather than <, because some of the
   821      doubling calculations add 1 afterwards.  */
   822   if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2
   823                         <= pat_len))
   824     return REG_ESPACE;
   825 
   826   dfa->nodes_alloc = pat_len + 1;
   827   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
   828 
   829   /*  table_size = 2 ^ ceil(log pat_len) */
   830   for (table_size = 1; ; table_size <<= 1)
   831     if (table_size > pat_len)
   832       break;
   833 
   834   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
   835   dfa->state_hash_mask = table_size - 1;
   836 
   837   dfa->mb_cur_max = MB_CUR_MAX;
   838 #ifdef _LIBC
   839   if (dfa->mb_cur_max == 6
   840       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
   841     dfa->is_utf8 = 1;
   842   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
   843                        != 0);
   844 #else
   845   codeset_name = nl_langinfo (CODESET);
   846   if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
   847       && (codeset_name[1] == 'T' || codeset_name[1] == 't')
   848       && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
   849       && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
   850     dfa->is_utf8 = 1;
   851 
   852   /* We check exhaustively in the loop below if this charset is a
   853      superset of ASCII.  */
   854   dfa->map_notascii = 0;
   855 #endif
   856 
   857   if (dfa->mb_cur_max > 1)
   858     {
   859       if (dfa->is_utf8)
   860         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
   861       else
   862         {
   863           int i, j, ch;
   864 
   865           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
   866           if (__glibc_unlikely (dfa->sb_char == NULL))
   867             return REG_ESPACE;
   868 
   869           /* Set the bits corresponding to single byte chars.  */
   870           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
   871             for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
   872               {
   873                 wint_t wch = __btowc (ch);
   874                 if (wch != WEOF)
   875                   dfa->sb_char[i] |= (bitset_word_t) 1 << j;
   876 #ifndef _LIBC
   877                 if (isascii (ch) && wch != ch)
   878                   dfa->map_notascii = 1;
   879 #endif
   880               }
   881         }
   882     }
   883 
   884   if (__glibc_unlikely (dfa->nodes == NULL || dfa->state_table == NULL))
   885     return REG_ESPACE;
   886   return REG_NOERROR;
   887 }
   888 
   889 /* Initialize WORD_CHAR table, which indicate which character is
   890    "word".  In this case "word" means that it is the word construction
   891    character used by some operators like "\<", "\>", etc.  */
   892 
   893 static void
   894 init_word_char (re_dfa_t *dfa)
   895 {
   896   int i = 0;
   897   int j;
   898   int ch = 0;
   899   dfa->word_ops_used = 1;
   900   if (__glibc_likely (dfa->map_notascii == 0))
   901     {
   902       bitset_word_t bits0 = 0x00000000;
   903       bitset_word_t bits1 = 0x03ff0000;
   904       bitset_word_t bits2 = 0x87fffffe;
   905       bitset_word_t bits3 = 0x07fffffe;
   906       if (BITSET_WORD_BITS == 64)
   907         {
   908           /* Pacify gcc -Woverflow on 32-bit platforms.  */
   909           dfa->word_char[0] = bits1 << 31 << 1 | bits0;
   910           dfa->word_char[1] = bits3 << 31 << 1 | bits2;
   911           i = 2;
   912         }
   913       else if (BITSET_WORD_BITS == 32)
   914         {
   915           dfa->word_char[0] = bits0;
   916           dfa->word_char[1] = bits1;
   917           dfa->word_char[2] = bits2;
   918           dfa->word_char[3] = bits3;
   919           i = 4;
   920         }
   921       else
   922         goto general_case;
   923       ch = 128;
   924 
   925       if (__glibc_likely (dfa->is_utf8))
   926         {
   927           memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
   928           return;
   929         }
   930     }
   931 
   932  general_case:
   933   for (; i < BITSET_WORDS; ++i)
   934     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
   935       if (isalnum (ch) || ch == '_')
   936         dfa->word_char[i] |= (bitset_word_t) 1 << j;
   937 }
   938 
   939 /* Free the work area which are only used while compiling.  */
   940 
   941 static void
   942 free_workarea_compile (regex_t *preg)
   943 {
   944   re_dfa_t *dfa = preg->buffer;
   945   bin_tree_storage_t *storage, *next;
   946   for (storage = dfa->str_tree_storage; storage; storage = next)
   947     {
   948       next = storage->next;
   949       re_free (storage);
   950     }
   951   dfa->str_tree_storage = NULL;
   952   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
   953   dfa->str_tree = NULL;
   954   re_free (dfa->org_indices);
   955   dfa->org_indices = NULL;
   956 }
   957 
   958 /* Create initial states for all contexts.  */
   959 
   960 static reg_errcode_t
   961 create_initial_state (re_dfa_t *dfa)
   962 {
   963   Idx first, i;
   964   reg_errcode_t err;
   965   re_node_set init_nodes;
   966 
   967   /* Initial states have the epsilon closure of the node which is
   968      the first node of the regular expression.  */
   969   first = dfa->str_tree->first->node_idx;
   970   dfa->init_node = first;
   971   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
   972   if (__glibc_unlikely (err != REG_NOERROR))
   973     return err;
   974 
   975   /* The back-references which are in initial states can epsilon transit,
   976      since in this case all of the subexpressions can be null.
   977      Then we add epsilon closures of the nodes which are the next nodes of
   978      the back-references.  */
   979   if (dfa->nbackref > 0)
   980     for (i = 0; i < init_nodes.nelem; ++i)
   981       {
   982         Idx node_idx = init_nodes.elems[i];
   983         re_token_type_t type = dfa->nodes[node_idx].type;
   984 
   985         Idx clexp_idx;
   986         if (type != OP_BACK_REF)
   987           continue;
   988         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
   989           {
   990             re_token_t *clexp_node;
   991             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
   992             if (clexp_node->type == OP_CLOSE_SUBEXP
   993                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
   994               break;
   995           }
   996         if (clexp_idx == init_nodes.nelem)
   997           continue;
   998 
   999         if (type == OP_BACK_REF)
  1000           {
  1001             Idx dest_idx = dfa->edests[node_idx].elems[0];
  1002             if (!re_node_set_contains (&init_nodes, dest_idx))
  1003               {
  1004                 reg_errcode_t merge_err
  1005                   = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
  1006                 if (merge_err != REG_NOERROR)
  1007                   return merge_err;
  1008                 i = 0;
  1009               }
  1010           }
  1011       }
  1012 
  1013   /* It must be the first time to invoke acquire_state.  */
  1014   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
  1015   /* We don't check ERR here, since the initial state must not be NULL.  */
  1016   if (__glibc_unlikely (dfa->init_state == NULL))
  1017     return err;
  1018   if (dfa->init_state->has_constraint)
  1019     {
  1020       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
  1021                                                        CONTEXT_WORD);
  1022       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
  1023                                                      CONTEXT_NEWLINE);
  1024       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
  1025                                                          &init_nodes,
  1026                                                          CONTEXT_NEWLINE
  1027                                                          | CONTEXT_BEGBUF);
  1028       if (__glibc_unlikely (dfa->init_state_word == NULL
  1029                             || dfa->init_state_nl == NULL
  1030                             || dfa->init_state_begbuf == NULL))
  1031         return err;
  1032     }
  1033   else
  1034     dfa->init_state_word = dfa->init_state_nl
  1035       = dfa->init_state_begbuf = dfa->init_state;
  1036 
  1037   re_node_set_free (&init_nodes);
  1038   return REG_NOERROR;
  1039 }
  1040 
  1041 /* If it is possible to do searching in single byte encoding instead of UTF-8
  1042    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
  1043    DFA nodes where needed.  */
  1044 
  1045 static void
  1046 optimize_utf8 (re_dfa_t *dfa)
  1047 {
  1048   Idx node;
  1049   int i;
  1050   bool mb_chars = false;
  1051   bool has_period = false;
  1052 
  1053   for (node = 0; node < dfa->nodes_len; ++node)
  1054     switch (dfa->nodes[node].type)
  1055       {
  1056       case CHARACTER:
  1057         if (dfa->nodes[node].opr.c >= ASCII_CHARS)
  1058           mb_chars = true;
  1059         break;
  1060       case ANCHOR:
  1061         switch (dfa->nodes[node].opr.ctx_type)
  1062           {
  1063           case LINE_FIRST:
  1064           case LINE_LAST:
  1065           case BUF_FIRST:
  1066           case BUF_LAST:
  1067             break;
  1068           default:
  1069             /* Word anchors etc. cannot be handled.  It's okay to test
  1070                opr.ctx_type since constraints (for all DFA nodes) are
  1071                created by ORing one or more opr.ctx_type values.  */
  1072             return;
  1073           }
  1074         break;
  1075       case OP_PERIOD:
  1076         has_period = true;
  1077         break;
  1078       case OP_BACK_REF:
  1079       case OP_ALT:
  1080       case END_OF_RE:
  1081       case OP_DUP_ASTERISK:
  1082       case OP_OPEN_SUBEXP:
  1083       case OP_CLOSE_SUBEXP:
  1084         break;
  1085       case COMPLEX_BRACKET:
  1086         return;
  1087       case SIMPLE_BRACKET:
  1088         /* Just double check.  */
  1089         {
  1090           int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
  1091                         ? 0
  1092                         : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
  1093           for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
  1094             {
  1095               if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
  1096                 return;
  1097               rshift = 0;
  1098             }
  1099         }
  1100         break;
  1101       default:
  1102         abort ();
  1103       }
  1104 
  1105   if (mb_chars || has_period)
  1106     for (node = 0; node < dfa->nodes_len; ++node)
  1107       {
  1108         if (dfa->nodes[node].type == CHARACTER
  1109             && dfa->nodes[node].opr.c >= ASCII_CHARS)
  1110           dfa->nodes[node].mb_partial = 0;
  1111         else if (dfa->nodes[node].type == OP_PERIOD)
  1112           dfa->nodes[node].type = OP_UTF8_PERIOD;
  1113       }
  1114 
  1115   /* The search can be in single byte locale.  */
  1116   dfa->mb_cur_max = 1;
  1117   dfa->is_utf8 = 0;
  1118   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
  1119 }
  1120 
  1121 /* Analyze the structure tree, and calculate "first", "next", "edest",
  1122    "eclosure", and "inveclosure".  */
  1123 
  1124 static reg_errcode_t
  1125 analyze (regex_t *preg)
  1126 {
  1127   re_dfa_t *dfa = preg->buffer;
  1128   reg_errcode_t ret;
  1129 
  1130   /* Allocate arrays.  */
  1131   dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
  1132   dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
  1133   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
  1134   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
  1135   if (__glibc_unlikely (dfa->nexts == NULL || dfa->org_indices == NULL
  1136                         || dfa->edests == NULL || dfa->eclosures == NULL))
  1137     return REG_ESPACE;
  1138 
  1139   dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
  1140   if (dfa->subexp_map != NULL)
  1141     {
  1142       Idx i;
  1143       for (i = 0; i < preg->re_nsub; i++)
  1144         dfa->subexp_map[i] = i;
  1145       preorder (dfa->str_tree, optimize_subexps, dfa);
  1146       for (i = 0; i < preg->re_nsub; i++)
  1147         if (dfa->subexp_map[i] != i)
  1148           break;
  1149       if (i == preg->re_nsub)
  1150         {
  1151           re_free (dfa->subexp_map);
  1152           dfa->subexp_map = NULL;
  1153         }
  1154     }
  1155 
  1156   ret = postorder (dfa->str_tree, lower_subexps, preg);
  1157   if (__glibc_unlikely (ret != REG_NOERROR))
  1158     return ret;
  1159   ret = postorder (dfa->str_tree, calc_first, dfa);
  1160   if (__glibc_unlikely (ret != REG_NOERROR))
  1161     return ret;
  1162   preorder (dfa->str_tree, calc_next, dfa);
  1163   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
  1164   if (__glibc_unlikely (ret != REG_NOERROR))
  1165     return ret;
  1166   ret = calc_eclosure (dfa);
  1167   if (__glibc_unlikely (ret != REG_NOERROR))
  1168     return ret;
  1169 
  1170   /* We only need this during the prune_impossible_nodes pass in regexec.c;
  1171      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
  1172   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
  1173       || dfa->nbackref)
  1174     {
  1175       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
  1176       if (__glibc_unlikely (dfa->inveclosures == NULL))
  1177         return REG_ESPACE;
  1178       ret = calc_inveclosure (dfa);
  1179     }
  1180 
  1181   return ret;
  1182 }
  1183 
  1184 /* Our parse trees are very unbalanced, so we cannot use a stack to
  1185    implement parse tree visits.  Instead, we use parent pointers and
  1186    some hairy code in these two functions.  */
  1187 static reg_errcode_t
  1188 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
  1189            void *extra)
  1190 {
  1191   bin_tree_t *node, *prev;
  1192 
  1193   for (node = root; ; )
  1194     {
  1195       /* Descend down the tree, preferably to the left (or to the right
  1196          if that's the only child).  */
  1197       while (node->left || node->right)
  1198         if (node->left)
  1199           node = node->left;
  1200         else
  1201           node = node->right;
  1202 
  1203       do
  1204         {
  1205           reg_errcode_t err = fn (extra, node);
  1206           if (__glibc_unlikely (err != REG_NOERROR))
  1207             return err;
  1208           if (node->parent == NULL)
  1209             return REG_NOERROR;
  1210           prev = node;
  1211           node = node->parent;
  1212         }
  1213       /* Go up while we have a node that is reached from the right.  */
  1214       while (node->right == prev || node->right == NULL);
  1215       node = node->right;
  1216     }
  1217 }
  1218 
  1219 static reg_errcode_t
  1220 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
  1221           void *extra)
  1222 {
  1223   bin_tree_t *node;
  1224 
  1225   for (node = root; ; )
  1226     {
  1227       reg_errcode_t err = fn (extra, node);
  1228       if (__glibc_unlikely (err != REG_NOERROR))
  1229         return err;
  1230 
  1231       /* Go to the left node, or up and to the right.  */
  1232       if (node->left)
  1233         node = node->left;
  1234       else
  1235         {
  1236           bin_tree_t *prev = NULL;
  1237           while (node->right == prev || node->right == NULL)
  1238             {
  1239               prev = node;
  1240               node = node->parent;
  1241               if (!node)
  1242                 return REG_NOERROR;
  1243             }
  1244           node = node->right;
  1245         }
  1246     }
  1247 }
  1248 
  1249 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
  1250    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
  1251    backreferences as well.  Requires a preorder visit.  */
  1252 static reg_errcode_t
  1253 optimize_subexps (void *extra, bin_tree_t *node)
  1254 {
  1255   re_dfa_t *dfa = (re_dfa_t *) extra;
  1256 
  1257   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
  1258     {
  1259       int idx = node->token.opr.idx;
  1260       node->token.opr.idx = dfa->subexp_map[idx];
  1261       dfa->used_bkref_map |= 1 << node->token.opr.idx;
  1262     }
  1263 
  1264   else if (node->token.type == SUBEXP
  1265            && node->left && node->left->token.type == SUBEXP)
  1266     {
  1267       Idx other_idx = node->left->token.opr.idx;
  1268 
  1269       node->left = node->left->left;
  1270       if (node->left)
  1271         node->left->parent = node;
  1272 
  1273       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
  1274       if (other_idx < BITSET_WORD_BITS)
  1275         dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
  1276     }
  1277 
  1278   return REG_NOERROR;
  1279 }
  1280 
  1281 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
  1282    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
  1283 static reg_errcode_t
  1284 lower_subexps (void *extra, bin_tree_t *node)
  1285 {
  1286   regex_t *preg = (regex_t *) extra;
  1287   reg_errcode_t err = REG_NOERROR;
  1288 
  1289   if (node->left && node->left->token.type == SUBEXP)
  1290     {
  1291       node->left = lower_subexp (&err, preg, node->left);
  1292       if (node->left)
  1293         node->left->parent = node;
  1294     }
  1295   if (node->right && node->right->token.type == SUBEXP)
  1296     {
  1297       node->right = lower_subexp (&err, preg, node->right);
  1298       if (node->right)
  1299         node->right->parent = node;
  1300     }
  1301 
  1302   return err;
  1303 }
  1304 
  1305 static bin_tree_t *
  1306 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
  1307 {
  1308   re_dfa_t *dfa = preg->buffer;
  1309   bin_tree_t *body = node->left;
  1310   bin_tree_t *op, *cls, *tree1, *tree;
  1311 
  1312   if (preg->no_sub
  1313       /* We do not optimize empty subexpressions, because otherwise we may
  1314          have bad CONCAT nodes with NULL children.  This is obviously not
  1315          very common, so we do not lose much.  An example that triggers
  1316          this case is the sed "script" /\(\)/x.  */
  1317       && node->left != NULL
  1318       && (node->token.opr.idx >= BITSET_WORD_BITS
  1319           || !(dfa->used_bkref_map
  1320                & ((bitset_word_t) 1 << node->token.opr.idx))))
  1321     return node->left;
  1322 
  1323   /* Convert the SUBEXP node to the concatenation of an
  1324      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
  1325   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
  1326   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
  1327   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
  1328   tree = create_tree (dfa, op, tree1, CONCAT);
  1329   if (__glibc_unlikely (tree == NULL || tree1 == NULL
  1330                         || op == NULL || cls == NULL))
  1331     {
  1332       *err = REG_ESPACE;
  1333       return NULL;
  1334     }
  1335 
  1336   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
  1337   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
  1338   return tree;
  1339 }
  1340 
  1341 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
  1342    nodes.  Requires a postorder visit.  */
  1343 static reg_errcode_t
  1344 calc_first (void *extra, bin_tree_t *node)
  1345 {
  1346   re_dfa_t *dfa = (re_dfa_t *) extra;
  1347   if (node->token.type == CONCAT)
  1348     {
  1349       node->first = node->left->first;
  1350       node->node_idx = node->left->node_idx;
  1351     }
  1352   else
  1353     {
  1354       node->first = node;
  1355       node->node_idx = re_dfa_add_node (dfa, node->token);
  1356       if (__glibc_unlikely (node->node_idx == -1))
  1357         return REG_ESPACE;
  1358       if (node->token.type == ANCHOR)
  1359         dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
  1360     }
  1361   return REG_NOERROR;
  1362 }
  1363 
  1364 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
  1365 static reg_errcode_t
  1366 calc_next (void *extra, bin_tree_t *node)
  1367 {
  1368   switch (node->token.type)
  1369     {
  1370     case OP_DUP_ASTERISK:
  1371       node->left->next = node;
  1372       break;
  1373     case CONCAT:
  1374       node->left->next = node->right->first;
  1375       node->right->next = node->next;
  1376       break;
  1377     default:
  1378       if (node->left)
  1379         node->left->next = node->next;
  1380       if (node->right)
  1381         node->right->next = node->next;
  1382       break;
  1383     }
  1384   return REG_NOERROR;
  1385 }
  1386 
  1387 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
  1388 static reg_errcode_t
  1389 link_nfa_nodes (void *extra, bin_tree_t *node)
  1390 {
  1391   re_dfa_t *dfa = (re_dfa_t *) extra;
  1392   Idx idx = node->node_idx;
  1393   reg_errcode_t err = REG_NOERROR;
  1394 
  1395   switch (node->token.type)
  1396     {
  1397     case CONCAT:
  1398       break;
  1399 
  1400     case END_OF_RE:
  1401       DEBUG_ASSERT (node->next == NULL);
  1402       break;
  1403 
  1404     case OP_DUP_ASTERISK:
  1405     case OP_ALT:
  1406       {
  1407         Idx left, right;
  1408         dfa->has_plural_match = 1;
  1409         if (node->left != NULL)
  1410           left = node->left->first->node_idx;
  1411         else
  1412           left = node->next->node_idx;
  1413         if (node->right != NULL)
  1414           right = node->right->first->node_idx;
  1415         else
  1416           right = node->next->node_idx;
  1417         DEBUG_ASSERT (left > -1);
  1418         DEBUG_ASSERT (right > -1);
  1419         err = re_node_set_init_2 (dfa->edests + idx, left, right);
  1420       }
  1421       break;
  1422 
  1423     case ANCHOR:
  1424     case OP_OPEN_SUBEXP:
  1425     case OP_CLOSE_SUBEXP:
  1426       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
  1427       break;
  1428 
  1429     case OP_BACK_REF:
  1430       dfa->nexts[idx] = node->next->node_idx;
  1431       if (node->token.type == OP_BACK_REF)
  1432         err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
  1433       break;
  1434 
  1435     default:
  1436       DEBUG_ASSERT (!IS_EPSILON_NODE (node->token.type));
  1437       dfa->nexts[idx] = node->next->node_idx;
  1438       break;
  1439     }
  1440 
  1441   return err;
  1442 }
  1443 
  1444 /* Duplicate the epsilon closure of the node ROOT_NODE.
  1445    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
  1446    to their own constraint.  */
  1447 
  1448 static reg_errcode_t
  1449 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
  1450                         Idx root_node, unsigned int init_constraint)
  1451 {
  1452   Idx org_node, clone_node;
  1453   bool ok;
  1454   unsigned int constraint = init_constraint;
  1455   for (org_node = top_org_node, clone_node = top_clone_node;;)
  1456     {
  1457       Idx org_dest, clone_dest;
  1458       if (dfa->nodes[org_node].type == OP_BACK_REF)
  1459         {
  1460           /* If the back reference epsilon-transit, its destination must
  1461              also have the constraint.  Then duplicate the epsilon closure
  1462              of the destination of the back reference, and store it in
  1463              edests of the back reference.  */
  1464           org_dest = dfa->nexts[org_node];
  1465           re_node_set_empty (dfa->edests + clone_node);
  1466           clone_dest = duplicate_node (dfa, org_dest, constraint);
  1467           if (__glibc_unlikely (clone_dest == -1))
  1468             return REG_ESPACE;
  1469           dfa->nexts[clone_node] = dfa->nexts[org_node];
  1470           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
  1471           if (__glibc_unlikely (! ok))
  1472             return REG_ESPACE;
  1473         }
  1474       else if (dfa->edests[org_node].nelem == 0)
  1475         {
  1476           /* In case of the node can't epsilon-transit, don't duplicate the
  1477              destination and store the original destination as the
  1478              destination of the node.  */
  1479           dfa->nexts[clone_node] = dfa->nexts[org_node];
  1480           break;
  1481         }
  1482       else if (dfa->edests[org_node].nelem == 1)
  1483         {
  1484           /* In case of the node can epsilon-transit, and it has only one
  1485              destination.  */
  1486           org_dest = dfa->edests[org_node].elems[0];
  1487           re_node_set_empty (dfa->edests + clone_node);
  1488           /* If the node is root_node itself, it means the epsilon closure
  1489              has a loop.  Then tie it to the destination of the root_node.  */
  1490           if (org_node == root_node && clone_node != org_node)
  1491             {
  1492               ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
  1493               if (__glibc_unlikely (! ok))
  1494                 return REG_ESPACE;
  1495               break;
  1496             }
  1497           /* In case the node has another constraint, append it.  */
  1498           constraint |= dfa->nodes[org_node].constraint;
  1499           clone_dest = duplicate_node (dfa, org_dest, constraint);
  1500           if (__glibc_unlikely (clone_dest == -1))
  1501             return REG_ESPACE;
  1502           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
  1503           if (__glibc_unlikely (! ok))
  1504             return REG_ESPACE;
  1505         }
  1506       else /* dfa->edests[org_node].nelem == 2 */
  1507         {
  1508           /* In case of the node can epsilon-transit, and it has two
  1509              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
  1510           org_dest = dfa->edests[org_node].elems[0];
  1511           re_node_set_empty (dfa->edests + clone_node);
  1512           /* Search for a duplicated node which satisfies the constraint.  */
  1513           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
  1514           if (clone_dest == -1)
  1515             {
  1516               /* There is no such duplicated node, create a new one.  */
  1517               reg_errcode_t err;
  1518               clone_dest = duplicate_node (dfa, org_dest, constraint);
  1519               if (__glibc_unlikely (clone_dest == -1))
  1520                 return REG_ESPACE;
  1521               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
  1522               if (__glibc_unlikely (! ok))
  1523                 return REG_ESPACE;
  1524               err = duplicate_node_closure (dfa, org_dest, clone_dest,
  1525                                             root_node, constraint);
  1526               if (__glibc_unlikely (err != REG_NOERROR))
  1527                 return err;
  1528             }
  1529           else
  1530             {
  1531               /* There is a duplicated node which satisfies the constraint,
  1532                  use it to avoid infinite loop.  */
  1533               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
  1534               if (__glibc_unlikely (! ok))
  1535                 return REG_ESPACE;
  1536             }
  1537 
  1538           org_dest = dfa->edests[org_node].elems[1];
  1539           clone_dest = duplicate_node (dfa, org_dest, constraint);
  1540           if (__glibc_unlikely (clone_dest == -1))
  1541             return REG_ESPACE;
  1542           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
  1543           if (__glibc_unlikely (! ok))
  1544             return REG_ESPACE;
  1545         }
  1546       org_node = org_dest;
  1547       clone_node = clone_dest;
  1548     }
  1549   return REG_NOERROR;
  1550 }
  1551 
  1552 /* Search for a node which is duplicated from the node ORG_NODE, and
  1553    satisfies the constraint CONSTRAINT.  */
  1554 
  1555 static Idx
  1556 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
  1557                         unsigned int constraint)
  1558 {
  1559   Idx idx;
  1560   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
  1561     {
  1562       if (org_node == dfa->org_indices[idx]
  1563           && constraint == dfa->nodes[idx].constraint)
  1564         return idx; /* Found.  */
  1565     }
  1566   return -1; /* Not found.  */
  1567 }
  1568 
  1569 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
  1570    Return the index of the new node, or -1 if insufficient storage is
  1571    available.  */
  1572 
  1573 static Idx
  1574 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
  1575 {
  1576   Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
  1577   if (__glibc_likely (dup_idx != -1))
  1578     {
  1579       dfa->nodes[dup_idx].constraint = constraint;
  1580       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
  1581       dfa->nodes[dup_idx].duplicated = 1;
  1582 
  1583       /* Store the index of the original node.  */
  1584       dfa->org_indices[dup_idx] = org_idx;
  1585     }
  1586   return dup_idx;
  1587 }
  1588 
  1589 static reg_errcode_t
  1590 calc_inveclosure (re_dfa_t *dfa)
  1591 {
  1592   Idx src, idx;
  1593   bool ok;
  1594   for (idx = 0; idx < dfa->nodes_len; ++idx)
  1595     re_node_set_init_empty (dfa->inveclosures + idx);
  1596 
  1597   for (src = 0; src < dfa->nodes_len; ++src)
  1598     {
  1599       Idx *elems = dfa->eclosures[src].elems;
  1600       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
  1601         {
  1602           ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
  1603           if (__glibc_unlikely (! ok))
  1604             return REG_ESPACE;
  1605         }
  1606     }
  1607 
  1608   return REG_NOERROR;
  1609 }
  1610 
  1611 /* Calculate "eclosure" for all the node in DFA.  */
  1612 
  1613 static reg_errcode_t
  1614 calc_eclosure (re_dfa_t *dfa)
  1615 {
  1616   Idx node_idx;
  1617   bool incomplete;
  1618   DEBUG_ASSERT (dfa->nodes_len > 0);
  1619   incomplete = false;
  1620   /* For each nodes, calculate epsilon closure.  */
  1621   for (node_idx = 0; ; ++node_idx)
  1622     {
  1623       reg_errcode_t err;
  1624       re_node_set eclosure_elem;
  1625       if (node_idx == dfa->nodes_len)
  1626         {
  1627           if (!incomplete)
  1628             break;
  1629           incomplete = false;
  1630           node_idx = 0;
  1631         }
  1632 
  1633       DEBUG_ASSERT (dfa->eclosures[node_idx].nelem != -1);
  1634 
  1635       /* If we have already calculated, skip it.  */
  1636       if (dfa->eclosures[node_idx].nelem != 0)
  1637         continue;
  1638       /* Calculate epsilon closure of 'node_idx'.  */
  1639       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
  1640       if (__glibc_unlikely (err != REG_NOERROR))
  1641         return err;
  1642 
  1643       if (dfa->eclosures[node_idx].nelem == 0)
  1644         {
  1645           incomplete = true;
  1646           re_node_set_free (&eclosure_elem);
  1647         }
  1648     }
  1649   return REG_NOERROR;
  1650 }
  1651 
  1652 /* Calculate epsilon closure of NODE.  */
  1653 
  1654 static reg_errcode_t
  1655 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
  1656 {
  1657   reg_errcode_t err;
  1658   Idx i;
  1659   re_node_set eclosure;
  1660   bool incomplete = false;
  1661   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
  1662   if (__glibc_unlikely (err != REG_NOERROR))
  1663     return err;
  1664 
  1665   /* An epsilon closure includes itself.  */
  1666   eclosure.elems[eclosure.nelem++] = node;
  1667 
  1668   /* This indicates that we are calculating this node now.
  1669      We reference this value to avoid infinite loop.  */
  1670   dfa->eclosures[node].nelem = -1;
  1671 
  1672   /* If the current node has constraints, duplicate all nodes
  1673      since they must inherit the constraints.  */
  1674   if (dfa->nodes[node].constraint
  1675       && dfa->edests[node].nelem
  1676       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
  1677     {
  1678       err = duplicate_node_closure (dfa, node, node, node,
  1679                                     dfa->nodes[node].constraint);
  1680       if (__glibc_unlikely (err != REG_NOERROR))
  1681         return err;
  1682     }
  1683 
  1684   /* Expand each epsilon destination nodes.  */
  1685   if (IS_EPSILON_NODE(dfa->nodes[node].type))
  1686     for (i = 0; i < dfa->edests[node].nelem; ++i)
  1687       {
  1688         re_node_set eclosure_elem;
  1689         Idx edest = dfa->edests[node].elems[i];
  1690         /* If calculating the epsilon closure of 'edest' is in progress,
  1691            return intermediate result.  */
  1692         if (dfa->eclosures[edest].nelem == -1)
  1693           {
  1694             incomplete = true;
  1695             continue;
  1696           }
  1697         /* If we haven't calculated the epsilon closure of 'edest' yet,
  1698            calculate now. Otherwise use calculated epsilon closure.  */
  1699         if (dfa->eclosures[edest].nelem == 0)
  1700           {
  1701             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
  1702             if (__glibc_unlikely (err != REG_NOERROR))
  1703               return err;
  1704           }
  1705         else
  1706           eclosure_elem = dfa->eclosures[edest];
  1707         /* Merge the epsilon closure of 'edest'.  */
  1708         err = re_node_set_merge (&eclosure, &eclosure_elem);
  1709         if (__glibc_unlikely (err != REG_NOERROR))
  1710           return err;
  1711         /* If the epsilon closure of 'edest' is incomplete,
  1712            the epsilon closure of this node is also incomplete.  */
  1713         if (dfa->eclosures[edest].nelem == 0)
  1714           {
  1715             incomplete = true;
  1716             re_node_set_free (&eclosure_elem);
  1717           }
  1718       }
  1719 
  1720   if (incomplete && !root)
  1721     dfa->eclosures[node].nelem = 0;
  1722   else
  1723     dfa->eclosures[node] = eclosure;
  1724   *new_set = eclosure;
  1725   return REG_NOERROR;
  1726 }
  1727 
  1728 /* Functions for token which are used in the parser.  */
  1729 
  1730 /* Fetch a token from INPUT.
  1731    We must not use this function inside bracket expressions.  */
  1732 
  1733 static void
  1734 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
  1735 {
  1736   re_string_skip_bytes (input, peek_token (result, input, syntax));
  1737 }
  1738 
  1739 /* Peek a token from INPUT, and return the length of the token.
  1740    We must not use this function inside bracket expressions.  */
  1741 
  1742 static int
  1743 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
  1744 {
  1745   unsigned char c;
  1746 
  1747   if (re_string_eoi (input))
  1748     {
  1749       token->type = END_OF_RE;
  1750       return 0;
  1751     }
  1752 
  1753   c = re_string_peek_byte (input, 0);
  1754   token->opr.c = c;
  1755 
  1756   token->word_char = 0;
  1757   token->mb_partial = 0;
  1758   if (input->mb_cur_max > 1
  1759       && !re_string_first_byte (input, re_string_cur_idx (input)))
  1760     {
  1761       token->type = CHARACTER;
  1762       token->mb_partial = 1;
  1763       return 1;
  1764     }
  1765   if (c == '\\')
  1766     {
  1767       unsigned char c2;
  1768       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
  1769         {
  1770           token->type = BACK_SLASH;
  1771           return 1;
  1772         }
  1773 
  1774       c2 = re_string_peek_byte_case (input, 1);
  1775       token->opr.c = c2;
  1776       token->type = CHARACTER;
  1777       if (input->mb_cur_max > 1)
  1778         {
  1779           wint_t wc = re_string_wchar_at (input,
  1780                                           re_string_cur_idx (input) + 1);
  1781           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
  1782         }
  1783       else
  1784         token->word_char = IS_WORD_CHAR (c2) != 0;
  1785 
  1786       switch (c2)
  1787         {
  1788         case '|':
  1789           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
  1790             token->type = OP_ALT;
  1791           break;
  1792         case '1': case '2': case '3': case '4': case '5':
  1793         case '6': case '7': case '8': case '9':
  1794           if (!(syntax & RE_NO_BK_REFS))
  1795             {
  1796               token->type = OP_BACK_REF;
  1797               token->opr.idx = c2 - '1';
  1798             }
  1799           break;
  1800         case '<':
  1801           if (!(syntax & RE_NO_GNU_OPS))
  1802             {
  1803               token->type = ANCHOR;
  1804               token->opr.ctx_type = WORD_FIRST;
  1805             }
  1806           break;
  1807         case '>':
  1808           if (!(syntax & RE_NO_GNU_OPS))
  1809             {
  1810               token->type = ANCHOR;
  1811               token->opr.ctx_type = WORD_LAST;
  1812             }
  1813           break;
  1814         case 'b':
  1815           if (!(syntax & RE_NO_GNU_OPS))
  1816             {
  1817               token->type = ANCHOR;
  1818               token->opr.ctx_type = WORD_DELIM;
  1819             }
  1820           break;
  1821         case 'B':
  1822           if (!(syntax & RE_NO_GNU_OPS))
  1823             {
  1824               token->type = ANCHOR;
  1825               token->opr.ctx_type = NOT_WORD_DELIM;
  1826             }
  1827           break;
  1828         case 'w':
  1829           if (!(syntax & RE_NO_GNU_OPS))
  1830             token->type = OP_WORD;
  1831           break;
  1832         case 'W':
  1833           if (!(syntax & RE_NO_GNU_OPS))
  1834             token->type = OP_NOTWORD;
  1835           break;
  1836         case 's':
  1837           if (!(syntax & RE_NO_GNU_OPS))
  1838             token->type = OP_SPACE;
  1839           break;
  1840         case 'S':
  1841           if (!(syntax & RE_NO_GNU_OPS))
  1842             token->type = OP_NOTSPACE;
  1843           break;
  1844         case '`':
  1845           if (!(syntax & RE_NO_GNU_OPS))
  1846             {
  1847               token->type = ANCHOR;
  1848               token->opr.ctx_type = BUF_FIRST;
  1849             }
  1850           break;
  1851         case '\'':
  1852           if (!(syntax & RE_NO_GNU_OPS))
  1853             {
  1854               token->type = ANCHOR;
  1855               token->opr.ctx_type = BUF_LAST;
  1856             }
  1857           break;
  1858         case '(':
  1859           if (!(syntax & RE_NO_BK_PARENS))
  1860             token->type = OP_OPEN_SUBEXP;
  1861           break;
  1862         case ')':
  1863           if (!(syntax & RE_NO_BK_PARENS))
  1864             token->type = OP_CLOSE_SUBEXP;
  1865           break;
  1866         case '+':
  1867           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
  1868             token->type = OP_DUP_PLUS;
  1869           break;
  1870         case '?':
  1871           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
  1872             token->type = OP_DUP_QUESTION;
  1873           break;
  1874         case '{':
  1875           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
  1876             token->type = OP_OPEN_DUP_NUM;
  1877           break;
  1878         case '}':
  1879           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
  1880             token->type = OP_CLOSE_DUP_NUM;
  1881           break;
  1882         default:
  1883           break;
  1884         }
  1885       return 2;
  1886     }
  1887 
  1888   token->type = CHARACTER;
  1889   if (input->mb_cur_max > 1)
  1890     {
  1891       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
  1892       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
  1893     }
  1894   else
  1895     token->word_char = IS_WORD_CHAR (token->opr.c);
  1896 
  1897   switch (c)
  1898     {
  1899     case '\n':
  1900       if (syntax & RE_NEWLINE_ALT)
  1901         token->type = OP_ALT;
  1902       break;
  1903     case '|':
  1904       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
  1905         token->type = OP_ALT;
  1906       break;
  1907     case '*':
  1908       token->type = OP_DUP_ASTERISK;
  1909       break;
  1910     case '+':
  1911       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
  1912         token->type = OP_DUP_PLUS;
  1913       break;
  1914     case '?':
  1915       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
  1916         token->type = OP_DUP_QUESTION;
  1917       break;
  1918     case '{':
  1919       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
  1920         token->type = OP_OPEN_DUP_NUM;
  1921       break;
  1922     case '}':
  1923       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
  1924         token->type = OP_CLOSE_DUP_NUM;
  1925       break;
  1926     case '(':
  1927       if (syntax & RE_NO_BK_PARENS)
  1928         token->type = OP_OPEN_SUBEXP;
  1929       break;
  1930     case ')':
  1931       if (syntax & RE_NO_BK_PARENS)
  1932         token->type = OP_CLOSE_SUBEXP;
  1933       break;
  1934     case '[':
  1935       token->type = OP_OPEN_BRACKET;
  1936       break;
  1937     case '.':
  1938       token->type = OP_PERIOD;
  1939       break;
  1940     case '^':
  1941       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE))
  1942           && re_string_cur_idx (input) != 0)
  1943         {
  1944           char prev = re_string_peek_byte (input, -1);
  1945           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
  1946             break;
  1947         }
  1948       token->type = ANCHOR;
  1949       token->opr.ctx_type = LINE_FIRST;
  1950       break;
  1951     case '$':
  1952       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS)
  1953           && re_string_cur_idx (input) + 1 != re_string_length (input))
  1954         {
  1955           re_token_t next;
  1956           re_string_skip_bytes (input, 1);
  1957           peek_token (&next, input, syntax);
  1958           re_string_skip_bytes (input, -1);
  1959           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
  1960             break;
  1961         }
  1962       token->type = ANCHOR;
  1963       token->opr.ctx_type = LINE_LAST;
  1964       break;
  1965     default:
  1966       break;
  1967     }
  1968   return 1;
  1969 }
  1970 
  1971 /* Peek a token from INPUT, and return the length of the token.
  1972    We must not use this function out of bracket expressions.  */
  1973 
  1974 static int
  1975 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
  1976 {
  1977   unsigned char c;
  1978   if (re_string_eoi (input))
  1979     {
  1980       token->type = END_OF_RE;
  1981       return 0;
  1982     }
  1983   c = re_string_peek_byte (input, 0);
  1984   token->opr.c = c;
  1985 
  1986   if (input->mb_cur_max > 1
  1987       && !re_string_first_byte (input, re_string_cur_idx (input)))
  1988     {
  1989       token->type = CHARACTER;
  1990       return 1;
  1991     }
  1992 
  1993   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
  1994       && re_string_cur_idx (input) + 1 < re_string_length (input))
  1995     {
  1996       /* In this case, '\' escape a character.  */
  1997       unsigned char c2;
  1998       re_string_skip_bytes (input, 1);
  1999       c2 = re_string_peek_byte (input, 0);
  2000       token->opr.c = c2;
  2001       token->type = CHARACTER;
  2002       return 1;
  2003     }
  2004   if (c == '[') /* '[' is a special char in a bracket exps.  */
  2005     {
  2006       unsigned char c2;
  2007       int token_len;
  2008       if (re_string_cur_idx (input) + 1 < re_string_length (input))
  2009         c2 = re_string_peek_byte (input, 1);
  2010       else
  2011         c2 = 0;
  2012       token->opr.c = c2;
  2013       token_len = 2;
  2014       switch (c2)
  2015         {
  2016         case '.':
  2017           token->type = OP_OPEN_COLL_ELEM;
  2018           break;
  2019 
  2020         case '=':
  2021           token->type = OP_OPEN_EQUIV_CLASS;
  2022           break;
  2023 
  2024         case ':':
  2025           if (syntax & RE_CHAR_CLASSES)
  2026             {
  2027               token->type = OP_OPEN_CHAR_CLASS;
  2028               break;
  2029             }
  2030           FALLTHROUGH;
  2031         default:
  2032           token->type = CHARACTER;
  2033           token->opr.c = c;
  2034           token_len = 1;
  2035           break;
  2036         }
  2037       return token_len;
  2038     }
  2039   switch (c)
  2040     {
  2041     case ']':
  2042       token->type = OP_CLOSE_BRACKET;
  2043       break;
  2044     case '^':
  2045       token->type = OP_NON_MATCH_LIST;
  2046       break;
  2047     case '-':
  2048       /* In V7 Unix grep and Unix awk and mawk, [...---...]
  2049          (3 adjacent minus signs) stands for a single minus sign.
  2050          Support that without breaking anything else.  */
  2051       if (! (re_string_cur_idx (input) + 2 < re_string_length (input)
  2052              && re_string_peek_byte (input, 1) == '-'
  2053              && re_string_peek_byte (input, 2) == '-'))
  2054         {
  2055           token->type = OP_CHARSET_RANGE;
  2056           break;
  2057         }
  2058       re_string_skip_bytes (input, 2);
  2059       FALLTHROUGH;
  2060     default:
  2061       token->type = CHARACTER;
  2062     }
  2063   return 1;
  2064 }
  2065 
  2066 /* Functions for parser.  */
  2067 
  2068 /* Entry point of the parser.
  2069    Parse the regular expression REGEXP and return the structure tree.
  2070    If an error occurs, ERR is set by error code, and return NULL.
  2071    This function build the following tree, from regular expression <reg_exp>:
  2072            CAT
  2073            / \
  2074           /   \
  2075    <reg_exp>  EOR
  2076 
  2077    CAT means concatenation.
  2078    EOR means end of regular expression.  */
  2079 
  2080 static bin_tree_t *
  2081 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
  2082        reg_errcode_t *err)
  2083 {
  2084   re_dfa_t *dfa = preg->buffer;
  2085   bin_tree_t *tree, *eor, *root;
  2086   re_token_t current_token;
  2087   dfa->syntax = syntax;
  2088   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
  2089   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
  2090   if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
  2091     return NULL;
  2092   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
  2093   if (tree != NULL)
  2094     root = create_tree (dfa, tree, eor, CONCAT);
  2095   else
  2096     root = eor;
  2097   if (__glibc_unlikely (eor == NULL || root == NULL))
  2098     {
  2099       *err = REG_ESPACE;
  2100       return NULL;
  2101     }
  2102   return root;
  2103 }
  2104 
  2105 /* This function build the following tree, from regular expression
  2106    <branch1>|<branch2>:
  2107            ALT
  2108            / \
  2109           /   \
  2110    <branch1> <branch2>
  2111 
  2112    ALT means alternative, which represents the operator '|'.  */
  2113 
  2114 static bin_tree_t *
  2115 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
  2116                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
  2117 {
  2118   re_dfa_t *dfa = preg->buffer;
  2119   bin_tree_t *tree, *branch = NULL;
  2120   bitset_word_t initial_bkref_map = dfa->completed_bkref_map;
  2121   tree = parse_branch (regexp, preg, token, syntax, nest, err);
  2122   if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
  2123     return NULL;
  2124 
  2125   while (token->type == OP_ALT)
  2126     {
  2127       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
  2128       if (token->type != OP_ALT && token->type != END_OF_RE
  2129           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
  2130         {
  2131           bitset_word_t accumulated_bkref_map = dfa->completed_bkref_map;
  2132           dfa->completed_bkref_map = initial_bkref_map;
  2133           branch = parse_branch (regexp, preg, token, syntax, nest, err);
  2134           if (__glibc_unlikely (*err != REG_NOERROR && branch == NULL))
  2135             {
  2136               if (tree != NULL)
  2137                 postorder (tree, free_tree, NULL);
  2138               return NULL;
  2139             }
  2140           dfa->completed_bkref_map |= accumulated_bkref_map;
  2141         }
  2142       else
  2143         branch = NULL;
  2144       tree = create_tree (dfa, tree, branch, OP_ALT);
  2145       if (__glibc_unlikely (tree == NULL))
  2146         {
  2147           *err = REG_ESPACE;
  2148           return NULL;
  2149         }
  2150     }
  2151   return tree;
  2152 }
  2153 
  2154 /* This function build the following tree, from regular expression
  2155    <exp1><exp2>:
  2156         CAT
  2157         / \
  2158        /   \
  2159    <exp1> <exp2>
  2160 
  2161    CAT means concatenation.  */
  2162 
  2163 static bin_tree_t *
  2164 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
  2165               reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
  2166 {
  2167   bin_tree_t *tree, *expr;
  2168   re_dfa_t *dfa = preg->buffer;
  2169   tree = parse_expression (regexp, preg, token, syntax, nest, err);
  2170   if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
  2171     return NULL;
  2172 
  2173   while (token->type != OP_ALT && token->type != END_OF_RE
  2174          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
  2175     {
  2176       expr = parse_expression (regexp, preg, token, syntax, nest, err);
  2177       if (__glibc_unlikely (*err != REG_NOERROR && expr == NULL))
  2178         {
  2179           if (tree != NULL)
  2180             postorder (tree, free_tree, NULL);
  2181           return NULL;
  2182         }
  2183       if (tree != NULL && expr != NULL)
  2184         {
  2185           bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
  2186           if (newtree == NULL)
  2187             {
  2188               postorder (expr, free_tree, NULL);
  2189               postorder (tree, free_tree, NULL);
  2190               *err = REG_ESPACE;
  2191               return NULL;
  2192             }
  2193           tree = newtree;
  2194         }
  2195       else if (tree == NULL)
  2196         tree = expr;
  2197       /* Otherwise expr == NULL, we don't need to create new tree.  */
  2198     }
  2199   return tree;
  2200 }
  2201 
  2202 /* This function build the following tree, from regular expression a*:
  2203          *
  2204          |
  2205          a
  2206 */
  2207 
  2208 static bin_tree_t *
  2209 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
  2210                   reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
  2211 {
  2212   re_dfa_t *dfa = preg->buffer;
  2213   bin_tree_t *tree;
  2214   switch (token->type)
  2215     {
  2216     case CHARACTER:
  2217       tree = create_token_tree (dfa, NULL, NULL, token);
  2218       if (__glibc_unlikely (tree == NULL))
  2219         {
  2220           *err = REG_ESPACE;
  2221           return NULL;
  2222         }
  2223       if (dfa->mb_cur_max > 1)
  2224         {
  2225           while (!re_string_eoi (regexp)
  2226                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
  2227             {
  2228               bin_tree_t *mbc_remain;
  2229               fetch_token (token, regexp, syntax);
  2230               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
  2231               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
  2232               if (__glibc_unlikely (mbc_remain == NULL || tree == NULL))
  2233                 {
  2234                   *err = REG_ESPACE;
  2235                   return NULL;
  2236                 }
  2237             }
  2238         }
  2239       break;
  2240 
  2241     case OP_OPEN_SUBEXP:
  2242       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
  2243       if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
  2244         return NULL;
  2245       break;
  2246 
  2247     case OP_OPEN_BRACKET:
  2248       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
  2249       if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
  2250         return NULL;
  2251       break;
  2252 
  2253     case OP_BACK_REF:
  2254       if (!__glibc_likely (dfa->completed_bkref_map & (1 << token->opr.idx)))
  2255         {
  2256           *err = REG_ESUBREG;
  2257           return NULL;
  2258         }
  2259       dfa->used_bkref_map |= 1 << token->opr.idx;
  2260       tree = create_token_tree (dfa, NULL, NULL, token);
  2261       if (__glibc_unlikely (tree == NULL))
  2262         {
  2263           *err = REG_ESPACE;
  2264           return NULL;
  2265         }
  2266       ++dfa->nbackref;
  2267       dfa->has_mb_node = 1;
  2268       break;
  2269 
  2270     case OP_OPEN_DUP_NUM:
  2271       if (syntax & RE_CONTEXT_INVALID_DUP)
  2272         {
  2273           *err = REG_BADRPT;
  2274           return NULL;
  2275         }
  2276       FALLTHROUGH;
  2277     case OP_DUP_ASTERISK:
  2278     case OP_DUP_PLUS:
  2279     case OP_DUP_QUESTION:
  2280       if (syntax & RE_CONTEXT_INVALID_OPS)
  2281         {
  2282           *err = REG_BADRPT;
  2283           return NULL;
  2284         }
  2285       else if (syntax & RE_CONTEXT_INDEP_OPS)
  2286         {
  2287           fetch_token (token, regexp, syntax);
  2288           return parse_expression (regexp, preg, token, syntax, nest, err);
  2289         }
  2290       FALLTHROUGH;
  2291     case OP_CLOSE_SUBEXP:
  2292       if ((token->type == OP_CLOSE_SUBEXP)
  2293           && !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
  2294         {
  2295           *err = REG_ERPAREN;
  2296           return NULL;
  2297         }
  2298       FALLTHROUGH;
  2299     case OP_CLOSE_DUP_NUM:
  2300       /* We treat it as a normal character.  */
  2301 
  2302       /* Then we can these characters as normal characters.  */
  2303       token->type = CHARACTER;
  2304       /* mb_partial and word_char bits should be initialized already
  2305          by peek_token.  */
  2306       tree = create_token_tree (dfa, NULL, NULL, token);
  2307       if (__glibc_unlikely (tree == NULL))
  2308         {
  2309           *err = REG_ESPACE;
  2310           return NULL;
  2311         }
  2312       break;
  2313 
  2314     case ANCHOR:
  2315       if ((token->opr.ctx_type
  2316            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
  2317           && dfa->word_ops_used == 0)
  2318         init_word_char (dfa);
  2319       if (token->opr.ctx_type == WORD_DELIM
  2320           || token->opr.ctx_type == NOT_WORD_DELIM)
  2321         {
  2322           bin_tree_t *tree_first, *tree_last;
  2323           if (token->opr.ctx_type == WORD_DELIM)
  2324             {
  2325               token->opr.ctx_type = WORD_FIRST;
  2326               tree_first = create_token_tree (dfa, NULL, NULL, token);
  2327               token->opr.ctx_type = WORD_LAST;
  2328             }
  2329           else
  2330             {
  2331               token->opr.ctx_type = INSIDE_WORD;
  2332               tree_first = create_token_tree (dfa, NULL, NULL, token);
  2333               token->opr.ctx_type = INSIDE_NOTWORD;
  2334             }
  2335           tree_last = create_token_tree (dfa, NULL, NULL, token);
  2336           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
  2337           if (__glibc_unlikely (tree_first == NULL || tree_last == NULL
  2338                                 || tree == NULL))
  2339             {
  2340               *err = REG_ESPACE;
  2341               return NULL;
  2342             }
  2343         }
  2344       else
  2345         {
  2346           tree = create_token_tree (dfa, NULL, NULL, token);
  2347           if (__glibc_unlikely (tree == NULL))
  2348             {
  2349               *err = REG_ESPACE;
  2350               return NULL;
  2351             }
  2352         }
  2353       /* We must return here, since ANCHORs can't be followed
  2354          by repetition operators.
  2355          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
  2356              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
  2357       fetch_token (token, regexp, syntax);
  2358       return tree;
  2359 
  2360     case OP_PERIOD:
  2361       tree = create_token_tree (dfa, NULL, NULL, token);
  2362       if (__glibc_unlikely (tree == NULL))
  2363         {
  2364           *err = REG_ESPACE;
  2365           return NULL;
  2366         }
  2367       if (dfa->mb_cur_max > 1)
  2368         dfa->has_mb_node = 1;
  2369       break;
  2370 
  2371     case OP_WORD:
  2372     case OP_NOTWORD:
  2373       tree = build_charclass_op (dfa, regexp->trans,
  2374                                  "alnum",
  2375                                  "_",
  2376                                  token->type == OP_NOTWORD, err);
  2377       if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
  2378         return NULL;
  2379       break;
  2380 
  2381     case OP_SPACE:
  2382     case OP_NOTSPACE:
  2383       tree = build_charclass_op (dfa, regexp->trans,
  2384                                  "space",
  2385                                  "",
  2386                                  token->type == OP_NOTSPACE, err);
  2387       if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
  2388         return NULL;
  2389       break;
  2390 
  2391     case OP_ALT:
  2392     case END_OF_RE:
  2393       return NULL;
  2394 
  2395     case BACK_SLASH:
  2396       *err = REG_EESCAPE;
  2397       return NULL;
  2398 
  2399     default:
  2400       /* Must not happen?  */
  2401       DEBUG_ASSERT (false);
  2402       return NULL;
  2403     }
  2404   fetch_token (token, regexp, syntax);
  2405 
  2406   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
  2407          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
  2408     {
  2409       bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token,
  2410                                            syntax, err);
  2411       if (__glibc_unlikely (*err != REG_NOERROR && dup_tree == NULL))
  2412         {
  2413           if (tree != NULL)
  2414             postorder (tree, free_tree, NULL);
  2415           return NULL;
  2416         }
  2417       tree = dup_tree;
  2418       /* In BRE consecutive duplications are not allowed.  */
  2419       if ((syntax & RE_CONTEXT_INVALID_DUP)
  2420           && (token->type == OP_DUP_ASTERISK
  2421               || token->type == OP_OPEN_DUP_NUM))
  2422         {
  2423           if (tree != NULL)
  2424             postorder (tree, free_tree, NULL);
  2425           *err = REG_BADRPT;
  2426           return NULL;
  2427         }
  2428     }
  2429 
  2430   return tree;
  2431 }
  2432 
  2433 /* This function build the following tree, from regular expression
  2434    (<reg_exp>):
  2435          SUBEXP
  2436             |
  2437         <reg_exp>
  2438 */
  2439 
  2440 static bin_tree_t *
  2441 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
  2442                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
  2443 {
  2444   re_dfa_t *dfa = preg->buffer;
  2445   bin_tree_t *tree;
  2446   size_t cur_nsub;
  2447   cur_nsub = preg->re_nsub++;
  2448 
  2449   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
  2450 
  2451   /* The subexpression may be a null string.  */
  2452   if (token->type == OP_CLOSE_SUBEXP)
  2453     tree = NULL;
  2454   else
  2455     {
  2456       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
  2457       if (__glibc_unlikely (*err == REG_NOERROR
  2458                             && token->type != OP_CLOSE_SUBEXP))
  2459         {
  2460           if (tree != NULL)
  2461             postorder (tree, free_tree, NULL);
  2462           *err = REG_EPAREN;
  2463         }
  2464       if (__glibc_unlikely (*err != REG_NOERROR))
  2465         return NULL;
  2466     }
  2467 
  2468   if (cur_nsub <= '9' - '1')
  2469     dfa->completed_bkref_map |= 1 << cur_nsub;
  2470 
  2471   tree = create_tree (dfa, tree, NULL, SUBEXP);
  2472   if (__glibc_unlikely (tree == NULL))
  2473     {
  2474       *err = REG_ESPACE;
  2475       return NULL;
  2476     }
  2477   tree->token.opr.idx = cur_nsub;
  2478   return tree;
  2479 }
  2480 
  2481 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
  2482 
  2483 static bin_tree_t *
  2484 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
  2485               re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
  2486 {
  2487   bin_tree_t *tree = NULL, *old_tree = NULL;
  2488   Idx i, start, end, start_idx = re_string_cur_idx (regexp);
  2489   re_token_t start_token = *token;
  2490 
  2491   if (token->type == OP_OPEN_DUP_NUM)
  2492     {
  2493       end = 0;
  2494       start = fetch_number (regexp, token, syntax);
  2495       if (start == -1)
  2496         {
  2497           if (token->type == CHARACTER && token->opr.c == ',')
  2498             start = 0; /* We treat "{,m}" as "{0,m}".  */
  2499           else
  2500             {
  2501               *err = REG_BADBR; /* <re>{} is invalid.  */
  2502               return NULL;
  2503             }
  2504         }
  2505       if (__glibc_likely (start != -2))
  2506         {
  2507           /* We treat "{n}" as "{n,n}".  */
  2508           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
  2509                  : ((token->type == CHARACTER && token->opr.c == ',')
  2510                     ? fetch_number (regexp, token, syntax) : -2));
  2511         }
  2512       if (__glibc_unlikely (start == -2 || end == -2))
  2513         {
  2514           /* Invalid sequence.  */
  2515           if (__glibc_unlikely (!(syntax & RE_INVALID_INTERVAL_ORD)))
  2516             {
  2517               if (token->type == END_OF_RE)
  2518                 *err = REG_EBRACE;
  2519               else
  2520                 *err = REG_BADBR;
  2521 
  2522               return NULL;
  2523             }
  2524 
  2525           /* If the syntax bit is set, rollback.  */
  2526           re_string_set_index (regexp, start_idx);
  2527           *token = start_token;
  2528           token->type = CHARACTER;
  2529           /* mb_partial and word_char bits should be already initialized by
  2530              peek_token.  */
  2531           return elem;
  2532         }
  2533 
  2534       if (__glibc_unlikely ((end != -1 && start > end)
  2535                             || token->type != OP_CLOSE_DUP_NUM))
  2536         {
  2537           /* First number greater than second.  */
  2538           *err = REG_BADBR;
  2539           return NULL;
  2540         }
  2541 
  2542       if (__glibc_unlikely (RE_DUP_MAX < (end == -1 ? start : end)))
  2543         {
  2544           *err = REG_ESIZE;
  2545           return NULL;
  2546         }
  2547     }
  2548   else
  2549     {
  2550       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
  2551       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
  2552     }
  2553 
  2554   fetch_token (token, regexp, syntax);
  2555 
  2556   if (__glibc_unlikely (elem == NULL))
  2557     return NULL;
  2558   if (__glibc_unlikely (start == 0 && end == 0))
  2559     {
  2560       postorder (elem, free_tree, NULL);
  2561       return NULL;
  2562     }
  2563 
  2564   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
  2565   if (__glibc_unlikely (start > 0))
  2566     {
  2567       tree = elem;
  2568       for (i = 2; i <= start; ++i)
  2569         {
  2570           elem = duplicate_tree (elem, dfa);
  2571           tree = create_tree (dfa, tree, elem, CONCAT);
  2572           if (__glibc_unlikely (elem == NULL || tree == NULL))
  2573             goto parse_dup_op_espace;
  2574         }
  2575 
  2576       if (start == end)
  2577         return tree;
  2578 
  2579       /* Duplicate ELEM before it is marked optional.  */
  2580       elem = duplicate_tree (elem, dfa);
  2581       if (__glibc_unlikely (elem == NULL))
  2582         goto parse_dup_op_espace;
  2583       old_tree = tree;
  2584     }
  2585   else
  2586     old_tree = NULL;
  2587 
  2588   if (elem->token.type == SUBEXP)
  2589     {
  2590       uintptr_t subidx = elem->token.opr.idx;
  2591       postorder (elem, mark_opt_subexp, (void *) subidx);
  2592     }
  2593 
  2594   tree = create_tree (dfa, elem, NULL,
  2595                       (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
  2596   if (__glibc_unlikely (tree == NULL))
  2597     goto parse_dup_op_espace;
  2598 
  2599   /* This loop is actually executed only when end != -1,
  2600      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
  2601      already created the start+1-th copy.  */
  2602   if (TYPE_SIGNED (Idx) || end != -1)
  2603     for (i = start + 2; i <= end; ++i)
  2604       {
  2605         elem = duplicate_tree (elem, dfa);
  2606         tree = create_tree (dfa, tree, elem, CONCAT);
  2607         if (__glibc_unlikely (elem == NULL || tree == NULL))
  2608           goto parse_dup_op_espace;
  2609 
  2610         tree = create_tree (dfa, tree, NULL, OP_ALT);
  2611         if (__glibc_unlikely (tree == NULL))
  2612           goto parse_dup_op_espace;
  2613       }
  2614 
  2615   if (old_tree)
  2616     tree = create_tree (dfa, old_tree, tree, CONCAT);
  2617 
  2618   return tree;
  2619 
  2620  parse_dup_op_espace:
  2621   *err = REG_ESPACE;
  2622   return NULL;
  2623 }
  2624 
  2625 /* Size of the names for collating symbol/equivalence_class/character_class.
  2626    I'm not sure, but maybe enough.  */
  2627 #define BRACKET_NAME_BUF_SIZE 32
  2628 
  2629 #ifndef _LIBC
  2630 
  2631 /* Convert the byte B to the corresponding wide character.  In a
  2632    unibyte locale, treat B as itself.  In a multibyte locale, return
  2633    WEOF if B is an encoding error.  */
  2634 static wint_t
  2635 parse_byte (unsigned char b, re_dfa_t const *dfa)
  2636 {
  2637   return dfa->mb_cur_max > 1 ? __btowc (b) : b;
  2638 }
  2639 
  2640 /* Local function for parse_bracket_exp used in _LIBC environment.
  2641    Build the range expression which starts from START_ELEM, and ends
  2642    at END_ELEM.  The result are written to MBCSET and SBCSET.
  2643    RANGE_ALLOC is the allocated size of mbcset->range_starts, and
  2644    mbcset->range_ends, is a pointer argument since we may
  2645    update it.  */
  2646 
  2647 static reg_errcode_t
  2648 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc,
  2649                  bracket_elem_t *start_elem, bracket_elem_t *end_elem,
  2650                  re_dfa_t *dfa, reg_syntax_t syntax, uint_fast32_t nrules,
  2651                  const unsigned char *collseqmb, const char *collseqwc,
  2652                  int_fast32_t table_size, const void *symb_table,
  2653                  const unsigned char *extra)
  2654 {
  2655   /* Equivalence Classes and Character Classes can't be a range start/end.  */
  2656   if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
  2657                         || start_elem->type == CHAR_CLASS
  2658                         || end_elem->type == EQUIV_CLASS
  2659                         || end_elem->type == CHAR_CLASS))
  2660     return REG_ERANGE;
  2661 
  2662   /* We can handle no multi character collating elements without libc
  2663      support.  */
  2664   if (__glibc_unlikely ((start_elem->type == COLL_SYM
  2665                          && strlen ((char *) start_elem->opr.name) > 1)
  2666                         || (end_elem->type == COLL_SYM
  2667                             && strlen ((char *) end_elem->opr.name) > 1)))
  2668     return REG_ECOLLATE;
  2669 
  2670   unsigned int
  2671     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
  2672                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
  2673                    : 0)),
  2674     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
  2675               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
  2676                  : 0));
  2677   wint_t
  2678     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
  2679                 ? parse_byte (start_ch, dfa) : start_elem->opr.wch),
  2680     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
  2681               ? parse_byte (end_ch, dfa) : end_elem->opr.wch);
  2682 
  2683   if (start_wc == WEOF || end_wc == WEOF)
  2684     return REG_ECOLLATE;
  2685   else if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
  2686                              && start_wc > end_wc))
  2687     return REG_ERANGE;
  2688 
  2689   /* Got valid collation sequence values, add them as a new entry.
  2690      However, for !_LIBC we have no collation elements: if the
  2691      character set is single byte, the single byte character set
  2692      that we build below suffices.  parse_bracket_exp passes
  2693      no MBCSET if dfa->mb_cur_max == 1.  */
  2694   if (dfa->mb_cur_max > 1)
  2695     {
  2696       /* Check the space of the arrays.  */
  2697       if (__glibc_unlikely (*range_alloc == mbcset->nranges))
  2698         {
  2699           /* There is not enough space, need realloc.  */
  2700           wchar_t *new_array_start, *new_array_end;
  2701           Idx new_nranges;
  2702 
  2703           /* +1 in case of mbcset->nranges is 0.  */
  2704           new_nranges = 2 * mbcset->nranges + 1;
  2705           /* Use realloc since mbcset->range_starts and mbcset->range_ends
  2706              are NULL if *range_alloc == 0.  */
  2707           new_array_start = re_realloc (mbcset->range_starts, wchar_t,
  2708                                         new_nranges);
  2709           new_array_end = re_realloc (mbcset->range_ends, wchar_t,
  2710                                       new_nranges);
  2711 
  2712           if (__glibc_unlikely (new_array_start == NULL
  2713                                 || new_array_end == NULL))
  2714             {
  2715               re_free (new_array_start);
  2716               re_free (new_array_end);
  2717               return REG_ESPACE;
  2718             }
  2719 
  2720           mbcset->range_starts = new_array_start;
  2721           mbcset->range_ends = new_array_end;
  2722           *range_alloc = new_nranges;
  2723         }
  2724 
  2725       mbcset->range_starts[mbcset->nranges] = start_wc;
  2726       mbcset->range_ends[mbcset->nranges++] = end_wc;
  2727     }
  2728 
  2729   /* Build the table for single byte characters.  */
  2730   for (wchar_t wc = 0; wc < SBC_MAX; ++wc)
  2731     {
  2732       if (start_wc <= wc && wc <= end_wc)
  2733         bitset_set (sbcset, wc);
  2734     }
  2735 
  2736   return REG_NOERROR;
  2737 }
  2738 #endif /* not _LIBC */
  2739 
  2740 #ifndef _LIBC
  2741 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC.
  2742    Build the collating element which is represented by NAME.
  2743    The result are written to MBCSET and SBCSET.
  2744    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
  2745    pointer argument since we may update it.  */
  2746 
  2747 static reg_errcode_t
  2748 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
  2749                         Idx *coll_sym_alloc, const unsigned char *name,
  2750                         uint_fast32_t nrules, int_fast32_t table_size,
  2751                         const void *symb_table, const unsigned char *extra)
  2752 {
  2753   size_t name_len = strlen ((const char *) name);
  2754   if (__glibc_unlikely (name_len != 1))
  2755     return REG_ECOLLATE;
  2756   else
  2757     {
  2758       bitset_set (sbcset, name[0]);
  2759       return REG_NOERROR;
  2760     }
  2761 }
  2762 #endif /* not _LIBC */
  2763 
  2764 #ifdef _LIBC
  2765 /* Local function for parse_bracket_exp used in _LIBC environment.
  2766    Seek the collating symbol entry corresponding to NAME.
  2767    Return the index of the symbol in the SYMB_TABLE,
  2768    or -1 if not found.  */
  2769 
  2770 static __always_inline int32_t
  2771 seek_collating_symbol_entry (const unsigned char *name, size_t name_len,
  2772                              const int32_t *symb_table,
  2773                              int_fast32_t table_size,
  2774                              const unsigned char *extra)
  2775 {
  2776   int_fast32_t elem;
  2777 
  2778   for (elem = 0; elem < table_size; elem++)
  2779     if (symb_table[2 * elem] != 0)
  2780       {
  2781         int32_t idx = symb_table[2 * elem + 1];
  2782         /* Skip the name of collating element name.  */
  2783         idx += 1 + extra[idx];
  2784         if (/* Compare the length of the name.  */
  2785             name_len == extra[idx]
  2786             /* Compare the name.  */
  2787             && memcmp (name, &extra[idx + 1], name_len) == 0)
  2788           /* Yep, this is the entry.  */
  2789           return elem;
  2790       }
  2791   return -1;
  2792 }
  2793 
  2794 /* Local function for parse_bracket_exp used in _LIBC environment.
  2795    Look up the collation sequence value of BR_ELEM.
  2796    Return the value if succeeded, UINT_MAX otherwise.  */
  2797 
  2798 static __always_inline unsigned int
  2799 lookup_collation_sequence_value (bracket_elem_t *br_elem, uint32_t nrules,
  2800                                  const unsigned char *collseqmb,
  2801                                  const char *collseqwc,
  2802                                  int_fast32_t table_size,
  2803                                  const int32_t *symb_table,
  2804                                  const unsigned char *extra)
  2805 {
  2806   if (br_elem->type == SB_CHAR)
  2807     {
  2808       /* if (MB_CUR_MAX == 1) */
  2809       if (nrules == 0)
  2810         return collseqmb[br_elem->opr.ch];
  2811       else
  2812         {
  2813           wint_t wc = __btowc (br_elem->opr.ch);
  2814           return __collseq_table_lookup (collseqwc, wc);
  2815         }
  2816     }
  2817   else if (br_elem->type == MB_CHAR)
  2818     {
  2819       if (nrules != 0)
  2820         return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
  2821     }
  2822   else if (br_elem->type == COLL_SYM)
  2823     {
  2824       size_t sym_name_len = strlen ((char *) br_elem->opr.name);
  2825       if (nrules != 0)
  2826         {
  2827           int32_t elem, idx;
  2828           elem = seek_collating_symbol_entry (br_elem->opr.name,
  2829                                               sym_name_len,
  2830                                               symb_table, table_size,
  2831                                               extra);
  2832           if (elem != -1)
  2833             {
  2834               /* We found the entry.  */
  2835               idx = symb_table[2 * elem + 1];
  2836               /* Skip the name of collating element name.  */
  2837               idx += 1 + extra[idx];
  2838               /* Skip the byte sequence of the collating element.  */
  2839               idx += 1 + extra[idx];
  2840               /* Adjust for the alignment.  */
  2841               idx = (idx + 3) & ~3;
  2842               /* Skip the multibyte collation sequence value.  */
  2843               idx += sizeof (unsigned int);
  2844               /* Skip the wide char sequence of the collating element.  */
  2845               idx += sizeof (unsigned int) *
  2846                 (1 + *(unsigned int *) (extra + idx));
  2847               /* Return the collation sequence value.  */
  2848               return *(unsigned int *) (extra + idx);
  2849             }
  2850           else if (sym_name_len == 1)
  2851             {
  2852               /* No valid character.  Match it as a single byte
  2853                  character.  */
  2854               return collseqmb[br_elem->opr.name[0]];
  2855             }
  2856         }
  2857       else if (sym_name_len == 1)
  2858         return collseqmb[br_elem->opr.name[0]];
  2859     }
  2860   return UINT_MAX;
  2861 }
  2862 
  2863 /* Local function for parse_bracket_exp used in _LIBC environment.
  2864    Build the range expression which starts from START_ELEM, and ends
  2865    at END_ELEM.  The result are written to MBCSET and SBCSET.
  2866    RANGE_ALLOC is the allocated size of mbcset->range_starts, and
  2867    mbcset->range_ends, is a pointer argument since we may
  2868    update it.  */
  2869 
  2870 static __always_inline reg_errcode_t
  2871 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc,
  2872                  bracket_elem_t *start_elem, bracket_elem_t *end_elem,
  2873                  re_dfa_t *dfa, reg_syntax_t syntax, uint32_t nrules,
  2874                  const unsigned char *collseqmb, const char *collseqwc,
  2875                  int_fast32_t table_size, const int32_t *symb_table,
  2876                  const unsigned char *extra)
  2877 {
  2878   unsigned int ch;
  2879   uint32_t start_collseq;
  2880   uint32_t end_collseq;
  2881 
  2882   /* Equivalence Classes and Character Classes can't be a range
  2883      start/end.  */
  2884   if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
  2885                         || start_elem->type == CHAR_CLASS
  2886                         || end_elem->type == EQUIV_CLASS
  2887                         || end_elem->type == CHAR_CLASS))
  2888     return REG_ERANGE;
  2889 
  2890   /* FIXME: Implement rational ranges here, too.  */
  2891   start_collseq = lookup_collation_sequence_value (start_elem, nrules, collseqmb, collseqwc,
  2892                                                    table_size, symb_table, extra);
  2893   end_collseq = lookup_collation_sequence_value (end_elem, nrules, collseqmb, collseqwc,
  2894                                                  table_size, symb_table, extra);
  2895   /* Check start/end collation sequence values.  */
  2896   if (__glibc_unlikely (start_collseq == UINT_MAX
  2897                         || end_collseq == UINT_MAX))
  2898     return REG_ECOLLATE;
  2899   if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
  2900                         && start_collseq > end_collseq))
  2901     return REG_ERANGE;
  2902 
  2903   /* Got valid collation sequence values, add them as a new entry.
  2904      However, if we have no collation elements, and the character set
  2905      is single byte, the single byte character set that we
  2906      build below suffices. */
  2907   if (nrules > 0 || dfa->mb_cur_max > 1)
  2908     {
  2909       /* Check the space of the arrays.  */
  2910       if (__glibc_unlikely (*range_alloc == mbcset->nranges))
  2911         {
  2912           /* There is not enough space, need realloc.  */
  2913           uint32_t *new_array_start;
  2914           uint32_t *new_array_end;
  2915           int new_nranges;
  2916 
  2917           /* +1 in case of mbcset->nranges is 0.  */
  2918           new_nranges = 2 * mbcset->nranges + 1;
  2919           new_array_start = re_realloc (mbcset->range_starts, uint32_t,
  2920                                         new_nranges);
  2921           new_array_end = re_realloc (mbcset->range_ends, uint32_t,
  2922                                       new_nranges);
  2923 
  2924           if (__glibc_unlikely (new_array_start == NULL
  2925                                 || new_array_end == NULL))
  2926             return REG_ESPACE;
  2927 
  2928           mbcset->range_starts = new_array_start;
  2929           mbcset->range_ends = new_array_end;
  2930           *range_alloc = new_nranges;
  2931         }
  2932 
  2933       mbcset->range_starts[mbcset->nranges] = start_collseq;
  2934       mbcset->range_ends[mbcset->nranges++] = end_collseq;
  2935     }
  2936 
  2937   /* Build the table for single byte characters.  */
  2938   for (ch = 0; ch < SBC_MAX; ch++)
  2939     {
  2940       uint32_t ch_collseq;
  2941       /* if (MB_CUR_MAX == 1) */
  2942       if (nrules == 0)
  2943         ch_collseq = collseqmb[ch];
  2944       else
  2945         ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
  2946       if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
  2947         bitset_set (sbcset, ch);
  2948     }
  2949   return REG_NOERROR;
  2950 }
  2951 
  2952 /* Local function for parse_bracket_exp used in _LIBC environment.
  2953    Build the collating element which is represented by NAME.
  2954    The result are written to MBCSET and SBCSET.
  2955    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
  2956    pointer argument since we may update it.  */
  2957 
  2958 static __always_inline reg_errcode_t
  2959 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
  2960                         Idx *coll_sym_alloc, const unsigned char *name,
  2961                         uint_fast32_t nrules, int_fast32_t table_size,
  2962                         const int32_t *symb_table, const unsigned char *extra)
  2963 {
  2964   int32_t elem, idx;
  2965   size_t name_len = strlen ((const char *) name);
  2966   if (nrules != 0)
  2967     {
  2968       elem = seek_collating_symbol_entry (name, name_len, symb_table,
  2969                                           table_size, extra);
  2970       if (elem != -1)
  2971         {
  2972           /* We found the entry.  */
  2973           idx = symb_table[2 * elem + 1];
  2974           /* Skip the name of collating element name.  */
  2975           idx += 1 + extra[idx];
  2976         }
  2977       else if (name_len == 1)
  2978         {
  2979           /* No valid character, treat it as a normal
  2980              character.  */
  2981           bitset_set (sbcset, name[0]);
  2982           return REG_NOERROR;
  2983         }
  2984       else
  2985         return REG_ECOLLATE;
  2986 
  2987       /* Got valid collation sequence, add it as a new entry.  */
  2988       /* Check the space of the arrays.  */
  2989       if (__glibc_unlikely (*coll_sym_alloc == mbcset->ncoll_syms))
  2990         {
  2991           /* Not enough, realloc it.  */
  2992           /* +1 in case of mbcset->ncoll_syms is 0.  */
  2993           int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
  2994           /* Use realloc since mbcset->coll_syms is NULL
  2995              if *alloc == 0.  */
  2996           int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
  2997                                                new_coll_sym_alloc);
  2998           if (__glibc_unlikely (new_coll_syms == NULL))
  2999             return REG_ESPACE;
  3000           mbcset->coll_syms = new_coll_syms;
  3001           *coll_sym_alloc = new_coll_sym_alloc;
  3002         }
  3003       mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
  3004       return REG_NOERROR;
  3005     }
  3006   else
  3007     {
  3008       if (__glibc_unlikely (name_len != 1))
  3009         return REG_ECOLLATE;
  3010       else
  3011         {
  3012           bitset_set (sbcset, name[0]);
  3013           return REG_NOERROR;
  3014         }
  3015     }
  3016 }
  3017 #endif /* _LIBC */
  3018 
  3019 /* This function parse bracket expression like "[abc]", "[a-c]",
  3020    "[[.a-a.]]" etc.  */
  3021 
  3022 static bin_tree_t *
  3023 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
  3024                    reg_syntax_t syntax, reg_errcode_t *err)
  3025 {
  3026   const unsigned char *collseqmb = NULL;
  3027   const char *collseqwc = NULL;
  3028   uint_fast32_t nrules = 0;
  3029   int_fast32_t table_size = 0;
  3030   const void *symb_table = NULL;
  3031   const unsigned char *extra = NULL;
  3032 
  3033   re_token_t br_token;
  3034   re_bitset_ptr_t sbcset;
  3035   re_charset_t *mbcset;
  3036   Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
  3037   Idx equiv_class_alloc = 0, char_class_alloc = 0;
  3038   bool non_match = false;
  3039   bin_tree_t *work_tree;
  3040   int token_len;
  3041   bool first_round = true;
  3042 #ifdef _LIBC
  3043   collseqmb = (const unsigned char *)
  3044     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
  3045   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
  3046   if (nrules)
  3047     {
  3048       /*
  3049       if (MB_CUR_MAX > 1)
  3050       */
  3051       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
  3052       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
  3053       symb_table = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_TABLEMB);
  3054       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
  3055                                                    _NL_COLLATE_SYMB_EXTRAMB);
  3056     }
  3057 #endif
  3058   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
  3059   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
  3060   if (__glibc_unlikely (sbcset == NULL || mbcset == NULL))
  3061     {
  3062       re_free (sbcset);
  3063       re_free (mbcset);
  3064       *err = REG_ESPACE;
  3065       return NULL;
  3066     }
  3067 
  3068   token_len = peek_token_bracket (token, regexp, syntax);
  3069   if (__glibc_unlikely (token->type == END_OF_RE))
  3070     {
  3071       *err = REG_BADPAT;
  3072       goto parse_bracket_exp_free_return;
  3073     }
  3074   if (token->type == OP_NON_MATCH_LIST)
  3075     {
  3076       mbcset->non_match = 1;
  3077       non_match = true;
  3078       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
  3079         bitset_set (sbcset, '\n');
  3080       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
  3081       token_len = peek_token_bracket (token, regexp, syntax);
  3082       if (__glibc_unlikely (token->type == END_OF_RE))
  3083         {
  3084           *err = REG_BADPAT;
  3085           goto parse_bracket_exp_free_return;
  3086         }
  3087     }
  3088 
  3089   /* We treat the first ']' as a normal character.  */
  3090   if (token->type == OP_CLOSE_BRACKET)
  3091     token->type = CHARACTER;
  3092 
  3093   while (1)
  3094     {
  3095       bracket_elem_t start_elem, end_elem;
  3096       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
  3097       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
  3098       reg_errcode_t ret;
  3099       int token_len2 = 0;
  3100       bool is_range_exp = false;
  3101       re_token_t token2;
  3102 
  3103       start_elem.opr.name = start_name_buf;
  3104       start_elem.type = COLL_SYM;
  3105       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
  3106                                    syntax, first_round);
  3107       if (__glibc_unlikely (ret != REG_NOERROR))
  3108         {
  3109           *err = ret;
  3110           goto parse_bracket_exp_free_return;
  3111         }
  3112       first_round = false;
  3113 
  3114       /* Get information about the next token.  We need it in any case.  */
  3115       token_len = peek_token_bracket (token, regexp, syntax);
  3116 
  3117       /* Do not check for ranges if we know they are not allowed.  */
  3118       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
  3119         {
  3120           if (__glibc_unlikely (token->type == END_OF_RE))
  3121             {
  3122               *err = REG_EBRACK;
  3123               goto parse_bracket_exp_free_return;
  3124             }
  3125           if (token->type == OP_CHARSET_RANGE)
  3126             {
  3127               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
  3128               token_len2 = peek_token_bracket (&token2, regexp, syntax);
  3129               if (__glibc_unlikely (token2.type == END_OF_RE))
  3130                 {
  3131                   *err = REG_EBRACK;
  3132                   goto parse_bracket_exp_free_return;
  3133                 }
  3134               if (token2.type == OP_CLOSE_BRACKET)
  3135                 {
  3136                   /* We treat the last '-' as a normal character.  */
  3137                   re_string_skip_bytes (regexp, -token_len);
  3138                   token->type = CHARACTER;
  3139                 }
  3140               else
  3141                 is_range_exp = true;
  3142             }
  3143         }
  3144 
  3145       if (is_range_exp == true)
  3146         {
  3147           end_elem.opr.name = end_name_buf;
  3148           end_elem.type = COLL_SYM;
  3149           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
  3150                                        dfa, syntax, true);
  3151           if (__glibc_unlikely (ret != REG_NOERROR))
  3152             {
  3153               *err = ret;
  3154               goto parse_bracket_exp_free_return;
  3155             }
  3156 
  3157           token_len = peek_token_bracket (token, regexp, syntax);
  3158 
  3159           *err = build_range_exp (sbcset, mbcset, &range_alloc,
  3160                                   &start_elem, &end_elem,
  3161                                   dfa, syntax, nrules, collseqmb, collseqwc,
  3162                                   table_size, symb_table, extra);
  3163           if (__glibc_unlikely (*err != REG_NOERROR))
  3164             goto parse_bracket_exp_free_return;
  3165         }
  3166       else
  3167         {
  3168           switch (start_elem.type)
  3169             {
  3170             case SB_CHAR:
  3171               bitset_set (sbcset, start_elem.opr.ch);
  3172               break;
  3173             case MB_CHAR:
  3174               /* Check whether the array has enough space.  */
  3175               if (__glibc_unlikely (mbchar_alloc == mbcset->nmbchars))
  3176                 {
  3177                   wchar_t *new_mbchars;
  3178                   /* Not enough, realloc it.  */
  3179                   /* +1 in case of mbcset->nmbchars is 0.  */
  3180                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
  3181                   /* Use realloc since array is NULL if *alloc == 0.  */
  3182                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
  3183                                             mbchar_alloc);
  3184                   if (__glibc_unlikely (new_mbchars == NULL))
  3185                     goto parse_bracket_exp_espace;
  3186                   mbcset->mbchars = new_mbchars;
  3187                 }
  3188               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
  3189               break;
  3190             case EQUIV_CLASS:
  3191               *err = build_equiv_class (sbcset,
  3192                                         mbcset, &equiv_class_alloc,
  3193                                         start_elem.opr.name);
  3194               if (__glibc_unlikely (*err != REG_NOERROR))
  3195                 goto parse_bracket_exp_free_return;
  3196               break;
  3197             case COLL_SYM:
  3198               *err = build_collating_symbol (sbcset,
  3199                                              mbcset, &coll_sym_alloc,
  3200                                              start_elem.opr.name,
  3201                                              nrules, table_size, symb_table, extra);
  3202               if (__glibc_unlikely (*err != REG_NOERROR))
  3203                 goto parse_bracket_exp_free_return;
  3204               break;
  3205             case CHAR_CLASS:
  3206               *err = build_charclass (regexp->trans, sbcset,
  3207                                       mbcset, &char_class_alloc,
  3208                                       (const char *) start_elem.opr.name,
  3209                                       syntax);
  3210               if (__glibc_unlikely (*err != REG_NOERROR))
  3211                goto parse_bracket_exp_free_return;
  3212               break;
  3213             default:
  3214               DEBUG_ASSERT (false);
  3215               break;
  3216             }
  3217         }
  3218       if (__glibc_unlikely (token->type == END_OF_RE))
  3219         {
  3220           *err = REG_EBRACK;
  3221           goto parse_bracket_exp_free_return;
  3222         }
  3223       if (token->type == OP_CLOSE_BRACKET)
  3224         break;
  3225     }
  3226 
  3227   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
  3228 
  3229   /* If it is non-matching list.  */
  3230   if (non_match)
  3231     bitset_not (sbcset);
  3232 
  3233   /* Ensure only single byte characters are set.  */
  3234   if (dfa->mb_cur_max > 1)
  3235     bitset_mask (sbcset, dfa->sb_char);
  3236 
  3237   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
  3238       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
  3239                                                      || mbcset->non_match)))
  3240     {
  3241       bin_tree_t *mbc_tree;
  3242       int sbc_idx;
  3243       /* Build a tree for complex bracket.  */
  3244       dfa->has_mb_node = 1;
  3245       br_token.type = COMPLEX_BRACKET;
  3246       br_token.opr.mbcset = mbcset;
  3247       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
  3248       if (__glibc_unlikely (mbc_tree == NULL))
  3249         goto parse_bracket_exp_espace;
  3250       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
  3251         if (sbcset[sbc_idx])
  3252           break;
  3253       /* If there are no bits set in sbcset, there is no point
  3254          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
  3255       if (sbc_idx < BITSET_WORDS)
  3256         {
  3257           /* Build a tree for simple bracket.  */
  3258           br_token.type = SIMPLE_BRACKET;
  3259           br_token.opr.sbcset = sbcset;
  3260           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
  3261           if (__glibc_unlikely (work_tree == NULL))
  3262             goto parse_bracket_exp_espace;
  3263 
  3264           /* Then join them by ALT node.  */
  3265           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
  3266           if (__glibc_unlikely (work_tree == NULL))
  3267             goto parse_bracket_exp_espace;
  3268         }
  3269       else
  3270         {
  3271           re_free (sbcset);
  3272           work_tree = mbc_tree;
  3273         }
  3274     }
  3275   else
  3276     {
  3277       free_charset (mbcset);
  3278       /* Build a tree for simple bracket.  */
  3279       br_token.type = SIMPLE_BRACKET;
  3280       br_token.opr.sbcset = sbcset;
  3281       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
  3282       if (__glibc_unlikely (work_tree == NULL))
  3283         goto parse_bracket_exp_espace;
  3284     }
  3285   return work_tree;
  3286 
  3287  parse_bracket_exp_espace:
  3288   *err = REG_ESPACE;
  3289  parse_bracket_exp_free_return:
  3290   re_free (sbcset);
  3291   free_charset (mbcset);
  3292   return NULL;
  3293 }
  3294 
  3295 /* Parse an element in the bracket expression.  */
  3296 
  3297 static reg_errcode_t
  3298 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
  3299                        re_token_t *token, int token_len, re_dfa_t *dfa,
  3300                        reg_syntax_t syntax, bool accept_hyphen)
  3301 {
  3302   int cur_char_size;
  3303   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
  3304   if (cur_char_size > 1)
  3305     {
  3306       elem->type = MB_CHAR;
  3307       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
  3308       re_string_skip_bytes (regexp, cur_char_size);
  3309       return REG_NOERROR;
  3310     }
  3311   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
  3312   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
  3313       || token->type == OP_OPEN_EQUIV_CLASS)
  3314     return parse_bracket_symbol (elem, regexp, token);
  3315   if (__glibc_unlikely (token->type == OP_CHARSET_RANGE) && !accept_hyphen)
  3316     {
  3317       /* A '-' must only appear as anything but a range indicator before
  3318          the closing bracket.  Everything else is an error.  */
  3319       re_token_t token2;
  3320       (void) peek_token_bracket (&token2, regexp, syntax);
  3321       if (token2.type != OP_CLOSE_BRACKET)
  3322         /* The actual error value is not standardized since this whole
  3323            case is undefined.  But ERANGE makes good sense.  */
  3324         return REG_ERANGE;
  3325     }
  3326   elem->type = SB_CHAR;
  3327   elem->opr.ch = token->opr.c;
  3328   return REG_NOERROR;
  3329 }
  3330 
  3331 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
  3332    such as [:<character_class>:], [.<collating_element>.], and
  3333    [=<equivalent_class>=].  */
  3334 
  3335 static reg_errcode_t
  3336 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
  3337                       re_token_t *token)
  3338 {
  3339   unsigned char ch, delim = token->opr.c;
  3340   int i = 0;
  3341   if (re_string_eoi(regexp))
  3342     return REG_EBRACK;
  3343   for (;; ++i)
  3344     {
  3345       if (i >= BRACKET_NAME_BUF_SIZE)
  3346         return REG_EBRACK;
  3347       if (token->type == OP_OPEN_CHAR_CLASS)
  3348         ch = re_string_fetch_byte_case (regexp);
  3349       else
  3350         ch = re_string_fetch_byte (regexp);
  3351       if (re_string_eoi(regexp))
  3352         return REG_EBRACK;
  3353       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
  3354         break;
  3355       elem->opr.name[i] = ch;
  3356     }
  3357   re_string_skip_bytes (regexp, 1);
  3358   elem->opr.name[i] = '\0';
  3359   switch (token->type)
  3360     {
  3361     case OP_OPEN_COLL_ELEM:
  3362       elem->type = COLL_SYM;
  3363       break;
  3364     case OP_OPEN_EQUIV_CLASS:
  3365       elem->type = EQUIV_CLASS;
  3366       break;
  3367     case OP_OPEN_CHAR_CLASS:
  3368       elem->type = CHAR_CLASS;
  3369       break;
  3370     default:
  3371       break;
  3372     }
  3373   return REG_NOERROR;
  3374 }
  3375 
  3376   /* Helper function for parse_bracket_exp.
  3377      Build the equivalence class which is represented by NAME.
  3378      The result are written to MBCSET and SBCSET.
  3379      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
  3380      is a pointer argument since we may update it.  */
  3381 
  3382 static reg_errcode_t
  3383 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
  3384                    Idx *equiv_class_alloc, const unsigned char *name)
  3385 {
  3386 #ifdef _LIBC
  3387   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
  3388   if (nrules != 0)
  3389     {
  3390       const int32_t *table, *indirect;
  3391       const unsigned char *weights, *extra, *cp;
  3392       unsigned char char_buf[2];
  3393       int32_t idx1, idx2;
  3394       unsigned int ch;
  3395       size_t len;
  3396       /* Calculate the index for equivalence class.  */
  3397       cp = name;
  3398       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
  3399       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
  3400                                                _NL_COLLATE_WEIGHTMB);
  3401       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
  3402                                                    _NL_COLLATE_EXTRAMB);
  3403       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
  3404                                                 _NL_COLLATE_INDIRECTMB);
  3405       idx1 = findidx (table, indirect, extra, &cp, -1);
  3406       if (__glibc_unlikely (idx1 == 0 || *cp != '\0'))
  3407         /* This isn't a valid character.  */
  3408         return REG_ECOLLATE;
  3409 
  3410       /* Build single byte matching table for this equivalence class.  */
  3411       len = weights[idx1 & 0xffffff];
  3412       for (ch = 0; ch < SBC_MAX; ++ch)
  3413         {
  3414           char_buf[0] = ch;
  3415           cp = char_buf;
  3416           idx2 = findidx (table, indirect, extra, &cp, 1);
  3417 /*
  3418           idx2 = table[ch];
  3419 */
  3420           if (idx2 == 0)
  3421             /* This isn't a valid character.  */
  3422             continue;
  3423           /* Compare only if the length matches and the collation rule
  3424              index is the same.  */
  3425           if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24)
  3426               && memcmp (weights + (idx1 & 0xffffff) + 1,
  3427                          weights + (idx2 & 0xffffff) + 1, len) == 0)
  3428             bitset_set (sbcset, ch);
  3429         }
  3430       /* Check whether the array has enough space.  */
  3431       if (__glibc_unlikely (*equiv_class_alloc == mbcset->nequiv_classes))
  3432         {
  3433           /* Not enough, realloc it.  */
  3434           /* +1 in case of mbcset->nequiv_classes is 0.  */
  3435           Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
  3436           /* Use realloc since the array is NULL if *alloc == 0.  */
  3437           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
  3438                                                    int32_t,
  3439                                                    new_equiv_class_alloc);
  3440           if (__glibc_unlikely (new_equiv_classes == NULL))
  3441             return REG_ESPACE;
  3442           mbcset->equiv_classes = new_equiv_classes;
  3443           *equiv_class_alloc = new_equiv_class_alloc;
  3444         }
  3445       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
  3446     }
  3447   else
  3448 #endif /* _LIBC */
  3449     {
  3450       if (__glibc_unlikely (strlen ((const char *) name) != 1))
  3451         return REG_ECOLLATE;
  3452       bitset_set (sbcset, *name);
  3453     }
  3454   return REG_NOERROR;
  3455 }
  3456 
  3457   /* Helper function for parse_bracket_exp.
  3458      Build the character class which is represented by NAME.
  3459      The result are written to MBCSET and SBCSET.
  3460      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
  3461      is a pointer argument since we may update it.  */
  3462 
  3463 static reg_errcode_t
  3464 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
  3465                  re_charset_t *mbcset, Idx *char_class_alloc,
  3466                  const char *class_name, reg_syntax_t syntax)
  3467 {
  3468   int i;
  3469   const char *name = class_name;
  3470 
  3471   /* In case of REG_ICASE "upper" and "lower" match the both of
  3472      upper and lower cases.  */
  3473   if ((syntax & RE_ICASE)
  3474       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
  3475     name = "alpha";
  3476 
  3477   /* Check the space of the arrays.  */
  3478   if (__glibc_unlikely (*char_class_alloc == mbcset->nchar_classes))
  3479     {
  3480       /* Not enough, realloc it.  */
  3481       /* +1 in case of mbcset->nchar_classes is 0.  */
  3482       Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
  3483       /* Use realloc since array is NULL if *alloc == 0.  */
  3484       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
  3485                                                new_char_class_alloc);
  3486       if (__glibc_unlikely (new_char_classes == NULL))
  3487         return REG_ESPACE;
  3488       mbcset->char_classes = new_char_classes;
  3489       *char_class_alloc = new_char_class_alloc;
  3490     }
  3491   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
  3492 
  3493 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
  3494   do {                                          \
  3495     if (__glibc_unlikely (trans != NULL))                       \
  3496       {                                         \
  3497         for (i = 0; i < SBC_MAX; ++i)           \
  3498           if (ctype_func (i))                   \
  3499             bitset_set (sbcset, trans[i]);      \
  3500       }                                         \
  3501     else                                        \
  3502       {                                         \
  3503         for (i = 0; i < SBC_MAX; ++i)           \
  3504           if (ctype_func (i))                   \
  3505             bitset_set (sbcset, i);             \
  3506       }                                         \
  3507   } while (0)
  3508 
  3509   if (strcmp (name, "alnum") == 0)
  3510     BUILD_CHARCLASS_LOOP (isalnum);
  3511   else if (strcmp (name, "cntrl") == 0)
  3512     BUILD_CHARCLASS_LOOP (iscntrl);
  3513   else if (strcmp (name, "lower") == 0)
  3514     BUILD_CHARCLASS_LOOP (islower);
  3515   else if (strcmp (name, "space") == 0)
  3516     BUILD_CHARCLASS_LOOP (isspace);
  3517   else if (strcmp (name, "alpha") == 0)
  3518     BUILD_CHARCLASS_LOOP (isalpha);
  3519   else if (strcmp (name, "digit") == 0)
  3520     BUILD_CHARCLASS_LOOP (isdigit);
  3521   else if (strcmp (name, "print") == 0)
  3522     BUILD_CHARCLASS_LOOP (isprint);
  3523   else if (strcmp (name, "upper") == 0)
  3524     BUILD_CHARCLASS_LOOP (isupper);
  3525   else if (strcmp (name, "blank") == 0)
  3526     BUILD_CHARCLASS_LOOP (isblank);
  3527   else if (strcmp (name, "graph") == 0)
  3528     BUILD_CHARCLASS_LOOP (isgraph);
  3529   else if (strcmp (name, "punct") == 0)
  3530     BUILD_CHARCLASS_LOOP (ispunct);
  3531   else if (strcmp (name, "xdigit") == 0)
  3532     BUILD_CHARCLASS_LOOP (isxdigit);
  3533   else
  3534     return REG_ECTYPE;
  3535 
  3536   return REG_NOERROR;
  3537 }
  3538 
  3539 static bin_tree_t *
  3540 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
  3541                     const char *class_name,
  3542                     const char *extra, bool non_match,
  3543                     reg_errcode_t *err)
  3544 {
  3545   re_bitset_ptr_t sbcset;
  3546   re_charset_t *mbcset;
  3547   Idx alloc = 0;
  3548   reg_errcode_t ret;
  3549   bin_tree_t *tree;
  3550 
  3551   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
  3552   if (__glibc_unlikely (sbcset == NULL))
  3553     {
  3554       *err = REG_ESPACE;
  3555       return NULL;
  3556     }
  3557   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
  3558   if (__glibc_unlikely (mbcset == NULL))
  3559     {
  3560       re_free (sbcset);
  3561       *err = REG_ESPACE;
  3562       return NULL;
  3563     }
  3564   mbcset->non_match = non_match;
  3565 
  3566   /* We don't care the syntax in this case.  */
  3567   ret = build_charclass (trans, sbcset, mbcset, &alloc, class_name, 0);
  3568 
  3569   if (__glibc_unlikely (ret != REG_NOERROR))
  3570     {
  3571       re_free (sbcset);
  3572       free_charset (mbcset);
  3573       *err = ret;
  3574       return NULL;
  3575     }
  3576   /* \w match '_' also.  */
  3577   for (; *extra; extra++)
  3578     bitset_set (sbcset, *extra);
  3579 
  3580   /* If it is non-matching list.  */
  3581   if (non_match)
  3582     bitset_not (sbcset);
  3583 
  3584   /* Ensure only single byte characters are set.  */
  3585   if (dfa->mb_cur_max > 1)
  3586     bitset_mask (sbcset, dfa->sb_char);
  3587 
  3588   /* Build a tree for simple bracket.  */
  3589   re_token_t br_token = { .type = SIMPLE_BRACKET, .opr.sbcset = sbcset };
  3590   tree = create_token_tree (dfa, NULL, NULL, &br_token);
  3591   if (__glibc_unlikely (tree == NULL))
  3592     goto build_word_op_espace;
  3593 
  3594   if (dfa->mb_cur_max > 1)
  3595     {
  3596       bin_tree_t *mbc_tree;
  3597       /* Build a tree for complex bracket.  */
  3598       br_token.type = COMPLEX_BRACKET;
  3599       br_token.opr.mbcset = mbcset;
  3600       dfa->has_mb_node = 1;
  3601       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
  3602       if (__glibc_unlikely (mbc_tree == NULL))
  3603         goto build_word_op_espace;
  3604       /* Then join them by ALT node.  */
  3605       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
  3606       if (__glibc_likely (mbc_tree != NULL))
  3607         return tree;
  3608     }
  3609   else
  3610     {
  3611       free_charset (mbcset);
  3612       return tree;
  3613     }
  3614 
  3615  build_word_op_espace:
  3616   re_free (sbcset);
  3617   free_charset (mbcset);
  3618   *err = REG_ESPACE;
  3619   return NULL;
  3620 }
  3621 
  3622 /* This is intended for the expressions like "a{1,3}".
  3623    Fetch a number from 'input', and return the number.
  3624    Return -1 if the number field is empty like "{,1}".
  3625    Return RE_DUP_MAX + 1 if the number field is too large.
  3626    Return -2 if an error occurred.  */
  3627 
  3628 static Idx
  3629 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
  3630 {
  3631   Idx num = -1;
  3632   unsigned char c;
  3633   while (1)
  3634     {
  3635       fetch_token (token, input, syntax);
  3636       c = token->opr.c;
  3637       if (__glibc_unlikely (token->type == END_OF_RE))
  3638         return -2;
  3639       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
  3640         break;
  3641       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
  3642              ? -2
  3643              : num == -1
  3644              ? c - '0'
  3645              : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
  3646     }
  3647   return num;
  3648 }
  3649 
  3650 static void
  3651 free_charset (re_charset_t *cset)
  3652 {
  3653   re_free (cset->mbchars);
  3654 #ifdef _LIBC
  3655   re_free (cset->coll_syms);
  3656   re_free (cset->equiv_classes);
  3657 #endif
  3658   re_free (cset->range_starts);
  3659   re_free (cset->range_ends);
  3660   re_free (cset->char_classes);
  3661   re_free (cset);
  3662 }
  3663 
  3664 /* Functions for binary tree operation.  */
  3665 
  3666 /* Create a tree node.  */
  3667 
  3668 static bin_tree_t *
  3669 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
  3670              re_token_type_t type)
  3671 {
  3672   re_token_t t = { .type = type };
  3673   return create_token_tree (dfa, left, right, &t);
  3674 }
  3675 
  3676 static bin_tree_t *
  3677 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
  3678                    const re_token_t *token)
  3679 {
  3680   bin_tree_t *tree;
  3681   if (__glibc_unlikely (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE))
  3682     {
  3683       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
  3684 
  3685       if (storage == NULL)
  3686         return NULL;
  3687       storage->next = dfa->str_tree_storage;
  3688       dfa->str_tree_storage = storage;
  3689       dfa->str_tree_storage_idx = 0;
  3690     }
  3691   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
  3692 
  3693   tree->parent = NULL;
  3694   tree->left = left;
  3695   tree->right = right;
  3696   tree->token = *token;
  3697   tree->token.duplicated = 0;
  3698   tree->token.opt_subexp = 0;
  3699   tree->first = NULL;
  3700   tree->next = NULL;
  3701   tree->node_idx = -1;
  3702 
  3703   if (left != NULL)
  3704     left->parent = tree;
  3705   if (right != NULL)
  3706     right->parent = tree;
  3707   return tree;
  3708 }
  3709 
  3710 /* Mark the tree SRC as an optional subexpression.
  3711    To be called from preorder or postorder.  */
  3712 
  3713 static reg_errcode_t
  3714 mark_opt_subexp (void *extra, bin_tree_t *node)
  3715 {
  3716   Idx idx = (uintptr_t) extra;
  3717   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
  3718     node->token.opt_subexp = 1;
  3719 
  3720   return REG_NOERROR;
  3721 }
  3722 
  3723 /* Free the allocated memory inside NODE. */
  3724 
  3725 static void
  3726 free_token (re_token_t *node)
  3727 {
  3728   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
  3729     free_charset (node->opr.mbcset);
  3730   else if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
  3731     re_free (node->opr.sbcset);
  3732 }
  3733 
  3734 /* Worker function for tree walking.  Free the allocated memory inside NODE
  3735    and its children. */
  3736 
  3737 static reg_errcode_t
  3738 free_tree (void *extra, bin_tree_t *node)
  3739 {
  3740   free_token (&node->token);
  3741   return REG_NOERROR;
  3742 }
  3743 
  3744 
  3745 /* Duplicate the node SRC, and return new node.  This is a preorder
  3746    visit similar to the one implemented by the generic visitor, but
  3747    we need more infrastructure to maintain two parallel trees --- so,
  3748    it's easier to duplicate.  */
  3749 
  3750 static bin_tree_t *
  3751 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
  3752 {
  3753   const bin_tree_t *node;
  3754   bin_tree_t *dup_root;
  3755   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
  3756 
  3757   for (node = root; ; )
  3758     {
  3759       /* Create a new tree and link it back to the current parent.  */
  3760       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
  3761       if (*p_new == NULL)
  3762         return NULL;
  3763       (*p_new)->parent = dup_node;
  3764       (*p_new)->token.duplicated = 1;
  3765       dup_node = *p_new;
  3766 
  3767       /* Go to the left node, or up and to the right.  */
  3768       if (node->left)
  3769         {
  3770           node = node->left;
  3771           p_new = &dup_node->left;
  3772         }
  3773       else
  3774         {
  3775           const bin_tree_t *prev = NULL;
  3776           while (node->right == prev || node->right == NULL)
  3777             {
  3778               prev = node;
  3779               node = node->parent;
  3780               dup_node = dup_node->parent;
  3781               if (!node)
  3782                 return dup_root;
  3783             }
  3784           node = node->right;
  3785           p_new = &dup_node->right;
  3786         }
  3787     }
  3788 }
/* [<][>][^][v][top][bottom][index][help] */
root/lib/regcomp.c

DEFINITIONS