lib/regex_internal.c

/* [<][>][^][v][top][bottom][index][help] */
This source file includes following definitions.
re_string_allocate
re_string_construct
re_string_realloc_buffers
re_string_construct_common
build_wcs_buffer
build_wcs_upper_buffer
re_string_skip_chars
build_upper_buffer
re_string_translate_buffer
re_string_reconstruct
re_string_peek_byte_case
re_string_fetch_byte_case
re_string_destruct
re_string_context_at
re_node_set_alloc
re_node_set_init_1
re_node_set_init_2
re_node_set_init_copy
re_node_set_add_intersect
re_node_set_init_union
re_node_set_merge
re_node_set_insert
re_node_set_insert_last
re_node_set_compare
re_node_set_contains
re_node_set_remove_at
re_dfa_add_node
calc_state_hash
re_acquire_state
re_acquire_state_context
register_state
free_state
create_ci_newstate
create_cd_newstate
     1 /* Extended regular expression matching and search library.
     2    Copyright (C) 2002-2023 Free Software Foundation, Inc.
     3    This file is part of the GNU C Library.
     4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
     5 
     6    The GNU C Library is free software; you can redistribute it and/or
     7    modify it under the terms of the GNU Lesser General Public
     8    License as published by the Free Software Foundation; either
     9    version 2.1 of the License, or (at your option) any later version.
    10 
    11    The GNU C Library is distributed in the hope that it will be useful,
    12    but WITHOUT ANY WARRANTY; without even the implied warranty of
    13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    14    Lesser General Public License for more details.
    15 
    16    You should have received a copy of the GNU Lesser General Public
    17    License along with the GNU C Library; if not, see
    18    <https://www.gnu.org/licenses/>.  */
    19 
    20 static void re_string_construct_common (const char *str, Idx len,
    21                                         re_string_t *pstr,
    22                                         RE_TRANSLATE_TYPE trans, bool icase,
    23                                         const re_dfa_t *dfa);
    24 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
    25                                           const re_node_set *nodes,
    26                                           re_hashval_t hash);
    27 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
    28                                           const re_node_set *nodes,
    29                                           unsigned int context,
    30                                           re_hashval_t hash);
    31 static reg_errcode_t re_string_realloc_buffers (re_string_t *pstr,
    32                                                 Idx new_buf_len);
    33 static void build_wcs_buffer (re_string_t *pstr);
    34 static reg_errcode_t build_wcs_upper_buffer (re_string_t *pstr);
    35 static void build_upper_buffer (re_string_t *pstr);
    36 static void re_string_translate_buffer (re_string_t *pstr);
    37 static unsigned int re_string_context_at (const re_string_t *input, Idx idx,
    38                                           int eflags) __attribute__ ((pure));
    39 
    40 /* Functions for string operation.  */
    41 
    42 /* This function allocate the buffers.  It is necessary to call
    43    re_string_reconstruct before using the object.  */
    44 
    45 static reg_errcode_t
    46 __attribute_warn_unused_result__
    47 re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
    48                     RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
    49 {
    50   reg_errcode_t ret;
    51   Idx init_buf_len;
    52 
    53   /* Ensure at least one character fits into the buffers.  */
    54   if (init_len < dfa->mb_cur_max)
    55     init_len = dfa->mb_cur_max;
    56   init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
    57   re_string_construct_common (str, len, pstr, trans, icase, dfa);
    58 
    59   ret = re_string_realloc_buffers (pstr, init_buf_len);
    60   if (__glibc_unlikely (ret != REG_NOERROR))
    61     return ret;
    62 
    63   pstr->word_char = dfa->word_char;
    64   pstr->word_ops_used = dfa->word_ops_used;
    65   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
    66   pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
    67   pstr->valid_raw_len = pstr->valid_len;
    68   return REG_NOERROR;
    69 }
    70 
    71 /* This function allocate the buffers, and initialize them.  */
    72 
    73 static reg_errcode_t
    74 __attribute_warn_unused_result__
    75 re_string_construct (re_string_t *pstr, const char *str, Idx len,
    76                      RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
    77 {
    78   reg_errcode_t ret;
    79   memset (pstr, '\0', sizeof (re_string_t));
    80   re_string_construct_common (str, len, pstr, trans, icase, dfa);
    81 
    82   if (len > 0)
    83     {
    84       ret = re_string_realloc_buffers (pstr, len + 1);
    85       if (__glibc_unlikely (ret != REG_NOERROR))
    86         return ret;
    87     }
    88   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
    89 
    90   if (icase)
    91     {
    92       if (dfa->mb_cur_max > 1)
    93         {
    94           while (1)
    95             {
    96               ret = build_wcs_upper_buffer (pstr);
    97               if (__glibc_unlikely (ret != REG_NOERROR))
    98                 return ret;
    99               if (pstr->valid_raw_len >= len)
   100                 break;
   101               if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
   102                 break;
   103               ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
   104               if (__glibc_unlikely (ret != REG_NOERROR))
   105                 return ret;
   106             }
   107         }
   108       else
   109         build_upper_buffer (pstr);
   110     }
   111   else
   112     {
   113       if (dfa->mb_cur_max > 1)
   114         build_wcs_buffer (pstr);
   115       else
   116         {
   117           if (trans != NULL)
   118             re_string_translate_buffer (pstr);
   119           else
   120             {
   121               pstr->valid_len = pstr->bufs_len;
   122               pstr->valid_raw_len = pstr->bufs_len;
   123             }
   124         }
   125     }
   126 
   127   return REG_NOERROR;
   128 }
   129 
   130 /* Helper functions for re_string_allocate, and re_string_construct.  */
   131 
   132 static reg_errcode_t
   133 __attribute_warn_unused_result__
   134 re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
   135 {
   136   if (pstr->mb_cur_max > 1)
   137     {
   138       wint_t *new_wcs;
   139 
   140       /* Avoid overflow in realloc.  */
   141       const size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
   142       if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size)
   143                             < new_buf_len))
   144         return REG_ESPACE;
   145 
   146       new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
   147       if (__glibc_unlikely (new_wcs == NULL))
   148         return REG_ESPACE;
   149       pstr->wcs = new_wcs;
   150       if (pstr->offsets != NULL)
   151         {
   152           Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
   153           if (__glibc_unlikely (new_offsets == NULL))
   154             return REG_ESPACE;
   155           pstr->offsets = new_offsets;
   156         }
   157     }
   158   if (pstr->mbs_allocated)
   159     {
   160       unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
   161                                            new_buf_len);
   162       if (__glibc_unlikely (new_mbs == NULL))
   163         return REG_ESPACE;
   164       pstr->mbs = new_mbs;
   165     }
   166   pstr->bufs_len = new_buf_len;
   167   return REG_NOERROR;
   168 }
   169 
   170 
   171 static void
   172 re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
   173                             RE_TRANSLATE_TYPE trans, bool icase,
   174                             const re_dfa_t *dfa)
   175 {
   176   pstr->raw_mbs = (const unsigned char *) str;
   177   pstr->len = len;
   178   pstr->raw_len = len;
   179   pstr->trans = trans;
   180   pstr->icase = icase;
   181   pstr->mbs_allocated = (trans != NULL || icase);
   182   pstr->mb_cur_max = dfa->mb_cur_max;
   183   pstr->is_utf8 = dfa->is_utf8;
   184   pstr->map_notascii = dfa->map_notascii;
   185   pstr->stop = pstr->len;
   186   pstr->raw_stop = pstr->stop;
   187 }
   188 
   189 
   190 /* Build wide character buffer PSTR->WCS.
   191    If the byte sequence of the string are:
   192      <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
   193    Then wide character buffer will be:
   194      <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
   195    We use WEOF for padding, they indicate that the position isn't
   196    a first byte of a multibyte character.
   197 
   198    Note that this function assumes PSTR->VALID_LEN elements are already
   199    built and starts from PSTR->VALID_LEN.  */
   200 
   201 static void
   202 build_wcs_buffer (re_string_t *pstr)
   203 {
   204 #ifdef _LIBC
   205   unsigned char buf[MB_LEN_MAX];
   206   DEBUG_ASSERT (MB_LEN_MAX >= pstr->mb_cur_max);
   207 #else
   208   unsigned char buf[64];
   209 #endif
   210   mbstate_t prev_st;
   211   Idx byte_idx, end_idx, remain_len;
   212   size_t mbclen;
   213 
   214   /* Build the buffers from pstr->valid_len to either pstr->len or
   215      pstr->bufs_len.  */
   216   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
   217   for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
   218     {
   219       wchar_t wc;
   220       const char *p;
   221 
   222       remain_len = end_idx - byte_idx;
   223       prev_st = pstr->cur_state;
   224       /* Apply the translation if we need.  */
   225       if (__glibc_unlikely (pstr->trans != NULL))
   226         {
   227           int i, ch;
   228 
   229           for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
   230             {
   231               ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
   232               buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
   233             }
   234           p = (const char *) buf;
   235         }
   236       else
   237         p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
   238       mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
   239       if (__glibc_unlikely (mbclen == (size_t) -1 || mbclen == 0
   240                             || (mbclen == (size_t) -2
   241                                 && pstr->bufs_len >= pstr->len)))
   242         {
   243           /* We treat these cases as a singlebyte character.  */
   244           mbclen = 1;
   245           wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
   246           if (__glibc_unlikely (pstr->trans != NULL))
   247             wc = pstr->trans[wc];
   248           pstr->cur_state = prev_st;
   249         }
   250       else if (__glibc_unlikely (mbclen == (size_t) -2))
   251         {
   252           /* The buffer doesn't have enough space, finish to build.  */
   253           pstr->cur_state = prev_st;
   254           break;
   255         }
   256 
   257       /* Write wide character and padding.  */
   258       pstr->wcs[byte_idx++] = wc;
   259       /* Write paddings.  */
   260       for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
   261         pstr->wcs[byte_idx++] = WEOF;
   262     }
   263   pstr->valid_len = byte_idx;
   264   pstr->valid_raw_len = byte_idx;
   265 }
   266 
   267 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
   268    but for REG_ICASE.  */
   269 
   270 static reg_errcode_t
   271 __attribute_warn_unused_result__
   272 build_wcs_upper_buffer (re_string_t *pstr)
   273 {
   274   mbstate_t prev_st;
   275   Idx src_idx, byte_idx, end_idx, remain_len;
   276   size_t mbclen;
   277 #ifdef _LIBC
   278   char buf[MB_LEN_MAX];
   279   DEBUG_ASSERT (pstr->mb_cur_max <= MB_LEN_MAX);
   280 #else
   281   char buf[64];
   282 #endif
   283 
   284   byte_idx = pstr->valid_len;
   285   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
   286 
   287   /* The following optimization assumes that ASCII characters can be
   288      mapped to wide characters with a simple cast.  */
   289   if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
   290     {
   291       while (byte_idx < end_idx)
   292         {
   293           wchar_t wc;
   294           unsigned char ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
   295 
   296           if (isascii (ch) && mbsinit (&pstr->cur_state))
   297             {
   298               /* The next step uses the assumption that wchar_t is encoded
   299                  ASCII-safe: all ASCII values can be converted like this.  */
   300               wchar_t wcu = __towupper (ch);
   301               if (isascii (wcu))
   302                 {
   303                   pstr->mbs[byte_idx] = wcu;
   304                   pstr->wcs[byte_idx] = wcu;
   305                   byte_idx++;
   306                   continue;
   307                 }
   308             }
   309 
   310           remain_len = end_idx - byte_idx;
   311           prev_st = pstr->cur_state;
   312           mbclen = __mbrtowc (&wc,
   313                               ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
   314                                + byte_idx), remain_len, &pstr->cur_state);
   315           if (__glibc_likely (0 < mbclen && mbclen < (size_t) -2))
   316             {
   317               wchar_t wcu = __towupper (wc);
   318               if (wcu != wc)
   319                 {
   320                   size_t mbcdlen;
   321 
   322                   mbcdlen = __wcrtomb (buf, wcu, &prev_st);
   323                   if (__glibc_likely (mbclen == mbcdlen))
   324                     memcpy (pstr->mbs + byte_idx, buf, mbclen);
   325                   else
   326                     {
   327                       src_idx = byte_idx;
   328                       goto offsets_needed;
   329                     }
   330                 }
   331               else
   332                 memcpy (pstr->mbs + byte_idx,
   333                         pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
   334               pstr->wcs[byte_idx++] = wcu;
   335               /* Write paddings.  */
   336               for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
   337                 pstr->wcs[byte_idx++] = WEOF;
   338             }
   339           else if (mbclen == (size_t) -1 || mbclen == 0
   340                    || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
   341             {
   342               /* It is an invalid character, an incomplete character
   343                  at the end of the string, or '\0'.  Just use the byte.  */
   344               pstr->mbs[byte_idx] = ch;
   345               /* And also cast it to wide char.  */
   346               pstr->wcs[byte_idx++] = (wchar_t) ch;
   347               if (__glibc_unlikely (mbclen == (size_t) -1))
   348                 pstr->cur_state = prev_st;
   349             }
   350           else
   351             {
   352               /* The buffer doesn't have enough space, finish to build.  */
   353               pstr->cur_state = prev_st;
   354               break;
   355             }
   356         }
   357       pstr->valid_len = byte_idx;
   358       pstr->valid_raw_len = byte_idx;
   359       return REG_NOERROR;
   360     }
   361   else
   362     for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
   363       {
   364         wchar_t wc;
   365         const char *p;
   366       offsets_needed:
   367         remain_len = end_idx - byte_idx;
   368         prev_st = pstr->cur_state;
   369         if (__glibc_unlikely (pstr->trans != NULL))
   370           {
   371             int i, ch;
   372 
   373             for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
   374               {
   375                 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
   376                 buf[i] = pstr->trans[ch];
   377               }
   378             p = (const char *) buf;
   379           }
   380         else
   381           p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
   382         mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
   383         if (__glibc_likely (0 < mbclen && mbclen < (size_t) -2))
   384           {
   385             wchar_t wcu = __towupper (wc);
   386             if (wcu != wc)
   387               {
   388                 size_t mbcdlen;
   389 
   390                 mbcdlen = __wcrtomb ((char *) buf, wcu, &prev_st);
   391                 if (__glibc_likely (mbclen == mbcdlen))
   392                   memcpy (pstr->mbs + byte_idx, buf, mbclen);
   393                 else if (mbcdlen != (size_t) -1)
   394                   {
   395                     size_t i;
   396 
   397                     if (byte_idx + mbcdlen > pstr->bufs_len)
   398                       {
   399                         pstr->cur_state = prev_st;
   400                         break;
   401                       }
   402 
   403                     if (pstr->offsets == NULL)
   404                       {
   405                         pstr->offsets = re_malloc (Idx, pstr->bufs_len);
   406 
   407                         if (pstr->offsets == NULL)
   408                           return REG_ESPACE;
   409                       }
   410                     if (!pstr->offsets_needed)
   411                       {
   412                         for (i = 0; i < (size_t) byte_idx; ++i)
   413                           pstr->offsets[i] = i;
   414                         pstr->offsets_needed = 1;
   415                       }
   416 
   417                     memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
   418                     pstr->wcs[byte_idx] = wcu;
   419                     pstr->offsets[byte_idx] = src_idx;
   420                     for (i = 1; i < mbcdlen; ++i)
   421                       {
   422                         pstr->offsets[byte_idx + i]
   423                           = src_idx + (i < mbclen ? i : mbclen - 1);
   424                         pstr->wcs[byte_idx + i] = WEOF;
   425                       }
   426                     pstr->len += mbcdlen - mbclen;
   427                     if (pstr->raw_stop > src_idx)
   428                       pstr->stop += mbcdlen - mbclen;
   429                     end_idx = (pstr->bufs_len > pstr->len)
   430                               ? pstr->len : pstr->bufs_len;
   431                     byte_idx += mbcdlen;
   432                     src_idx += mbclen;
   433                     continue;
   434                   }
   435                 else
   436                   memcpy (pstr->mbs + byte_idx, p, mbclen);
   437               }
   438             else
   439               memcpy (pstr->mbs + byte_idx, p, mbclen);
   440 
   441             if (__glibc_unlikely (pstr->offsets_needed != 0))
   442               {
   443                 size_t i;
   444                 for (i = 0; i < mbclen; ++i)
   445                   pstr->offsets[byte_idx + i] = src_idx + i;
   446               }
   447             src_idx += mbclen;
   448 
   449             pstr->wcs[byte_idx++] = wcu;
   450             /* Write paddings.  */
   451             for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
   452               pstr->wcs[byte_idx++] = WEOF;
   453           }
   454         else if (mbclen == (size_t) -1 || mbclen == 0
   455                  || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
   456           {
   457             /* It is an invalid character or '\0'.  Just use the byte.  */
   458             int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
   459 
   460             if (__glibc_unlikely (pstr->trans != NULL))
   461               ch = pstr->trans [ch];
   462             pstr->mbs[byte_idx] = ch;
   463 
   464             if (__glibc_unlikely (pstr->offsets_needed != 0))
   465               pstr->offsets[byte_idx] = src_idx;
   466             ++src_idx;
   467 
   468             /* And also cast it to wide char.  */
   469             pstr->wcs[byte_idx++] = (wchar_t) ch;
   470             if (__glibc_unlikely (mbclen == (size_t) -1))
   471               pstr->cur_state = prev_st;
   472           }
   473         else
   474           {
   475             /* The buffer doesn't have enough space, finish to build.  */
   476             pstr->cur_state = prev_st;
   477             break;
   478           }
   479       }
   480   pstr->valid_len = byte_idx;
   481   pstr->valid_raw_len = src_idx;
   482   return REG_NOERROR;
   483 }
   484 
   485 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
   486    Return the index.  */
   487 
   488 static Idx
   489 re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
   490 {
   491   mbstate_t prev_st;
   492   Idx rawbuf_idx;
   493   size_t mbclen;
   494   wint_t wc = WEOF;
   495 
   496   /* Skip the characters which are not necessary to check.  */
   497   for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
   498        rawbuf_idx < new_raw_idx;)
   499     {
   500       wchar_t wc2;
   501       Idx remain_len = pstr->raw_len - rawbuf_idx;
   502       prev_st = pstr->cur_state;
   503       mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
   504                           remain_len, &pstr->cur_state);
   505       if (__glibc_unlikely (mbclen == (size_t) -2 || mbclen == (size_t) -1
   506                             || mbclen == 0))
   507         {
   508           /* We treat these cases as a single byte character.  */
   509           if (mbclen == 0 || remain_len == 0)
   510             wc = L'\0';
   511           else
   512             wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
   513           mbclen = 1;
   514           pstr->cur_state = prev_st;
   515         }
   516       else
   517         wc = wc2;
   518       /* Then proceed the next character.  */
   519       rawbuf_idx += mbclen;
   520     }
   521   *last_wc = wc;
   522   return rawbuf_idx;
   523 }
   524 
   525 /* Build the buffer PSTR->MBS, and apply the translation if we need.
   526    This function is used in case of REG_ICASE.  */
   527 
   528 static void
   529 build_upper_buffer (re_string_t *pstr)
   530 {
   531   Idx char_idx, end_idx;
   532   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
   533 
   534   for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
   535     {
   536       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
   537       if (__glibc_unlikely (pstr->trans != NULL))
   538         ch = pstr->trans[ch];
   539       pstr->mbs[char_idx] = toupper (ch);
   540     }
   541   pstr->valid_len = char_idx;
   542   pstr->valid_raw_len = char_idx;
   543 }
   544 
   545 /* Apply TRANS to the buffer in PSTR.  */
   546 
   547 static void
   548 re_string_translate_buffer (re_string_t *pstr)
   549 {
   550   Idx buf_idx, end_idx;
   551   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
   552 
   553   for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
   554     {
   555       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
   556       pstr->mbs[buf_idx] = pstr->trans[ch];
   557     }
   558 
   559   pstr->valid_len = buf_idx;
   560   pstr->valid_raw_len = buf_idx;
   561 }
   562 
   563 /* This function re-construct the buffers.
   564    Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
   565    convert to upper case in case of REG_ICASE, apply translation.  */
   566 
   567 static reg_errcode_t
   568 __attribute_warn_unused_result__
   569 re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
   570 {
   571   Idx offset;
   572 
   573   if (__glibc_unlikely (pstr->raw_mbs_idx <= idx))
   574     offset = idx - pstr->raw_mbs_idx;
   575   else
   576     {
   577       /* Reset buffer.  */
   578       if (pstr->mb_cur_max > 1)
   579         memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
   580       pstr->len = pstr->raw_len;
   581       pstr->stop = pstr->raw_stop;
   582       pstr->valid_len = 0;
   583       pstr->raw_mbs_idx = 0;
   584       pstr->valid_raw_len = 0;
   585       pstr->offsets_needed = 0;
   586       pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
   587                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
   588       if (!pstr->mbs_allocated)
   589         pstr->mbs = (unsigned char *) pstr->raw_mbs;
   590       offset = idx;
   591     }
   592 
   593   if (__glibc_likely (offset != 0))
   594     {
   595       /* Should the already checked characters be kept?  */
   596       if (__glibc_likely (offset < pstr->valid_raw_len))
   597         {
   598           /* Yes, move them to the front of the buffer.  */
   599           if (__glibc_unlikely (pstr->offsets_needed))
   600             {
   601               Idx low = 0, high = pstr->valid_len, mid;
   602               do
   603                 {
   604                   mid = (high + low) / 2;
   605                   if (pstr->offsets[mid] > offset)
   606                     high = mid;
   607                   else if (pstr->offsets[mid] < offset)
   608                     low = mid + 1;
   609                   else
   610                     break;
   611                 }
   612               while (low < high);
   613               if (pstr->offsets[mid] < offset)
   614                 ++mid;
   615               pstr->tip_context = re_string_context_at (pstr, mid - 1,
   616                                                         eflags);
   617               /* This can be quite complicated, so handle specially
   618                  only the common and easy case where the character with
   619                  different length representation of lower and upper
   620                  case is present at or after offset.  */
   621               if (pstr->valid_len > offset
   622                   && mid == offset && pstr->offsets[mid] == offset)
   623                 {
   624                   memmove (pstr->wcs, pstr->wcs + offset,
   625                            (pstr->valid_len - offset) * sizeof (wint_t));
   626                   memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
   627                   pstr->valid_len -= offset;
   628                   pstr->valid_raw_len -= offset;
   629                   for (low = 0; low < pstr->valid_len; low++)
   630                     pstr->offsets[low] = pstr->offsets[low + offset] - offset;
   631                 }
   632               else
   633                 {
   634                   /* Otherwise, just find out how long the partial multibyte
   635                      character at offset is and fill it with WEOF/255.  */
   636                   pstr->len = pstr->raw_len - idx + offset;
   637                   pstr->stop = pstr->raw_stop - idx + offset;
   638                   pstr->offsets_needed = 0;
   639                   while (mid > 0 && pstr->offsets[mid - 1] == offset)
   640                     --mid;
   641                   while (mid < pstr->valid_len)
   642                     if (pstr->wcs[mid] != WEOF)
   643                       break;
   644                     else
   645                       ++mid;
   646                   if (mid == pstr->valid_len)
   647                     pstr->valid_len = 0;
   648                   else
   649                     {
   650                       pstr->valid_len = pstr->offsets[mid] - offset;
   651                       if (pstr->valid_len)
   652                         {
   653                           for (low = 0; low < pstr->valid_len; ++low)
   654                             pstr->wcs[low] = WEOF;
   655                           memset (pstr->mbs, 255, pstr->valid_len);
   656                         }
   657                     }
   658                   pstr->valid_raw_len = pstr->valid_len;
   659                 }
   660             }
   661           else
   662             {
   663               pstr->tip_context = re_string_context_at (pstr, offset - 1,
   664                                                         eflags);
   665               if (pstr->mb_cur_max > 1)
   666                 memmove (pstr->wcs, pstr->wcs + offset,
   667                          (pstr->valid_len - offset) * sizeof (wint_t));
   668               if (__glibc_unlikely (pstr->mbs_allocated))
   669                 memmove (pstr->mbs, pstr->mbs + offset,
   670                          pstr->valid_len - offset);
   671               pstr->valid_len -= offset;
   672               pstr->valid_raw_len -= offset;
   673               DEBUG_ASSERT (pstr->valid_len > 0);
   674             }
   675         }
   676       else
   677         {
   678           /* No, skip all characters until IDX.  */
   679           Idx prev_valid_len = pstr->valid_len;
   680 
   681           if (__glibc_unlikely (pstr->offsets_needed))
   682             {
   683               pstr->len = pstr->raw_len - idx + offset;
   684               pstr->stop = pstr->raw_stop - idx + offset;
   685               pstr->offsets_needed = 0;
   686             }
   687           pstr->valid_len = 0;
   688           if (pstr->mb_cur_max > 1)
   689             {
   690               Idx wcs_idx;
   691               wint_t wc = WEOF;
   692 
   693               if (pstr->is_utf8)
   694                 {
   695                   const unsigned char *raw, *p, *end;
   696 
   697                   /* Special case UTF-8.  Multi-byte chars start with any
   698                      byte other than 0x80 - 0xbf.  */
   699                   raw = pstr->raw_mbs + pstr->raw_mbs_idx;
   700                   end = raw + (offset - pstr->mb_cur_max);
   701                   if (end < pstr->raw_mbs)
   702                     end = pstr->raw_mbs;
   703                   p = raw + offset - 1;
   704 #ifdef _LIBC
   705                   /* We know the wchar_t encoding is UCS4, so for the simple
   706                      case, ASCII characters, skip the conversion step.  */
   707                   if (isascii (*p) && __glibc_likely (pstr->trans == NULL))
   708                     {
   709                       memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
   710                       /* pstr->valid_len = 0; */
   711                       wc = (wchar_t) *p;
   712                     }
   713                   else
   714 #endif
   715                     for (; p >= end; --p)
   716                       if ((*p & 0xc0) != 0x80)
   717                         {
   718                           mbstate_t cur_state;
   719                           wchar_t wc2;
   720                           Idx mlen = raw + pstr->len - p;
   721                           unsigned char buf[6];
   722                           size_t mbclen;
   723 
   724                           const unsigned char *pp = p;
   725                           if (__glibc_unlikely (pstr->trans != NULL))
   726                             {
   727                               int i = mlen < 6 ? mlen : 6;
   728                               while (--i >= 0)
   729                                 buf[i] = pstr->trans[p[i]];
   730                               pp = buf;
   731                             }
   732                           /* XXX Don't use mbrtowc, we know which conversion
   733                              to use (UTF-8 -> UCS4).  */
   734                           memset (&cur_state, 0, sizeof (cur_state));
   735                           mbclen = __mbrtowc (&wc2, (const char *) pp, mlen,
   736                                               &cur_state);
   737                           if (raw + offset - p <= mbclen
   738                               && mbclen < (size_t) -2)
   739                             {
   740                               memset (&pstr->cur_state, '\0',
   741                                       sizeof (mbstate_t));
   742                               pstr->valid_len = mbclen - (raw + offset - p);
   743                               wc = wc2;
   744                             }
   745                           break;
   746                         }
   747                 }
   748 
   749               if (wc == WEOF)
   750                 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
   751               if (wc == WEOF)
   752                 pstr->tip_context
   753                   = re_string_context_at (pstr, prev_valid_len - 1, eflags);
   754               else
   755                 pstr->tip_context = ((__glibc_unlikely (pstr->word_ops_used != 0)
   756                                       && IS_WIDE_WORD_CHAR (wc))
   757                                      ? CONTEXT_WORD
   758                                      : ((IS_WIDE_NEWLINE (wc)
   759                                          && pstr->newline_anchor)
   760                                         ? CONTEXT_NEWLINE : 0));
   761               if (__glibc_unlikely (pstr->valid_len))
   762                 {
   763                   for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
   764                     pstr->wcs[wcs_idx] = WEOF;
   765                   if (pstr->mbs_allocated)
   766                     memset (pstr->mbs, 255, pstr->valid_len);
   767                 }
   768               pstr->valid_raw_len = pstr->valid_len;
   769             }
   770           else
   771             {
   772               int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
   773               pstr->valid_raw_len = 0;
   774               if (pstr->trans)
   775                 c = pstr->trans[c];
   776               pstr->tip_context = (bitset_contain (pstr->word_char, c)
   777                                    ? CONTEXT_WORD
   778                                    : ((IS_NEWLINE (c) && pstr->newline_anchor)
   779                                       ? CONTEXT_NEWLINE : 0));
   780             }
   781         }
   782       if (!__glibc_unlikely (pstr->mbs_allocated))
   783         pstr->mbs += offset;
   784     }
   785   pstr->raw_mbs_idx = idx;
   786   pstr->len -= offset;
   787   pstr->stop -= offset;
   788 
   789   /* Then build the buffers.  */
   790   if (pstr->mb_cur_max > 1)
   791     {
   792       if (pstr->icase)
   793         {
   794           reg_errcode_t ret = build_wcs_upper_buffer (pstr);
   795           if (__glibc_unlikely (ret != REG_NOERROR))
   796             return ret;
   797         }
   798       else
   799         build_wcs_buffer (pstr);
   800     }
   801   else
   802     if (__glibc_unlikely (pstr->mbs_allocated))
   803       {
   804         if (pstr->icase)
   805           build_upper_buffer (pstr);
   806         else if (pstr->trans != NULL)
   807           re_string_translate_buffer (pstr);
   808       }
   809     else
   810       pstr->valid_len = pstr->len;
   811 
   812   pstr->cur_idx = 0;
   813   return REG_NOERROR;
   814 }
   815 
   816 static unsigned char
   817 __attribute__ ((pure))
   818 re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
   819 {
   820   int ch;
   821   Idx off;
   822 
   823   /* Handle the common (easiest) cases first.  */
   824   if (__glibc_likely (!pstr->mbs_allocated))
   825     return re_string_peek_byte (pstr, idx);
   826 
   827   if (pstr->mb_cur_max > 1
   828       && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
   829     return re_string_peek_byte (pstr, idx);
   830 
   831   off = pstr->cur_idx + idx;
   832   if (pstr->offsets_needed)
   833     off = pstr->offsets[off];
   834 
   835   ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
   836 
   837   /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
   838      this function returns CAPITAL LETTER I instead of first byte of
   839      DOTLESS SMALL LETTER I.  The latter would confuse the parser,
   840      since peek_byte_case doesn't advance cur_idx in any way.  */
   841   if (pstr->offsets_needed && !isascii (ch))
   842     return re_string_peek_byte (pstr, idx);
   843 
   844   return ch;
   845 }
   846 
   847 static unsigned char
   848 re_string_fetch_byte_case (re_string_t *pstr)
   849 {
   850   if (__glibc_likely (!pstr->mbs_allocated))
   851     return re_string_fetch_byte (pstr);
   852 
   853   if (pstr->offsets_needed)
   854     {
   855       Idx off;
   856       int ch;
   857 
   858       /* For tr_TR.UTF-8 [[:islower:]] there is
   859          [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
   860          in that case the whole multi-byte character and return
   861          the original letter.  On the other side, with
   862          [[: DOTLESS SMALL LETTER I return [[:I, as doing
   863          anything else would complicate things too much.  */
   864 
   865       if (!re_string_first_byte (pstr, pstr->cur_idx))
   866         return re_string_fetch_byte (pstr);
   867 
   868       off = pstr->offsets[pstr->cur_idx];
   869       ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
   870 
   871       if (! isascii (ch))
   872         return re_string_fetch_byte (pstr);
   873 
   874       re_string_skip_bytes (pstr,
   875                             re_string_char_size_at (pstr, pstr->cur_idx));
   876       return ch;
   877     }
   878 
   879   return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
   880 }
   881 
   882 static void
   883 re_string_destruct (re_string_t *pstr)
   884 {
   885   re_free (pstr->wcs);
   886   re_free (pstr->offsets);
   887   if (pstr->mbs_allocated)
   888     re_free (pstr->mbs);
   889 }
   890 
   891 /* Return the context at IDX in INPUT.  */
   892 
   893 static unsigned int
   894 re_string_context_at (const re_string_t *input, Idx idx, int eflags)
   895 {
   896   int c;
   897   if (__glibc_unlikely (idx < 0))
   898     /* In this case, we use the value stored in input->tip_context,
   899        since we can't know the character in input->mbs[-1] here.  */
   900     return input->tip_context;
   901   if (__glibc_unlikely (idx == input->len))
   902     return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
   903             : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
   904   if (input->mb_cur_max > 1)
   905     {
   906       wint_t wc;
   907       Idx wc_idx = idx;
   908       while(input->wcs[wc_idx] == WEOF)
   909         {
   910           DEBUG_ASSERT (wc_idx >= 0);
   911           --wc_idx;
   912           if (wc_idx < 0)
   913             return input->tip_context;
   914         }
   915       wc = input->wcs[wc_idx];
   916       if (__glibc_unlikely (input->word_ops_used != 0)
   917           && IS_WIDE_WORD_CHAR (wc))
   918         return CONTEXT_WORD;
   919       return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
   920               ? CONTEXT_NEWLINE : 0);
   921     }
   922   else
   923     {
   924       c = re_string_byte_at (input, idx);
   925       if (bitset_contain (input->word_char, c))
   926         return CONTEXT_WORD;
   927       return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
   928     }
   929 }
   930 
   931 /* Functions for set operation.  */
   932 
   933 static reg_errcode_t
   934 __attribute_warn_unused_result__
   935 re_node_set_alloc (re_node_set *set, Idx size)
   936 {
   937   set->alloc = size;
   938   set->nelem = 0;
   939   set->elems = re_malloc (Idx, size);
   940   if (__glibc_unlikely (set->elems == NULL)
   941       && (MALLOC_0_IS_NONNULL || size != 0))
   942     return REG_ESPACE;
   943   return REG_NOERROR;
   944 }
   945 
   946 static reg_errcode_t
   947 __attribute_warn_unused_result__
   948 re_node_set_init_1 (re_node_set *set, Idx elem)
   949 {
   950   set->alloc = 1;
   951   set->nelem = 1;
   952   set->elems = re_malloc (Idx, 1);
   953   if (__glibc_unlikely (set->elems == NULL))
   954     {
   955       set->alloc = set->nelem = 0;
   956       return REG_ESPACE;
   957     }
   958   set->elems[0] = elem;
   959   return REG_NOERROR;
   960 }
   961 
   962 static reg_errcode_t
   963 __attribute_warn_unused_result__
   964 re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
   965 {
   966   set->alloc = 2;
   967   set->elems = re_malloc (Idx, 2);
   968   if (__glibc_unlikely (set->elems == NULL))
   969     return REG_ESPACE;
   970   if (elem1 == elem2)
   971     {
   972       set->nelem = 1;
   973       set->elems[0] = elem1;
   974     }
   975   else
   976     {
   977       set->nelem = 2;
   978       if (elem1 < elem2)
   979         {
   980           set->elems[0] = elem1;
   981           set->elems[1] = elem2;
   982         }
   983       else
   984         {
   985           set->elems[0] = elem2;
   986           set->elems[1] = elem1;
   987         }
   988     }
   989   return REG_NOERROR;
   990 }
   991 
   992 static reg_errcode_t
   993 __attribute_warn_unused_result__
   994 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
   995 {
   996   dest->nelem = src->nelem;
   997   if (src->nelem > 0)
   998     {
   999       dest->alloc = dest->nelem;
  1000       dest->elems = re_malloc (Idx, dest->alloc);
  1001       if (__glibc_unlikely (dest->elems == NULL))
  1002         {
  1003           dest->alloc = dest->nelem = 0;
  1004           return REG_ESPACE;
  1005         }
  1006       memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
  1007     }
  1008   else
  1009     re_node_set_init_empty (dest);
  1010   return REG_NOERROR;
  1011 }
  1012 
  1013 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
  1014    DEST. Return value indicate the error code or REG_NOERROR if succeeded.
  1015    Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
  1016 
  1017 static reg_errcode_t
  1018 __attribute_warn_unused_result__
  1019 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
  1020                            const re_node_set *src2)
  1021 {
  1022   Idx i1, i2, is, id, delta, sbase;
  1023   if (src1->nelem == 0 || src2->nelem == 0)
  1024     return REG_NOERROR;
  1025 
  1026   /* We need dest->nelem + 2 * elems_in_intersection; this is a
  1027      conservative estimate.  */
  1028   if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
  1029     {
  1030       Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
  1031       Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
  1032       if (__glibc_unlikely (new_elems == NULL))
  1033         return REG_ESPACE;
  1034       dest->elems = new_elems;
  1035       dest->alloc = new_alloc;
  1036     }
  1037 
  1038   /* Find the items in the intersection of SRC1 and SRC2, and copy
  1039      into the top of DEST those that are not already in DEST itself.  */
  1040   sbase = dest->nelem + src1->nelem + src2->nelem;
  1041   i1 = src1->nelem - 1;
  1042   i2 = src2->nelem - 1;
  1043   id = dest->nelem - 1;
  1044   for (;;)
  1045     {
  1046       if (src1->elems[i1] == src2->elems[i2])
  1047         {
  1048           /* Try to find the item in DEST.  Maybe we could binary search?  */
  1049           while (id >= 0 && dest->elems[id] > src1->elems[i1])
  1050             --id;
  1051 
  1052           if (id < 0 || dest->elems[id] != src1->elems[i1])
  1053             dest->elems[--sbase] = src1->elems[i1];
  1054 
  1055           if (--i1 < 0 || --i2 < 0)
  1056             break;
  1057         }
  1058 
  1059       /* Lower the highest of the two items.  */
  1060       else if (src1->elems[i1] < src2->elems[i2])
  1061         {
  1062           if (--i2 < 0)
  1063             break;
  1064         }
  1065       else
  1066         {
  1067           if (--i1 < 0)
  1068             break;
  1069         }
  1070     }
  1071 
  1072   id = dest->nelem - 1;
  1073   is = dest->nelem + src1->nelem + src2->nelem - 1;
  1074   delta = is - sbase + 1;
  1075 
  1076   /* Now copy.  When DELTA becomes zero, the remaining
  1077      DEST elements are already in place; this is more or
  1078      less the same loop that is in re_node_set_merge.  */
  1079   dest->nelem += delta;
  1080   if (delta > 0 && id >= 0)
  1081     for (;;)
  1082       {
  1083         if (dest->elems[is] > dest->elems[id])
  1084           {
  1085             /* Copy from the top.  */
  1086             dest->elems[id + delta--] = dest->elems[is--];
  1087             if (delta == 0)
  1088               break;
  1089           }
  1090         else
  1091           {
  1092             /* Slide from the bottom.  */
  1093             dest->elems[id + delta] = dest->elems[id];
  1094             if (--id < 0)
  1095               break;
  1096           }
  1097       }
  1098 
  1099   /* Copy remaining SRC elements.  */
  1100   memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
  1101 
  1102   return REG_NOERROR;
  1103 }
  1104 
  1105 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
  1106    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
  1107 
  1108 static reg_errcode_t
  1109 __attribute_warn_unused_result__
  1110 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
  1111                         const re_node_set *src2)
  1112 {
  1113   Idx i1, i2, id;
  1114   if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
  1115     {
  1116       dest->alloc = src1->nelem + src2->nelem;
  1117       dest->elems = re_malloc (Idx, dest->alloc);
  1118       if (__glibc_unlikely (dest->elems == NULL))
  1119         return REG_ESPACE;
  1120     }
  1121   else
  1122     {
  1123       if (src1 != NULL && src1->nelem > 0)
  1124         return re_node_set_init_copy (dest, src1);
  1125       else if (src2 != NULL && src2->nelem > 0)
  1126         return re_node_set_init_copy (dest, src2);
  1127       else
  1128         re_node_set_init_empty (dest);
  1129       return REG_NOERROR;
  1130     }
  1131   for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
  1132     {
  1133       if (src1->elems[i1] > src2->elems[i2])
  1134         {
  1135           dest->elems[id++] = src2->elems[i2++];
  1136           continue;
  1137         }
  1138       if (src1->elems[i1] == src2->elems[i2])
  1139         ++i2;
  1140       dest->elems[id++] = src1->elems[i1++];
  1141     }
  1142   if (i1 < src1->nelem)
  1143     {
  1144       memcpy (dest->elems + id, src1->elems + i1,
  1145              (src1->nelem - i1) * sizeof (Idx));
  1146       id += src1->nelem - i1;
  1147     }
  1148   else if (i2 < src2->nelem)
  1149     {
  1150       memcpy (dest->elems + id, src2->elems + i2,
  1151              (src2->nelem - i2) * sizeof (Idx));
  1152       id += src2->nelem - i2;
  1153     }
  1154   dest->nelem = id;
  1155   return REG_NOERROR;
  1156 }
  1157 
  1158 /* Calculate the union set of the sets DEST and SRC. And store it to
  1159    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
  1160 
  1161 static reg_errcode_t
  1162 __attribute_warn_unused_result__
  1163 re_node_set_merge (re_node_set *dest, const re_node_set *src)
  1164 {
  1165   Idx is, id, sbase, delta;
  1166   if (src == NULL || src->nelem == 0)
  1167     return REG_NOERROR;
  1168   if (dest->alloc < 2 * src->nelem + dest->nelem)
  1169     {
  1170       Idx new_alloc = 2 * (src->nelem + dest->alloc);
  1171       Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
  1172       if (__glibc_unlikely (new_buffer == NULL))
  1173         return REG_ESPACE;
  1174       dest->elems = new_buffer;
  1175       dest->alloc = new_alloc;
  1176     }
  1177 
  1178   if (__glibc_unlikely (dest->nelem == 0))
  1179     {
  1180       /* Although we already guaranteed above that dest->alloc != 0 and
  1181          therefore dest->elems != NULL, add a debug assertion to pacify
  1182          GCC 11.2.1's -fanalyzer.  */
  1183       DEBUG_ASSERT (dest->elems);
  1184       dest->nelem = src->nelem;
  1185       memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
  1186       return REG_NOERROR;
  1187     }
  1188 
  1189   /* Copy into the top of DEST the items of SRC that are not
  1190      found in DEST.  Maybe we could binary search in DEST?  */
  1191   for (sbase = dest->nelem + 2 * src->nelem,
  1192        is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
  1193     {
  1194       if (dest->elems[id] == src->elems[is])
  1195         is--, id--;
  1196       else if (dest->elems[id] < src->elems[is])
  1197         dest->elems[--sbase] = src->elems[is--];
  1198       else /* if (dest->elems[id] > src->elems[is]) */
  1199         --id;
  1200     }
  1201 
  1202   if (is >= 0)
  1203     {
  1204       /* If DEST is exhausted, the remaining items of SRC must be unique.  */
  1205       sbase -= is + 1;
  1206       memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
  1207     }
  1208 
  1209   id = dest->nelem - 1;
  1210   is = dest->nelem + 2 * src->nelem - 1;
  1211   delta = is - sbase + 1;
  1212   if (delta == 0)
  1213     return REG_NOERROR;
  1214 
  1215   /* Now copy.  When DELTA becomes zero, the remaining
  1216      DEST elements are already in place.  */
  1217   dest->nelem += delta;
  1218   for (;;)
  1219     {
  1220       if (dest->elems[is] > dest->elems[id])
  1221         {
  1222           /* Copy from the top.  */
  1223           dest->elems[id + delta--] = dest->elems[is--];
  1224           if (delta == 0)
  1225             break;
  1226         }
  1227       else
  1228         {
  1229           /* Slide from the bottom.  */
  1230           dest->elems[id + delta] = dest->elems[id];
  1231           if (--id < 0)
  1232             {
  1233               /* Copy remaining SRC elements.  */
  1234               memcpy (dest->elems, dest->elems + sbase,
  1235                       delta * sizeof (Idx));
  1236               break;
  1237             }
  1238         }
  1239     }
  1240 
  1241   return REG_NOERROR;
  1242 }
  1243 
  1244 /* Insert the new element ELEM to the re_node_set* SET.
  1245    SET should not already have ELEM.
  1246    Return true if successful.  */
  1247 
  1248 static bool
  1249 __attribute_warn_unused_result__
  1250 re_node_set_insert (re_node_set *set, Idx elem)
  1251 {
  1252   Idx idx;
  1253   /* In case the set is empty.  */
  1254   if (set->alloc == 0)
  1255     return __glibc_likely (re_node_set_init_1 (set, elem) == REG_NOERROR);
  1256 
  1257   if (__glibc_unlikely (set->nelem) == 0)
  1258     {
  1259       /* Although we already guaranteed above that set->alloc != 0 and
  1260          therefore set->elems != NULL, add a debug assertion to pacify
  1261          GCC 11.2 -fanalyzer.  */
  1262       DEBUG_ASSERT (set->elems);
  1263       set->elems[0] = elem;
  1264       ++set->nelem;
  1265       return true;
  1266     }
  1267 
  1268   /* Realloc if we need.  */
  1269   if (set->alloc == set->nelem)
  1270     {
  1271       Idx *new_elems;
  1272       set->alloc = set->alloc * 2;
  1273       new_elems = re_realloc (set->elems, Idx, set->alloc);
  1274       if (__glibc_unlikely (new_elems == NULL))
  1275         return false;
  1276       set->elems = new_elems;
  1277     }
  1278 
  1279   /* Move the elements which follows the new element.  Test the
  1280      first element separately to skip a check in the inner loop.  */
  1281   if (elem < set->elems[0])
  1282     {
  1283       for (idx = set->nelem; idx > 0; idx--)
  1284         set->elems[idx] = set->elems[idx - 1];
  1285     }
  1286   else
  1287     {
  1288       for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
  1289         set->elems[idx] = set->elems[idx - 1];
  1290       DEBUG_ASSERT (set->elems[idx - 1] < elem);
  1291     }
  1292 
  1293   /* Insert the new element.  */
  1294   set->elems[idx] = elem;
  1295   ++set->nelem;
  1296   return true;
  1297 }
  1298 
  1299 /* Insert the new element ELEM to the re_node_set* SET.
  1300    SET should not already have any element greater than or equal to ELEM.
  1301    Return true if successful.  */
  1302 
  1303 static bool
  1304 __attribute_warn_unused_result__
  1305 re_node_set_insert_last (re_node_set *set, Idx elem)
  1306 {
  1307   /* Realloc if we need.  */
  1308   if (set->alloc == set->nelem)
  1309     {
  1310       Idx *new_elems;
  1311       set->alloc = (set->alloc + 1) * 2;
  1312       new_elems = re_realloc (set->elems, Idx, set->alloc);
  1313       if (__glibc_unlikely (new_elems == NULL))
  1314         return false;
  1315       set->elems = new_elems;
  1316     }
  1317 
  1318   /* Insert the new element.  */
  1319   set->elems[set->nelem++] = elem;
  1320   return true;
  1321 }
  1322 
  1323 /* Compare two node sets SET1 and SET2.
  1324    Return true if SET1 and SET2 are equivalent.  */
  1325 
  1326 static bool
  1327 __attribute__ ((pure))
  1328 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
  1329 {
  1330   Idx i;
  1331   if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
  1332     return false;
  1333   for (i = set1->nelem ; --i >= 0 ; )
  1334     if (set1->elems[i] != set2->elems[i])
  1335       return false;
  1336   return true;
  1337 }
  1338 
  1339 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
  1340 
  1341 static Idx
  1342 __attribute__ ((pure))
  1343 re_node_set_contains (const re_node_set *set, Idx elem)
  1344 {
  1345   __re_size_t idx, right, mid;
  1346   if (set->nelem <= 0)
  1347     return 0;
  1348 
  1349   /* Binary search the element.  */
  1350   idx = 0;
  1351   right = set->nelem - 1;
  1352   while (idx < right)
  1353     {
  1354       mid = (idx + right) / 2;
  1355       if (set->elems[mid] < elem)
  1356         idx = mid + 1;
  1357       else
  1358         right = mid;
  1359     }
  1360   return set->elems[idx] == elem ? idx + 1 : 0;
  1361 }
  1362 
  1363 static void
  1364 re_node_set_remove_at (re_node_set *set, Idx idx)
  1365 {
  1366   if (idx < 0 || idx >= set->nelem)
  1367     return;
  1368   --set->nelem;
  1369   for (; idx < set->nelem; idx++)
  1370     set->elems[idx] = set->elems[idx + 1];
  1371 }
  1372 
  1373 
  1374 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
  1375    Or return -1 if an error occurred.  */
  1376 
  1377 static Idx
  1378 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
  1379 {
  1380   if (__glibc_unlikely (dfa->nodes_len >= dfa->nodes_alloc))
  1381     {
  1382       size_t new_nodes_alloc = dfa->nodes_alloc * 2;
  1383       Idx *new_nexts, *new_indices;
  1384       re_node_set *new_edests, *new_eclosures;
  1385       re_token_t *new_nodes;
  1386 
  1387       /* Avoid overflows in realloc.  */
  1388       const size_t max_object_size = MAX (sizeof (re_token_t),
  1389                                           MAX (sizeof (re_node_set),
  1390                                                sizeof (Idx)));
  1391       if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size)
  1392                             < new_nodes_alloc))
  1393         return -1;
  1394 
  1395       new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
  1396       if (__glibc_unlikely (new_nodes == NULL))
  1397         return -1;
  1398       dfa->nodes = new_nodes;
  1399       dfa->nodes_alloc = new_nodes_alloc;
  1400       new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
  1401       if (new_nexts != NULL)
  1402         dfa->nexts = new_nexts;
  1403       new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
  1404       if (new_indices != NULL)
  1405         dfa->org_indices = new_indices;
  1406       new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
  1407       if (new_edests != NULL)
  1408         dfa->edests = new_edests;
  1409       new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
  1410       if (new_eclosures != NULL)
  1411         dfa->eclosures = new_eclosures;
  1412       if (__glibc_unlikely (new_nexts == NULL || new_indices == NULL
  1413                             || new_edests == NULL || new_eclosures == NULL))
  1414         return -1;
  1415     }
  1416   dfa->nodes[dfa->nodes_len] = token;
  1417   dfa->nodes[dfa->nodes_len].constraint = 0;
  1418   dfa->nodes[dfa->nodes_len].accept_mb =
  1419     ((token.type == OP_PERIOD && dfa->mb_cur_max > 1)
  1420      || token.type == COMPLEX_BRACKET);
  1421   dfa->nexts[dfa->nodes_len] = -1;
  1422   re_node_set_init_empty (dfa->edests + dfa->nodes_len);
  1423   re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
  1424   return dfa->nodes_len++;
  1425 }
  1426 
  1427 static re_hashval_t
  1428 calc_state_hash (const re_node_set *nodes, unsigned int context)
  1429 {
  1430   re_hashval_t hash = nodes->nelem + context;
  1431   Idx i;
  1432   for (i = 0 ; i < nodes->nelem ; i++)
  1433     hash += nodes->elems[i];
  1434   return hash;
  1435 }
  1436 
  1437 /* Search for the state whose node_set is equivalent to NODES.
  1438    Return the pointer to the state, if we found it in the DFA.
  1439    Otherwise create the new one and return it.  In case of an error
  1440    return NULL and set the error code in ERR.
  1441    Note: - We assume NULL as the invalid state, then it is possible that
  1442            return value is NULL and ERR is REG_NOERROR.
  1443          - We never return non-NULL value in case of any errors, it is for
  1444            optimization.  */
  1445 
  1446 static re_dfastate_t *
  1447 __attribute_warn_unused_result__
  1448 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
  1449                   const re_node_set *nodes)
  1450 {
  1451   re_hashval_t hash;
  1452   re_dfastate_t *new_state;
  1453   struct re_state_table_entry *spot;
  1454   Idx i;
  1455 #if defined GCC_LINT || defined lint
  1456   /* Suppress bogus uninitialized-variable warnings.  */
  1457   *err = REG_NOERROR;
  1458 #endif
  1459   if (__glibc_unlikely (nodes->nelem == 0))
  1460     {
  1461       *err = REG_NOERROR;
  1462       return NULL;
  1463     }
  1464   hash = calc_state_hash (nodes, 0);
  1465   spot = dfa->state_table + (hash & dfa->state_hash_mask);
  1466 
  1467   for (i = 0 ; i < spot->num ; i++)
  1468     {
  1469       re_dfastate_t *state = spot->array[i];
  1470       if (hash != state->hash)
  1471         continue;
  1472       if (re_node_set_compare (&state->nodes, nodes))
  1473         return state;
  1474     }
  1475 
  1476   /* There are no appropriate state in the dfa, create the new one.  */
  1477   new_state = create_ci_newstate (dfa, nodes, hash);
  1478   if (__glibc_unlikely (new_state == NULL))
  1479     *err = REG_ESPACE;
  1480 
  1481   return new_state;
  1482 }
  1483 
  1484 /* Search for the state whose node_set is equivalent to NODES and
  1485    whose context is equivalent to CONTEXT.
  1486    Return the pointer to the state, if we found it in the DFA.
  1487    Otherwise create the new one and return it.  In case of an error
  1488    return NULL and set the error code in ERR.
  1489    Note: - We assume NULL as the invalid state, then it is possible that
  1490            return value is NULL and ERR is REG_NOERROR.
  1491          - We never return non-NULL value in case of any errors, it is for
  1492            optimization.  */
  1493 
  1494 static re_dfastate_t *
  1495 __attribute_warn_unused_result__
  1496 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
  1497                           const re_node_set *nodes, unsigned int context)
  1498 {
  1499   re_hashval_t hash;
  1500   re_dfastate_t *new_state;
  1501   struct re_state_table_entry *spot;
  1502   Idx i;
  1503 #if defined GCC_LINT || defined lint
  1504   /* Suppress bogus uninitialized-variable warnings.  */
  1505   *err = REG_NOERROR;
  1506 #endif
  1507   if (nodes->nelem == 0)
  1508     {
  1509       *err = REG_NOERROR;
  1510       return NULL;
  1511     }
  1512   hash = calc_state_hash (nodes, context);
  1513   spot = dfa->state_table + (hash & dfa->state_hash_mask);
  1514 
  1515   for (i = 0 ; i < spot->num ; i++)
  1516     {
  1517       re_dfastate_t *state = spot->array[i];
  1518       if (state->hash == hash
  1519           && state->context == context
  1520           && re_node_set_compare (state->entrance_nodes, nodes))
  1521         return state;
  1522     }
  1523   /* There are no appropriate state in 'dfa', create the new one.  */
  1524   new_state = create_cd_newstate (dfa, nodes, context, hash);
  1525   if (__glibc_unlikely (new_state == NULL))
  1526     *err = REG_ESPACE;
  1527 
  1528   return new_state;
  1529 }
  1530 
  1531 /* Finish initialization of the new state NEWSTATE, and using its hash value
  1532    HASH put in the appropriate bucket of DFA's state table.  Return value
  1533    indicates the error code if failed.  */
  1534 
  1535 static reg_errcode_t
  1536 __attribute_warn_unused_result__
  1537 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
  1538                 re_hashval_t hash)
  1539 {
  1540   struct re_state_table_entry *spot;
  1541   reg_errcode_t err;
  1542   Idx i;
  1543 
  1544   newstate->hash = hash;
  1545   err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
  1546   if (__glibc_unlikely (err != REG_NOERROR))
  1547     return REG_ESPACE;
  1548   for (i = 0; i < newstate->nodes.nelem; i++)
  1549     {
  1550       Idx elem = newstate->nodes.elems[i];
  1551       if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
  1552         if (! re_node_set_insert_last (&newstate->non_eps_nodes, elem))
  1553           return REG_ESPACE;
  1554     }
  1555 
  1556   spot = dfa->state_table + (hash & dfa->state_hash_mask);
  1557   if (__glibc_unlikely (spot->alloc <= spot->num))
  1558     {
  1559       Idx new_alloc = 2 * spot->num + 2;
  1560       re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
  1561                                               new_alloc);
  1562       if (__glibc_unlikely (new_array == NULL))
  1563         return REG_ESPACE;
  1564       spot->array = new_array;
  1565       spot->alloc = new_alloc;
  1566     }
  1567   spot->array[spot->num++] = newstate;
  1568   return REG_NOERROR;
  1569 }
  1570 
  1571 static void
  1572 free_state (re_dfastate_t *state)
  1573 {
  1574   re_node_set_free (&state->non_eps_nodes);
  1575   re_node_set_free (&state->inveclosure);
  1576   if (state->entrance_nodes != &state->nodes)
  1577     {
  1578       re_node_set_free (state->entrance_nodes);
  1579       re_free (state->entrance_nodes);
  1580     }
  1581   re_node_set_free (&state->nodes);
  1582   re_free (state->word_trtable);
  1583   re_free (state->trtable);
  1584   re_free (state);
  1585 }
  1586 
  1587 /* Create the new state which is independent of contexts.
  1588    Return the new state if succeeded, otherwise return NULL.  */
  1589 
  1590 static re_dfastate_t *
  1591 __attribute_warn_unused_result__
  1592 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
  1593                     re_hashval_t hash)
  1594 {
  1595   Idx i;
  1596   reg_errcode_t err;
  1597   re_dfastate_t *newstate;
  1598 
  1599   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
  1600   if (__glibc_unlikely (newstate == NULL))
  1601     return NULL;
  1602   err = re_node_set_init_copy (&newstate->nodes, nodes);
  1603   if (__glibc_unlikely (err != REG_NOERROR))
  1604     {
  1605       re_free (newstate);
  1606       return NULL;
  1607     }
  1608 
  1609   newstate->entrance_nodes = &newstate->nodes;
  1610   for (i = 0 ; i < nodes->nelem ; i++)
  1611     {
  1612       re_token_t *node = dfa->nodes + nodes->elems[i];
  1613       re_token_type_t type = node->type;
  1614       if (type == CHARACTER && !node->constraint)
  1615         continue;
  1616       newstate->accept_mb |= node->accept_mb;
  1617 
  1618       /* If the state has the halt node, the state is a halt state.  */
  1619       if (type == END_OF_RE)
  1620         newstate->halt = 1;
  1621       else if (type == OP_BACK_REF)
  1622         newstate->has_backref = 1;
  1623       else if (type == ANCHOR || node->constraint)
  1624         newstate->has_constraint = 1;
  1625     }
  1626   err = register_state (dfa, newstate, hash);
  1627   if (__glibc_unlikely (err != REG_NOERROR))
  1628     {
  1629       free_state (newstate);
  1630       newstate = NULL;
  1631     }
  1632   return newstate;
  1633 }
  1634 
  1635 /* Create the new state which is depend on the context CONTEXT.
  1636    Return the new state if succeeded, otherwise return NULL.  */
  1637 
  1638 static re_dfastate_t *
  1639 __attribute_warn_unused_result__
  1640 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
  1641                     unsigned int context, re_hashval_t hash)
  1642 {
  1643   Idx i, nctx_nodes = 0;
  1644   reg_errcode_t err;
  1645   re_dfastate_t *newstate;
  1646 
  1647   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
  1648   if (__glibc_unlikely (newstate == NULL))
  1649     return NULL;
  1650   err = re_node_set_init_copy (&newstate->nodes, nodes);
  1651   if (__glibc_unlikely (err != REG_NOERROR))
  1652     {
  1653       re_free (newstate);
  1654       return NULL;
  1655     }
  1656 
  1657   newstate->context = context;
  1658   newstate->entrance_nodes = &newstate->nodes;
  1659 
  1660   for (i = 0 ; i < nodes->nelem ; i++)
  1661     {
  1662       re_token_t *node = dfa->nodes + nodes->elems[i];
  1663       re_token_type_t type = node->type;
  1664       unsigned int constraint = node->constraint;
  1665 
  1666       if (type == CHARACTER && !constraint)
  1667         continue;
  1668       newstate->accept_mb |= node->accept_mb;
  1669 
  1670       /* If the state has the halt node, the state is a halt state.  */
  1671       if (type == END_OF_RE)
  1672         newstate->halt = 1;
  1673       else if (type == OP_BACK_REF)
  1674         newstate->has_backref = 1;
  1675 
  1676       if (constraint)
  1677         {
  1678           if (newstate->entrance_nodes == &newstate->nodes)
  1679             {
  1680               re_node_set *entrance_nodes = re_malloc (re_node_set, 1);
  1681               if (__glibc_unlikely (entrance_nodes == NULL))
  1682                 {
  1683                   free_state (newstate);
  1684                   return NULL;
  1685                 }
  1686               newstate->entrance_nodes = entrance_nodes;
  1687               if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
  1688                   != REG_NOERROR)
  1689                 {
  1690                   free_state (newstate);
  1691                   return NULL;
  1692                 }
  1693               nctx_nodes = 0;
  1694               newstate->has_constraint = 1;
  1695             }
  1696 
  1697           if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
  1698             {
  1699               re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
  1700               ++nctx_nodes;
  1701             }
  1702         }
  1703     }
  1704   err = register_state (dfa, newstate, hash);
  1705   if (__glibc_unlikely (err != REG_NOERROR))
  1706     {
  1707       free_state (newstate);
  1708       newstate = NULL;
  1709     }
  1710   return  newstate;
  1711 }
/* [<][>][^][v][top][bottom][index][help] */
root/lib/regex_internal.c

DEFINITIONS