1 /* Byte-wise substring search, using the Two-Way algorithm. 2 Copyright (C) 2008-2023 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 Written by Eric Blake <ebb9@byu.net>, 2008. 5 6 This file is free software: you can redistribute it and/or modify 7 it under the terms of the GNU Lesser General Public License as 8 published by the Free Software Foundation; either version 2.1 of the 9 License, or (at your option) any later version. 10 11 This file is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 GNU Lesser General Public License for more details. 15 16 You should have received a copy of the GNU Lesser General Public License 17 along with this program. If not, see <https://www.gnu.org/licenses/>. */ 18 19 /* Before including this file, you need to include <config.h> and 20 <string.h>, and define: 21 RETURN_TYPE A macro that expands to the return type. 22 AVAILABLE(h, h_l, j, n_l) 23 A macro that returns nonzero if there are 24 at least N_L bytes left starting at H[J]. 25 H is 'unsigned char *', H_L, J, and N_L 26 are 'size_t'; H_L is an lvalue. For 27 NUL-terminated searches, H_L can be 28 modified each iteration to avoid having 29 to compute the end of H up front. 30 31 For case-insensitivity, you may optionally define: 32 CMP_FUNC(p1, p2, l) A macro that returns 0 iff the first L 33 characters of P1 and P2 are equal. 34 CANON_ELEMENT(c) A macro that canonicalizes an element right after 35 it has been fetched from one of the two strings. 36 The argument is an 'unsigned char'; the result 37 must be an 'unsigned char' as well. 38 39 This file undefines the macros documented above, and defines 40 LONG_NEEDLE_THRESHOLD. 41 */ 42 43 #include <limits.h> 44 #include <stdint.h> 45 46 /* We use the Two-Way string matching algorithm (also known as 47 Chrochemore-Perrin), which guarantees linear complexity with 48 constant space. Additionally, for long needles, we also use a bad 49 character shift table similar to the Boyer-Moore algorithm to 50 achieve improved (potentially sub-linear) performance. 51 52 See https://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260, 53 https://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm, 54 https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.34.6641&rep=rep1&type=pdf 55 */ 56 57 /* Point at which computing a bad-byte shift table is likely to be 58 worthwhile. Small needles should not compute a table, since it 59 adds (1 << CHAR_BIT) + NEEDLE_LEN computations of preparation for a 60 speedup no greater than a factor of NEEDLE_LEN. The larger the 61 needle, the better the potential performance gain. On the other 62 hand, on non-POSIX systems with CHAR_BIT larger than eight, the 63 memory required for the table is prohibitive. */ 64 #if CHAR_BIT < 10 65 # define LONG_NEEDLE_THRESHOLD 32U 66 #else 67 # define LONG_NEEDLE_THRESHOLD SIZE_MAX 68 #endif 69 70 #ifndef MAX 71 # define MAX(a, b) ((a < b) ? (b) : (a)) 72 #endif 73 74 #ifndef CANON_ELEMENT 75 # define CANON_ELEMENT(c) c 76 #endif 77 #ifndef CMP_FUNC 78 # define CMP_FUNC memcmp 79 #endif 80 81 /* Perform a critical factorization of NEEDLE, of length NEEDLE_LEN. 82 Return the index of the first byte in the right half, and set 83 *PERIOD to the global period of the right half. 84 85 The global period of a string is the smallest index (possibly its 86 length) at which all remaining bytes in the string are repetitions 87 of the prefix (the last repetition may be a subset of the prefix). 88 89 When NEEDLE is factored into two halves, a local period is the 90 length of the smallest word that shares a suffix with the left half 91 and shares a prefix with the right half. All factorizations of a 92 non-empty NEEDLE have a local period of at least 1 and no greater 93 than NEEDLE_LEN. 94 95 A critical factorization has the property that the local period 96 equals the global period. All strings have at least one critical 97 factorization with the left half smaller than the global period. 98 And while some strings have more than one critical factorization, 99 it is provable that with an ordered alphabet, at least one of the 100 critical factorizations corresponds to a maximal suffix. 101 102 Given an ordered alphabet, a critical factorization can be computed 103 in linear time, with 2 * NEEDLE_LEN comparisons, by computing the 104 shorter of two ordered maximal suffixes. The ordered maximal 105 suffixes are determined by lexicographic comparison while tracking 106 periodicity. */ 107 static size_t 108 critical_factorization (const unsigned char *needle, size_t needle_len, 109 size_t *period) 110 { 111 /* Index of last byte of left half, or SIZE_MAX. */ 112 size_t max_suffix, max_suffix_rev; 113 size_t j; /* Index into NEEDLE for current candidate suffix. */ 114 size_t k; /* Offset into current period. */ 115 size_t p; /* Intermediate period. */ 116 unsigned char a, b; /* Current comparison bytes. */ 117 118 /* Special case NEEDLE_LEN of 1 or 2 (all callers already filtered 119 out 0-length needles. */ 120 if (needle_len < 3) 121 { 122 *period = 1; 123 return needle_len - 1; 124 } 125 126 /* Invariants: 127 0 <= j < NEEDLE_LEN - 1 128 -1 <= max_suffix{,_rev} < j (treating SIZE_MAX as if it were signed) 129 min(max_suffix, max_suffix_rev) < global period of NEEDLE 130 1 <= p <= global period of NEEDLE 131 p == global period of the substring NEEDLE[max_suffix{,_rev}+1...j] 132 1 <= k <= p 133 */ 134 135 /* Perform lexicographic search. */ 136 max_suffix = SIZE_MAX; 137 j = 0; 138 k = p = 1; 139 while (j + k < needle_len) 140 { 141 a = CANON_ELEMENT (needle[j + k]); 142 b = CANON_ELEMENT (needle[max_suffix + k]); 143 if (a < b) 144 { 145 /* Suffix is smaller, period is entire prefix so far. */ 146 j += k; 147 k = 1; 148 p = j - max_suffix; 149 } 150 else if (a == b) 151 { 152 /* Advance through repetition of the current period. */ 153 if (k != p) 154 ++k; 155 else 156 { 157 j += p; 158 k = 1; 159 } 160 } 161 else /* b < a */ 162 { 163 /* Suffix is larger, start over from current location. */ 164 max_suffix = j++; 165 k = p = 1; 166 } 167 } 168 *period = p; 169 170 /* Perform reverse lexicographic search. */ 171 max_suffix_rev = SIZE_MAX; 172 j = 0; 173 k = p = 1; 174 while (j + k < needle_len) 175 { 176 a = CANON_ELEMENT (needle[j + k]); 177 b = CANON_ELEMENT (needle[max_suffix_rev + k]); 178 if (b < a) 179 { 180 /* Suffix is smaller, period is entire prefix so far. */ 181 j += k; 182 k = 1; 183 p = j - max_suffix_rev; 184 } 185 else if (a == b) 186 { 187 /* Advance through repetition of the current period. */ 188 if (k != p) 189 ++k; 190 else 191 { 192 j += p; 193 k = 1; 194 } 195 } 196 else /* a < b */ 197 { 198 /* Suffix is larger, start over from current location. */ 199 max_suffix_rev = j++; 200 k = p = 1; 201 } 202 } 203 204 /* Choose the shorter suffix. Return the index of the first byte of 205 the right half, rather than the last byte of the left half. 206 207 For some examples, 'banana' has two critical factorizations, both 208 exposed by the two lexicographic extreme suffixes of 'anana' and 209 'nana', where both suffixes have a period of 2. On the other 210 hand, with 'aab' and 'bba', both strings have a single critical 211 factorization of the last byte, with the suffix having a period 212 of 1. While the maximal lexicographic suffix of 'aab' is 'b', 213 the maximal lexicographic suffix of 'bba' is 'ba', which is not a 214 critical factorization. Conversely, the maximal reverse 215 lexicographic suffix of 'a' works for 'bba', but not 'ab' for 216 'aab'. The shorter suffix of the two will always be a critical 217 factorization. */ 218 if (max_suffix_rev + 1 < max_suffix + 1) 219 return max_suffix + 1; 220 *period = p; 221 return max_suffix_rev + 1; 222 } 223 224 /* Return the first location of non-empty NEEDLE within HAYSTACK, or 225 NULL. HAYSTACK_LEN is the minimum known length of HAYSTACK. This 226 method is optimized for NEEDLE_LEN < LONG_NEEDLE_THRESHOLD. 227 Performance is guaranteed to be linear, with an initialization cost 228 of 2 * NEEDLE_LEN comparisons. 229 230 If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at 231 most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching. 232 If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 * 233 HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching. */ 234 static RETURN_TYPE _GL_ATTRIBUTE_PURE 235 two_way_short_needle (const unsigned char *haystack, size_t haystack_len, 236 const unsigned char *needle, size_t needle_len) 237 { 238 size_t i; /* Index into current byte of NEEDLE. */ 239 size_t j; /* Index into current window of HAYSTACK. */ 240 size_t period; /* The period of the right half of needle. */ 241 size_t suffix; /* The index of the right half of needle. */ 242 243 /* Factor the needle into two halves, such that the left half is 244 smaller than the global period, and the right half is 245 periodic (with a period as large as NEEDLE_LEN - suffix). */ 246 suffix = critical_factorization (needle, needle_len, &period); 247 248 /* Perform the search. Each iteration compares the right half 249 first. */ 250 if (CMP_FUNC (needle, needle + period, suffix) == 0) 251 { 252 /* Entire needle is periodic; a mismatch in the left half can 253 only advance by the period, so use memory to avoid rescanning 254 known occurrences of the period in the right half. */ 255 size_t memory = 0; 256 j = 0; 257 while (AVAILABLE (haystack, haystack_len, j, needle_len)) 258 { 259 /* Scan for matches in right half. */ 260 i = MAX (suffix, memory); 261 while (i < needle_len && (CANON_ELEMENT (needle[i]) 262 == CANON_ELEMENT (haystack[i + j]))) 263 ++i; 264 if (needle_len <= i) 265 { 266 /* Scan for matches in left half. */ 267 i = suffix - 1; 268 while (memory < i + 1 && (CANON_ELEMENT (needle[i]) 269 == CANON_ELEMENT (haystack[i + j]))) 270 --i; 271 if (i + 1 < memory + 1) 272 return (RETURN_TYPE) (haystack + j); 273 /* No match, so remember how many repetitions of period 274 on the right half were scanned. */ 275 j += period; 276 memory = needle_len - period; 277 } 278 else 279 { 280 j += i - suffix + 1; 281 memory = 0; 282 } 283 } 284 } 285 else 286 { 287 /* The two halves of needle are distinct; no extra memory is 288 required, and any mismatch results in a maximal shift. */ 289 period = MAX (suffix, needle_len - suffix) + 1; 290 j = 0; 291 while (AVAILABLE (haystack, haystack_len, j, needle_len)) 292 { 293 /* Scan for matches in right half. */ 294 i = suffix; 295 while (i < needle_len && (CANON_ELEMENT (needle[i]) 296 == CANON_ELEMENT (haystack[i + j]))) 297 ++i; 298 if (needle_len <= i) 299 { 300 /* Scan for matches in left half. */ 301 i = suffix - 1; 302 while (i != SIZE_MAX && (CANON_ELEMENT (needle[i]) 303 == CANON_ELEMENT (haystack[i + j]))) 304 --i; 305 if (i == SIZE_MAX) 306 return (RETURN_TYPE) (haystack + j); 307 j += period; 308 } 309 else 310 j += i - suffix + 1; 311 } 312 } 313 return NULL; 314 } 315 316 /* Return the first location of non-empty NEEDLE within HAYSTACK, or 317 NULL. HAYSTACK_LEN is the minimum known length of HAYSTACK. This 318 method is optimized for LONG_NEEDLE_THRESHOLD <= NEEDLE_LEN. 319 Performance is guaranteed to be linear, with an initialization cost 320 of 3 * NEEDLE_LEN + (1 << CHAR_BIT) operations. 321 322 If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at 323 most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, 324 and sublinear performance O(HAYSTACK_LEN / NEEDLE_LEN) is possible. 325 If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 * 326 HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and 327 sublinear performance is not possible. */ 328 static RETURN_TYPE _GL_ATTRIBUTE_PURE 329 two_way_long_needle (const unsigned char *haystack, size_t haystack_len, 330 const unsigned char *needle, size_t needle_len) 331 { 332 size_t i; /* Index into current byte of NEEDLE. */ 333 size_t j; /* Index into current window of HAYSTACK. */ 334 size_t period; /* The period of the right half of needle. */ 335 size_t suffix; /* The index of the right half of needle. */ 336 size_t shift_table[1U << CHAR_BIT]; /* See below. */ 337 338 /* Factor the needle into two halves, such that the left half is 339 smaller than the global period, and the right half is 340 periodic (with a period as large as NEEDLE_LEN - suffix). */ 341 suffix = critical_factorization (needle, needle_len, &period); 342 343 /* Populate shift_table. For each possible byte value c, 344 shift_table[c] is the distance from the last occurrence of c to 345 the end of NEEDLE, or NEEDLE_LEN if c is absent from the NEEDLE. 346 shift_table[NEEDLE[NEEDLE_LEN - 1]] contains the only 0. */ 347 for (i = 0; i < 1U << CHAR_BIT; i++) 348 shift_table[i] = needle_len; 349 for (i = 0; i < needle_len; i++) 350 shift_table[CANON_ELEMENT (needle[i])] = needle_len - i - 1; 351 352 /* Perform the search. Each iteration compares the right half 353 first. */ 354 if (CMP_FUNC (needle, needle + period, suffix) == 0) 355 { 356 /* Entire needle is periodic; a mismatch in the left half can 357 only advance by the period, so use memory to avoid rescanning 358 known occurrences of the period in the right half. */ 359 size_t memory = 0; 360 size_t shift; 361 j = 0; 362 while (AVAILABLE (haystack, haystack_len, j, needle_len)) 363 { 364 /* Check the last byte first; if it does not match, then 365 shift to the next possible match location. */ 366 shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])]; 367 if (0 < shift) 368 { 369 if (memory && shift < period) 370 { 371 /* Since needle is periodic, but the last period has 372 a byte out of place, there can be no match until 373 after the mismatch. */ 374 shift = needle_len - period; 375 } 376 memory = 0; 377 j += shift; 378 continue; 379 } 380 /* Scan for matches in right half. The last byte has 381 already been matched, by virtue of the shift table. */ 382 i = MAX (suffix, memory); 383 while (i < needle_len - 1 && (CANON_ELEMENT (needle[i]) 384 == CANON_ELEMENT (haystack[i + j]))) 385 ++i; 386 if (needle_len - 1 <= i) 387 { 388 /* Scan for matches in left half. */ 389 i = suffix - 1; 390 while (memory < i + 1 && (CANON_ELEMENT (needle[i]) 391 == CANON_ELEMENT (haystack[i + j]))) 392 --i; 393 if (i + 1 < memory + 1) 394 return (RETURN_TYPE) (haystack + j); 395 /* No match, so remember how many repetitions of period 396 on the right half were scanned. */ 397 j += period; 398 memory = needle_len - period; 399 } 400 else 401 { 402 j += i - suffix + 1; 403 memory = 0; 404 } 405 } 406 } 407 else 408 { 409 /* The two halves of needle are distinct; no extra memory is 410 required, and any mismatch results in a maximal shift. */ 411 size_t shift; 412 period = MAX (suffix, needle_len - suffix) + 1; 413 j = 0; 414 while (AVAILABLE (haystack, haystack_len, j, needle_len)) 415 { 416 /* Check the last byte first; if it does not match, then 417 shift to the next possible match location. */ 418 shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])]; 419 if (0 < shift) 420 { 421 j += shift; 422 continue; 423 } 424 /* Scan for matches in right half. The last byte has 425 already been matched, by virtue of the shift table. */ 426 i = suffix; 427 while (i < needle_len - 1 && (CANON_ELEMENT (needle[i]) 428 == CANON_ELEMENT (haystack[i + j]))) 429 ++i; 430 if (needle_len - 1 <= i) 431 { 432 /* Scan for matches in left half. */ 433 i = suffix - 1; 434 while (i != SIZE_MAX && (CANON_ELEMENT (needle[i]) 435 == CANON_ELEMENT (haystack[i + j]))) 436 --i; 437 if (i == SIZE_MAX) 438 return (RETURN_TYPE) (haystack + j); 439 j += period; 440 } 441 else 442 j += i - suffix + 1; 443 } 444 } 445 return NULL; 446 } 447 448 #undef AVAILABLE 449 #undef CANON_ELEMENT 450 #undef CMP_FUNC 451 #undef MAX 452 #undef RETURN_TYPE