1 /* Emacs regular expression matching and search
2
3 Copyright (C) 1993-2023 Free Software Foundation, Inc.
4
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <https://www.gnu.org/licenses/>. */
17
18 /* TODO:
19 - structure the opcode space into opcode+flag.
20 - replace (succeed_n + jump_n + set_number_at) with something that doesn't
21 need to modify the compiled regexp so that re_search can be reentrant.
22 - get rid of on_failure_jump_smart by doing the optimization in re_comp
23 rather than at run-time, so that re_search can be reentrant.
24 */
25
26 #include <config.h>
27
28 #include "regex-emacs.h"
29
30 #include <stdlib.h>
31
32 #include "character.h"
33 #include "buffer.h"
34 #include "syntax.h"
35 #include "category.h"
36 #include "dispextern.h"
37
38 /* Maximum number of duplicates an interval can allow. Some systems
39 define this in other header files, but we want our value, so remove
40 any previous define. Repeat counts are stored in opcodes as 2-byte
41 unsigned integers. */
42 #ifdef RE_DUP_MAX
43 # undef RE_DUP_MAX
44 #endif
45 #define RE_DUP_MAX (0xffff)
46
47 /* Make syntax table lookup grant data in gl_state. */
48 #define SYNTAX(c) syntax_property (c, 1)
49
50 /* Explicit syntax lookup using the buffer-local table. */
51 #define BUFFER_SYNTAX(c) syntax_property (c, 0)
52
53 #define RE_MULTIBYTE_P(bufp) ((bufp)->multibyte)
54 #define RE_TARGET_MULTIBYTE_P(bufp) ((bufp)->target_multibyte)
55 #define RE_STRING_CHAR(p, multibyte) \
56 (multibyte ? STRING_CHAR (p) : *(p))
57 #define RE_STRING_CHAR_AND_LENGTH(p, len, multibyte) \
58 (multibyte ? string_char_and_length (p, &(len)) : ((len) = 1, *(p)))
59
60 #define RE_CHAR_TO_MULTIBYTE(c) UNIBYTE_TO_CHAR (c)
61
62 #define RE_CHAR_TO_UNIBYTE(c) CHAR_TO_BYTE_SAFE (c)
63
64 /* Set C a (possibly converted to multibyte) character before P. P
65 points into a string which is the virtual concatenation of STR1
66 (which ends at END1) or STR2 (which ends at END2). */
67 #define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
68 do { \
69 if (target_multibyte) \
70 { \
71 re_char *dtemp = (p) == (str2) ? (end1) : (p); \
72 re_char *dlimit = (p) > (str2) && (p) <= (end2) ? (str2) : (str1); \
73 while (dtemp-- > dlimit && !CHAR_HEAD_P (*dtemp)) \
74 continue; \
75 c = STRING_CHAR (dtemp); \
76 } \
77 else \
78 { \
79 (c = ((p) == (str2) ? (end1) : (p))[-1]); \
80 (c) = RE_CHAR_TO_MULTIBYTE (c); \
81 } \
82 } while (false)
83
84 /* Set C a (possibly converted to multibyte) character at P, and set
85 LEN to the byte length of that character. */
86 #define GET_CHAR_AFTER(c, p, len) \
87 do { \
88 if (target_multibyte) \
89 (c) = string_char_and_length (p, &(len)); \
90 else \
91 { \
92 (c) = *p; \
93 len = 1; \
94 (c) = RE_CHAR_TO_MULTIBYTE (c); \
95 } \
96 } while (false)
97
98 /* 1 if C is an ASCII character. */
99 #define IS_REAL_ASCII(c) ((c) < 0200)
100
101 /* 1 if C is a unibyte character. */
102 #define ISUNIBYTE(c) (SINGLE_BYTE_CHAR_P ((c)))
103
104 /* The Emacs definitions should not be directly affected by locales. */
105
106 /* In Emacs, these are only used for single-byte characters. */
107 #define ISDIGIT(c) ((c) >= '0' && (c) <= '9')
108 #define ISCNTRL(c) ((c) < ' ')
109 #define ISXDIGIT(c) (0 <= char_hexdigit (c))
110
111 /* The rest must handle multibyte characters. */
112
113 #define ISBLANK(c) (IS_REAL_ASCII (c) \
114 ? ((c) == ' ' || (c) == '\t') \
115 : blankp (c))
116
117 #define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c) \
118 ? (c) > ' ' && !((c) >= 0177 && (c) <= 0240) \
119 : graphicp (c))
120
121 #define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c) \
122 ? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237) \
123 : printablep (c))
124
125 #define ISALNUM(c) (IS_REAL_ASCII (c) \
126 ? (((c) >= 'a' && (c) <= 'z') \
127 || ((c) >= 'A' && (c) <= 'Z') \
128 || ((c) >= '0' && (c) <= '9')) \
129 : alphanumericp (c))
130
131 #define ISALPHA(c) (IS_REAL_ASCII (c) \
132 ? (((c) >= 'a' && (c) <= 'z') \
133 || ((c) >= 'A' && (c) <= 'Z')) \
134 : alphabeticp (c))
135
136 #define ISLOWER(c) lowercasep (c)
137
138 #define ISUPPER(c) uppercasep (c)
139
140 /* The following predicates use the buffer-local syntax table and
141 ignore syntax properties, for consistency with the up-front
142 assumptions made at compile time. */
143
144 #define ISPUNCT(c) (IS_REAL_ASCII (c) \
145 ? ((c) > ' ' && (c) < 0177 \
146 && !(((c) >= 'a' && (c) <= 'z') \
147 || ((c) >= 'A' && (c) <= 'Z') \
148 || ((c) >= '0' && (c) <= '9'))) \
149 : BUFFER_SYNTAX (c) != Sword)
150
151 #define ISSPACE(c) (BUFFER_SYNTAX (c) == Swhitespace)
152
153 #define ISWORD(c) (BUFFER_SYNTAX (c) == Sword)
154
155 /* Use alloca instead of malloc. This is because using malloc in
156 re_search* or re_match* could cause memory leaks when C-g is used
157 in Emacs (note that SAFE_ALLOCA could also call malloc, but does so
158 via 'record_xmalloc' which uses 'unwind_protect' to ensure the
159 memory is freed even in case of non-local exits); also, malloc is
160 slower and causes storage fragmentation. On the other hand, malloc
161 is more portable, and easier to debug.
162
163 Because we sometimes use alloca, some routines have to be macros,
164 not functions -- 'alloca'-allocated space disappears at the end of the
165 function it is called in. */
166
167 /* This may be adjusted in main(), if the stack is successfully grown. */
168 ptrdiff_t emacs_re_safe_alloca = MAX_ALLOCA;
169 /* Like USE_SAFE_ALLOCA, but use emacs_re_safe_alloca. */
170 #define REGEX_USE_SAFE_ALLOCA \
171 USE_SAFE_ALLOCA; sa_avail = emacs_re_safe_alloca
172
173 /* Assumes a 'char *destination' variable. */
174 #define REGEX_REALLOCATE(source, osize, nsize) \
175 (destination = SAFE_ALLOCA (nsize), \
176 memcpy (destination, source, osize))
177
178 /* True if 'size1' is non-NULL and PTR is pointing anywhere inside
179 'string1' or just past its end. This works if PTR is NULL, which is
180 a good thing. */
181 #define FIRST_STRING_P(ptr) \
182 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
183
184 #define BYTEWIDTH 8 /* In bits. */
185
186 /* Type of source-pattern and string chars. */
187 typedef const unsigned char re_char;
188
189 static void re_compile_fastmap (struct re_pattern_buffer *);
190 static ptrdiff_t re_match_2_internal (struct re_pattern_buffer *bufp,
191 re_char *string1, ptrdiff_t size1,
192 re_char *string2, ptrdiff_t size2,
193 ptrdiff_t pos,
194 struct re_registers *regs,
195 ptrdiff_t stop);
196
197 /* These are the command codes that appear in compiled regular
198 expressions. Some opcodes are followed by argument bytes. A
199 command code can specify any interpretation whatsoever for its
200 arguments. Zero bytes may appear in the compiled regular expression. */
201
202 typedef enum
203 {
204 no_op = 0,
205
206 /* Succeed right away--no more backtracking. */
207 succeed,
208
209 /* Followed by one byte giving n, then by n literal bytes. */
210 exactn,
211
212 /* Matches any (more or less) character. */
213 anychar,
214
215 /* Matches any one char belonging to specified set. First
216 following byte is number of bitmap bytes. Then come bytes
217 for a bitmap saying which chars are in. Bits in each byte
218 are ordered low-bit-first. A character is in the set if its
219 bit is 1. A character too large to have a bit in the map is
220 automatically not in the set.
221
222 If the length byte has the 0x80 bit set, then that stuff
223 is followed by a range table:
224 2 bytes of flags for character sets (low 8 bits, high 8 bits)
225 See RANGE_TABLE_WORK_BITS below.
226 2 bytes, the number of pairs that follow (up to 32767)
227 pairs, each 2 multibyte characters,
228 each multibyte character represented as 3 bytes. */
229 charset,
230
231 /* Same parameters as charset, but match any character that is
232 not one of those specified. */
233 charset_not,
234
235 /* Start remembering the text that is matched, for storing in a
236 register. Followed by one byte with the register number, in
237 the range 0 to one less than the pattern buffer's re_nsub
238 field. */
239 start_memory,
240
241 /* Stop remembering the text that is matched and store it in a
242 memory register. Followed by one byte with the register
243 number, in the range 0 to one less than 're_nsub' in the
244 pattern buffer. */
245 stop_memory,
246
247 /* Match a duplicate of something remembered. Followed by one
248 byte containing the register number. */
249 duplicate,
250
251 /* Fail unless at beginning of line. */
252 begline,
253
254 /* Fail unless at end of line. */
255 endline,
256
257 /* Succeeds if at beginning of buffer. */
258 begbuf,
259
260 /* Analogously, for end of buffer/string. */
261 endbuf,
262
263 /* Followed by two byte relative address to which to jump. */
264 jump,
265
266 /* Followed by two-byte relative address of place to resume at
267 in case of failure. */
268 on_failure_jump,
269
270 /* Like on_failure_jump, but pushes a placeholder instead of the
271 current string position when executed. */
272 on_failure_keep_string_jump,
273
274 /* Just like 'on_failure_jump', except that it checks that we
275 don't get stuck in an infinite loop (matching an empty string
276 indefinitely). */
277 on_failure_jump_loop,
278
279 /* Just like 'on_failure_jump_loop', except that it checks for
280 a different kind of loop (the kind that shows up with non-greedy
281 operators). This operation has to be immediately preceded
282 by a 'no_op'. */
283 on_failure_jump_nastyloop,
284
285 /* A smart 'on_failure_jump' used for greedy * and + operators.
286 It analyzes the loop before which it is put and if the
287 loop does not require backtracking, it changes itself to
288 'on_failure_keep_string_jump' and short-circuits the loop,
289 else it just defaults to changing itself into 'on_failure_jump'.
290 It assumes that it is pointing to just past a 'jump'. */
291 on_failure_jump_smart,
292
293 /* Followed by two-byte relative address and two-byte number n.
294 After matching N times, jump to the address upon failure.
295 Does not work if N starts at 0: use on_failure_jump_loop
296 instead. */
297 succeed_n,
298
299 /* Followed by two-byte relative address, and two-byte number n.
300 Jump to the address N times, then fail. */
301 jump_n,
302
303 /* Set the following two-byte relative address to the
304 subsequent two-byte number. The address *includes* the two
305 bytes of number. */
306 set_number_at,
307
308 wordbeg, /* Succeeds if at word beginning. */
309 wordend, /* Succeeds if at word end. */
310
311 wordbound, /* Succeeds if at a word boundary. */
312 notwordbound, /* Succeeds if not at a word boundary. */
313
314 symbeg, /* Succeeds if at symbol beginning. */
315 symend, /* Succeeds if at symbol end. */
316
317 /* Matches any character whose syntax is specified. Followed by
318 a byte which contains a syntax code, e.g., Sword. */
319 syntaxspec,
320
321 /* Matches any character whose syntax is not that specified. */
322 notsyntaxspec,
323
324 at_dot, /* Succeeds if at point. */
325
326 /* Matches any character whose category-set contains the specified
327 category. The operator is followed by a byte which contains a
328 category code (mnemonic ASCII character). */
329 categoryspec,
330
331 /* Matches any character whose category-set does not contain the
332 specified category. The operator is followed by a byte which
333 contains the category code (mnemonic ASCII character). */
334 notcategoryspec
335 } re_opcode_t;
336
337 /* Common operations on the compiled pattern. */
338
339 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */
340
341 #define STORE_NUMBER(destination, number) \
342 do { \
343 (destination)[0] = (number) & 0377; \
344 (destination)[1] = (number) >> 8; \
345 } while (false)
346
347 /* Same as STORE_NUMBER, except increment DESTINATION to
348 the byte after where the number is stored. Therefore, DESTINATION
349 must be an lvalue. */
350
351 #define STORE_NUMBER_AND_INCR(destination, number) \
352 do { \
353 STORE_NUMBER (destination, number); \
354 (destination) += 2; \
355 } while (false)
356
357 /* Put into DESTINATION a number stored in two contiguous bytes starting
358 at SOURCE. */
359
360 #define EXTRACT_NUMBER(destination, source) \
361 ((destination) = extract_number (source))
362
363 static int
364 extract_number (re_char *source)
365 {
366 signed char leading_byte = source[1];
367 return leading_byte * 256 + source[0];
368 }
369
370 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
371 SOURCE must be an lvalue. */
372
373 #define EXTRACT_NUMBER_AND_INCR(destination, source) \
374 ((destination) = extract_number_and_incr (&source))
375
376 static int
377 extract_number_and_incr (re_char **source)
378 {
379 int num = extract_number (*source);
380 *source += 2;
381 return num;
382 }
383
384 /* Store a multibyte character in three contiguous bytes starting
385 DESTINATION, and increment DESTINATION to the byte after where the
386 character is stored. Therefore, DESTINATION must be an lvalue. */
387
388 #define STORE_CHARACTER_AND_INCR(destination, character) \
389 do { \
390 (destination)[0] = (character) & 0377; \
391 (destination)[1] = ((character) >> 8) & 0377; \
392 (destination)[2] = (character) >> 16; \
393 (destination) += 3; \
394 } while (false)
395
396 /* Put into DESTINATION a character stored in three contiguous bytes
397 starting at SOURCE. */
398
399 #define EXTRACT_CHARACTER(destination, source) \
400 do { \
401 (destination) = ((source)[0] \
402 | ((source)[1] << 8) \
403 | ((source)[2] << 16)); \
404 } while (false)
405
406
407 /* Macros for charset. */
408
409 /* Size of bitmap of charset P in bytes. P is a start of charset,
410 i.e. *P is (re_opcode_t) charset or (re_opcode_t) charset_not. */
411 #define CHARSET_BITMAP_SIZE(p) ((p)[1] & 0x7F)
412
413 /* Nonzero if charset P has range table. */
414 #define CHARSET_RANGE_TABLE_EXISTS_P(p) (((p)[1] & 0x80) != 0)
415
416 /* Return the address of range table of charset P. But not the start
417 of table itself, but the before where the number of ranges is
418 stored. '2 +' means to skip re_opcode_t and size of bitmap,
419 and the 2 bytes of flags at the start of the range table. */
420 #define CHARSET_RANGE_TABLE(p) (&(p)[4 + CHARSET_BITMAP_SIZE (p)])
421
422 /* Extract the bit flags that start a range table. */
423 #define CHARSET_RANGE_TABLE_BITS(p) \
424 ((p)[2 + CHARSET_BITMAP_SIZE (p)] \
425 + (p)[3 + CHARSET_BITMAP_SIZE (p)] * 0x100)
426
427 /* Return the address of end of RANGE_TABLE. COUNT is number of
428 ranges (which is a pair of (start, end)) in the RANGE_TABLE. '* 2'
429 is start of range and end of range. '* 3' is size of each start
430 and end. */
431 #define CHARSET_RANGE_TABLE_END(range_table, count) \
432 ((range_table) + (count) * 2 * 3)
433
434 /* If REGEX_EMACS_DEBUG is defined, print many voluminous messages
435 (if the variable regex_emacs_debug is positive). */
436
437 #ifdef REGEX_EMACS_DEBUG
438
439 /* Use standard I/O for debugging. */
440 # include "sysstdio.h"
441
442 static int regex_emacs_debug = -100000;
443
444 # define DEBUG_STATEMENT(e) e
445 # define DEBUG_PRINT(...) \
446 if (regex_emacs_debug > 0) fprintf (stderr, __VA_ARGS__)
447 # define DEBUG_COMPILES_ARGUMENTS
448 # define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
449 if (regex_emacs_debug > 0) print_partial_compiled_pattern (s, e)
450 # define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
451 if (regex_emacs_debug > 0) print_double_string (w, s1, sz1, s2, sz2)
452
453 static void
454 debug_putchar (int c)
455 {
456 if (c >= 32 && c <= 126)
457 putc (c, stderr);
458 else
459 {
460 unsigned int uc = c;
461 fprintf (stderr, "{%02x}", uc);
462 }
463 }
464
465 /* Print the fastmap in human-readable form. */
466
467 static void
468 print_fastmap (char *fastmap)
469 {
470 bool was_a_range = false;
471 int i = 0;
472
473 while (i < (1 << BYTEWIDTH))
474 {
475 if (fastmap[i++])
476 {
477 was_a_range = false;
478 debug_putchar (i - 1);
479 while (i < (1 << BYTEWIDTH) && fastmap[i])
480 {
481 was_a_range = true;
482 i++;
483 }
484 if (was_a_range)
485 {
486 debug_putchar ('-');
487 debug_putchar (i - 1);
488 }
489 }
490 }
491 putc ('\n', stderr);
492 }
493
494
495 /* Print a compiled pattern string in human-readable form, starting at
496 the START pointer into it and ending just before the pointer END. */
497
498 static void
499 print_partial_compiled_pattern (re_char *start, re_char *end)
500 {
501 int mcnt, mcnt2;
502 re_char *p = start;
503 re_char *pend = end;
504
505 if (start == NULL)
506 {
507 fputs ("(null)\n", stderr);
508 return;
509 }
510
511 /* Loop over pattern commands. */
512 while (p < pend)
513 {
514 fprintf (stderr, "%td:\t", p - start);
515
516 switch ((re_opcode_t) *p++)
517 {
518 case no_op:
519 fputs ("/no_op", stderr);
520 break;
521
522 case succeed:
523 fputs ("/succeed", stderr);
524 break;
525
526 case exactn:
527 mcnt = *p++;
528 fprintf (stderr, "/exactn/%d", mcnt);
529 do
530 {
531 debug_putchar ('/');
532 debug_putchar (*p++);
533 }
534 while (--mcnt);
535 break;
536
537 case start_memory:
538 fprintf (stderr, "/start_memory/%d", *p++);
539 break;
540
541 case stop_memory:
542 fprintf (stderr, "/stop_memory/%d", *p++);
543 break;
544
545 case duplicate:
546 fprintf (stderr, "/duplicate/%d", *p++);
547 break;
548
549 case anychar:
550 fputs ("/anychar", stderr);
551 break;
552
553 case charset:
554 case charset_not:
555 {
556 int c, last = -100;
557 bool in_range = false;
558 int length = CHARSET_BITMAP_SIZE (p - 1);
559 bool has_range_table = CHARSET_RANGE_TABLE_EXISTS_P (p - 1);
560
561 fprintf (stderr, "/charset [%s",
562 (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
563
564 if (p + (*p & 0x7f) >= pend)
565 fputs (" !extends past end of pattern! ", stderr);
566
567 for (c = 0; c < 256; c++)
568 if (c / 8 < length
569 && (p[1 + (c/8)] & (1 << (c % 8))))
570 {
571 /* Are we starting a range? */
572 if (last + 1 == c && ! in_range)
573 {
574 debug_putchar ('-');
575 in_range = true;
576 }
577 /* Have we broken a range? */
578 else if (last + 1 != c && in_range)
579 {
580 debug_putchar (last);
581 in_range = false;
582 }
583
584 if (! in_range)
585 debug_putchar (c);
586
587 last = c;
588 }
589
590 if (in_range)
591 debug_putchar (last);
592
593 debug_putchar (']');
594
595 p += 1 + length;
596
597 if (has_range_table)
598 {
599 int count;
600 fputs ("has-range-table", stderr);
601
602 /* ??? Should print the range table; for now, just skip it. */
603 p += 2; /* skip range table bits */
604 EXTRACT_NUMBER_AND_INCR (count, p);
605 p = CHARSET_RANGE_TABLE_END (p, count);
606 }
607 }
608 break;
609
610 case begline:
611 fputs ("/begline", stderr);
612 break;
613
614 case endline:
615 fputs ("/endline", stderr);
616 break;
617
618 case on_failure_jump:
619 EXTRACT_NUMBER_AND_INCR (mcnt, p);
620 fprintf (stderr, "/on_failure_jump to %td", p + mcnt - start);
621 break;
622
623 case on_failure_keep_string_jump:
624 EXTRACT_NUMBER_AND_INCR (mcnt, p);
625 fprintf (stderr, "/on_failure_keep_string_jump to %td",
626 p + mcnt - start);
627 break;
628
629 case on_failure_jump_nastyloop:
630 EXTRACT_NUMBER_AND_INCR (mcnt, p);
631 fprintf (stderr, "/on_failure_jump_nastyloop to %td",
632 p + mcnt - start);
633 break;
634
635 case on_failure_jump_loop:
636 EXTRACT_NUMBER_AND_INCR (mcnt, p);
637 fprintf (stderr, "/on_failure_jump_loop to %td",
638 p + mcnt - start);
639 break;
640
641 case on_failure_jump_smart:
642 EXTRACT_NUMBER_AND_INCR (mcnt, p);
643 fprintf (stderr, "/on_failure_jump_smart to %td",
644 p + mcnt - start);
645 break;
646
647 case jump:
648 EXTRACT_NUMBER_AND_INCR (mcnt, p);
649 fprintf (stderr, "/jump to %td", p + mcnt - start);
650 break;
651
652 case succeed_n:
653 EXTRACT_NUMBER_AND_INCR (mcnt, p);
654 EXTRACT_NUMBER_AND_INCR (mcnt2, p);
655 fprintf (stderr, "/succeed_n to %td, %d times",
656 p - 2 + mcnt - start, mcnt2);
657 break;
658
659 case jump_n:
660 EXTRACT_NUMBER_AND_INCR (mcnt, p);
661 EXTRACT_NUMBER_AND_INCR (mcnt2, p);
662 fprintf (stderr, "/jump_n to %td, %d times",
663 p - 2 + mcnt - start, mcnt2);
664 break;
665
666 case set_number_at:
667 EXTRACT_NUMBER_AND_INCR (mcnt, p);
668 EXTRACT_NUMBER_AND_INCR (mcnt2, p);
669 fprintf (stderr, "/set_number_at location %td to %d",
670 p - 2 + mcnt - start, mcnt2);
671 break;
672
673 case wordbound:
674 fputs ("/wordbound", stderr);
675 break;
676
677 case notwordbound:
678 fputs ("/notwordbound", stderr);
679 break;
680
681 case wordbeg:
682 fputs ("/wordbeg", stderr);
683 break;
684
685 case wordend:
686 fputs ("/wordend", stderr);
687 break;
688
689 case symbeg:
690 fputs ("/symbeg", stderr);
691 break;
692
693 case symend:
694 fputs ("/symend", stderr);
695 break;
696
697 case syntaxspec:
698 fputs ("/syntaxspec", stderr);
699 mcnt = *p++;
700 fprintf (stderr, "/%d", mcnt);
701 break;
702
703 case notsyntaxspec:
704 fputs ("/notsyntaxspec", stderr);
705 mcnt = *p++;
706 fprintf (stderr, "/%d", mcnt);
707 break;
708
709 case at_dot:
710 fputs ("/at_dot", stderr);
711 break;
712
713 case categoryspec:
714 fputs ("/categoryspec", stderr);
715 mcnt = *p++;
716 fprintf (stderr, "/%d", mcnt);
717 break;
718
719 case notcategoryspec:
720 fputs ("/notcategoryspec", stderr);
721 mcnt = *p++;
722 fprintf (stderr, "/%d", mcnt);
723 break;
724
725 case begbuf:
726 fputs ("/begbuf", stderr);
727 break;
728
729 case endbuf:
730 fputs ("/endbuf", stderr);
731 break;
732
733 default:
734 fprintf (stderr, "?%d", *(p-1));
735 }
736
737 putc ('\n', stderr);
738 }
739
740 fprintf (stderr, "%td:\tend of pattern.\n", p - start);
741 }
742
743
744 static void
745 print_compiled_pattern (struct re_pattern_buffer *bufp)
746 {
747 re_char *buffer = bufp->buffer;
748
749 print_partial_compiled_pattern (buffer, buffer + bufp->used);
750 fprintf (stderr, "%td bytes used/%td bytes allocated.\n",
751 bufp->used, bufp->allocated);
752
753 if (bufp->fastmap_accurate && bufp->fastmap)
754 {
755 fputs ("fastmap: ", stderr);
756 print_fastmap (bufp->fastmap);
757 }
758
759 fprintf (stderr, "re_nsub: %td\t", bufp->re_nsub);
760 fprintf (stderr, "regs_alloc: %d\t", bufp->regs_allocated);
761 fprintf (stderr, "can_be_null: %d\n", bufp->can_be_null);
762 /* Perhaps we should print the translate table? */
763 }
764
765
766 static void
767 print_double_string (re_char *where, re_char *string1, ptrdiff_t size1,
768 re_char *string2, ptrdiff_t size2)
769 {
770 if (where == NULL)
771 fputs ("(null)", stderr);
772 else
773 {
774 int i;
775 if (FIRST_STRING_P (where))
776 {
777 for (i = 0; i < string1 + size1 - where; i++)
778 debug_putchar (where[i]);
779 where = string2;
780 }
781
782 for (i = 0; i < string2 + size2 - where; i++)
783 debug_putchar (where[i]);
784 }
785 }
786
787 #else /* not REGEX_EMACS_DEBUG */
788
789 # define DEBUG_STATEMENT(e)
790 # define DEBUG_PRINT(...)
791 # define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
792 # define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
793
794 #endif /* not REGEX_EMACS_DEBUG */
795
796 typedef enum
797 {
798 REG_NOERROR = 0, /* Success. */
799 REG_NOMATCH, /* Didn't find a match (for regexec). */
800
801 /* POSIX regcomp return error codes. (In the order listed in the
802 standard.) An older version of this code supported the POSIX
803 API; this version continues to use these names internally. */
804 REG_BADPAT, /* Invalid pattern. */
805 REG_ECOLLATE, /* Not implemented. */
806 REG_ECTYPE, /* Invalid character class name. */
807 REG_EESCAPE, /* Trailing backslash. */
808 REG_ESUBREG, /* Invalid back reference. */
809 REG_EBRACK, /* Unmatched left bracket. */
810 REG_EPAREN, /* Parenthesis imbalance. */
811 REG_EBRACE, /* Unmatched \{. */
812 REG_BADBR, /* Invalid contents of \{\}. */
813 REG_ERANGE, /* Invalid range end. */
814 REG_ESPACE, /* Ran out of memory. */
815 REG_BADRPT, /* No preceding re for repetition op. */
816
817 /* Error codes we've added. */
818 REG_EEND, /* Premature end. */
819 REG_ESIZE, /* Compiled pattern bigger than 2^16 bytes. */
820 REG_ERPAREN, /* Unmatched ) or \); not returned from regcomp. */
821 REG_ERANGEX, /* Range striding over charsets. */
822 REG_ESIZEBR /* n or m too big in \{n,m\} */
823 } reg_errcode_t;
824
825 static const char *re_error_msgid[] =
826 {
827 [REG_NOERROR] = "Success",
828 [REG_NOMATCH] = "No match",
829 [REG_BADPAT] = "Invalid regular expression",
830 [REG_ECOLLATE] = "Invalid collation character",
831 [REG_ECTYPE] = "Invalid character class name",
832 [REG_EESCAPE] = "Trailing backslash",
833 [REG_ESUBREG] = "Invalid back reference",
834 [REG_EBRACK] = "Unmatched [ or [^",
835 [REG_EPAREN] = "Unmatched ( or \\(",
836 [REG_EBRACE] = "Unmatched \\{",
837 [REG_BADBR] = "Invalid content of \\{\\}",
838 [REG_ERANGE] = "Invalid range end",
839 [REG_ESPACE] = "Memory exhausted",
840 [REG_BADRPT] = "Invalid preceding regular expression",
841 [REG_EEND] = "Premature end of regular expression",
842 [REG_ESIZE] = "Regular expression too big",
843 [REG_ERPAREN] = "Unmatched ) or \\)",
844 [REG_ERANGEX ] = "Range striding over charsets",
845 [REG_ESIZEBR ] = "Invalid content of \\{\\}",
846 };
847
848 /* For 'regs_allocated'. */
849 enum { REGS_UNALLOCATED, REGS_REALLOCATE, REGS_FIXED };
850
851 /* If 'regs_allocated' is REGS_UNALLOCATED in the pattern buffer,
852 're_match_2' returns information about at least this many registers
853 the first time a 'regs' structure is passed. */
854 enum { RE_NREGS = 30 };
855
856 /* The searching and matching functions allocate memory for the
857 failure stack and registers. Otherwise searching and matching
858 routines would have much smaller memory resources at their
859 disposal, and therefore might fail to handle complex regexps. */
860
861 /* Failure stack declarations and macros; both re_compile_fastmap and
862 re_match_2 use a failure stack. These have to be macros because of
863 SAFE_ALLOCA. */
864
865
866 /* Approximate number of failure points for which to initially allocate space
867 when matching. If this number is exceeded, we allocate more
868 space, so it is not a hard limit. */
869 #define INIT_FAILURE_ALLOC 20
870
871 /* Roughly the maximum number of failure points on the stack. Would be
872 exactly that if failure always used TYPICAL_FAILURE_SIZE items.
873 This is a variable only so users of regex can assign to it; we never
874 change it ourselves. We always multiply it by TYPICAL_FAILURE_SIZE
875 before using it, so it should probably be a byte-count instead. */
876 /* Note that 4400 was enough to cause a crash on Alpha OSF/1,
877 whose default stack limit is 2mb. In order for a larger
878 value to work reliably, you have to try to make it accord
879 with the process stack limit. */
880 ptrdiff_t emacs_re_max_failures = 40000;
881
882 union fail_stack_elt
883 {
884 re_char *pointer;
885 intptr_t integer;
886 };
887
888 typedef union fail_stack_elt fail_stack_elt_t;
889
890 typedef struct
891 {
892 fail_stack_elt_t *stack;
893 ptrdiff_t size;
894 ptrdiff_t avail; /* Offset of next open position. */
895 ptrdiff_t frame; /* Offset of the cur constructed frame. */
896 } fail_stack_type;
897
898 #define FAIL_STACK_EMPTY() (fail_stack.frame == 0)
899
900
901 /* Define macros to initialize and free the failure stack. */
902
903 #define INIT_FAIL_STACK() \
904 do { \
905 fail_stack.stack = \
906 SAFE_ALLOCA (INIT_FAILURE_ALLOC * TYPICAL_FAILURE_SIZE \
907 * sizeof (fail_stack_elt_t)); \
908 fail_stack.size = INIT_FAILURE_ALLOC; \
909 fail_stack.avail = 0; \
910 fail_stack.frame = 0; \
911 } while (false)
912
913
914 /* Double the size of FAIL_STACK, up to a limit
915 which allows approximately 'emacs_re_max_failures' items.
916
917 Return 1 if succeeds, and 0 if either ran out of memory
918 allocating space for it or it was already too large.
919
920 REGEX_REALLOCATE requires 'destination' be declared. */
921
922 /* Factor to increase the failure stack size by.
923 This used to be 2, but 2 was too wasteful
924 because the old discarded stacks added up to as much space
925 were as ultimate, maximum-size stack. */
926 #define FAIL_STACK_GROWTH_FACTOR 4
927
928 #define GROW_FAIL_STACK(fail_stack) \
929 (((fail_stack).size >= emacs_re_max_failures * TYPICAL_FAILURE_SIZE) \
930 ? 0 \
931 : ((fail_stack).stack \
932 = REGEX_REALLOCATE ((fail_stack).stack, \
933 (fail_stack).avail * sizeof (fail_stack_elt_t), \
934 min (emacs_re_max_failures * TYPICAL_FAILURE_SIZE, \
935 ((fail_stack).size * FAIL_STACK_GROWTH_FACTOR)) \
936 * sizeof (fail_stack_elt_t)), \
937 ((fail_stack).size \
938 = (min (emacs_re_max_failures * TYPICAL_FAILURE_SIZE, \
939 ((fail_stack).size * FAIL_STACK_GROWTH_FACTOR)))), \
940 1))
941
942
943 /* Push a pointer value onto the failure stack.
944 Assumes the variable 'fail_stack'. Probably should only
945 be called from within 'PUSH_FAILURE_POINT'. */
946 #define PUSH_FAILURE_POINTER(item) \
947 fail_stack.stack[fail_stack.avail++].pointer = (item)
948
949 /* This pushes an integer-valued item onto the failure stack.
950 Assumes the variable 'fail_stack'. Probably should only
951 be called from within 'PUSH_FAILURE_POINT'. */
952 #define PUSH_FAILURE_INT(item) \
953 fail_stack.stack[fail_stack.avail++].integer = (item)
954
955 /* These POP... operations complement the PUSH... operations.
956 All assume that 'fail_stack' is nonempty. */
957 #define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
958 #define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
959
960 /* Individual items aside from the registers. */
961 #define NUM_NONREG_ITEMS 3
962
963 /* Used to examine the stack (to detect infinite loops). */
964 #define FAILURE_PAT(h) fail_stack.stack[(h) - 1].pointer
965 #define FAILURE_STR(h) (fail_stack.stack[(h) - 2].pointer)
966 #define NEXT_FAILURE_HANDLE(h) fail_stack.stack[(h) - 3].integer
967 #define TOP_FAILURE_HANDLE() fail_stack.frame
968
969
970 #define ENSURE_FAIL_STACK(space) \
971 while (REMAINING_AVAIL_SLOTS <= space) { \
972 if (!GROW_FAIL_STACK (fail_stack)) \
973 { \
974 unbind_to (count, Qnil); \
975 SAFE_FREE (); \
976 return -2; \
977 } \
978 DEBUG_PRINT ("\n Doubled stack; size now: %td\n", fail_stack.size); \
979 DEBUG_PRINT (" slots available: %td\n", REMAINING_AVAIL_SLOTS);\
980 }
981
982 /* Push register NUM onto the stack. */
983 #define PUSH_FAILURE_REG(num) \
984 do { \
985 char *destination; \
986 intptr_t n = num; \
987 eassert (0 < n && n < num_regs); \
988 eassert (REG_UNSET (regstart[n]) <= REG_UNSET (regend[n])); \
989 ENSURE_FAIL_STACK(3); \
990 DEBUG_PRINT (" Push reg %"PRIdPTR" (spanning %p -> %p)\n", \
991 n, regstart[n], regend[n]); \
992 PUSH_FAILURE_POINTER (regstart[n]); \
993 PUSH_FAILURE_POINTER (regend[n]); \
994 PUSH_FAILURE_INT (n); \
995 } while (false)
996
997 /* Change the counter's value to VAL, but make sure that it will
998 be reset when backtracking. */
999 #define PUSH_NUMBER(ptr,val) \
1000 do { \
1001 char *destination; \
1002 int c; \
1003 ENSURE_FAIL_STACK(3); \
1004 EXTRACT_NUMBER (c, ptr); \
1005 DEBUG_PRINT (" Push number %p = %d -> %d\n", ptr, c, val); \
1006 PUSH_FAILURE_INT (c); \
1007 PUSH_FAILURE_POINTER (ptr); \
1008 PUSH_FAILURE_INT (-1); \
1009 STORE_NUMBER (ptr, val); \
1010 } while (false)
1011
1012 /* Pop a saved register off the stack. */
1013 #define POP_FAILURE_REG_OR_COUNT() \
1014 do { \
1015 intptr_t pfreg = POP_FAILURE_INT (); \
1016 if (pfreg == -1) \
1017 { \
1018 /* It's a counter. */ \
1019 /* Discard 'const', making re_search non-reentrant. */ \
1020 unsigned char *ptr = (unsigned char *) POP_FAILURE_POINTER (); \
1021 pfreg = POP_FAILURE_INT (); \
1022 STORE_NUMBER (ptr, pfreg); \
1023 DEBUG_PRINT (" Pop counter %p = %"PRIdPTR"\n", ptr, pfreg); \
1024 } \
1025 else \
1026 { \
1027 eassert (0 < pfreg && pfreg < num_regs); \
1028 regend[pfreg] = POP_FAILURE_POINTER (); \
1029 regstart[pfreg] = POP_FAILURE_POINTER (); \
1030 eassert (REG_UNSET (regstart[pfreg]) <= REG_UNSET (regend[pfreg])); \
1031 DEBUG_PRINT (" Pop reg %ld (spanning %p -> %p)\n", \
1032 pfreg, regstart[pfreg], regend[pfreg]); \
1033 } \
1034 } while (false)
1035
1036 /* Check that we are not stuck in an infinite loop. */
1037 #define CHECK_INFINITE_LOOP(pat_cur, string_place) \
1038 do { \
1039 ptrdiff_t failure = TOP_FAILURE_HANDLE (); \
1040 /* Check for infinite matching loops */ \
1041 while (failure > 0 \
1042 && (FAILURE_STR (failure) == string_place \
1043 || FAILURE_STR (failure) == NULL)) \
1044 { \
1045 eassert (FAILURE_PAT (failure) >= bufp->buffer \
1046 && FAILURE_PAT (failure) <= bufp->buffer + bufp->used); \
1047 if (FAILURE_PAT (failure) == pat_cur) \
1048 { \
1049 cycle = true; \
1050 break; \
1051 } \
1052 DEBUG_PRINT (" Other pattern: %p\n", FAILURE_PAT (failure)); \
1053 failure = NEXT_FAILURE_HANDLE(failure); \
1054 } \
1055 DEBUG_PRINT (" Other string: %p\n", FAILURE_STR (failure)); \
1056 } while (false)
1057
1058 /* Push the information about the state we will need
1059 if we ever fail back to it.
1060
1061 Requires variables fail_stack, regstart, regend and
1062 num_regs be declared. GROW_FAIL_STACK requires 'destination' be
1063 declared.
1064
1065 Does 'return FAILURE_CODE' if runs out of memory. */
1066
1067 #define PUSH_FAILURE_POINT(pattern, string_place) \
1068 do { \
1069 char *destination; \
1070 DEBUG_STATEMENT (nfailure_points_pushed++); \
1071 DEBUG_PRINT ("\nPUSH_FAILURE_POINT:\n"); \
1072 DEBUG_PRINT (" Before push, next avail: %td\n", fail_stack.avail); \
1073 DEBUG_PRINT (" size: %td\n", fail_stack.size); \
1074 \
1075 ENSURE_FAIL_STACK (NUM_NONREG_ITEMS); \
1076 \
1077 DEBUG_PRINT ("\n"); \
1078 \
1079 DEBUG_PRINT (" Push frame index: %td\n", fail_stack.frame); \
1080 PUSH_FAILURE_INT (fail_stack.frame); \
1081 \
1082 DEBUG_PRINT (" Push string %p: \"", string_place); \
1083 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, size2);\
1084 DEBUG_PRINT ("\"\n"); \
1085 PUSH_FAILURE_POINTER (string_place); \
1086 \
1087 DEBUG_PRINT (" Push pattern %p: ", pattern); \
1088 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern, pend); \
1089 PUSH_FAILURE_POINTER (pattern); \
1090 \
1091 /* Close the frame by moving the frame pointer past it. */ \
1092 fail_stack.frame = fail_stack.avail; \
1093 } while (false)
1094
1095 /* Estimate the size of data pushed by a typical failure stack entry.
1096 An estimate is all we need, because all we use this for
1097 is to choose a limit for how big to make the failure stack. */
1098 /* BEWARE, the value `20' is hard-coded in emacs.c:main(). */
1099 #define TYPICAL_FAILURE_SIZE 20
1100
1101 /* How many items can still be added to the stack without overflowing it. */
1102 #define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
1103
1104
1105 /* Pop what PUSH_FAIL_STACK pushes.
1106
1107 Restore into the parameters, all of which should be lvalues:
1108 STR -- the saved data position.
1109 PAT -- the saved pattern position.
1110 REGSTART, REGEND -- arrays of string positions.
1111
1112 Also assume the variables FAIL_STACK and (if debugging) BUFP, PEND,
1113 STRING1, SIZE1, STRING2, and SIZE2. */
1114
1115 #define POP_FAILURE_POINT(str, pat) \
1116 do { \
1117 eassert (!FAIL_STACK_EMPTY ()); \
1118 \
1119 /* Remove failure points and point to how many regs pushed. */ \
1120 DEBUG_PRINT ("POP_FAILURE_POINT:\n"); \
1121 DEBUG_PRINT (" Before pop, next avail: %td\n", fail_stack.avail); \
1122 DEBUG_PRINT (" size: %td\n", fail_stack.size); \
1123 \
1124 /* Pop the saved registers. */ \
1125 while (fail_stack.frame < fail_stack.avail) \
1126 POP_FAILURE_REG_OR_COUNT (); \
1127 \
1128 pat = POP_FAILURE_POINTER (); \
1129 DEBUG_PRINT (" Popping pattern %p: ", pat); \
1130 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
1131 \
1132 /* If the saved string location is NULL, it came from an \
1133 on_failure_keep_string_jump opcode, and we want to throw away the \
1134 saved NULL, thus retaining our current position in the string. */ \
1135 str = POP_FAILURE_POINTER (); \
1136 DEBUG_PRINT (" Popping string %p: \"", str); \
1137 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
1138 DEBUG_PRINT ("\"\n"); \
1139 \
1140 fail_stack.frame = POP_FAILURE_INT (); \
1141 DEBUG_PRINT (" Popping frame index: %td\n", fail_stack.frame); \
1142 \
1143 eassert (fail_stack.avail >= 0); \
1144 eassert (fail_stack.frame <= fail_stack.avail); \
1145 \
1146 DEBUG_STATEMENT (nfailure_points_popped++); \
1147 } while (false) /* POP_FAILURE_POINT */
1148
1149
1150
1151 /* Registers are set to a sentinel when they haven't yet matched. */
1152 #define REG_UNSET(e) ((e) == NULL)
1153
1154 /* Subroutine declarations and macros for regex_compile. */
1155
1156 static reg_errcode_t regex_compile (re_char *pattern, ptrdiff_t size,
1157 bool posix_backtracking,
1158 const char *whitespace_regexp,
1159 struct re_pattern_buffer *bufp);
1160 static void store_op1 (re_opcode_t op, unsigned char *loc, int arg);
1161 static void store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2);
1162 static void insert_op1 (re_opcode_t op, unsigned char *loc,
1163 int arg, unsigned char *end);
1164 static void insert_op2 (re_opcode_t op, unsigned char *loc,
1165 int arg1, int arg2, unsigned char *end);
1166 static bool at_begline_loc_p (re_char *pattern, re_char *p);
1167 static bool at_endline_loc_p (re_char *p, re_char *pend);
1168 static re_char *skip_one_char (re_char *p);
1169 static int analyze_first (re_char *p, re_char *pend,
1170 char *fastmap, bool multibyte);
1171
1172 /* Fetch the next character in the uncompiled pattern, with no
1173 translation. */
1174 #define PATFETCH(c) \
1175 do { \
1176 int len; \
1177 if (p == pend) return REG_EEND; \
1178 c = RE_STRING_CHAR_AND_LENGTH (p, len, multibyte); \
1179 p += len; \
1180 } while (false)
1181
1182
1183 #define RE_TRANSLATE(TBL, C) char_table_translate (TBL, C)
1184 #define TRANSLATE(d) (!NILP (translate) ? RE_TRANSLATE (translate, d) : (d))
1185
1186 /* Macros for outputting the compiled pattern into 'buffer'. */
1187
1188 /* If the buffer isn't allocated when it comes in, use this. */
1189 #define INIT_BUF_SIZE 32
1190
1191 /* Ensure at least N more bytes of space in buffer. */
1192 #define GET_BUFFER_SPACE(n) \
1193 if (bufp->buffer + bufp->allocated - b < (n)) \
1194 EXTEND_BUFFER ((n) - (bufp->buffer + bufp->allocated - b))
1195
1196 /* Ensure one more byte of buffer space and then add C to it. */
1197 #define BUF_PUSH(c) \
1198 do { \
1199 GET_BUFFER_SPACE (1); \
1200 *b++ = (unsigned char) (c); \
1201 } while (false)
1202
1203
1204 /* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1205 #define BUF_PUSH_2(c1, c2) \
1206 do { \
1207 GET_BUFFER_SPACE (2); \
1208 *b++ = (unsigned char) (c1); \
1209 *b++ = (unsigned char) (c2); \
1210 } while (false)
1211
1212
1213 /* Store a jump with opcode OP at LOC to location TO. Store a
1214 relative address offset by the three bytes the jump itself occupies. */
1215 #define STORE_JUMP(op, loc, to) \
1216 store_op1 (op, loc, (to) - (loc) - 3)
1217
1218 /* Likewise, for a two-argument jump. */
1219 #define STORE_JUMP2(op, loc, to, arg) \
1220 store_op2 (op, loc, (to) - (loc) - 3, arg)
1221
1222 /* Like 'STORE_JUMP', but for inserting. Assume B is the buffer end. */
1223 #define INSERT_JUMP(op, loc, to) \
1224 insert_op1 (op, loc, (to) - (loc) - 3, b)
1225
1226 /* Like 'STORE_JUMP2', but for inserting. Assume B is the buffer end. */
1227 #define INSERT_JUMP2(op, loc, to, arg) \
1228 insert_op2 (op, loc, (to) - (loc) - 3, arg, b)
1229
1230
1231 /* This is not an arbitrary limit: the arguments which represent offsets
1232 into the pattern are two bytes long. So if 2^15 bytes turns out to
1233 be too small, many things would have to change. */
1234 # define MAX_BUF_SIZE (1 << 15)
1235
1236 /* Extend the buffer by at least N bytes via realloc and
1237 reset the pointers that pointed into the old block to point to the
1238 correct places in the new one. If extending the buffer results in it
1239 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
1240 #define EXTEND_BUFFER(n) \
1241 do { \
1242 ptrdiff_t requested_extension = n; \
1243 unsigned char *old_buffer = bufp->buffer; \
1244 if (MAX_BUF_SIZE - bufp->allocated < requested_extension) \
1245 return REG_ESIZE; \
1246 ptrdiff_t b_off = b - old_buffer; \
1247 ptrdiff_t begalt_off = begalt - old_buffer; \
1248 ptrdiff_t fixup_alt_jump_off = \
1249 fixup_alt_jump ? fixup_alt_jump - old_buffer : -1; \
1250 ptrdiff_t laststart_off = laststart ? laststart - old_buffer : -1; \
1251 ptrdiff_t pending_exact_off = \
1252 pending_exact ? pending_exact - old_buffer : -1; \
1253 bufp->buffer = xpalloc (bufp->buffer, &bufp->allocated, \
1254 requested_extension, MAX_BUF_SIZE, 1); \
1255 unsigned char *new_buffer = bufp->buffer; \
1256 b = new_buffer + b_off; \
1257 begalt = new_buffer + begalt_off; \
1258 if (0 <= fixup_alt_jump_off) \
1259 fixup_alt_jump = new_buffer + fixup_alt_jump_off; \
1260 if (0 <= laststart_off) \
1261 laststart = new_buffer + laststart_off; \
1262 if (0 <= pending_exact_off) \
1263 pending_exact = new_buffer + pending_exact_off; \
1264 } while (false)
1265
1266
1267 /* Since we have one byte reserved for the register number argument to
1268 {start,stop}_memory, the maximum number of groups we can report
1269 things about is what fits in that byte. */
1270 #define MAX_REGNUM 255
1271
1272 /* But patterns can have more than 'MAX_REGNUM' registers. Just
1273 ignore the excess. */
1274 typedef int regnum_t;
1275
1276
1277 /* Macros for the compile stack. */
1278
1279 typedef long pattern_offset_t;
1280 verify (LONG_MIN <= -(MAX_BUF_SIZE - 1) && MAX_BUF_SIZE - 1 <= LONG_MAX);
1281
1282 typedef struct
1283 {
1284 pattern_offset_t begalt_offset;
1285 pattern_offset_t fixup_alt_jump;
1286 pattern_offset_t laststart_offset;
1287 regnum_t regnum;
1288 } compile_stack_elt_t;
1289
1290
1291 typedef struct
1292 {
1293 compile_stack_elt_t *stack;
1294 ptrdiff_t size;
1295 ptrdiff_t avail; /* Offset of next open position. */
1296 } compile_stack_type;
1297
1298
1299 #define INIT_COMPILE_STACK_SIZE 32
1300
1301 #define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1302 #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1303
1304 /* The next available element. */
1305 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1306
1307 /* Structure to manage work area for range table. */
1308 struct range_table_work_area
1309 {
1310 int *table; /* actual work area. */
1311 int allocated; /* allocated size for work area in bytes. */
1312 int used; /* actually used size in words. */
1313 int bits; /* flag to record character classes */
1314 };
1315
1316 /* Make sure that WORK_AREA can hold N more multibyte characters.
1317 If it can't get the space, it returns from the surrounding function. */
1318
1319 #define EXTEND_RANGE_TABLE(work_area, n) \
1320 do { \
1321 if (((work_area).used + (n)) * sizeof (int) > (work_area).allocated) \
1322 { \
1323 extend_range_table_work_area (&work_area); \
1324 if ((work_area).table == 0) \
1325 return (REG_ESPACE); \
1326 } \
1327 } while (false)
1328
1329 #define SET_RANGE_TABLE_WORK_AREA_BIT(work_area, bit) \
1330 (work_area).bits |= (bit)
1331
1332 /* Set a range (RANGE_START, RANGE_END) to WORK_AREA. */
1333 #define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end) \
1334 do { \
1335 EXTEND_RANGE_TABLE ((work_area), 2); \
1336 (work_area).table[(work_area).used++] = (range_start); \
1337 (work_area).table[(work_area).used++] = (range_end); \
1338 } while (false)
1339
1340 /* Free allocated memory for WORK_AREA. */
1341 #define FREE_RANGE_TABLE_WORK_AREA(work_area) \
1342 do { \
1343 if ((work_area).table) \
1344 xfree ((work_area).table); \
1345 } while (false)
1346
1347 #define CLEAR_RANGE_TABLE_WORK_USED(work_area) \
1348 ((work_area).used = 0, (work_area).bits = 0)
1349 #define RANGE_TABLE_WORK_USED(work_area) ((work_area).used)
1350 #define RANGE_TABLE_WORK_BITS(work_area) ((work_area).bits)
1351 #define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i])
1352
1353 /* Bits used to implement the multibyte-part of the various character classes
1354 such as [:alnum:] in a charset's range table. The code currently assumes
1355 that only the low 16 bits are used. */
1356 #define BIT_WORD 0x1
1357 #define BIT_LOWER 0x2
1358 #define BIT_PUNCT 0x4
1359 #define BIT_SPACE 0x8
1360 #define BIT_UPPER 0x10
1361 #define BIT_MULTIBYTE 0x20
1362 #define BIT_ALPHA 0x40
1363 #define BIT_ALNUM 0x80
1364 #define BIT_GRAPH 0x100
1365 #define BIT_PRINT 0x200
1366 #define BIT_BLANK 0x400
1367
1368
1369 /* Set the bit for character C in a list. */
1370 #define SET_LIST_BIT(c) (b[((c)) / BYTEWIDTH] |= 1 << ((c) % BYTEWIDTH))
1371
1372
1373 /* Store characters in the range FROM to TO in the bitmap at B (for
1374 ASCII and unibyte characters) and WORK_AREA (for multibyte
1375 characters) while translating them and paying attention to the
1376 continuity of translated characters.
1377
1378 Implementation note: It is better to implement these fairly big
1379 macros by a function, but it's not that easy because macros called
1380 in this macro assume various local variables already declared. */
1381
1382 /* Both FROM and TO are ASCII characters. */
1383
1384 #define SETUP_ASCII_RANGE(work_area, FROM, TO) \
1385 do { \
1386 int C0, C1; \
1387 \
1388 for (C0 = (FROM); C0 <= (TO); C0++) \
1389 { \
1390 C1 = TRANSLATE (C0); \
1391 if (! ASCII_CHAR_P (C1)) \
1392 { \
1393 SET_RANGE_TABLE_WORK_AREA ((work_area), C1, C1); \
1394 if ((C1 = RE_CHAR_TO_UNIBYTE (C1)) < 0) \
1395 C1 = C0; \
1396 } \
1397 SET_LIST_BIT (C1); \
1398 } \
1399 } while (false)
1400
1401
1402 /* Both FROM and TO are unibyte characters (0x80..0xFF). */
1403
1404 #define SETUP_UNIBYTE_RANGE(work_area, FROM, TO) \
1405 do { \
1406 int C0, C1, C2, I; \
1407 int USED = RANGE_TABLE_WORK_USED (work_area); \
1408 \
1409 for (C0 = (FROM); C0 <= (TO); C0++) \
1410 { \
1411 C1 = RE_CHAR_TO_MULTIBYTE (C0); \
1412 if (CHAR_BYTE8_P (C1)) \
1413 SET_LIST_BIT (C0); \
1414 else \
1415 { \
1416 C2 = TRANSLATE (C1); \
1417 if (C2 == C1 \
1418 || (C1 = RE_CHAR_TO_UNIBYTE (C2)) < 0) \
1419 C1 = C0; \
1420 SET_LIST_BIT (C1); \
1421 for (I = RANGE_TABLE_WORK_USED (work_area) - 2; I >= USED; I -= 2) \
1422 { \
1423 int from = RANGE_TABLE_WORK_ELT (work_area, I); \
1424 int to = RANGE_TABLE_WORK_ELT (work_area, I + 1); \
1425 \
1426 if (C2 >= from - 1 && C2 <= to + 1) \
1427 { \
1428 if (C2 == from - 1) \
1429 RANGE_TABLE_WORK_ELT (work_area, I)--; \
1430 else if (C2 == to + 1) \
1431 RANGE_TABLE_WORK_ELT (work_area, I + 1)++; \
1432 break; \
1433 } \
1434 } \
1435 if (I < USED) \
1436 SET_RANGE_TABLE_WORK_AREA ((work_area), C2, C2); \
1437 } \
1438 } \
1439 } while (false)
1440
1441
1442 /* Both FROM and TO are multibyte characters. */
1443
1444 #define SETUP_MULTIBYTE_RANGE(work_area, FROM, TO) \
1445 do { \
1446 int C0, C1, C2, I, USED = RANGE_TABLE_WORK_USED (work_area); \
1447 \
1448 SET_RANGE_TABLE_WORK_AREA ((work_area), (FROM), (TO)); \
1449 for (C0 = (FROM); C0 <= (TO); C0++) \
1450 { \
1451 C1 = TRANSLATE (C0); \
1452 if ((C2 = RE_CHAR_TO_UNIBYTE (C1)) >= 0 \
1453 || (C1 != C0 && (C2 = RE_CHAR_TO_UNIBYTE (C0)) >= 0)) \
1454 SET_LIST_BIT (C2); \
1455 if (C1 >= (FROM) && C1 <= (TO)) \
1456 continue; \
1457 for (I = RANGE_TABLE_WORK_USED (work_area) - 2; I >= USED; I -= 2) \
1458 { \
1459 int from = RANGE_TABLE_WORK_ELT (work_area, I); \
1460 int to = RANGE_TABLE_WORK_ELT (work_area, I + 1); \
1461 \
1462 if (C1 >= from - 1 && C1 <= to + 1) \
1463 { \
1464 if (C1 == from - 1) \
1465 RANGE_TABLE_WORK_ELT (work_area, I)--; \
1466 else if (C1 == to + 1) \
1467 RANGE_TABLE_WORK_ELT (work_area, I + 1)++; \
1468 break; \
1469 } \
1470 } \
1471 if (I < USED) \
1472 SET_RANGE_TABLE_WORK_AREA ((work_area), C1, C1); \
1473 } \
1474 } while (false)
1475
1476 /* Get the next unsigned number in the uncompiled pattern. */
1477 #define GET_INTERVAL_COUNT(num) \
1478 do { \
1479 if (p == pend) \
1480 FREE_STACK_RETURN (REG_EBRACE); \
1481 else \
1482 { \
1483 PATFETCH (c); \
1484 while ('0' <= c && c <= '9') \
1485 { \
1486 if (num < 0) \
1487 num = 0; \
1488 if (RE_DUP_MAX / 10 - (RE_DUP_MAX % 10 < c - '0') < num) \
1489 FREE_STACK_RETURN (REG_ESIZEBR); \
1490 num = num * 10 + c - '0'; \
1491 if (p == pend) \
1492 FREE_STACK_RETURN (REG_EBRACE); \
1493 PATFETCH (c); \
1494 } \
1495 } \
1496 } while (false)
1497
1498 /* Parse a character class, i.e. string such as "[:name:]". *strp
1499 points to the string to be parsed and limit is length, in bytes, of
1500 that string.
1501
1502 If *strp point to a string that begins with "[:name:]", where name is
1503 a non-empty sequence of lower case letters, *strp will be advanced past the
1504 closing square bracket and RECC_* constant which maps to the name will be
1505 returned. If name is not a valid character class name zero, or RECC_ERROR,
1506 is returned.
1507
1508 Otherwise, if *strp doesn't begin with "[:name:]", -1 is returned.
1509
1510 The function can be used on ASCII and multibyte (UTF-8-encoded) strings.
1511 */
1512 re_wctype_t
1513 re_wctype_parse (const unsigned char **strp, ptrdiff_t limit)
1514 {
1515 const char *beg = (const char *)*strp, *it;
1516
1517 if (limit < 4 || beg[0] != '[' || beg[1] != ':')
1518 return -1;
1519
1520 beg += 2; /* skip opening "[:" */
1521 limit -= 3; /* opening "[:" and half of closing ":]"; --limit handles rest */
1522 for (it = beg; it[0] != ':' || it[1] != ']'; ++it)
1523 if (!--limit)
1524 return -1;
1525
1526 *strp = (const unsigned char *)(it + 2);
1527
1528 /* Sort tests in the length=five case by frequency the classes to minimize
1529 number of times we fail the comparison. The frequencies of character class
1530 names used in Emacs sources as of 2016-07-27:
1531
1532 $ find \( -name \*.c -o -name \*.el \) -exec grep -h '\[:[a-z]*:]' {} + |
1533 sed 's/]/]\n/g' |grep -o '\[:[a-z]*:]' |sort |uniq -c |sort -nr
1534 213 [:alnum:]
1535 104 [:alpha:]
1536 62 [:space:]
1537 39 [:digit:]
1538 36 [:blank:]
1539 26 [:word:]
1540 26 [:upper:]
1541 21 [:lower:]
1542 10 [:xdigit:]
1543 10 [:punct:]
1544 10 [:ascii:]
1545 4 [:nonascii:]
1546 4 [:graph:]
1547 2 [:print:]
1548 2 [:cntrl:]
1549 1 [:ff:]
1550
1551 If you update this list, consider also updating chain of or'ed conditions
1552 in execute_charset function.
1553 */
1554
1555 switch (it - beg) {
1556 case 4:
1557 if (!memcmp (beg, "word", 4)) return RECC_WORD;
1558 break;
1559 case 5:
1560 if (!memcmp (beg, "alnum", 5)) return RECC_ALNUM;
1561 if (!memcmp (beg, "alpha", 5)) return RECC_ALPHA;
1562 if (!memcmp (beg, "space", 5)) return RECC_SPACE;
1563 if (!memcmp (beg, "digit", 5)) return RECC_DIGIT;
1564 if (!memcmp (beg, "blank", 5)) return RECC_BLANK;
1565 if (!memcmp (beg, "upper", 5)) return RECC_UPPER;
1566 if (!memcmp (beg, "lower", 5)) return RECC_LOWER;
1567 if (!memcmp (beg, "punct", 5)) return RECC_PUNCT;
1568 if (!memcmp (beg, "ascii", 5)) return RECC_ASCII;
1569 if (!memcmp (beg, "graph", 5)) return RECC_GRAPH;
1570 if (!memcmp (beg, "print", 5)) return RECC_PRINT;
1571 if (!memcmp (beg, "cntrl", 5)) return RECC_CNTRL;
1572 break;
1573 case 6:
1574 if (!memcmp (beg, "xdigit", 6)) return RECC_XDIGIT;
1575 break;
1576 case 7:
1577 if (!memcmp (beg, "unibyte", 7)) return RECC_UNIBYTE;
1578 break;
1579 case 8:
1580 if (!memcmp (beg, "nonascii", 8)) return RECC_NONASCII;
1581 break;
1582 case 9:
1583 if (!memcmp (beg, "multibyte", 9)) return RECC_MULTIBYTE;
1584 break;
1585 }
1586
1587 return RECC_ERROR;
1588 }
1589
1590 /* True if CH is in the char class CC. */
1591 bool
1592 re_iswctype (int ch, re_wctype_t cc)
1593 {
1594 switch (cc)
1595 {
1596 case RECC_ALNUM: return ISALNUM (ch) != 0;
1597 case RECC_ALPHA: return ISALPHA (ch) != 0;
1598 case RECC_BLANK: return ISBLANK (ch) != 0;
1599 case RECC_CNTRL: return ISCNTRL (ch) != 0;
1600 case RECC_DIGIT: return ISDIGIT (ch) != 0;
1601 case RECC_GRAPH: return ISGRAPH (ch) != 0;
1602 case RECC_LOWER: return ISLOWER (ch) != 0;
1603 case RECC_PRINT: return ISPRINT (ch) != 0;
1604 case RECC_PUNCT: return ISPUNCT (ch) != 0;
1605 case RECC_SPACE: return ISSPACE (ch) != 0;
1606 case RECC_UPPER: return ISUPPER (ch) != 0;
1607 case RECC_XDIGIT: return ISXDIGIT (ch) != 0;
1608 case RECC_ASCII: return IS_REAL_ASCII (ch) != 0;
1609 case RECC_NONASCII: return !IS_REAL_ASCII (ch);
1610 case RECC_UNIBYTE: return ISUNIBYTE (ch) != 0;
1611 case RECC_MULTIBYTE: return !ISUNIBYTE (ch);
1612 case RECC_WORD: return ISWORD (ch) != 0;
1613 case RECC_ERROR: return false;
1614 default:
1615 abort ();
1616 }
1617 }
1618
1619 /* Return a bit-pattern to use in the range-table bits to match multibyte
1620 chars of class CC. */
1621 static int
1622 re_wctype_to_bit (re_wctype_t cc)
1623 {
1624 switch (cc)
1625 {
1626 case RECC_NONASCII:
1627 case RECC_MULTIBYTE: return BIT_MULTIBYTE;
1628 case RECC_ALPHA: return BIT_ALPHA;
1629 case RECC_ALNUM: return BIT_ALNUM;
1630 case RECC_WORD: return BIT_WORD;
1631 case RECC_LOWER: return BIT_LOWER;
1632 case RECC_UPPER: return BIT_UPPER;
1633 case RECC_PUNCT: return BIT_PUNCT;
1634 case RECC_SPACE: return BIT_SPACE;
1635 case RECC_GRAPH: return BIT_GRAPH;
1636 case RECC_PRINT: return BIT_PRINT;
1637 case RECC_BLANK: return BIT_BLANK;
1638 case RECC_ASCII: case RECC_DIGIT: case RECC_XDIGIT: case RECC_CNTRL:
1639 case RECC_UNIBYTE: case RECC_ERROR: return 0;
1640 default:
1641 abort ();
1642 }
1643 }
1644
1645 /* Filling in the work area of a range. */
1646
1647 /* Actually extend the space in WORK_AREA. */
1648
1649 static void
1650 extend_range_table_work_area (struct range_table_work_area *work_area)
1651 {
1652 work_area->allocated += 16 * sizeof (int);
1653 work_area->table = xrealloc (work_area->table, work_area->allocated);
1654 }
1655
1656 /* regex_compile and helpers. */
1657
1658 static bool group_in_compile_stack (compile_stack_type, regnum_t);
1659
1660 /* Insert the 'jump' from the end of last alternative to "here".
1661 The space for the jump has already been allocated. */
1662 #define FIXUP_ALT_JUMP() \
1663 do { \
1664 if (fixup_alt_jump) \
1665 STORE_JUMP (jump, fixup_alt_jump, b); \
1666 } while (false)
1667
1668
1669 /* Return, freeing storage we allocated. */
1670 #define FREE_STACK_RETURN(value) \
1671 do { \
1672 FREE_RANGE_TABLE_WORK_AREA (range_table_work); \
1673 xfree (compile_stack.stack); \
1674 return value; \
1675 } while (false)
1676
1677 /* Compile PATTERN (of length SIZE) according to SYNTAX.
1678 Return a nonzero error code on failure, or zero for success.
1679
1680 If WHITESPACE_REGEXP is given, use it instead of a space
1681 character in PATTERN.
1682
1683 Assume the 'allocated' (and perhaps 'buffer') and 'translate'
1684 fields are set in BUFP on entry.
1685
1686 If successful, put results in *BUFP (otherwise the
1687 contents of *BUFP are undefined):
1688 'buffer' is the compiled pattern;
1689 'syntax' is set to SYNTAX;
1690 'used' is set to the length of the compiled pattern;
1691 'fastmap_accurate' is false;
1692 're_nsub' is the number of subexpressions in PATTERN;
1693
1694 The 'fastmap' field is neither examined nor set. */
1695
1696 static reg_errcode_t
1697 regex_compile (re_char *pattern, ptrdiff_t size,
1698 bool posix_backtracking,
1699 const char *whitespace_regexp,
1700 struct re_pattern_buffer *bufp)
1701 {
1702 /* Fetch characters from PATTERN here. */
1703 int c, c1;
1704
1705 /* Points to the end of the buffer, where we should append. */
1706 unsigned char *b;
1707
1708 /* Keeps track of unclosed groups. */
1709 compile_stack_type compile_stack;
1710
1711 /* Points to the current (ending) position in the pattern. */
1712 re_char *p = pattern;
1713 re_char *pend = pattern + size;
1714
1715 /* How to translate the characters in the pattern. */
1716 Lisp_Object translate = bufp->translate;
1717
1718 /* Address of the count-byte of the most recently inserted 'exactn'
1719 command. This makes it possible to tell if a new exact-match
1720 character can be added to that command or if the character requires
1721 a new 'exactn' command. */
1722 unsigned char *pending_exact = 0;
1723
1724 /* Address of start of the most recently finished expression.
1725 This tells, e.g., postfix * where to find the start of its
1726 operand. Reset at the beginning of groups and alternatives,
1727 and after ^ and \` for dusty-deck compatibility. */
1728 unsigned char *laststart = 0;
1729
1730 /* Address of beginning of regexp, or inside of last group. */
1731 unsigned char *begalt;
1732
1733 /* Place in the uncompiled pattern (i.e., the {) to
1734 which to go back if the interval is invalid. */
1735 re_char *beg_interval;
1736
1737 /* Address of the place where a forward jump should go to the end of
1738 the containing expression. Each alternative of an 'or' -- except the
1739 last -- ends with a forward jump of this sort. */
1740 unsigned char *fixup_alt_jump = 0;
1741
1742 /* Work area for range table of charset. */
1743 struct range_table_work_area range_table_work;
1744
1745 /* If the regular expression is multibyte. */
1746 bool multibyte = RE_MULTIBYTE_P (bufp);
1747
1748 /* Nonzero if we have pushed down into a subpattern. */
1749 int in_subpattern = 0;
1750
1751 /* These hold the values of p, pattern, and pend from the main
1752 pattern when we have pushed into a subpattern. */
1753 re_char *main_p;
1754 re_char *main_pattern;
1755 re_char *main_pend;
1756
1757 #ifdef REGEX_EMACS_DEBUG
1758 regex_emacs_debug++;
1759 DEBUG_PRINT ("\nCompiling pattern: ");
1760 if (regex_emacs_debug > 0)
1761 {
1762 for (ptrdiff_t debug_count = 0; debug_count < size; debug_count++)
1763 debug_putchar (pattern[debug_count]);
1764 putc ('\n', stderr);
1765 }
1766 #endif
1767
1768 /* Initialize the compile stack. */
1769 compile_stack.stack = xmalloc (INIT_COMPILE_STACK_SIZE
1770 * sizeof *compile_stack.stack);
1771 __lsan_ignore_object (compile_stack.stack);
1772 compile_stack.size = INIT_COMPILE_STACK_SIZE;
1773 compile_stack.avail = 0;
1774
1775 range_table_work.table = 0;
1776 range_table_work.allocated = 0;
1777
1778 /* Initialize the pattern buffer. */
1779 bufp->fastmap_accurate = false;
1780 bufp->used_syntax = false;
1781
1782 /* Set 'used' to zero, so that if we return an error, the pattern
1783 printer (for debugging) will think there's no pattern. We reset it
1784 at the end. */
1785 bufp->used = 0;
1786
1787 bufp->re_nsub = 0;
1788
1789 if (bufp->allocated == 0)
1790 {
1791 /* This loses if BUFP->buffer is bogus, but that is the user's
1792 responsibility. */
1793 bufp->buffer = xrealloc (bufp->buffer, INIT_BUF_SIZE);
1794 bufp->allocated = INIT_BUF_SIZE;
1795 }
1796
1797 begalt = b = bufp->buffer;
1798
1799 /* Loop through the uncompiled pattern until we're at the end. */
1800 while (1)
1801 {
1802 if (p == pend)
1803 {
1804 /* If this is the end of an included regexp,
1805 pop back to the main regexp and try again. */
1806 if (in_subpattern)
1807 {
1808 in_subpattern = 0;
1809 pattern = main_pattern;
1810 p = main_p;
1811 pend = main_pend;
1812 continue;
1813 }
1814 /* If this is the end of the main regexp, we are done. */
1815 break;
1816 }
1817
1818 PATFETCH (c);
1819
1820 switch (c)
1821 {
1822 case ' ':
1823 {
1824 re_char *p1 = p;
1825
1826 /* If there's no special whitespace regexp, treat
1827 spaces normally. And don't try to do this recursively. */
1828 if (!whitespace_regexp || in_subpattern)
1829 goto normal_char;
1830
1831 /* Peek past following spaces. */
1832 while (p1 != pend)
1833 {
1834 if (*p1 != ' ')
1835 break;
1836 p1++;
1837 }
1838 /* If the spaces are followed by a repetition op,
1839 treat them normally. */
1840 if (p1 != pend
1841 && (*p1 == '*' || *p1 == '+' || *p1 == '?'
1842 || (*p1 == '\\' && p1 + 1 != pend && p1[1] == '{')))
1843 goto normal_char;
1844
1845 /* Replace the spaces with the whitespace regexp. */
1846 in_subpattern = 1;
1847 main_p = p1;
1848 main_pend = pend;
1849 main_pattern = pattern;
1850 p = pattern = (re_char *) whitespace_regexp;
1851 pend = p + strlen (whitespace_regexp);
1852 break;
1853 }
1854
1855 case '^':
1856 if (! (p == pattern + 1 || at_begline_loc_p (pattern, p)))
1857 goto normal_char;
1858 /* Special case for compatibility: postfix ops after ^ become
1859 literals. */
1860 laststart = 0;
1861 BUF_PUSH (begline);
1862 break;
1863
1864 case '$':
1865 if (! (p == pend || at_endline_loc_p (p, pend)))
1866 goto normal_char;
1867 laststart = b;
1868 BUF_PUSH (endline);
1869 break;
1870
1871
1872 case '+':
1873 case '?':
1874 case '*':
1875 /* If there is no previous pattern... */
1876 if (!laststart)
1877 goto normal_char;
1878
1879 {
1880 /* 1 means zero (many) matches is allowed. */
1881 bool zero_times_ok = false, many_times_ok = false;
1882 bool greedy = true;
1883
1884 /* If there is a sequence of repetition chars, collapse it
1885 down to just one (the right one). We can't combine
1886 interval operators with these because of, e.g., 'a{2}*',
1887 which should only match an even number of 'a's. */
1888
1889 for (;;)
1890 {
1891 if (c == '?' && (zero_times_ok || many_times_ok))
1892 greedy = false;
1893 else
1894 {
1895 zero_times_ok |= c != '+';
1896 many_times_ok |= c != '?';
1897 }
1898
1899 if (! (p < pend && (*p == '*' || *p == '+' || *p == '?')))
1900 break;
1901 /* If we get here, we found another repeat character. */
1902 c = *p++;
1903 }
1904
1905 /* Star, etc. applied to an empty pattern is equivalent
1906 to an empty pattern. */
1907 if (laststart == b)
1908 break;
1909
1910 /* Now we know whether or not zero matches is allowed
1911 and also whether or not two or more matches is allowed. */
1912 if (greedy)
1913 {
1914 if (many_times_ok)
1915 {
1916 bool simple = skip_one_char (laststart) == b;
1917 ptrdiff_t startoffset = 0;
1918 re_opcode_t ofj =
1919 /* Check if the loop can match the empty string. */
1920 (simple || !analyze_first (laststart, b, NULL, false))
1921 ? on_failure_jump : on_failure_jump_loop;
1922 eassert (skip_one_char (laststart) <= b);
1923
1924 if (!zero_times_ok && simple)
1925 { /* Since simple * loops can be made faster by using
1926 on_failure_keep_string_jump, we turn simple P+
1927 into PP* if P is simple. */
1928 unsigned char *p1, *p2;
1929 startoffset = b - laststart;
1930 GET_BUFFER_SPACE (startoffset);
1931 p1 = b; p2 = laststart;
1932 while (p2 < p1)
1933 *b++ = *p2++;
1934 zero_times_ok = 1;
1935 }
1936
1937 GET_BUFFER_SPACE (6);
1938 if (!zero_times_ok)
1939 /* A + loop. */
1940 STORE_JUMP (ofj, b, b + 6);
1941 else
1942 /* Simple * loops can use on_failure_keep_string_jump
1943 depending on what follows. But since we don't know
1944 that yet, we leave the decision up to
1945 on_failure_jump_smart. */
1946 INSERT_JUMP (simple ? on_failure_jump_smart : ofj,
1947 laststart + startoffset, b + 6);
1948 b += 3;
1949 STORE_JUMP (jump, b, laststart + startoffset);
1950 b += 3;
1951 }
1952 else
1953 {
1954 /* A simple ? pattern. */
1955 eassert (zero_times_ok);
1956 GET_BUFFER_SPACE (3);
1957 INSERT_JUMP (on_failure_jump, laststart, b + 3);
1958 b += 3;
1959 }
1960 }
1961 else /* not greedy */
1962 { /* I wish the greedy and non-greedy cases could be merged. */
1963
1964 GET_BUFFER_SPACE (7); /* We might use less. */
1965 if (many_times_ok)
1966 {
1967 bool emptyp = !!analyze_first (laststart, b, NULL, false);
1968
1969 /* The non-greedy multiple match looks like
1970 a repeat..until: we only need a conditional jump
1971 at the end of the loop. */
1972 if (emptyp) BUF_PUSH (no_op);
1973 STORE_JUMP (emptyp ? on_failure_jump_nastyloop
1974 : on_failure_jump, b, laststart);
1975 b += 3;
1976 if (zero_times_ok)
1977 {
1978 /* The repeat...until naturally matches one or more.
1979 To also match zero times, we need to first jump to
1980 the end of the loop (its conditional jump). */
1981 INSERT_JUMP (jump, laststart, b);
1982 b += 3;
1983 }
1984 }
1985 else
1986 {
1987 /* non-greedy a?? */
1988 INSERT_JUMP (jump, laststart, b + 3);
1989 b += 3;
1990 INSERT_JUMP (on_failure_jump, laststart, laststart + 6);
1991 b += 3;
1992 }
1993 }
1994 }
1995 pending_exact = 0;
1996 break;
1997
1998
1999 case '.':
2000 laststart = b;
2001 BUF_PUSH (anychar);
2002 break;
2003
2004
2005 case '[':
2006 {
2007 re_char *p1;
2008
2009 CLEAR_RANGE_TABLE_WORK_USED (range_table_work);
2010
2011 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2012
2013 /* Ensure that we have enough space to push a charset: the
2014 opcode, the length count, and the bitset; 34 bytes in all. */
2015 GET_BUFFER_SPACE (34);
2016
2017 laststart = b;
2018
2019 /* Test '*p == '^' twice, instead of using an if
2020 statement, so we need only one BUF_PUSH. */
2021 BUF_PUSH (*p == '^' ? charset_not : charset);
2022 if (*p == '^')
2023 p++;
2024
2025 /* Remember the first position in the bracket expression. */
2026 p1 = p;
2027
2028 /* Push the number of bytes in the bitmap. */
2029 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
2030
2031 /* Clear the whole map. */
2032 memset (b, 0, (1 << BYTEWIDTH) / BYTEWIDTH);
2033
2034 /* Read in characters and ranges, setting map bits. */
2035 for (;;)
2036 {
2037 const unsigned char *p2 = p;
2038 int ch;
2039
2040 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2041
2042 /* See if we're at the beginning of a possible character
2043 class. */
2044 re_wctype_t cc = re_wctype_parse (&p, pend - p);
2045 if (cc != -1)
2046 {
2047 if (cc == 0)
2048 FREE_STACK_RETURN (REG_ECTYPE);
2049
2050 if (p == pend)
2051 FREE_STACK_RETURN (REG_EBRACK);
2052
2053 /* Most character classes in a multibyte match just set
2054 a flag. Exceptions are is_blank, is_digit, is_cntrl, and
2055 is_xdigit, since they can only match ASCII characters.
2056 We don't need to handle them for multibyte. */
2057
2058 for (c = 0; c < 0x80; ++c)
2059 if (re_iswctype (c, cc))
2060 {
2061 SET_LIST_BIT (c);
2062 c1 = TRANSLATE (c);
2063 if (c1 == c)
2064 continue;
2065 if (ASCII_CHAR_P (c1))
2066 SET_LIST_BIT (c1);
2067 else if ((c1 = RE_CHAR_TO_UNIBYTE (c1)) >= 0)
2068 SET_LIST_BIT (c1);
2069 }
2070 SET_RANGE_TABLE_WORK_AREA_BIT
2071 (range_table_work, re_wctype_to_bit (cc));
2072
2073 /* In most cases the matching rule for char classes only
2074 uses the syntax table for multibyte chars, so that the
2075 content of the syntax-table is not hardcoded in the
2076 range_table. SPACE and WORD are the two exceptions. */
2077 if ((1 << cc) & ((1 << RECC_SPACE) | (1 << RECC_WORD)))
2078 bufp->used_syntax = true;
2079
2080 /* Repeat the loop. */
2081 continue;
2082 }
2083
2084 /* Don't translate yet. The range TRANSLATE(X..Y) cannot
2085 always be determined from TRANSLATE(X) and TRANSLATE(Y)
2086 So the translation is done later in a loop. Example:
2087 (let ((case-fold-search t)) (string-match "[A-_]" "A")) */
2088 PATFETCH (c);
2089
2090 /* Could be the end of the bracket expression. If it's
2091 not (i.e., when the bracket expression is '[]' so
2092 far), the ']' character bit gets set way below. */
2093 if (c == ']' && p2 != p1)
2094 break;
2095
2096 if (p < pend && p[0] == '-' && p[1] != ']')
2097 {
2098
2099 /* Discard the '-'. */
2100 PATFETCH (c1);
2101
2102 /* Fetch the character which ends the range. */
2103 PATFETCH (c1);
2104
2105 if (CHAR_BYTE8_P (c1)
2106 && ! ASCII_CHAR_P (c) && ! CHAR_BYTE8_P (c))
2107 /* Treat the range from a multibyte character to
2108 raw-byte character as empty. */
2109 c = c1 + 1;
2110 }
2111 else
2112 /* Range from C to C. */
2113 c1 = c;
2114
2115 if (c <= c1)
2116 {
2117 if (c < 128)
2118 {
2119 ch = min (127, c1);
2120 SETUP_ASCII_RANGE (range_table_work, c, ch);
2121 c = ch + 1;
2122 if (CHAR_BYTE8_P (c1))
2123 c = BYTE8_TO_CHAR (128);
2124 }
2125 if (c <= c1)
2126 {
2127 if (CHAR_BYTE8_P (c))
2128 {
2129 c = CHAR_TO_BYTE8 (c);
2130 c1 = CHAR_TO_BYTE8 (c1);
2131 for (; c <= c1; c++)
2132 SET_LIST_BIT (c);
2133 }
2134 else if (multibyte)
2135 SETUP_MULTIBYTE_RANGE (range_table_work, c, c1);
2136 else
2137 SETUP_UNIBYTE_RANGE (range_table_work, c, c1);
2138 }
2139 }
2140 }
2141
2142 /* Discard any (non)matching list bytes that are all 0 at the
2143 end of the map. Decrease the map-length byte too. */
2144 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
2145 b[-1]--;
2146 b += b[-1];
2147
2148 /* Build real range table from work area. */
2149 if (RANGE_TABLE_WORK_USED (range_table_work)
2150 || RANGE_TABLE_WORK_BITS (range_table_work))
2151 {
2152 int i;
2153 int used = RANGE_TABLE_WORK_USED (range_table_work);
2154
2155 /* Allocate space for COUNT + RANGE_TABLE. Needs two
2156 bytes for flags, two for COUNT, and three bytes for
2157 each character. */
2158 GET_BUFFER_SPACE (4 + used * 3);
2159
2160 /* Indicate the existence of range table. */
2161 laststart[1] |= 0x80;
2162
2163 /* Store the character class flag bits into the range table. */
2164 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) & 0xff;
2165 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) >> 8;
2166
2167 STORE_NUMBER_AND_INCR (b, used / 2);
2168 for (i = 0; i < used; i++)
2169 STORE_CHARACTER_AND_INCR
2170 (b, RANGE_TABLE_WORK_ELT (range_table_work, i));
2171 }
2172 }
2173 break;
2174
2175
2176 case '\\':
2177 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2178
2179 /* Do not translate the character after the \, so that we can
2180 distinguish, e.g., \B from \b, even if we normally would
2181 translate, e.g., B to b. */
2182 PATFETCH (c);
2183
2184 switch (c)
2185 {
2186 case '(':
2187 {
2188 bool shy = false;
2189 regnum_t regnum = 0;
2190 if (p+1 < pend)
2191 {
2192 /* Look for a special (?...) construct */
2193 if (*p == '?')
2194 {
2195 PATFETCH (c); /* Gobble up the '?'. */
2196 while (!shy)
2197 {
2198 PATFETCH (c);
2199 switch (c)
2200 {
2201 case ':': shy = true; break;
2202 case '0':
2203 /* An explicitly specified regnum must start
2204 with non-0. */
2205 if (regnum == 0)
2206 FREE_STACK_RETURN (REG_BADPAT);
2207 FALLTHROUGH;
2208 case '1': case '2': case '3': case '4':
2209 case '5': case '6': case '7': case '8': case '9':
2210 if (ckd_mul (®num, regnum, 10)
2211 || ckd_add (®num, regnum, c - '0'))
2212 FREE_STACK_RETURN (REG_ESIZE);
2213 break;
2214 default:
2215 /* Only (?:...) is supported right now. */
2216 FREE_STACK_RETURN (REG_BADPAT);
2217 }
2218 }
2219 }
2220 }
2221
2222 if (!shy)
2223 regnum = ++bufp->re_nsub;
2224 else if (regnum)
2225 { /* It's actually not shy, but explicitly numbered. */
2226 shy = false;
2227 if (regnum > bufp->re_nsub)
2228 bufp->re_nsub = regnum;
2229 else if (regnum > bufp->re_nsub
2230 /* Ideally, we'd want to check that the specified
2231 group can't have matched (i.e. all subgroups
2232 using the same regnum are in other branches of
2233 OR patterns), but we don't currently keep track
2234 of enough info to do that easily. */
2235 || group_in_compile_stack (compile_stack, regnum))
2236 FREE_STACK_RETURN (REG_BADPAT);
2237 }
2238 else
2239 /* It's really shy. */
2240 regnum = - bufp->re_nsub;
2241
2242 if (COMPILE_STACK_FULL)
2243 compile_stack.stack
2244 = xpalloc (compile_stack.stack, &compile_stack.size,
2245 1, -1, sizeof *compile_stack.stack);
2246
2247 /* These are the values to restore when we hit end of this
2248 group. They are all relative offsets, so that if the
2249 whole pattern moves because of realloc, they will still
2250 be valid. */
2251 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
2252 COMPILE_STACK_TOP.fixup_alt_jump
2253 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
2254 COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
2255 COMPILE_STACK_TOP.regnum = regnum;
2256
2257 /* Do not push a start_memory for groups beyond the last one
2258 we can represent in the compiled pattern. */
2259 if (regnum <= MAX_REGNUM && regnum > 0)
2260 BUF_PUSH_2 (start_memory, regnum);
2261
2262 compile_stack.avail++;
2263
2264 fixup_alt_jump = 0;
2265 laststart = 0;
2266 begalt = b;
2267 /* If we've reached MAX_REGNUM groups, then this open
2268 won't actually generate any code, so we'll have to
2269 clear pending_exact explicitly. */
2270 pending_exact = 0;
2271 break;
2272 }
2273
2274 case ')':
2275 if (COMPILE_STACK_EMPTY)
2276 FREE_STACK_RETURN (REG_ERPAREN);
2277
2278 FIXUP_ALT_JUMP ();
2279
2280 /* See similar code for backslashed left paren above. */
2281 if (COMPILE_STACK_EMPTY)
2282 FREE_STACK_RETURN (REG_ERPAREN);
2283
2284 /* Since we just checked for an empty stack above, this
2285 "can't happen". */
2286 eassert (compile_stack.avail != 0);
2287 {
2288 /* We don't just want to restore into 'regnum', because
2289 later groups should continue to be numbered higher,
2290 as in '(ab)c(de)' -- the second group is #2. */
2291 regnum_t regnum;
2292
2293 compile_stack.avail--;
2294 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
2295 fixup_alt_jump
2296 = COMPILE_STACK_TOP.fixup_alt_jump
2297 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
2298 : 0;
2299 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
2300 regnum = COMPILE_STACK_TOP.regnum;
2301 /* If we've reached MAX_REGNUM groups, then this open
2302 won't actually generate any code, so we'll have to
2303 clear pending_exact explicitly. */
2304 pending_exact = 0;
2305
2306 /* We're at the end of the group, so now we know how many
2307 groups were inside this one. */
2308 if (regnum <= MAX_REGNUM && regnum > 0)
2309 BUF_PUSH_2 (stop_memory, regnum);
2310 }
2311 break;
2312
2313
2314 case '|': /* '\|'. */
2315 /* Insert before the previous alternative a jump which
2316 jumps to this alternative if the former fails. */
2317 GET_BUFFER_SPACE (3);
2318 INSERT_JUMP (on_failure_jump, begalt, b + 6);
2319 pending_exact = 0;
2320 b += 3;
2321
2322 /* The alternative before this one has a jump after it
2323 which gets executed if it gets matched. Adjust that
2324 jump so it will jump to this alternative's analogous
2325 jump (put in below, which in turn will jump to the next
2326 (if any) alternative's such jump, etc.). The last such
2327 jump jumps to the correct final destination. A picture:
2328 _____ _____
2329 | | | |
2330 | v | v
2331 A | B | C
2332
2333 If we are at B, then fixup_alt_jump right now points to a
2334 three-byte space after A. We'll put in the jump, set
2335 fixup_alt_jump to right after B, and leave behind three
2336 bytes which we'll fill in when we get to after C. */
2337
2338 FIXUP_ALT_JUMP ();
2339
2340 /* Mark and leave space for a jump after this alternative,
2341 to be filled in later either by next alternative or
2342 when know we're at the end of a series of alternatives. */
2343 fixup_alt_jump = b;
2344 GET_BUFFER_SPACE (3);
2345 b += 3;
2346
2347 laststart = 0;
2348 begalt = b;
2349 break;
2350
2351
2352 case '{':
2353 {
2354 /* At least (most) this many matches must be made. */
2355 int lower_bound = 0, upper_bound = -1;
2356
2357 beg_interval = p;
2358
2359 GET_INTERVAL_COUNT (lower_bound);
2360
2361 if (c == ',')
2362 GET_INTERVAL_COUNT (upper_bound);
2363 else
2364 /* Interval such as '{1}' => match exactly once. */
2365 upper_bound = lower_bound;
2366
2367 if (lower_bound < 0
2368 || (0 <= upper_bound && upper_bound < lower_bound)
2369 || c != '\\')
2370 FREE_STACK_RETURN (REG_BADBR);
2371 if (p == pend)
2372 FREE_STACK_RETURN (REG_EESCAPE);
2373 if (*p++ != '}')
2374 FREE_STACK_RETURN (REG_BADBR);
2375
2376 /* We just parsed a valid interval. */
2377
2378 /* If it's invalid to have no preceding re. */
2379 if (!laststart)
2380 goto unfetch_interval;
2381
2382 if (upper_bound == 0)
2383 /* If the upper bound is zero, just drop the sub pattern
2384 altogether. */
2385 b = laststart;
2386 else if (lower_bound == 1 && upper_bound == 1)
2387 /* Just match it once: nothing to do here. */
2388 ;
2389
2390 /* Otherwise, we have a nontrivial interval. When
2391 we're all done, the pattern will look like:
2392 set_number_at <jump count> <upper bound>
2393 set_number_at <succeed_n count> <lower bound>
2394 succeed_n <after jump addr> <succeed_n count>
2395 <body of loop>
2396 jump_n <succeed_n addr> <jump count>
2397 (The upper bound and 'jump_n' are omitted if
2398 'upper_bound' is 1, though.) */
2399 else
2400 { /* If the upper bound is > 1, we need to insert
2401 more at the end of the loop. */
2402 int nbytes = upper_bound < 0 ? 3 : upper_bound > 1 ? 5 : 0;
2403 int startoffset = 0;
2404
2405 GET_BUFFER_SPACE (20); /* We might use less. */
2406
2407 if (lower_bound == 0)
2408 {
2409 /* A succeed_n that starts with 0 is really
2410 a simple on_failure_jump_loop. */
2411 INSERT_JUMP (on_failure_jump_loop, laststart,
2412 b + 3 + nbytes);
2413 b += 3;
2414 }
2415 else
2416 {
2417 /* Initialize lower bound of the 'succeed_n', even
2418 though it will be set during matching by its
2419 attendant 'set_number_at' (inserted next),
2420 because 're_compile_fastmap' needs to know.
2421 Jump to the 'jump_n' we might insert below. */
2422 INSERT_JUMP2 (succeed_n, laststart,
2423 b + 5 + nbytes,
2424 lower_bound);
2425 b += 5;
2426
2427 /* Code to initialize the lower bound. Insert
2428 before the 'succeed_n'. The '5' is the last two
2429 bytes of this 'set_number_at', plus 3 bytes of
2430 the following 'succeed_n'. */
2431 insert_op2 (set_number_at, laststart, 5,
2432 lower_bound, b);
2433 b += 5;
2434 startoffset += 5;
2435 }
2436
2437 if (upper_bound < 0)
2438 {
2439 /* A negative upper bound stands for infinity,
2440 in which case it degenerates to a plain jump. */
2441 STORE_JUMP (jump, b, laststart + startoffset);
2442 b += 3;
2443 }
2444 else if (upper_bound > 1)
2445 { /* More than one repetition is allowed, so
2446 append a backward jump to the 'succeed_n'
2447 that starts this interval.
2448
2449 When we've reached this during matching,
2450 we'll have matched the interval once, so
2451 jump back only 'upper_bound - 1' times. */
2452 STORE_JUMP2 (jump_n, b, laststart + startoffset,
2453 upper_bound - 1);
2454 b += 5;
2455
2456 /* The location we want to set is the second
2457 parameter of the 'jump_n'; that is 'b-2' as
2458 an absolute address. 'laststart' will be
2459 the 'set_number_at' we're about to insert;
2460 'laststart+3' the number to set, the source
2461 for the relative address. But we are
2462 inserting into the middle of the pattern --
2463 so everything is getting moved up by 5.
2464 Conclusion: (b - 2) - (laststart + 3) + 5,
2465 i.e., b - laststart.
2466
2467 Insert this at the beginning of the loop
2468 so that if we fail during matching, we'll
2469 reinitialize the bounds. */
2470 insert_op2 (set_number_at, laststart, b - laststart,
2471 upper_bound - 1, b);
2472 b += 5;
2473 }
2474 }
2475 pending_exact = 0;
2476 beg_interval = NULL;
2477 }
2478 break;
2479
2480 unfetch_interval:
2481 /* If an invalid interval, match the characters as literals. */
2482 eassert (beg_interval);
2483 p = beg_interval;
2484 beg_interval = NULL;
2485 eassert (p > pattern && p[-1] == '\\');
2486 c = '{';
2487 goto normal_char;
2488
2489 case '=':
2490 laststart = b;
2491 BUF_PUSH (at_dot);
2492 break;
2493
2494 case 's':
2495 laststart = b;
2496 PATFETCH (c);
2497 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
2498 break;
2499
2500 case 'S':
2501 laststart = b;
2502 PATFETCH (c);
2503 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
2504 break;
2505
2506 case 'c':
2507 laststart = b;
2508 PATFETCH (c);
2509 BUF_PUSH_2 (categoryspec, c);
2510 break;
2511
2512 case 'C':
2513 laststart = b;
2514 PATFETCH (c);
2515 BUF_PUSH_2 (notcategoryspec, c);
2516 break;
2517
2518 case 'w':
2519 laststart = b;
2520 BUF_PUSH_2 (syntaxspec, Sword);
2521 break;
2522
2523
2524 case 'W':
2525 laststart = b;
2526 BUF_PUSH_2 (notsyntaxspec, Sword);
2527 break;
2528
2529
2530 case '<':
2531 laststart = b;
2532 BUF_PUSH (wordbeg);
2533 break;
2534
2535 case '>':
2536 laststart = b;
2537 BUF_PUSH (wordend);
2538 break;
2539
2540 case '_':
2541 laststart = b;
2542 PATFETCH (c);
2543 if (c == '<')
2544 BUF_PUSH (symbeg);
2545 else if (c == '>')
2546 BUF_PUSH (symend);
2547 else
2548 FREE_STACK_RETURN (REG_BADPAT);
2549 break;
2550
2551 case 'b':
2552 laststart = b;
2553 BUF_PUSH (wordbound);
2554 break;
2555
2556 case 'B':
2557 laststart = b;
2558 BUF_PUSH (notwordbound);
2559 break;
2560
2561 case '`':
2562 /* Special case for compatibility: postfix ops after \` become
2563 literals, as for ^ (see above). */
2564 laststart = 0;
2565 BUF_PUSH (begbuf);
2566 break;
2567
2568 case '\'':
2569 laststart = b;
2570 BUF_PUSH (endbuf);
2571 break;
2572
2573 case '1': case '2': case '3': case '4': case '5':
2574 case '6': case '7': case '8': case '9':
2575 {
2576 regnum_t reg = c - '0';
2577
2578 if (reg > bufp->re_nsub || reg < 1
2579 /* Can't back reference to a subexp before its end. */
2580 || group_in_compile_stack (compile_stack, reg))
2581 FREE_STACK_RETURN (REG_ESUBREG);
2582
2583 laststart = b;
2584 BUF_PUSH_2 (duplicate, reg);
2585 }
2586 break;
2587
2588 default:
2589 /* You might think it would be useful for \ to mean
2590 not to translate; but if we don't translate it
2591 it will never match anything. */
2592 goto normal_char;
2593 }
2594 break;
2595
2596
2597 default:
2598 /* Expects the character in C. */
2599 normal_char:
2600 /* If no exactn currently being built. */
2601 if (!pending_exact
2602
2603 /* If last exactn not at current position. */
2604 || pending_exact + *pending_exact + 1 != b
2605
2606 /* Only one byte follows the exactn for the count. */
2607 || *pending_exact >= (1 << BYTEWIDTH) - MAX_MULTIBYTE_LENGTH
2608
2609 /* If followed by a repetition operator. */
2610 || (p != pend
2611 && (*p == '*' || *p == '+' || *p == '?'))
2612 || (p + 1 < pend && p[0] == '\\' && p[1] == '{'))
2613 {
2614 /* Start building a new exactn. */
2615
2616 laststart = b;
2617
2618 BUF_PUSH_2 (exactn, 0);
2619 pending_exact = b - 1;
2620 }
2621
2622 GET_BUFFER_SPACE (MAX_MULTIBYTE_LENGTH);
2623 {
2624 int len;
2625
2626 if (multibyte)
2627 {
2628 c = TRANSLATE (c);
2629 len = CHAR_STRING (c, b);
2630 b += len;
2631 }
2632 else
2633 {
2634 c1 = RE_CHAR_TO_MULTIBYTE (c);
2635 if (! CHAR_BYTE8_P (c1))
2636 {
2637 int c2 = TRANSLATE (c1);
2638
2639 if (c1 != c2 && (c1 = RE_CHAR_TO_UNIBYTE (c2)) >= 0)
2640 c = c1;
2641 }
2642 *b++ = c;
2643 len = 1;
2644 }
2645 (*pending_exact) += len;
2646 }
2647
2648 break;
2649 } /* switch (c) */
2650 } /* while p != pend */
2651
2652
2653 /* Through the pattern now. */
2654
2655 FIXUP_ALT_JUMP ();
2656
2657 if (!COMPILE_STACK_EMPTY)
2658 FREE_STACK_RETURN (REG_EPAREN);
2659
2660 /* If we don't want backtracking, force success
2661 the first time we reach the end of the compiled pattern. */
2662 if (!posix_backtracking)
2663 BUF_PUSH (succeed);
2664
2665 /* Success; set the length of the buffer. */
2666 bufp->used = b - bufp->buffer;
2667
2668 #ifdef REGEX_EMACS_DEBUG
2669 if (regex_emacs_debug > 0)
2670 {
2671 re_compile_fastmap (bufp);
2672 DEBUG_PRINT ("\nCompiled pattern:\n");
2673 print_compiled_pattern (bufp);
2674 }
2675 regex_emacs_debug--;
2676 #endif
2677
2678 FREE_STACK_RETURN (REG_NOERROR);
2679
2680 } /* regex_compile */
2681
2682 /* Subroutines for 'regex_compile'. */
2683
2684 /* Store OP at LOC followed by two-byte integer parameter ARG. */
2685
2686 static void
2687 store_op1 (re_opcode_t op, unsigned char *loc, int arg)
2688 {
2689 *loc = (unsigned char) op;
2690 STORE_NUMBER (loc + 1, arg);
2691 }
2692
2693
2694 /* Like 'store_op1', but for two two-byte parameters ARG1 and ARG2. */
2695
2696 static void
2697 store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2)
2698 {
2699 *loc = (unsigned char) op;
2700 STORE_NUMBER (loc + 1, arg1);
2701 STORE_NUMBER (loc + 3, arg2);
2702 }
2703
2704
2705 /* Copy the bytes from LOC to END to open up three bytes of space at LOC
2706 for OP followed by two-byte integer parameter ARG. */
2707
2708 static void
2709 insert_op1 (re_opcode_t op, unsigned char *loc, int arg, unsigned char *end)
2710 {
2711 register unsigned char *pfrom = end;
2712 register unsigned char *pto = end + 3;
2713
2714 while (pfrom != loc)
2715 *--pto = *--pfrom;
2716
2717 store_op1 (op, loc, arg);
2718 }
2719
2720
2721 /* Like 'insert_op1', but for two two-byte parameters ARG1 and ARG2. */
2722
2723 static void
2724 insert_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2,
2725 unsigned char *end)
2726 {
2727 register unsigned char *pfrom = end;
2728 register unsigned char *pto = end + 5;
2729
2730 while (pfrom != loc)
2731 *--pto = *--pfrom;
2732
2733 store_op2 (op, loc, arg1, arg2);
2734 }
2735
2736
2737 /* P points to just after a ^ in PATTERN. Return true if that ^ comes
2738 after an alternative or a begin-subexpression. Assume there is at
2739 least one character before the ^. */
2740
2741 static bool
2742 at_begline_loc_p (re_char *pattern, re_char *p)
2743 {
2744 re_char *prev = p - 2;
2745
2746 switch (*prev)
2747 {
2748 case '(': /* After a subexpression. */
2749 case '|': /* After an alternative. */
2750 break;
2751
2752 case ':': /* After a shy subexpression. */
2753 /* Skip over optional regnum. */
2754 while (prev > pattern && '0' <= prev[-1] && prev[-1] <= '9')
2755 --prev;
2756
2757 if (! (prev > pattern + 1 && prev[-1] == '?' && prev[-2] == '('))
2758 return false;
2759 prev -= 2;
2760 break;
2761
2762 default:
2763 return false;
2764 }
2765
2766 /* Count the number of preceding backslashes. */
2767 p = prev;
2768 while (prev > pattern && prev[-1] == '\\')
2769 --prev;
2770 return (p - prev) & 1;
2771 }
2772
2773
2774 /* The dual of at_begline_loc_p. This one is for $. Assume there is
2775 at least one character after the $, i.e., 'P < PEND'. */
2776
2777 static bool
2778 at_endline_loc_p (re_char *p, re_char *pend)
2779 {
2780 /* Before a subexpression or an alternative? */
2781 return *p == '\\' && p + 1 < pend && (p[1] == ')' || p[1] == '|');
2782 }
2783
2784
2785 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and
2786 false if it's not. */
2787
2788 static bool
2789 group_in_compile_stack (compile_stack_type compile_stack, regnum_t regnum)
2790 {
2791 ptrdiff_t this_element;
2792
2793 for (this_element = compile_stack.avail - 1;
2794 this_element >= 0;
2795 this_element--)
2796 if (compile_stack.stack[this_element].regnum == regnum)
2797 return true;
2798
2799 return false;
2800 }
2801
2802 /* analyze_first.
2803 If fastmap is non-NULL, go through the pattern and fill fastmap
2804 with all the possible leading chars. If fastmap is NULL, don't
2805 bother filling it up (obviously) and only return whether the
2806 pattern could potentially match the empty string.
2807
2808 Return 1 if p..pend might match the empty string.
2809 Return 0 if p..pend matches at least one char.
2810 Return -1 if fastmap was not updated accurately. */
2811
2812 static int
2813 analyze_first (re_char *p, re_char *pend, char *fastmap, bool multibyte)
2814 {
2815 int j, k;
2816 int nbits;
2817 bool not;
2818
2819 /* If all elements for base leading-codes in fastmap is set, this
2820 flag is set true. */
2821 bool match_any_multibyte_characters = false;
2822
2823 eassert (p);
2824
2825 /* The loop below works as follows:
2826 - It has a working-list kept in the PATTERN_STACK and which basically
2827 starts by only containing a pointer to the first operation.
2828 - If the opcode we're looking at is a match against some set of
2829 chars, then we add those chars to the fastmap and go on to the
2830 next work element from the worklist (done via 'break').
2831 - If the opcode is a control operator on the other hand, we either
2832 ignore it (if it's meaningless at this point, such as 'start_memory')
2833 or execute it (if it's a jump). If the jump has several destinations
2834 (i.e. 'on_failure_jump'), then we push the other destination onto the
2835 worklist.
2836 We guarantee termination by ignoring backward jumps (more or less),
2837 so that P is monotonically increasing. More to the point, we
2838 never set P (or push) anything '<= p1'. */
2839
2840 while (p < pend)
2841 {
2842 /* P1 is used as a marker of how far back a 'on_failure_jump'
2843 can go without being ignored. It is normally equal to P
2844 (which prevents any backward 'on_failure_jump') except right
2845 after a plain 'jump', to allow patterns such as:
2846 0: jump 10
2847 3..9: <body>
2848 10: on_failure_jump 3
2849 as used for the *? operator. */
2850 re_char *p1 = p;
2851
2852 switch (*p++)
2853 {
2854 case succeed:
2855 return 1;
2856
2857 case duplicate:
2858 /* If the first character has to match a backreference, that means
2859 that the group was empty (since it already matched). Since this
2860 is the only case that interests us here, we can assume that the
2861 backreference must match the empty string. */
2862 p++;
2863 continue;
2864
2865
2866 /* Following are the cases which match a character. These end
2867 with 'break'. */
2868
2869 case exactn:
2870 if (fastmap)
2871 {
2872 /* If multibyte is nonzero, the first byte of each
2873 character is an ASCII or a leading code. Otherwise,
2874 each byte is a character. Thus, this works in both
2875 cases. */
2876 fastmap[p[1]] = 1;
2877 if (multibyte)
2878 {
2879 /* Cover the case of matching a raw char in a
2880 multibyte regexp against unibyte. */
2881 if (CHAR_BYTE8_HEAD_P (p[1]))
2882 fastmap[CHAR_TO_BYTE8 (STRING_CHAR (p + 1))] = 1;
2883 }
2884 else
2885 {
2886 /* For the case of matching this unibyte regex
2887 against multibyte, we must set a leading code of
2888 the corresponding multibyte character. */
2889 int c = RE_CHAR_TO_MULTIBYTE (p[1]);
2890
2891 fastmap[CHAR_LEADING_CODE (c)] = 1;
2892 }
2893 }
2894 break;
2895
2896
2897 case anychar:
2898 /* We could put all the chars except for \n (and maybe \0)
2899 but we don't bother since it is generally not worth it. */
2900 if (!fastmap) break;
2901 return -1;
2902
2903
2904 case charset_not:
2905 if (!fastmap) break;
2906 {
2907 /* Chars beyond end of bitmap are possible matches. */
2908 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
2909 j < (1 << BYTEWIDTH); j++)
2910 fastmap[j] = 1;
2911 }
2912 FALLTHROUGH;
2913 case charset:
2914 if (!fastmap) break;
2915 not = (re_opcode_t) *(p - 1) == charset_not;
2916 nbits = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
2917 p++;
2918 for (j = 0; j < nbits; j++)
2919 if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
2920 fastmap[j] = 1;
2921
2922 /* To match raw bytes (in the 80..ff range) against multibyte
2923 strings, add their leading bytes to the fastmap. */
2924 for (j = 0x80; j < nbits; j++)
2925 if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
2926 fastmap[CHAR_LEADING_CODE (BYTE8_TO_CHAR (j))] = 1;
2927
2928 if (/* Any leading code can possibly start a character
2929 which doesn't match the specified set of characters. */
2930 not
2931 ||
2932 /* If we can match a character class, we can match any
2933 multibyte characters. */
2934 (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
2935 && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0))
2936
2937 {
2938 if (match_any_multibyte_characters == false)
2939 {
2940 for (j = MIN_MULTIBYTE_LEADING_CODE;
2941 j <= MAX_MULTIBYTE_LEADING_CODE; j++)
2942 fastmap[j] = 1;
2943 match_any_multibyte_characters = true;
2944 }
2945 }
2946
2947 else if (!not && CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
2948 && match_any_multibyte_characters == false)
2949 {
2950 /* Set fastmap[I] to 1 where I is a leading code of each
2951 multibyte character in the range table. */
2952 int c, count;
2953 unsigned char lc1, lc2;
2954
2955 /* Make P points the range table. '+ 2' is to skip flag
2956 bits for a character class. */
2957 p += CHARSET_BITMAP_SIZE (&p[-2]) + 2;
2958
2959 /* Extract the number of ranges in range table into COUNT. */
2960 EXTRACT_NUMBER_AND_INCR (count, p);
2961 for (; count > 0; count--, p += 3)
2962 {
2963 /* Extract the start and end of each range. */
2964 EXTRACT_CHARACTER (c, p);
2965 lc1 = CHAR_LEADING_CODE (c);
2966 p += 3;
2967 EXTRACT_CHARACTER (c, p);
2968 lc2 = CHAR_LEADING_CODE (c);
2969 for (j = lc1; j <= lc2; j++)
2970 fastmap[j] = 1;
2971 }
2972 }
2973 break;
2974
2975 case syntaxspec:
2976 case notsyntaxspec:
2977 if (!fastmap) break;
2978 /* This match depends on text properties. These end with
2979 aborting optimizations. */
2980 return -1;
2981
2982 case categoryspec:
2983 case notcategoryspec:
2984 if (!fastmap) break;
2985 not = (re_opcode_t)p[-1] == notcategoryspec;
2986 k = *p++;
2987 for (j = (1 << BYTEWIDTH); j >= 0; j--)
2988 if ((CHAR_HAS_CATEGORY (j, k)) ^ not)
2989 fastmap[j] = 1;
2990
2991 /* Any leading code can possibly start a character which
2992 has or doesn't has the specified category. */
2993 if (match_any_multibyte_characters == false)
2994 {
2995 for (j = MIN_MULTIBYTE_LEADING_CODE;
2996 j <= MAX_MULTIBYTE_LEADING_CODE; j++)
2997 fastmap[j] = 1;
2998 match_any_multibyte_characters = true;
2999 }
3000 break;
3001
3002 /* All cases after this match the empty string. These end with
3003 'continue'. */
3004
3005 case at_dot:
3006 case no_op:
3007 case begline:
3008 case endline:
3009 case begbuf:
3010 case endbuf:
3011 case wordbound:
3012 case notwordbound:
3013 case wordbeg:
3014 case wordend:
3015 case symbeg:
3016 case symend:
3017 continue;
3018
3019
3020 case jump:
3021 EXTRACT_NUMBER_AND_INCR (j, p);
3022 if (j < 0)
3023 /* Backward jumps can only go back to code that we've already
3024 visited. 're_compile' should make sure this is true. */
3025 break;
3026 p += j;
3027 switch (*p)
3028 {
3029 case on_failure_jump:
3030 case on_failure_keep_string_jump:
3031 case on_failure_jump_loop:
3032 case on_failure_jump_nastyloop:
3033 case on_failure_jump_smart:
3034 p++;
3035 break;
3036 default:
3037 continue;
3038 };
3039 /* Keep P1 to allow the 'on_failure_jump' we are jumping to
3040 to jump back to "just after here". */
3041 FALLTHROUGH;
3042 case on_failure_jump:
3043 case on_failure_keep_string_jump:
3044 case on_failure_jump_nastyloop:
3045 case on_failure_jump_loop:
3046 case on_failure_jump_smart:
3047 EXTRACT_NUMBER_AND_INCR (j, p);
3048 if (p + j <= p1)
3049 ; /* Backward jump to be ignored. */
3050 else
3051 { /* We have to look down both arms.
3052 We first go down the "straight" path so as to minimize
3053 stack usage when going through alternatives. */
3054 int r = analyze_first (p, pend, fastmap, multibyte);
3055 if (r) return r;
3056 p += j;
3057 }
3058 continue;
3059
3060
3061 case jump_n:
3062 /* This code simply does not properly handle forward jump_n. */
3063 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p); eassert (j < 0));
3064 p += 4;
3065 /* jump_n can either jump or fall through. The (backward) jump
3066 case has already been handled, so we only need to look at the
3067 fallthrough case. */
3068 continue;
3069
3070 case succeed_n:
3071 /* If N == 0, it should be an on_failure_jump_loop instead. */
3072 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); eassert (j > 0));
3073 p += 4;
3074 /* We only care about one iteration of the loop, so we don't
3075 need to consider the case where this behaves like an
3076 on_failure_jump. */
3077 continue;
3078
3079
3080 case set_number_at:
3081 p += 4;
3082 continue;
3083
3084
3085 case start_memory:
3086 case stop_memory:
3087 p += 1;
3088 continue;
3089
3090
3091 default:
3092 abort (); /* We have listed all the cases. */
3093 } /* switch *p++ */
3094
3095 /* Getting here means we have found the possible starting
3096 characters for one path of the pattern -- and that the empty
3097 string does not match. We need not follow this path further. */
3098 return 0;
3099 } /* while p */
3100
3101 /* We reached the end without matching anything. */
3102 return 1;
3103
3104 } /* analyze_first */
3105
3106 /* Compute a fastmap for the compiled pattern in BUFP.
3107 A fastmap records which of the (1 << BYTEWIDTH) possible
3108 characters can start a string that matches the pattern. This fastmap
3109 is used by re_search to skip quickly over impossible starting points.
3110
3111 Character codes above (1 << BYTEWIDTH) are not represented in the
3112 fastmap, but the leading codes are represented. Thus, the fastmap
3113 indicates which character sets could start a match.
3114
3115 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
3116 area as BUFP->fastmap.
3117
3118 Set the 'fastmap', 'fastmap_accurate', and 'can_be_null' fields in
3119 the pattern buffer. */
3120
3121 static void
3122 re_compile_fastmap (struct re_pattern_buffer *bufp)
3123 {
3124 char *fastmap = bufp->fastmap;
3125 int analysis;
3126
3127 eassert (fastmap && bufp->buffer);
3128
3129 memset (fastmap, 0, 1 << BYTEWIDTH); /* Assume nothing's valid. */
3130
3131 /* FIXME: Is the following assignment correct even when ANALYSIS < 0? */
3132 bufp->fastmap_accurate = 1; /* It will be when we're done. */
3133
3134 analysis = analyze_first (bufp->buffer, bufp->buffer + bufp->used,
3135 fastmap, RE_MULTIBYTE_P (bufp));
3136 bufp->can_be_null = (analysis != 0);
3137 } /* re_compile_fastmap */
3138
3139 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
3140 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
3141 this memory for recording register information. STARTS and ENDS
3142 must be allocated using the malloc library routine, and must each
3143 be at least NUM_REGS * sizeof (ptrdiff_t) bytes long.
3144
3145 If NUM_REGS == 0, then subsequent matches should allocate their own
3146 register data.
3147
3148 Unless this function is called, the first search or match using
3149 PATTERN_BUFFER will allocate its own register data, without
3150 freeing the old data. */
3151
3152 void
3153 re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs,
3154 ptrdiff_t num_regs, ptrdiff_t *starts, ptrdiff_t *ends)
3155 {
3156 if (num_regs)
3157 {
3158 bufp->regs_allocated = REGS_REALLOCATE;
3159 regs->num_regs = num_regs;
3160 regs->start = starts;
3161 regs->end = ends;
3162 }
3163 else
3164 {
3165 bufp->regs_allocated = REGS_UNALLOCATED;
3166 regs->num_regs = 0;
3167 regs->start = regs->end = 0;
3168 }
3169 }
3170
3171 /* Searching routines. */
3172
3173 /* Like re_search_2, below, but only one string is specified, and
3174 doesn't let you say where to stop matching. */
3175
3176 ptrdiff_t
3177 re_search (struct re_pattern_buffer *bufp, const char *string, ptrdiff_t size,
3178 ptrdiff_t startpos, ptrdiff_t range, struct re_registers *regs)
3179 {
3180 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
3181 regs, size);
3182 }
3183
3184 /* Address of POS in the concatenation of virtual string. */
3185 #define POS_ADDR_VSTRING(POS) \
3186 (((POS) >= size1 ? string2 - size1 : string1) + (POS))
3187
3188 /* Using the compiled pattern in BUFP->buffer, first tries to match the
3189 virtual concatenation of STRING1 and STRING2, starting first at index
3190 STARTPOS, then at STARTPOS + 1, and so on.
3191
3192 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
3193
3194 RANGE is how far to scan while trying to match. RANGE = 0 means try
3195 only at STARTPOS; in general, the last start tried is STARTPOS +
3196 RANGE.
3197
3198 In REGS, return the indices of the virtual concatenation of STRING1
3199 and STRING2 that matched the entire BUFP->buffer and its contained
3200 subexpressions.
3201
3202 Do not consider matching one past the index STOP in the virtual
3203 concatenation of STRING1 and STRING2.
3204
3205 Return either the position in the strings at which the match was
3206 found, -1 if no match, or -2 if error (such as failure
3207 stack overflow). */
3208
3209 ptrdiff_t
3210 re_search_2 (struct re_pattern_buffer *bufp, const char *str1, ptrdiff_t size1,
3211 const char *str2, ptrdiff_t size2,
3212 ptrdiff_t startpos, ptrdiff_t range,
3213 struct re_registers *regs, ptrdiff_t stop)
3214 {
3215 ptrdiff_t val;
3216 re_char *string1 = (re_char *) str1;
3217 re_char *string2 = (re_char *) str2;
3218 char *fastmap = bufp->fastmap;
3219 Lisp_Object translate = bufp->translate;
3220 ptrdiff_t total_size = size1 + size2;
3221 ptrdiff_t endpos = startpos + range;
3222 bool anchored_start;
3223 /* Nonzero if we are searching multibyte string. */
3224 bool multibyte = RE_TARGET_MULTIBYTE_P (bufp);
3225
3226 /* Check for out-of-range STARTPOS. */
3227 if (startpos < 0 || startpos > total_size)
3228 return -1;
3229
3230 /* Fix up RANGE if it might eventually take us outside
3231 the virtual concatenation of STRING1 and STRING2.
3232 Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE. */
3233 if (endpos < 0)
3234 range = 0 - startpos;
3235 else if (endpos > total_size)
3236 range = total_size - startpos;
3237
3238 /* If the search isn't to be a backwards one, don't waste time in a
3239 search for a pattern anchored at beginning of buffer. */
3240 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
3241 {
3242 if (startpos > 0)
3243 return -1;
3244 else
3245 range = 0;
3246 }
3247
3248 /* In a forward search for something that starts with \=.
3249 don't keep searching past point. */
3250 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
3251 {
3252 range = PT_BYTE - BEGV_BYTE - startpos;
3253 if (range < 0)
3254 return -1;
3255 }
3256
3257 /* Update the fastmap now if not correct already. */
3258 if (fastmap && !bufp->fastmap_accurate)
3259 re_compile_fastmap (bufp);
3260
3261 /* See whether the pattern is anchored. */
3262 anchored_start = (bufp->buffer[0] == begline);
3263
3264 RE_SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, startpos);
3265
3266 /* Loop through the string, looking for a place to start matching. */
3267 for (;;)
3268 {
3269 /* If the pattern is anchored,
3270 skip quickly past places we cannot match.
3271 Don't bother to treat startpos == 0 specially
3272 because that case doesn't repeat. */
3273 if (anchored_start && startpos > 0)
3274 {
3275 if (! ((startpos <= size1 ? string1[startpos - 1]
3276 : string2[startpos - size1 - 1])
3277 == '\n'))
3278 goto advance;
3279 }
3280
3281 /* If a fastmap is supplied, skip quickly over characters that
3282 cannot be the start of a match. If the pattern can match the
3283 null string, however, we don't need to skip characters; we want
3284 the first null string. */
3285 if (fastmap && startpos < total_size && !bufp->can_be_null)
3286 {
3287 re_char *d;
3288 int buf_ch;
3289
3290 d = POS_ADDR_VSTRING (startpos);
3291
3292 if (range > 0) /* Searching forwards. */
3293 {
3294 ptrdiff_t irange = range, lim = 0;
3295
3296 if (startpos < size1 && startpos + range >= size1)
3297 lim = range - (size1 - startpos);
3298
3299 /* Written out as an if-else to avoid testing 'translate'
3300 inside the loop. */
3301 if (!NILP (translate))
3302 {
3303 if (multibyte)
3304 while (range > lim)
3305 {
3306 int buf_charlen;
3307
3308 buf_ch = string_char_and_length (d, &buf_charlen);
3309 buf_ch = RE_TRANSLATE (translate, buf_ch);
3310 if (fastmap[CHAR_LEADING_CODE (buf_ch)])
3311 break;
3312
3313 range -= buf_charlen;
3314 d += buf_charlen;
3315 }
3316 else
3317 while (range > lim)
3318 {
3319 buf_ch = *d;
3320 int ch = RE_CHAR_TO_MULTIBYTE (buf_ch);
3321 int translated = RE_TRANSLATE (translate, ch);
3322 if (translated != ch
3323 && (ch = RE_CHAR_TO_UNIBYTE (translated)) >= 0)
3324 buf_ch = ch;
3325 if (fastmap[buf_ch])
3326 break;
3327 d++;
3328 range--;
3329 }
3330 }
3331 else
3332 {
3333 if (multibyte)
3334 while (range > lim)
3335 {
3336 int buf_charlen;
3337
3338 buf_ch = string_char_and_length (d, &buf_charlen);
3339 if (fastmap[CHAR_LEADING_CODE (buf_ch)])
3340 break;
3341 range -= buf_charlen;
3342 d += buf_charlen;
3343 }
3344 else
3345 while (range > lim && !fastmap[*d])
3346 {
3347 d++;
3348 range--;
3349 }
3350 }
3351 startpos += irange - range;
3352 }
3353 else /* Searching backwards. */
3354 {
3355 if (multibyte)
3356 {
3357 buf_ch = STRING_CHAR (d);
3358 buf_ch = TRANSLATE (buf_ch);
3359 if (! fastmap[CHAR_LEADING_CODE (buf_ch)])
3360 goto advance;
3361 }
3362 else
3363 {
3364 buf_ch = *d;
3365 int ch = RE_CHAR_TO_MULTIBYTE (buf_ch);
3366 int translated = TRANSLATE (ch);
3367 if (translated != ch
3368 && (ch = RE_CHAR_TO_UNIBYTE (translated)) >= 0)
3369 buf_ch = ch;
3370 if (! fastmap[TRANSLATE (buf_ch)])
3371 goto advance;
3372 }
3373 }
3374 }
3375
3376 /* If can't match the null string, and that's all we have left, fail. */
3377 if (range >= 0 && startpos == total_size && fastmap
3378 && !bufp->can_be_null)
3379 return -1;
3380
3381 val = re_match_2_internal (bufp, string1, size1, string2, size2,
3382 startpos, regs, stop);
3383
3384 if (val >= 0)
3385 return startpos;
3386
3387 if (val == -2)
3388 return -2;
3389
3390 advance:
3391 if (!range)
3392 break;
3393 else if (range > 0)
3394 {
3395 /* Update STARTPOS to the next character boundary. */
3396 if (multibyte)
3397 {
3398 re_char *p = POS_ADDR_VSTRING (startpos);
3399 int len = BYTES_BY_CHAR_HEAD (*p);
3400
3401 range -= len;
3402 if (range < 0)
3403 break;
3404 startpos += len;
3405 }
3406 else
3407 {
3408 range--;
3409 startpos++;
3410 }
3411 }
3412 else
3413 {
3414 range++;
3415 startpos--;
3416
3417 /* Update STARTPOS to the previous character boundary. */
3418 if (multibyte)
3419 {
3420 re_char *p = POS_ADDR_VSTRING (startpos) + 1;
3421 int len = raw_prev_char_len (p);
3422
3423 range += len - 1;
3424 if (range > 0)
3425 break;
3426 startpos -= len - 1;
3427 }
3428 }
3429 }
3430 return -1;
3431 } /* re_search_2 */
3432
3433 /* Declarations and macros for re_match_2. */
3434
3435 static bool bcmp_translate (re_char *, re_char *, ptrdiff_t,
3436 Lisp_Object, bool);
3437
3438 /* This converts PTR, a pointer into one of the search strings 'string1'
3439 and 'string2' into an offset from the beginning of that string. */
3440 #define POINTER_TO_OFFSET(ptr) \
3441 (FIRST_STRING_P (ptr) \
3442 ? (ptr) - string1 \
3443 : (ptr) - string2 + (ptrdiff_t) size1)
3444
3445 /* Call before fetching a character with *d. This switches over to
3446 string2 if necessary.
3447 `reset' is executed before backtracking if there are no more characters.
3448 Check re_match_2_internal for a discussion of why end_match_2 might
3449 not be within string2 (but be equal to end_match_1 instead). */
3450 #define PREFETCH(reset) \
3451 while (d == dend) \
3452 { \
3453 /* End of string2 => fail. */ \
3454 if (dend == end_match_2) \
3455 { \
3456 reset; \
3457 goto fail; \
3458 } \
3459 /* End of string1 => advance to string2. */ \
3460 d = string2; \
3461 dend = end_match_2; \
3462 }
3463
3464 /* Call before fetching a char with *d if you already checked other limits.
3465 This is meant for use in lookahead operations like wordend, etc..
3466 where we might need to look at parts of the string that might be
3467 outside of the LIMITs (i.e past 'stop'). */
3468 #define PREFETCH_NOLIMIT() \
3469 if (d == end1) \
3470 { \
3471 d = string2; \
3472 dend = end_match_2; \
3473 }
3474
3475 /* Test if at very beginning or at very end of the virtual concatenation
3476 of STRING1 and STRING2. If only one string, it's STRING2. */
3477 #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
3478 #define AT_STRINGS_END(d) ((d) == end2)
3479
3480 /* Disabled due to a compiler bug -- see comment at case wordbound */
3481
3482 /* The comment at case wordbound is following one, but we don't use
3483 AT_WORD_BOUNDARY anymore to support multibyte form.
3484
3485 The DEC Alpha C compiler 3.x generates incorrect code for the
3486 test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of
3487 AT_WORD_BOUNDARY, so this code is disabled. Expanding the
3488 macro and introducing temporary variables works around the bug. */
3489
3490 #if 0
3491 /* Test if D points to a character which is word-constituent. We have
3492 two special cases to check for: if past the end of string1, look at
3493 the first character in string2; and if before the beginning of
3494 string2, look at the last character in string1. */
3495 #define WORDCHAR_P(d) \
3496 (SYNTAX ((d) == end1 ? *string2 \
3497 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
3498 == Sword)
3499
3500 /* Test if the character before D and the one at D differ with respect
3501 to being word-constituent. */
3502 #define AT_WORD_BOUNDARY(d) \
3503 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
3504 || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
3505 #endif
3506
3507
3508 /* Optimization routines. */
3509
3510 /* If the operation is a match against one or more chars,
3511 return a pointer to the next operation, else return NULL. */
3512 static re_char *
3513 skip_one_char (re_char *p)
3514 {
3515 switch (*p++)
3516 {
3517 case anychar:
3518 break;
3519
3520 case exactn:
3521 p += *p + 1;
3522 break;
3523
3524 case charset_not:
3525 case charset:
3526 if (CHARSET_RANGE_TABLE_EXISTS_P (p - 1))
3527 {
3528 int mcnt;
3529 p = CHARSET_RANGE_TABLE (p - 1);
3530 EXTRACT_NUMBER_AND_INCR (mcnt, p);
3531 p = CHARSET_RANGE_TABLE_END (p, mcnt);
3532 }
3533 else
3534 p += 1 + CHARSET_BITMAP_SIZE (p - 1);
3535 break;
3536
3537 case syntaxspec:
3538 case notsyntaxspec:
3539 case categoryspec:
3540 case notcategoryspec:
3541 p++;
3542 break;
3543
3544 default:
3545 p = NULL;
3546 }
3547 return p;
3548 }
3549
3550
3551 /* Jump over non-matching operations. */
3552 static re_char *
3553 skip_noops (re_char *p, re_char *pend)
3554 {
3555 int mcnt;
3556 while (p < pend)
3557 {
3558 switch (*p)
3559 {
3560 case start_memory:
3561 case stop_memory:
3562 p += 2; break;
3563 case no_op:
3564 p += 1; break;
3565 case jump:
3566 p += 1;
3567 EXTRACT_NUMBER_AND_INCR (mcnt, p);
3568 p += mcnt;
3569 break;
3570 default:
3571 return p;
3572 }
3573 }
3574 eassert (p == pend);
3575 return p;
3576 }
3577
3578 /* Test if C matches charset op. *PP points to the charset or charset_not
3579 opcode. When the function finishes, *PP will be advanced past that opcode.
3580 C is character to test (possibly after translations) and CORIG is original
3581 character (i.e. without any translations). UNIBYTE denotes whether c is
3582 unibyte or multibyte character.
3583 CANON_TABLE is the canonicalisation table for case folding or Qnil. */
3584 static bool
3585 execute_charset (re_char **pp, int c, int corig, bool unibyte,
3586 Lisp_Object canon_table)
3587 {
3588 eassume (0 <= c && 0 <= corig);
3589 re_char *p = *pp, *rtp = NULL;
3590 bool not = (re_opcode_t) *p == charset_not;
3591
3592 if (CHARSET_RANGE_TABLE_EXISTS_P (p))
3593 {
3594 int count;
3595 rtp = CHARSET_RANGE_TABLE (p);
3596 EXTRACT_NUMBER_AND_INCR (count, rtp);
3597 *pp = CHARSET_RANGE_TABLE_END ((rtp), (count));
3598 }
3599 else
3600 *pp += 2 + CHARSET_BITMAP_SIZE (p);
3601
3602 if (unibyte && c < (1 << BYTEWIDTH))
3603 { /* Lookup bitmap. */
3604 /* Cast to 'unsigned' instead of 'unsigned char' in
3605 case the bit list is a full 32 bytes long. */
3606 if (c < (unsigned) (CHARSET_BITMAP_SIZE (p) * BYTEWIDTH)
3607 && p[2 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
3608 return !not;
3609 }
3610 else if (rtp)
3611 {
3612 int class_bits = CHARSET_RANGE_TABLE_BITS (p);
3613 int range_start, range_end;
3614
3615 /* Sort tests by the most commonly used classes with some adjustment to which
3616 tests are easiest to perform. Take a look at comment in re_wctype_parse
3617 for table with frequencies of character class names. */
3618
3619 if ((class_bits & BIT_MULTIBYTE) ||
3620 (class_bits & BIT_ALNUM && ISALNUM (c)) ||
3621 (class_bits & BIT_ALPHA && ISALPHA (c)) ||
3622 (class_bits & BIT_SPACE && ISSPACE (c)) ||
3623 (class_bits & BIT_BLANK && ISBLANK (c)) ||
3624 (class_bits & BIT_WORD && ISWORD (c)) ||
3625 ((class_bits & BIT_UPPER) &&
3626 (ISUPPER (corig) || (!NILP (canon_table) && ISLOWER (corig)))) ||
3627 ((class_bits & BIT_LOWER) &&
3628 (ISLOWER (corig) || (!NILP (canon_table) && ISUPPER (corig)))) ||
3629 (class_bits & BIT_PUNCT && ISPUNCT (c)) ||
3630 (class_bits & BIT_GRAPH && ISGRAPH (c)) ||
3631 (class_bits & BIT_PRINT && ISPRINT (c)))
3632 return !not;
3633
3634 for (p = *pp; rtp < p; rtp += 2 * 3)
3635 {
3636 EXTRACT_CHARACTER (range_start, rtp);
3637 EXTRACT_CHARACTER (range_end, rtp + 3);
3638 if (range_start <= c && c <= range_end)
3639 return !not;
3640 }
3641 }
3642
3643 return not;
3644 }
3645
3646 /* True if "p1 matches something" implies "p2 fails". */
3647 static bool
3648 mutually_exclusive_p (struct re_pattern_buffer *bufp, re_char *p1,
3649 re_char *p2)
3650 {
3651 re_opcode_t op2;
3652 bool multibyte = RE_MULTIBYTE_P (bufp);
3653 unsigned char *pend = bufp->buffer + bufp->used;
3654 re_char *p2_orig = p2;
3655
3656 eassert (p1 >= bufp->buffer && p1 < pend
3657 && p2 >= bufp->buffer && p2 <= pend);
3658
3659 /* Skip over open/close-group commands.
3660 If what follows this loop is a ...+ construct,
3661 look at what begins its body, since we will have to
3662 match at least one of that. */
3663 p2 = skip_noops (p2, pend);
3664 /* The same skip can be done for p1, except that this function
3665 is only used in the case where p1 is a simple match operator. */
3666 /* p1 = skip_noops (p1, pend); */
3667
3668 eassert (p1 >= bufp->buffer && p1 < pend
3669 && p2 >= bufp->buffer && p2 <= pend);
3670
3671 op2 = p2 == pend ? succeed : *p2;
3672
3673 switch (op2)
3674 {
3675 case succeed:
3676 case endbuf:
3677 /* If we're at the end of the pattern, we can change. */
3678 if (skip_one_char (p1))
3679 {
3680 DEBUG_PRINT (" End of pattern: fast loop.\n");
3681 return true;
3682 }
3683 break;
3684
3685 case endline:
3686 case exactn:
3687 {
3688 int c
3689 = (re_opcode_t) *p2 == endline ? '\n'
3690 : RE_STRING_CHAR (p2 + 2, multibyte);
3691
3692 if ((re_opcode_t) *p1 == exactn)
3693 {
3694 if (c != RE_STRING_CHAR (p1 + 2, multibyte))
3695 {
3696 DEBUG_PRINT (" '%c' != '%c' => fast loop.\n", c, p1[2]);
3697 return true;
3698 }
3699 }
3700
3701 else if ((re_opcode_t) *p1 == charset
3702 || (re_opcode_t) *p1 == charset_not)
3703 {
3704 if (!execute_charset (&p1, c, c, !multibyte || ASCII_CHAR_P (c),
3705 Qnil))
3706 {
3707 DEBUG_PRINT (" No match => fast loop.\n");
3708 return true;
3709 }
3710 }
3711 else if ((re_opcode_t) *p1 == anychar
3712 && c == '\n')
3713 {
3714 DEBUG_PRINT (" . != \\n => fast loop.\n");
3715 return true;
3716 }
3717 }
3718 break;
3719
3720 case charset:
3721 {
3722 if ((re_opcode_t) *p1 == exactn)
3723 /* Reuse the code above. */
3724 return mutually_exclusive_p (bufp, p2, p1);
3725
3726 /* It is hard to list up all the character in charset
3727 P2 if it includes multibyte character. Give up in
3728 such case. */
3729 else if (!multibyte || !CHARSET_RANGE_TABLE_EXISTS_P (p2))
3730 {
3731 /* Now, we are sure that P2 has no range table.
3732 So, for the size of bitmap in P2, 'p2[1]' is
3733 enough. But P1 may have range table, so the
3734 size of bitmap table of P1 is extracted by
3735 using macro 'CHARSET_BITMAP_SIZE'.
3736
3737 In a multibyte case, we know that all the character
3738 listed in P2 is ASCII. In a unibyte case, P1 has only a
3739 bitmap table. So, in both cases, it is enough to test
3740 only the bitmap table of P1. */
3741
3742 if ((re_opcode_t) *p1 == charset)
3743 {
3744 int idx;
3745 /* We win if the charset inside the loop
3746 has no overlap with the one after the loop. */
3747 for (idx = 0;
3748 (idx < (int) p2[1]
3749 && idx < CHARSET_BITMAP_SIZE (p1));
3750 idx++)
3751 if ((p2[2 + idx] & p1[2 + idx]) != 0)
3752 break;
3753
3754 if (idx == p2[1]
3755 || idx == CHARSET_BITMAP_SIZE (p1))
3756 {
3757 DEBUG_PRINT (" No match => fast loop.\n");
3758 return true;
3759 }
3760 }
3761 else if ((re_opcode_t) *p1 == charset_not)
3762 {
3763 int idx;
3764 /* We win if the charset_not inside the loop lists
3765 every character listed in the charset after. */
3766 for (idx = 0; idx < (int) p2[1]; idx++)
3767 if (! (p2[2 + idx] == 0
3768 || (idx < CHARSET_BITMAP_SIZE (p1)
3769 && ((p2[2 + idx] & ~ p1[2 + idx]) == 0))))
3770 break;
3771
3772 if (idx == p2[1])
3773 {
3774 DEBUG_PRINT (" No match => fast loop.\n");
3775 return true;
3776 }
3777 }
3778 }
3779 }
3780 break;
3781
3782 case charset_not:
3783 switch (*p1)
3784 {
3785 case exactn:
3786 case charset:
3787 /* Reuse the code above. */
3788 return mutually_exclusive_p (bufp, p2, p1);
3789 case charset_not:
3790 /* When we have two charset_not, it's very unlikely that
3791 they don't overlap. The union of the two sets of excluded
3792 chars should cover all possible chars, which, as a matter of
3793 fact, is virtually impossible in multibyte buffers. */
3794 break;
3795 }
3796 break;
3797
3798 case wordend:
3799 return ((re_opcode_t) *p1 == syntaxspec && p1[1] == Sword);
3800 case symend:
3801 return ((re_opcode_t) *p1 == syntaxspec
3802 && (p1[1] == Ssymbol || p1[1] == Sword));
3803 case notsyntaxspec:
3804 return ((re_opcode_t) *p1 == syntaxspec && p1[1] == p2[1]);
3805
3806 case wordbeg:
3807 return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == Sword);
3808 case symbeg:
3809 return ((re_opcode_t) *p1 == notsyntaxspec
3810 && (p1[1] == Ssymbol || p1[1] == Sword));
3811 case syntaxspec:
3812 return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == p2[1]);
3813
3814 case wordbound:
3815 return (((re_opcode_t) *p1 == notsyntaxspec
3816 || (re_opcode_t) *p1 == syntaxspec)
3817 && p1[1] == Sword);
3818
3819 case categoryspec:
3820 return ((re_opcode_t) *p1 == notcategoryspec && p1[1] == p2[1]);
3821 case notcategoryspec:
3822 return ((re_opcode_t) *p1 == categoryspec && p1[1] == p2[1]);
3823
3824 case on_failure_jump_nastyloop:
3825 case on_failure_jump_smart:
3826 case on_failure_jump_loop:
3827 case on_failure_keep_string_jump:
3828 case on_failure_jump:
3829 {
3830 int mcnt;
3831 p2++;
3832 EXTRACT_NUMBER_AND_INCR (mcnt, p2);
3833 /* Don't just test `mcnt > 0` because non-greedy loops have
3834 their test at the end with an unconditional jump at the start. */
3835 if (p2 + mcnt > p2_orig) /* Ensure forward progress. */
3836 return (mutually_exclusive_p (bufp, p1, p2)
3837 && mutually_exclusive_p (bufp, p1, p2 + mcnt));
3838 break;
3839 }
3840
3841 default:
3842 ;
3843 }
3844
3845 /* Safe default. */
3846 return false;
3847 }
3848
3849
3850 /* Matching routines. */
3851
3852 /* re_match_2 matches the compiled pattern in BUFP against the
3853 (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
3854 and SIZE2, respectively). We start matching at POS, and stop
3855 matching at STOP.
3856
3857 If REGS is non-null, store offsets for the substring each group
3858 matched in REGS.
3859
3860 We return -1 if no match, -2 if an internal error (such as the
3861 failure stack overflowing). Otherwise, we return the length of the
3862 matched substring. */
3863
3864 ptrdiff_t
3865 re_match_2 (struct re_pattern_buffer *bufp,
3866 char const *string1, ptrdiff_t size1,
3867 char const *string2, ptrdiff_t size2,
3868 ptrdiff_t pos, struct re_registers *regs, ptrdiff_t stop)
3869 {
3870 ptrdiff_t result;
3871
3872 RE_SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, pos);
3873
3874 result = re_match_2_internal (bufp, (re_char *) string1, size1,
3875 (re_char *) string2, size2,
3876 pos, regs, stop);
3877 return result;
3878 }
3879
3880 static void
3881 unwind_re_match (void *ptr)
3882 {
3883 struct buffer *b = (struct buffer *) ptr;
3884 b->text->inhibit_shrinking = 0;
3885 }
3886
3887 /* This is a separate function so that we can force an alloca cleanup
3888 afterwards. */
3889 static ptrdiff_t
3890 re_match_2_internal (struct re_pattern_buffer *bufp,
3891 re_char *string1, ptrdiff_t size1,
3892 re_char *string2, ptrdiff_t size2,
3893 ptrdiff_t pos, struct re_registers *regs, ptrdiff_t stop)
3894 {
3895 eassume (0 <= size1);
3896 eassume (0 <= size2);
3897 eassume (0 <= pos && pos <= stop && stop <= size1 + size2);
3898
3899 /* General temporaries. */
3900 int mcnt;
3901
3902 /* Just past the end of the corresponding string. */
3903 re_char *end1, *end2;
3904
3905 /* Pointers into string1 and string2, just past the last characters in
3906 each to consider matching. */
3907 re_char *end_match_1, *end_match_2;
3908
3909 /* Where we are in the data, and the end of the current string. */
3910 re_char *d, *dend;
3911
3912 /* Used sometimes to remember where we were before starting matching
3913 an operator so that we can go back in case of failure. This "atomic"
3914 behavior of matching opcodes is indispensable to the correctness
3915 of the on_failure_keep_string_jump optimization. */
3916 re_char *dfail;
3917
3918 /* Where we are in the pattern, and the end of the pattern. */
3919 re_char *p = bufp->buffer;
3920 re_char *pend = p + bufp->used;
3921
3922 /* We use this to map every character in the string. */
3923 Lisp_Object translate = bufp->translate;
3924
3925 /* True if BUFP is setup from a multibyte regex. */
3926 bool multibyte = RE_MULTIBYTE_P (bufp);
3927
3928 /* True if STRING1/STRING2 are multibyte. */
3929 bool target_multibyte = RE_TARGET_MULTIBYTE_P (bufp);
3930
3931 /* Failure point stack. Each place that can handle a failure further
3932 down the line pushes a failure point on this stack. It consists of
3933 regstart, and regend for all registers corresponding to
3934 the subexpressions we're currently inside, plus the number of such
3935 registers, and, finally, two char *'s. The first char * is where
3936 to resume scanning the pattern; the second one is where to resume
3937 scanning the strings. */
3938 fail_stack_type fail_stack;
3939 #ifdef DEBUG_COMPILES_ARGUMENTS
3940 ptrdiff_t nfailure_points_pushed = 0, nfailure_points_popped = 0;
3941 #endif
3942
3943 /* We fill all the registers internally, independent of what we
3944 return, for use in backreferences. The number here includes
3945 an element for register zero. */
3946 ptrdiff_t num_regs = bufp->re_nsub + 1;
3947 eassume (0 < num_regs);
3948
3949 /* Information on the contents of registers. These are pointers into
3950 the input strings; they record just what was matched (on this
3951 attempt) by a subexpression part of the pattern, that is, the
3952 regnum-th regstart pointer points to where in the pattern we began
3953 matching and the regnum-th regend points to right after where we
3954 stopped matching the regnum-th subexpression. */
3955 re_char **regstart UNINIT, **regend UNINIT;
3956
3957 /* The following record the register info as found in the above
3958 variables when we find a match better than any we've seen before.
3959 This happens as we backtrack through the failure points, which in
3960 turn happens only if we have not yet matched the entire string. */
3961 bool best_regs_set = false;
3962 re_char **best_regstart UNINIT, **best_regend UNINIT;
3963
3964 /* Logically, this is 'best_regend[0]'. But we don't want to have to
3965 allocate space for that if we're not allocating space for anything
3966 else (see below). Also, we never need info about register 0 for
3967 any of the other register vectors, and it seems rather a kludge to
3968 treat 'best_regend' differently from the rest. So we keep track of
3969 the end of the best match so far in a separate variable. We
3970 initialize this to NULL so that when we backtrack the first time
3971 and need to test it, it's not garbage. */
3972 re_char *match_end = NULL;
3973
3974 /* This keeps track of how many buffer/string positions we examined. */
3975 ptrdiff_t nchars = 0;
3976
3977 #ifdef DEBUG_COMPILES_ARGUMENTS
3978 /* Counts the total number of registers pushed. */
3979 ptrdiff_t num_regs_pushed = 0;
3980 #endif
3981
3982 DEBUG_PRINT ("\nEntering re_match_2.\n");
3983
3984 REGEX_USE_SAFE_ALLOCA;
3985
3986 INIT_FAIL_STACK ();
3987
3988 specpdl_ref count = SPECPDL_INDEX ();
3989
3990 /* Prevent shrinking and relocation of buffer text if GC happens
3991 while we are inside this function. The calls to
3992 UPDATE_SYNTAX_TABLE_* macros can call Lisp (via
3993 `internal--syntax-propertize`); these calls are careful to defend against
3994 buffer modifications, but even with no modifications, the buffer text may
3995 be relocated during GC by `compact_buffer` which would invalidate
3996 our C pointers to buffer text. */
3997 if (!current_buffer->text->inhibit_shrinking)
3998 {
3999 record_unwind_protect_ptr (unwind_re_match, current_buffer);
4000 current_buffer->text->inhibit_shrinking = 1;
4001 }
4002
4003 /* Do not bother to initialize all the register variables if there are
4004 no groups in the pattern, as it takes a fair amount of time. If
4005 there are groups, we include space for register 0 (the whole
4006 pattern) in REGSTART[0], even though we never use it, to avoid
4007 the undefined behavior of subtracting 1 from REGSTART. */
4008 ptrdiff_t re_nsub = num_regs - 1;
4009 if (0 < re_nsub)
4010 {
4011 regstart = SAFE_ALLOCA ((re_nsub * 4 + 1) * sizeof *regstart);
4012 regend = regstart + num_regs;
4013 best_regstart = regend + re_nsub;
4014 best_regend = best_regstart + re_nsub;
4015
4016 /* Initialize subexpression text positions to unset, to mark ones
4017 that no start_memory/stop_memory has been seen for. */
4018 for (re_char **apos = regstart + 1; apos < best_regstart + 1; apos++)
4019 *apos = NULL;
4020 }
4021
4022 /* We move 'string1' into 'string2' if the latter's empty -- but not if
4023 'string1' is null. */
4024 if (size2 == 0 && string1 != NULL)
4025 {
4026 string2 = string1;
4027 size2 = size1;
4028 string1 = 0;
4029 size1 = 0;
4030 }
4031 end1 = string1 + size1;
4032 end2 = string2 + size2;
4033
4034 /* P scans through the pattern as D scans through the data.
4035 DEND is the end of the input string that D points within.
4036 Advance D into the following input string whenever necessary, but
4037 this happens before fetching; therefore, at the beginning of the
4038 loop, D can be pointing at the end of a string, but it cannot
4039 equal STRING2. */
4040 if (pos >= size1)
4041 {
4042 /* Only match within string2. */
4043 d = string2 + pos - size1;
4044 dend = end_match_2 = string2 + stop - size1;
4045 end_match_1 = end1; /* Just to give it a value. */
4046 }
4047 else
4048 {
4049 if (stop < size1)
4050 {
4051 /* Only match within string1. */
4052 end_match_1 = string1 + stop;
4053 /* BEWARE!
4054 When we reach end_match_1, PREFETCH normally switches to string2.
4055 But in the present case, this means that just doing a PREFETCH
4056 makes us jump from 'stop' to 'gap' within the string.
4057 What we really want here is for the search to stop as
4058 soon as we hit end_match_1. That's why we set end_match_2
4059 to end_match_1 (since PREFETCH fails as soon as we hit
4060 end_match_2). */
4061 end_match_2 = end_match_1;
4062 }
4063 else
4064 { /* It's important to use this code when STOP == SIZE so that
4065 moving D from end1 to string2 will not prevent the D == DEND
4066 check from catching the end of string. */
4067 end_match_1 = end1;
4068 end_match_2 = string2 + stop - size1;
4069 }
4070 d = string1 + pos;
4071 dend = end_match_1;
4072 }
4073
4074 DEBUG_PRINT ("The compiled pattern is:\n");
4075 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
4076 DEBUG_PRINT ("The string to match is: \"");
4077 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
4078 DEBUG_PRINT ("\"\n");
4079
4080 /* This loops over pattern commands. It exits by returning from the
4081 function if the match is complete, or it drops through if the match
4082 fails at this starting point in the input data. */
4083 for (;;)
4084 {
4085 DEBUG_PRINT ("\n%p: ", p);
4086
4087 if (p == pend)
4088 {
4089 /* End of pattern means we might have succeeded. */
4090 DEBUG_PRINT ("end of pattern ... ");
4091
4092 /* If we haven't matched the entire string, and we want the
4093 longest match, try backtracking. */
4094 if (d != end_match_2)
4095 {
4096 /* True if this match is the best seen so far. */
4097 bool best_match_p;
4098
4099 {
4100 /* True if this match ends in the same string (string1
4101 or string2) as the best previous match. */
4102 bool same_str_p = (FIRST_STRING_P (match_end)
4103 == FIRST_STRING_P (d));
4104
4105 /* AIX compiler got confused when this was combined
4106 with the previous declaration. */
4107 if (same_str_p)
4108 best_match_p = d > match_end;
4109 else
4110 best_match_p = !FIRST_STRING_P (d);
4111 }
4112
4113 DEBUG_PRINT ("backtracking.\n");
4114
4115 if (!FAIL_STACK_EMPTY ())
4116 { /* More failure points to try. */
4117
4118 /* If exceeds best match so far, save it. */
4119 if (!best_regs_set || best_match_p)
4120 {
4121 best_regs_set = true;
4122 match_end = d;
4123
4124 DEBUG_PRINT ("\nSAVING match as best so far.\n");
4125
4126 for (ptrdiff_t reg = 1; reg < num_regs; reg++)
4127 {
4128 best_regstart[reg] = regstart[reg];
4129 best_regend[reg] = regend[reg];
4130 }
4131 }
4132 goto fail;
4133 }
4134
4135 /* If no failure points, don't restore garbage. And if
4136 last match is real best match, don't restore second
4137 best one. */
4138 else if (best_regs_set && !best_match_p)
4139 {
4140 restore_best_regs:
4141 /* Restore best match. It may happen that 'dend ==
4142 end_match_1' while the restored d is in string2.
4143 For example, the pattern 'x.*y.*z' against the
4144 strings 'x-' and 'y-z-', if the two strings are
4145 not consecutive in memory. */
4146 DEBUG_PRINT ("Restoring best registers.\n");
4147
4148 d = match_end;
4149 dend = ((d >= string1 && d <= end1)
4150 ? end_match_1 : end_match_2);
4151
4152 for (ptrdiff_t reg = 1; reg < num_regs; reg++)
4153 {
4154 regstart[reg] = best_regstart[reg];
4155 regend[reg] = best_regend[reg];
4156 eassert (REG_UNSET (regstart[reg])
4157 <= REG_UNSET (regend[reg]));
4158 }
4159 }
4160 } /* d != end_match_2 */
4161
4162 succeed_label:
4163 DEBUG_PRINT ("Accepting match.\n");
4164
4165 /* If caller wants register contents data back, do it. */
4166 if (regs)
4167 {
4168 /* Have the register data arrays been allocated? */
4169 if (bufp->regs_allocated == REGS_UNALLOCATED)
4170 { /* No. So allocate them with malloc. */
4171 ptrdiff_t n = max (RE_NREGS, num_regs);
4172 regs->start = xnmalloc (n, sizeof *regs->start);
4173 regs->end = xnmalloc (n, sizeof *regs->end);
4174 regs->num_regs = n;
4175 bufp->regs_allocated = REGS_REALLOCATE;
4176 }
4177 else if (bufp->regs_allocated == REGS_REALLOCATE)
4178 { /* Yes. If we need more elements than were already
4179 allocated, reallocate them. If we need fewer, just
4180 leave it alone. */
4181 ptrdiff_t n = regs->num_regs;
4182 if (n < num_regs)
4183 {
4184 n = max (n + (n >> 1), num_regs);
4185 regs->start
4186 = xnrealloc (regs->start, n, sizeof *regs->start);
4187 regs->end = xnrealloc (regs->end, n, sizeof *regs->end);
4188 regs->num_regs = n;
4189 }
4190 }
4191 else
4192 eassert (bufp->regs_allocated == REGS_FIXED);
4193
4194 /* Convert the pointer data in 'regstart' and 'regend' to
4195 indices. Register zero has to be set differently,
4196 since we haven't kept track of any info for it. */
4197 if (regs->num_regs > 0)
4198 {
4199 regs->start[0] = pos;
4200 regs->end[0] = POINTER_TO_OFFSET (d);
4201 }
4202
4203 for (ptrdiff_t reg = 1; reg < num_regs; reg++)
4204 {
4205 eassert (REG_UNSET (regstart[reg])
4206 <= REG_UNSET (regend[reg]));
4207 if (REG_UNSET (regend[reg]))
4208 regs->start[reg] = regs->end[reg] = -1;
4209 else
4210 {
4211 regs->start[reg] = POINTER_TO_OFFSET (regstart[reg]);
4212 regs->end[reg] = POINTER_TO_OFFSET (regend[reg]);
4213 }
4214 }
4215
4216 /* If the regs structure we return has more elements than
4217 were in the pattern, set the extra elements to -1. */
4218 for (ptrdiff_t reg = num_regs; reg < regs->num_regs; reg++)
4219 regs->start[reg] = regs->end[reg] = -1;
4220 }
4221
4222 DEBUG_PRINT ("%td failure points pushed, %td popped (%td remain).\n",
4223 nfailure_points_pushed, nfailure_points_popped,
4224 nfailure_points_pushed - nfailure_points_popped);
4225 DEBUG_PRINT ("%td registers pushed.\n", num_regs_pushed);
4226
4227 ptrdiff_t dcnt = POINTER_TO_OFFSET (d) - pos;
4228
4229 DEBUG_PRINT ("Returning %td from re_match_2.\n", dcnt);
4230
4231 unbind_to (count, Qnil);
4232 SAFE_FREE ();
4233 /* The factor of 50 below is a heuristic that needs to be tuned. It
4234 means we consider 50 buffer positions examined by this function
4235 roughly equivalent to the display engine iterating over a single
4236 buffer position. */
4237 if (max_redisplay_ticks > 0 && nchars > 0)
4238 update_redisplay_ticks (nchars / 50 + 1, NULL);
4239 return dcnt;
4240 }
4241
4242 /* Otherwise match next pattern command. */
4243 switch (*p++)
4244 {
4245 /* Ignore these. Used to ignore the n of succeed_n's which
4246 currently have n == 0. */
4247 case no_op:
4248 DEBUG_PRINT ("EXECUTING no_op.\n");
4249 break;
4250
4251 case succeed:
4252 DEBUG_PRINT ("EXECUTING succeed.\n");
4253 goto succeed_label;
4254
4255 /* Match the next n pattern characters exactly. The following
4256 byte in the pattern defines n, and the n bytes after that
4257 are the characters to match. */
4258 case exactn:
4259 mcnt = *p++;
4260 DEBUG_PRINT ("EXECUTING exactn %d.\n", mcnt);
4261
4262 /* Remember the start point to rollback upon failure. */
4263 dfail = d;
4264
4265 /* The cost of testing 'translate' is comparatively small. */
4266 if (target_multibyte)
4267 do
4268 {
4269 int pat_charlen, buf_charlen;
4270 int pat_ch, buf_ch;
4271
4272 PREFETCH (d = dfail);
4273 if (multibyte)
4274 pat_ch = string_char_and_length (p, &pat_charlen);
4275 else
4276 {
4277 pat_ch = RE_CHAR_TO_MULTIBYTE (*p);
4278 pat_charlen = 1;
4279 }
4280 buf_ch = string_char_and_length (d, &buf_charlen);
4281
4282 if (TRANSLATE (buf_ch) != pat_ch)
4283 {
4284 d = dfail;
4285 goto fail;
4286 }
4287
4288 p += pat_charlen;
4289 d += buf_charlen;
4290 mcnt -= pat_charlen;
4291 nchars++;
4292 }
4293 while (mcnt > 0);
4294 else
4295 do
4296 {
4297 int pat_charlen;
4298 int pat_ch, buf_ch;
4299
4300 PREFETCH (d = dfail);
4301 if (multibyte)
4302 {
4303 pat_ch = string_char_and_length (p, &pat_charlen);
4304 pat_ch = RE_CHAR_TO_UNIBYTE (pat_ch);
4305 }
4306 else
4307 {
4308 pat_ch = *p;
4309 pat_charlen = 1;
4310 }
4311 buf_ch = RE_CHAR_TO_MULTIBYTE (*d);
4312 if (! CHAR_BYTE8_P (buf_ch))
4313 {
4314 buf_ch = TRANSLATE (buf_ch);
4315 buf_ch = RE_CHAR_TO_UNIBYTE (buf_ch);
4316 if (buf_ch < 0)
4317 buf_ch = *d;
4318 }
4319 else
4320 buf_ch = *d;
4321 if (buf_ch != pat_ch)
4322 {
4323 d = dfail;
4324 goto fail;
4325 }
4326 p += pat_charlen;
4327 d++;
4328 mcnt -= pat_charlen;
4329 nchars++;
4330 }
4331 while (mcnt > 0);
4332
4333 break;
4334
4335
4336 /* Match any character except newline. */
4337 case anychar:
4338 {
4339 int buf_charlen;
4340 int buf_ch;
4341
4342 DEBUG_PRINT ("EXECUTING anychar.\n");
4343
4344 PREFETCH ();
4345 buf_ch = RE_STRING_CHAR_AND_LENGTH (d, buf_charlen,
4346 target_multibyte);
4347 buf_ch = TRANSLATE (buf_ch);
4348 if (buf_ch == '\n')
4349 goto fail;
4350
4351 DEBUG_PRINT (" Matched \"%d\".\n", *d);
4352 d += buf_charlen;
4353 nchars++;
4354 }
4355 break;
4356
4357
4358 case charset:
4359 case charset_not:
4360 {
4361 /* Whether matching against a unibyte character. */
4362 bool unibyte_char = false;
4363
4364 DEBUG_PRINT ("EXECUTING charset%s.\n",
4365 (re_opcode_t) *(p - 1) == charset_not ? "_not" : "");
4366
4367 PREFETCH ();
4368 int len;
4369 int corig = RE_STRING_CHAR_AND_LENGTH (d, len, target_multibyte);
4370 int c = corig;
4371 if (target_multibyte)
4372 {
4373 int c1;
4374
4375 c = TRANSLATE (c);
4376 c1 = RE_CHAR_TO_UNIBYTE (c);
4377 if (c1 >= 0)
4378 {
4379 unibyte_char = true;
4380 c = c1;
4381 }
4382 }
4383 else
4384 {
4385 int c1 = RE_CHAR_TO_MULTIBYTE (c);
4386
4387 if (! CHAR_BYTE8_P (c1))
4388 {
4389 c1 = TRANSLATE (c1);
4390 c1 = RE_CHAR_TO_UNIBYTE (c1);
4391 if (c1 >= 0)
4392 {
4393 unibyte_char = true;
4394 c = c1;
4395 }
4396 }
4397 else
4398 unibyte_char = true;
4399 }
4400
4401 p -= 1;
4402 if (!execute_charset (&p, c, corig, unibyte_char, translate))
4403 goto fail;
4404
4405 d += len;
4406 nchars++;
4407 }
4408 break;
4409
4410
4411 /* The beginning of a group is represented by start_memory.
4412 The argument is the register number. The text
4413 matched within the group is recorded (in the internal
4414 registers data structure) under the register number. */
4415 case start_memory:
4416 DEBUG_PRINT ("EXECUTING start_memory %d:\n", *p);
4417 eassert (0 < *p && *p < num_regs);
4418
4419 /* In case we need to undo this operation (via backtracking). */
4420 PUSH_FAILURE_REG (*p);
4421
4422 regstart[*p] = d;
4423 DEBUG_PRINT (" regstart: %td\n", POINTER_TO_OFFSET (regstart[*p]));
4424
4425 /* Move past the register number and inner group count. */
4426 p += 1;
4427 break;
4428
4429
4430 /* The stop_memory opcode represents the end of a group. Its
4431 argument is the same as start_memory's: the register number. */
4432 case stop_memory:
4433 DEBUG_PRINT ("EXECUTING stop_memory %d:\n", *p);
4434
4435 eassert (0 < *p && *p < num_regs);
4436 eassert (!REG_UNSET (regstart[*p]));
4437 /* Strictly speaking, there should be code such as:
4438
4439 eassert (REG_UNSET (regend[*p]));
4440 PUSH_FAILURE_REGSTOP (*p);
4441
4442 But the only info to be pushed is regend[*p] and it is known to
4443 be UNSET, so there really isn't anything to push.
4444 Not pushing anything, on the other hand deprives us from the
4445 guarantee that regend[*p] is UNSET since undoing this operation
4446 will not reset its value properly. This is not important since
4447 the value will only be read on the next start_memory or at
4448 the very end and both events can only happen if this stop_memory
4449 is *not* undone. */
4450
4451 regend[*p] = d;
4452 DEBUG_PRINT (" regend: %td\n", POINTER_TO_OFFSET (regend[*p]));
4453
4454 /* Move past the register number and the inner group count. */
4455 p += 1;
4456 break;
4457
4458
4459 /* \<digit> has been turned into a 'duplicate' command which is
4460 followed by the numeric value of <digit> as the register number. */
4461 case duplicate:
4462 {
4463 re_char *d2, *dend2;
4464 int regno = *p++; /* Get which register to match against. */
4465 DEBUG_PRINT ("EXECUTING duplicate %d.\n", regno);
4466
4467 /* Can't back reference a group which we've never matched. */
4468 eassert (0 < regno && regno < num_regs);
4469 eassert (REG_UNSET (regstart[regno]) <= REG_UNSET (regend[regno]));
4470 if (REG_UNSET (regend[regno]))
4471 goto fail;
4472
4473 /* Where in input to try to start matching. */
4474 d2 = regstart[regno];
4475
4476 /* Remember the start point to rollback upon failure. */
4477 dfail = d;
4478
4479 /* Where to stop matching; if both the place to start and
4480 the place to stop matching are in the same string, then
4481 set to the place to stop, otherwise, for now have to use
4482 the end of the first string. */
4483
4484 dend2 = ((FIRST_STRING_P (regstart[regno])
4485 == FIRST_STRING_P (regend[regno]))
4486 ? regend[regno] : end_match_1);
4487 for (;;)
4488 {
4489 ptrdiff_t dcnt;
4490
4491 /* If necessary, advance to next segment in register
4492 contents. */
4493 while (d2 == dend2)
4494 {
4495 if (dend2 == end_match_2) break;
4496 if (dend2 == regend[regno]) break;
4497
4498 /* End of string1 => advance to string2. */
4499 d2 = string2;
4500 dend2 = regend[regno];
4501 }
4502 /* At end of register contents => success */
4503 if (d2 == dend2) break;
4504
4505 /* If necessary, advance to next segment in data. */
4506 PREFETCH (d = dfail);
4507
4508 /* How many characters left in this segment to match. */
4509 dcnt = dend - d;
4510
4511 /* Want how many consecutive characters we can match in
4512 one shot, so, if necessary, adjust the count. */
4513 if (dcnt > dend2 - d2)
4514 dcnt = dend2 - d2;
4515
4516 /* Compare that many; failure if mismatch, else move
4517 past them. */
4518 if (!NILP (translate)
4519 ? bcmp_translate (d, d2, dcnt, translate, target_multibyte)
4520 : memcmp (d, d2, dcnt))
4521 {
4522 d = dfail;
4523 goto fail;
4524 }
4525 d += dcnt, d2 += dcnt;
4526 nchars++;
4527 }
4528 }
4529 break;
4530
4531
4532 /* begline matches the empty string at the beginning of the string,
4533 and after newlines. */
4534 case begline:
4535 DEBUG_PRINT ("EXECUTING begline.\n");
4536
4537 if (AT_STRINGS_BEG (d))
4538 break;
4539 else
4540 {
4541 unsigned c;
4542 GET_CHAR_BEFORE_2 (c, d, string1, end1, string2, end2);
4543 if (c == '\n')
4544 break;
4545 }
4546 goto fail;
4547
4548
4549 /* endline is the dual of begline. */
4550 case endline:
4551 DEBUG_PRINT ("EXECUTING endline.\n");
4552
4553 if (AT_STRINGS_END (d))
4554 break;
4555 PREFETCH_NOLIMIT ();
4556 if (*d == '\n')
4557 break;
4558 goto fail;
4559
4560
4561 /* Match at the very beginning of the data. */
4562 case begbuf:
4563 DEBUG_PRINT ("EXECUTING begbuf.\n");
4564 if (AT_STRINGS_BEG (d))
4565 break;
4566 goto fail;
4567
4568
4569 /* Match at the very end of the data. */
4570 case endbuf:
4571 DEBUG_PRINT ("EXECUTING endbuf.\n");
4572 if (AT_STRINGS_END (d))
4573 break;
4574 goto fail;
4575
4576
4577 /* on_failure_keep_string_jump is used to optimize '.*\n'. It
4578 pushes NULL as the value for the string on the stack. Then
4579 'POP_FAILURE_POINT' will keep the current value for the
4580 string, instead of restoring it. To see why, consider
4581 matching 'foo\nbar' against '.*\n'. The .* matches the foo;
4582 then the . fails against the \n. But the next thing we want
4583 to do is match the \n against the \n; if we restored the
4584 string value, we would be back at the foo.
4585
4586 Because this is used only in specific cases, we don't need to
4587 check all the things that 'on_failure_jump' does, to make
4588 sure the right things get saved on the stack. Hence we don't
4589 share its code. The only reason to push anything on the
4590 stack at all is that otherwise we would have to change
4591 'anychar's code to do something besides goto fail in this
4592 case; that seems worse than this. */
4593 case on_failure_keep_string_jump:
4594 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4595 DEBUG_PRINT ("EXECUTING on_failure_keep_string_jump %d (to %p):\n",
4596 mcnt, p + mcnt);
4597
4598 PUSH_FAILURE_POINT (p - 3, NULL);
4599 break;
4600
4601 /* A nasty loop is introduced by the non-greedy *? and +?.
4602 With such loops, the stack only ever contains one failure point
4603 at a time, so that a plain on_failure_jump_loop kind of
4604 cycle detection cannot work. Worse yet, such a detection
4605 can not only fail to detect a cycle, but it can also wrongly
4606 detect a cycle (between different instantiations of the same
4607 loop).
4608 So the method used for those nasty loops is a little different:
4609 We use a special cycle-detection-stack-frame which is pushed
4610 when the on_failure_jump_nastyloop failure-point is *popped*.
4611 This special frame thus marks the beginning of one iteration
4612 through the loop and we can hence easily check right here
4613 whether something matched between the beginning and the end of
4614 the loop. */
4615 case on_failure_jump_nastyloop:
4616 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4617 DEBUG_PRINT ("EXECUTING on_failure_jump_nastyloop %d (to %p):\n",
4618 mcnt, p + mcnt);
4619
4620 eassert ((re_opcode_t)p[-4] == no_op);
4621 {
4622 bool cycle = false;
4623 CHECK_INFINITE_LOOP (p - 4, d);
4624 if (!cycle)
4625 /* If there's a cycle, just continue without pushing
4626 this failure point. The failure point is the "try again"
4627 option, which shouldn't be tried.
4628 We want (x?)*?y\1z to match both xxyz and xxyxz. */
4629 PUSH_FAILURE_POINT (p - 3, d);
4630 }
4631 break;
4632
4633 /* Simple loop detecting on_failure_jump: just check on the
4634 failure stack if the same spot was already hit earlier. */
4635 case on_failure_jump_loop:
4636 on_failure:
4637 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4638 DEBUG_PRINT ("EXECUTING on_failure_jump_loop %d (to %p):\n",
4639 mcnt, p + mcnt);
4640 {
4641 bool cycle = false;
4642 CHECK_INFINITE_LOOP (p - 3, d);
4643 if (cycle)
4644 /* If there's a cycle, get out of the loop, as if the matching
4645 had failed. We used to just 'goto fail' here, but that was
4646 aborting the search a bit too early: we want to keep the
4647 empty-loop-match and keep matching after the loop.
4648 We want (x?)*y\1z to match both xxyz and xxyxz. */
4649 p += mcnt;
4650 else
4651 PUSH_FAILURE_POINT (p - 3, d);
4652 }
4653 break;
4654
4655
4656 /* Uses of on_failure_jump:
4657
4658 Each alternative starts with an on_failure_jump that points
4659 to the beginning of the next alternative. Each alternative
4660 except the last ends with a jump that in effect jumps past
4661 the rest of the alternatives. (They really jump to the
4662 ending jump of the following alternative, because tensioning
4663 these jumps is a hassle.)
4664
4665 Repeats start with an on_failure_jump that points past both
4666 the repetition text and either the following jump or
4667 pop_failure_jump back to this on_failure_jump. */
4668 case on_failure_jump:
4669 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4670 DEBUG_PRINT ("EXECUTING on_failure_jump %d (to %p):\n",
4671 mcnt, p + mcnt);
4672
4673 PUSH_FAILURE_POINT (p -3, d);
4674 break;
4675
4676 /* This operation is used for greedy *.
4677 Compare the beginning of the repeat with what in the
4678 pattern follows its end. If we can establish that there
4679 is nothing that they would both match, i.e., that we
4680 would have to backtrack because of (as in, e.g., 'a*a')
4681 then we can use a non-backtracking loop based on
4682 on_failure_keep_string_jump instead of on_failure_jump. */
4683 case on_failure_jump_smart:
4684 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4685 DEBUG_PRINT ("EXECUTING on_failure_jump_smart %d (to %p).\n",
4686 mcnt, p + mcnt);
4687 {
4688 re_char *p1 = p; /* Next operation. */
4689 /* Discard 'const', making re_search non-reentrant. */
4690 unsigned char *p2 = (unsigned char *) p + mcnt; /* Jump dest. */
4691 unsigned char *p3 = (unsigned char *) p - 3; /* opcode location. */
4692
4693 p -= 3; /* Reset so that we will re-execute the
4694 instruction once it's been changed. */
4695
4696 EXTRACT_NUMBER (mcnt, p2 - 2);
4697
4698 /* Ensure this is indeed the trivial kind of loop
4699 we are expecting. */
4700 eassert (skip_one_char (p1) == p2 - 3);
4701 eassert ((re_opcode_t) p2[-3] == jump && p2 + mcnt == p);
4702 DEBUG_STATEMENT (regex_emacs_debug += 2);
4703 if (mutually_exclusive_p (bufp, p1, p2))
4704 {
4705 /* Use a fast 'on_failure_keep_string_jump' loop. */
4706 DEBUG_PRINT (" smart exclusive => fast loop.\n");
4707 *p3 = (unsigned char) on_failure_keep_string_jump;
4708 STORE_NUMBER (p2 - 2, mcnt + 3);
4709 }
4710 else
4711 {
4712 /* Default to a safe 'on_failure_jump' loop. */
4713 DEBUG_PRINT (" smart default => slow loop.\n");
4714 *p3 = (unsigned char) on_failure_jump;
4715 }
4716 DEBUG_STATEMENT (regex_emacs_debug -= 2);
4717 }
4718 break;
4719
4720 /* Unconditionally jump (without popping any failure points). */
4721 case jump:
4722 unconditional_jump:
4723 maybe_quit ();
4724 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
4725 DEBUG_PRINT ("EXECUTING jump %d ", mcnt);
4726 p += mcnt; /* Do the jump. */
4727 DEBUG_PRINT ("(to %p).\n", p);
4728 break;
4729
4730
4731 /* Have to succeed matching what follows at least n times.
4732 After that, handle like 'on_failure_jump'. */
4733 case succeed_n:
4734 /* Signedness doesn't matter since we only compare MCNT to 0. */
4735 EXTRACT_NUMBER (mcnt, p + 2);
4736 DEBUG_PRINT ("EXECUTING succeed_n %d.\n", mcnt);
4737
4738 /* Originally, mcnt is how many times we HAVE to succeed. */
4739 if (mcnt != 0)
4740 {
4741 /* Discard 'const', making re_search non-reentrant. */
4742 unsigned char *p2 = (unsigned char *) p + 2; /* counter loc. */
4743 mcnt--;
4744 p += 4;
4745 PUSH_NUMBER (p2, mcnt);
4746 }
4747 else
4748 /* The two bytes encoding mcnt == 0 are two no_op opcodes. */
4749 goto on_failure;
4750 break;
4751
4752 case jump_n:
4753 /* Signedness doesn't matter since we only compare MCNT to 0. */
4754 EXTRACT_NUMBER (mcnt, p + 2);
4755 DEBUG_PRINT ("EXECUTING jump_n %d.\n", mcnt);
4756
4757 /* Originally, this is how many times we CAN jump. */
4758 if (mcnt != 0)
4759 {
4760 /* Discard 'const', making re_search non-reentrant. */
4761 unsigned char *p2 = (unsigned char *) p + 2; /* counter loc. */
4762 mcnt--;
4763 PUSH_NUMBER (p2, mcnt);
4764 goto unconditional_jump;
4765 }
4766 /* If don't have to jump any more, skip over the rest of command. */
4767 else
4768 p += 4;
4769 break;
4770
4771 case set_number_at:
4772 {
4773 unsigned char *p2; /* Location of the counter. */
4774 DEBUG_PRINT ("EXECUTING set_number_at.\n");
4775
4776 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4777 /* Discard 'const', making re_search non-reentrant. */
4778 p2 = (unsigned char *) p + mcnt;
4779 /* Signedness doesn't matter since we only copy MCNT's bits. */
4780 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4781 DEBUG_PRINT (" Setting %p to %d.\n", p2, mcnt);
4782 PUSH_NUMBER (p2, mcnt);
4783 break;
4784 }
4785
4786 case wordbound:
4787 case notwordbound:
4788 {
4789 bool not = (re_opcode_t) *(p - 1) == notwordbound;
4790 DEBUG_PRINT ("EXECUTING %swordbound.\n", not ? "not" : "");
4791
4792 /* We SUCCEED (or FAIL) in one of the following cases: */
4793
4794 /* Case 1: D is at the beginning or the end of string. */
4795 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
4796 not = !not;
4797 else
4798 {
4799 /* C1 is the character before D, S1 is the syntax of C1, C2
4800 is the character at D, and S2 is the syntax of C2. */
4801 int c1, c2;
4802 int s1, s2;
4803 int dummy;
4804 ptrdiff_t offset = POINTER_TO_OFFSET (d);
4805 ptrdiff_t charpos = RE_SYNTAX_TABLE_BYTE_TO_CHAR (offset) - 1;
4806 UPDATE_SYNTAX_TABLE (charpos);
4807 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
4808 nchars++;
4809 s1 = SYNTAX (c1);
4810 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
4811 PREFETCH_NOLIMIT ();
4812 GET_CHAR_AFTER (c2, d, dummy);
4813 nchars++;
4814 s2 = SYNTAX (c2);
4815
4816 if (/* Case 2: Only one of S1 and S2 is Sword. */
4817 ((s1 == Sword) != (s2 == Sword))
4818 /* Case 3: Both of S1 and S2 are Sword, and macro
4819 WORD_BOUNDARY_P (C1, C2) returns nonzero. */
4820 || ((s1 == Sword) && WORD_BOUNDARY_P (c1, c2)))
4821 not = !not;
4822 }
4823 if (not)
4824 break;
4825 else
4826 goto fail;
4827 }
4828
4829 case wordbeg:
4830 DEBUG_PRINT ("EXECUTING wordbeg.\n");
4831
4832 /* We FAIL in one of the following cases: */
4833
4834 /* Case 1: D is at the end of string. */
4835 if (AT_STRINGS_END (d))
4836 goto fail;
4837 else
4838 {
4839 /* C1 is the character before D, S1 is the syntax of C1, C2
4840 is the character at D, and S2 is the syntax of C2. */
4841 int c1, c2;
4842 int s1, s2;
4843 int dummy;
4844 ptrdiff_t offset = POINTER_TO_OFFSET (d);
4845 ptrdiff_t charpos = RE_SYNTAX_TABLE_BYTE_TO_CHAR (offset);
4846 UPDATE_SYNTAX_TABLE (charpos);
4847 PREFETCH ();
4848 GET_CHAR_AFTER (c2, d, dummy);
4849 nchars++;
4850 s2 = SYNTAX (c2);
4851
4852 /* Case 2: S2 is not Sword. */
4853 if (s2 != Sword)
4854 goto fail;
4855
4856 /* Case 3: D is not at the beginning of string ... */
4857 if (!AT_STRINGS_BEG (d))
4858 {
4859 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
4860 nchars++;
4861 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
4862 s1 = SYNTAX (c1);
4863
4864 /* ... and S1 is Sword, and WORD_BOUNDARY_P (C1, C2)
4865 returns 0. */
4866 if ((s1 == Sword) && !WORD_BOUNDARY_P (c1, c2))
4867 goto fail;
4868 }
4869 }
4870 break;
4871
4872 case wordend:
4873 DEBUG_PRINT ("EXECUTING wordend.\n");
4874
4875 /* We FAIL in one of the following cases: */
4876
4877 /* Case 1: D is at the beginning of string. */
4878 if (AT_STRINGS_BEG (d))
4879 goto fail;
4880 else
4881 {
4882 /* C1 is the character before D, S1 is the syntax of C1, C2
4883 is the character at D, and S2 is the syntax of C2. */
4884 int c1, c2;
4885 int s1, s2;
4886 int dummy;
4887 ptrdiff_t offset = POINTER_TO_OFFSET (d);
4888 ptrdiff_t charpos = RE_SYNTAX_TABLE_BYTE_TO_CHAR (offset) - 1;
4889 UPDATE_SYNTAX_TABLE (charpos);
4890 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
4891 nchars++;
4892 s1 = SYNTAX (c1);
4893
4894 /* Case 2: S1 is not Sword. */
4895 if (s1 != Sword)
4896 goto fail;
4897
4898 /* Case 3: D is not at the end of string ... */
4899 if (!AT_STRINGS_END (d))
4900 {
4901 PREFETCH_NOLIMIT ();
4902 GET_CHAR_AFTER (c2, d, dummy);
4903 nchars++;
4904 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
4905 s2 = SYNTAX (c2);
4906
4907 /* ... and S2 is Sword, and WORD_BOUNDARY_P (C1, C2)
4908 returns 0. */
4909 if ((s2 == Sword) && !WORD_BOUNDARY_P (c1, c2))
4910 goto fail;
4911 }
4912 }
4913 break;
4914
4915 case symbeg:
4916 DEBUG_PRINT ("EXECUTING symbeg.\n");
4917
4918 /* We FAIL in one of the following cases: */
4919
4920 /* Case 1: D is at the end of string. */
4921 if (AT_STRINGS_END (d))
4922 goto fail;
4923 else
4924 {
4925 /* C1 is the character before D, S1 is the syntax of C1, C2
4926 is the character at D, and S2 is the syntax of C2. */
4927 int c1, c2;
4928 int s1, s2;
4929 ptrdiff_t offset = POINTER_TO_OFFSET (d);
4930 ptrdiff_t charpos = RE_SYNTAX_TABLE_BYTE_TO_CHAR (offset);
4931 UPDATE_SYNTAX_TABLE (charpos);
4932 PREFETCH ();
4933 c2 = RE_STRING_CHAR (d, target_multibyte);
4934 nchars++;
4935 s2 = SYNTAX (c2);
4936
4937 /* Case 2: S2 is neither Sword nor Ssymbol. */
4938 if (s2 != Sword && s2 != Ssymbol)
4939 goto fail;
4940
4941 /* Case 3: D is not at the beginning of string ... */
4942 if (!AT_STRINGS_BEG (d))
4943 {
4944 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
4945 nchars++;
4946 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
4947 s1 = SYNTAX (c1);
4948
4949 /* ... and S1 is Sword or Ssymbol. */
4950 if (s1 == Sword || s1 == Ssymbol)
4951 goto fail;
4952 }
4953 }
4954 break;
4955
4956 case symend:
4957 DEBUG_PRINT ("EXECUTING symend.\n");
4958
4959 /* We FAIL in one of the following cases: */
4960
4961 /* Case 1: D is at the beginning of string. */
4962 if (AT_STRINGS_BEG (d))
4963 goto fail;
4964 else
4965 {
4966 /* C1 is the character before D, S1 is the syntax of C1, C2
4967 is the character at D, and S2 is the syntax of C2. */
4968 int c1, c2;
4969 int s1, s2;
4970 ptrdiff_t offset = POINTER_TO_OFFSET (d);
4971 ptrdiff_t charpos = RE_SYNTAX_TABLE_BYTE_TO_CHAR (offset) - 1;
4972 UPDATE_SYNTAX_TABLE (charpos);
4973 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
4974 nchars++;
4975 s1 = SYNTAX (c1);
4976
4977 /* Case 2: S1 is neither Ssymbol nor Sword. */
4978 if (s1 != Sword && s1 != Ssymbol)
4979 goto fail;
4980
4981 /* Case 3: D is not at the end of string ... */
4982 if (!AT_STRINGS_END (d))
4983 {
4984 PREFETCH_NOLIMIT ();
4985 c2 = RE_STRING_CHAR (d, target_multibyte);
4986 nchars++;
4987 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
4988 s2 = SYNTAX (c2);
4989
4990 /* ... and S2 is Sword or Ssymbol. */
4991 if (s2 == Sword || s2 == Ssymbol)
4992 goto fail;
4993 }
4994 }
4995 break;
4996
4997 case syntaxspec:
4998 case notsyntaxspec:
4999 {
5000 bool not = (re_opcode_t) *(p - 1) == notsyntaxspec;
5001 mcnt = *p++;
5002 DEBUG_PRINT ("EXECUTING %ssyntaxspec %d.\n", not ? "not" : "",
5003 mcnt);
5004 PREFETCH ();
5005 {
5006 ptrdiff_t offset = POINTER_TO_OFFSET (d);
5007 ptrdiff_t pos1 = RE_SYNTAX_TABLE_BYTE_TO_CHAR (offset);
5008 UPDATE_SYNTAX_TABLE (pos1);
5009 }
5010 {
5011 int len;
5012 int c;
5013
5014 GET_CHAR_AFTER (c, d, len);
5015 if ((SYNTAX (c) != (enum syntaxcode) mcnt) ^ not)
5016 goto fail;
5017 d += len;
5018 nchars++;
5019 }
5020 }
5021 break;
5022
5023 case at_dot:
5024 DEBUG_PRINT ("EXECUTING at_dot.\n");
5025 if (PTR_BYTE_POS (d) != PT_BYTE)
5026 goto fail;
5027 break;
5028
5029 case categoryspec:
5030 case notcategoryspec:
5031 {
5032 bool not = (re_opcode_t) *(p - 1) == notcategoryspec;
5033 mcnt = *p++;
5034 DEBUG_PRINT ("EXECUTING %scategoryspec %d.\n",
5035 not ? "not" : "", mcnt);
5036 PREFETCH ();
5037
5038 {
5039 int len;
5040 int c;
5041 GET_CHAR_AFTER (c, d, len);
5042 if ((!CHAR_HAS_CATEGORY (c, mcnt)) ^ not)
5043 goto fail;
5044 d += len;
5045 nchars++;
5046 }
5047 }
5048 break;
5049
5050 default:
5051 abort ();
5052 }
5053 continue; /* Successfully executed one pattern command; keep going. */
5054
5055
5056 /* We goto here if a matching operation fails. */
5057 fail:
5058 maybe_quit ();
5059 if (!FAIL_STACK_EMPTY ())
5060 {
5061 re_char *str, *pat;
5062 /* A restart point is known. Restore to that state. */
5063 DEBUG_PRINT ("\nFAIL:\n");
5064 POP_FAILURE_POINT (str, pat);
5065 switch (*pat++)
5066 {
5067 case on_failure_keep_string_jump:
5068 eassert (str == NULL);
5069 goto continue_failure_jump;
5070
5071 case on_failure_jump_nastyloop:
5072 eassert ((re_opcode_t)pat[-2] == no_op);
5073 PUSH_FAILURE_POINT (pat - 2, str);
5074 FALLTHROUGH;
5075 case on_failure_jump_loop:
5076 case on_failure_jump:
5077 case succeed_n:
5078 d = str;
5079 continue_failure_jump:
5080 EXTRACT_NUMBER_AND_INCR (mcnt, pat);
5081 p = pat + mcnt;
5082 break;
5083
5084 case no_op:
5085 /* A special frame used for nastyloops. */
5086 goto fail;
5087
5088 default:
5089 abort ();
5090 }
5091
5092 eassert (p >= bufp->buffer && p <= pend);
5093
5094 if (d >= string1 && d <= end1)
5095 dend = end_match_1;
5096 }
5097 else
5098 break; /* Matching at this starting point really fails. */
5099 } /* for (;;) */
5100
5101 if (best_regs_set)
5102 goto restore_best_regs;
5103
5104 unbind_to (count, Qnil);
5105 SAFE_FREE ();
5106
5107 if (max_redisplay_ticks > 0 && nchars > 0)
5108 update_redisplay_ticks (nchars / 50 + 1, NULL);
5109
5110 return -1; /* Failure to match. */
5111 }
5112
5113 /* Subroutine definitions for re_match_2. */
5114
5115 /* Return true if TRANSLATE[S1] and TRANSLATE[S2] are not identical
5116 for LEN bytes. */
5117
5118 static bool
5119 bcmp_translate (re_char *s1, re_char *s2, ptrdiff_t len,
5120 Lisp_Object translate, bool target_multibyte)
5121 {
5122 re_char *p1 = s1, *p2 = s2;
5123 re_char *p1_end = s1 + len;
5124 re_char *p2_end = s2 + len;
5125
5126 /* FIXME: Checking both p1 and p2 presumes that the two strings might have
5127 different lengths, but relying on a single LEN would break this. -sm */
5128 while (p1 < p1_end && p2 < p2_end)
5129 {
5130 int p1_charlen, p2_charlen;
5131 int p1_ch, p2_ch;
5132
5133 GET_CHAR_AFTER (p1_ch, p1, p1_charlen);
5134 GET_CHAR_AFTER (p2_ch, p2, p2_charlen);
5135
5136 if (RE_TRANSLATE (translate, p1_ch)
5137 != RE_TRANSLATE (translate, p2_ch))
5138 return true;
5139
5140 p1 += p1_charlen, p2 += p2_charlen;
5141 }
5142
5143 return p1 != p1_end || p2 != p2_end;
5144 }
5145
5146 /* Entry points for GNU code. */
5147
5148 /* re_compile_pattern is the GNU regular expression compiler: it
5149 compiles PATTERN (of length SIZE) and puts the result in BUFP.
5150 Returns 0 if the pattern was valid, otherwise an error string.
5151
5152 Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
5153 are set in BUFP on entry.
5154
5155 We call regex_compile to do the actual compilation. */
5156
5157 const char *
5158 re_compile_pattern (const char *pattern, ptrdiff_t length,
5159 bool posix_backtracking, const char *whitespace_regexp,
5160 struct re_pattern_buffer *bufp)
5161 {
5162 bufp->regs_allocated = REGS_UNALLOCATED;
5163
5164 reg_errcode_t ret
5165 = regex_compile ((re_char *) pattern, length,
5166 posix_backtracking,
5167 whitespace_regexp,
5168 bufp);
5169
5170 if (!ret)
5171 return NULL;
5172 return re_error_msgid[ret];
5173 }