1 /* Emacs regular expression matching and search
2
3 Copyright (C) 1993-2023 Free Software Foundation, Inc.
4
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <https://www.gnu.org/licenses/>. */
17
18 /* TODO:
19 - structure the opcode space into opcode+flag.
20 - replace (succeed_n + jump_n + set_number_at) with something that doesn't
21 need to modify the compiled regexp so that re_search can be reentrant.
22 - get rid of on_failure_jump_smart by doing the optimization in re_comp
23 rather than at run-time, so that re_search can be reentrant.
24 */
25
26 #include <config.h>
27
28 #include "regex-emacs.h"
29
30 #include <stdlib.h>
31
32 #include "character.h"
33 #include "buffer.h"
34 #include "syntax.h"
35 #include "category.h"
36 #include "dispextern.h"
37
38 /* Maximum number of duplicates an interval can allow. Some systems
39 define this in other header files, but we want our value, so remove
40 any previous define. Repeat counts are stored in opcodes as 2-byte
41 unsigned integers. */
42 #ifdef RE_DUP_MAX
43 # undef RE_DUP_MAX
44 #endif
45 #define RE_DUP_MAX (0xffff)
46
47 /* Make syntax table lookup grant data in gl_state. */
48 #define SYNTAX(c) syntax_property (c, 1)
49
50 /* Convert the pointer to the char to BEG-based offset from the start. */
51 #define PTR_TO_OFFSET(d) POS_AS_IN_BUFFER (POINTER_TO_OFFSET (d))
52 /* Strings are 0-indexed, buffers are 1-indexed; pun on the boolean
53 result to get the right base index. */
54 #define POS_AS_IN_BUFFER(p) \
55 ((p) + (NILP (gl_state.object) || BUFFERP (gl_state.object)))
56
57 #define RE_MULTIBYTE_P(bufp) ((bufp)->multibyte)
58 #define RE_TARGET_MULTIBYTE_P(bufp) ((bufp)->target_multibyte)
59 #define RE_STRING_CHAR(p, multibyte) \
60 (multibyte ? STRING_CHAR (p) : *(p))
61 #define RE_STRING_CHAR_AND_LENGTH(p, len, multibyte) \
62 (multibyte ? string_char_and_length (p, &(len)) : ((len) = 1, *(p)))
63
64 #define RE_CHAR_TO_MULTIBYTE(c) UNIBYTE_TO_CHAR (c)
65
66 #define RE_CHAR_TO_UNIBYTE(c) CHAR_TO_BYTE_SAFE (c)
67
68 /* Set C a (possibly converted to multibyte) character before P. P
69 points into a string which is the virtual concatenation of STR1
70 (which ends at END1) or STR2 (which ends at END2). */
71 #define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
72 do { \
73 if (target_multibyte) \
74 { \
75 re_char *dtemp = (p) == (str2) ? (end1) : (p); \
76 re_char *dlimit = (p) > (str2) && (p) <= (end2) ? (str2) : (str1); \
77 while (dtemp-- > dlimit && !CHAR_HEAD_P (*dtemp)) \
78 continue; \
79 c = STRING_CHAR (dtemp); \
80 } \
81 else \
82 { \
83 (c = ((p) == (str2) ? (end1) : (p))[-1]); \
84 (c) = RE_CHAR_TO_MULTIBYTE (c); \
85 } \
86 } while (false)
87
88 /* Set C a (possibly converted to multibyte) character at P, and set
89 LEN to the byte length of that character. */
90 #define GET_CHAR_AFTER(c, p, len) \
91 do { \
92 if (target_multibyte) \
93 (c) = string_char_and_length (p, &(len)); \
94 else \
95 { \
96 (c) = *p; \
97 len = 1; \
98 (c) = RE_CHAR_TO_MULTIBYTE (c); \
99 } \
100 } while (false)
101
102 /* 1 if C is an ASCII character. */
103 #define IS_REAL_ASCII(c) ((c) < 0200)
104
105 /* 1 if C is a unibyte character. */
106 #define ISUNIBYTE(c) (SINGLE_BYTE_CHAR_P ((c)))
107
108 /* The Emacs definitions should not be directly affected by locales. */
109
110 /* In Emacs, these are only used for single-byte characters. */
111 #define ISDIGIT(c) ((c) >= '0' && (c) <= '9')
112 #define ISCNTRL(c) ((c) < ' ')
113 #define ISXDIGIT(c) (0 <= char_hexdigit (c))
114
115 /* The rest must handle multibyte characters. */
116
117 #define ISBLANK(c) (IS_REAL_ASCII (c) \
118 ? ((c) == ' ' || (c) == '\t') \
119 : blankp (c))
120
121 #define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c) \
122 ? (c) > ' ' && !((c) >= 0177 && (c) <= 0240) \
123 : graphicp (c))
124
125 #define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c) \
126 ? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237) \
127 : printablep (c))
128
129 #define ISALNUM(c) (IS_REAL_ASCII (c) \
130 ? (((c) >= 'a' && (c) <= 'z') \
131 || ((c) >= 'A' && (c) <= 'Z') \
132 || ((c) >= '0' && (c) <= '9')) \
133 : alphanumericp (c))
134
135 #define ISALPHA(c) (IS_REAL_ASCII (c) \
136 ? (((c) >= 'a' && (c) <= 'z') \
137 || ((c) >= 'A' && (c) <= 'Z')) \
138 : alphabeticp (c))
139
140 #define ISLOWER(c) lowercasep (c)
141
142 #define ISPUNCT(c) (IS_REAL_ASCII (c) \
143 ? ((c) > ' ' && (c) < 0177 \
144 && !(((c) >= 'a' && (c) <= 'z') \
145 || ((c) >= 'A' && (c) <= 'Z') \
146 || ((c) >= '0' && (c) <= '9'))) \
147 : SYNTAX (c) != Sword)
148
149 #define ISSPACE(c) (SYNTAX (c) == Swhitespace)
150
151 #define ISUPPER(c) uppercasep (c)
152
153 #define ISWORD(c) (SYNTAX (c) == Sword)
154
155 /* Use alloca instead of malloc. This is because using malloc in
156 re_search* or re_match* could cause memory leaks when C-g is used
157 in Emacs (note that SAFE_ALLOCA could also call malloc, but does so
158 via 'record_xmalloc' which uses 'unwind_protect' to ensure the
159 memory is freed even in case of non-local exits); also, malloc is
160 slower and causes storage fragmentation. On the other hand, malloc
161 is more portable, and easier to debug.
162
163 Because we sometimes use alloca, some routines have to be macros,
164 not functions -- 'alloca'-allocated space disappears at the end of the
165 function it is called in. */
166
167 /* This may be adjusted in main(), if the stack is successfully grown. */
168 ptrdiff_t emacs_re_safe_alloca = MAX_ALLOCA;
169 /* Like USE_SAFE_ALLOCA, but use emacs_re_safe_alloca. */
170 #define REGEX_USE_SAFE_ALLOCA \
171 USE_SAFE_ALLOCA; sa_avail = emacs_re_safe_alloca
172
173 /* Assumes a 'char *destination' variable. */
174 #define REGEX_REALLOCATE(source, osize, nsize) \
175 (destination = SAFE_ALLOCA (nsize), \
176 memcpy (destination, source, osize))
177
178 /* True if 'size1' is non-NULL and PTR is pointing anywhere inside
179 'string1' or just past its end. This works if PTR is NULL, which is
180 a good thing. */
181 #define FIRST_STRING_P(ptr) \
182 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
183
184 #define BYTEWIDTH 8 /* In bits. */
185
186 /* Type of source-pattern and string chars. */
187 typedef const unsigned char re_char;
188
189 static void re_compile_fastmap (struct re_pattern_buffer *);
190 static ptrdiff_t re_match_2_internal (struct re_pattern_buffer *bufp,
191 re_char *string1, ptrdiff_t size1,
192 re_char *string2, ptrdiff_t size2,
193 ptrdiff_t pos,
194 struct re_registers *regs,
195 ptrdiff_t stop);
196
197 /* These are the command codes that appear in compiled regular
198 expressions. Some opcodes are followed by argument bytes. A
199 command code can specify any interpretation whatsoever for its
200 arguments. Zero bytes may appear in the compiled regular expression. */
201
202 typedef enum
203 {
204 no_op = 0,
205
206 /* Succeed right away--no more backtracking. */
207 succeed,
208
209 /* Followed by one byte giving n, then by n literal bytes. */
210 exactn,
211
212 /* Matches any (more or less) character. */
213 anychar,
214
215 /* Matches any one char belonging to specified set. First
216 following byte is number of bitmap bytes. Then come bytes
217 for a bitmap saying which chars are in. Bits in each byte
218 are ordered low-bit-first. A character is in the set if its
219 bit is 1. A character too large to have a bit in the map is
220 automatically not in the set.
221
222 If the length byte has the 0x80 bit set, then that stuff
223 is followed by a range table:
224 2 bytes of flags for character sets (low 8 bits, high 8 bits)
225 See RANGE_TABLE_WORK_BITS below.
226 2 bytes, the number of pairs that follow (up to 32767)
227 pairs, each 2 multibyte characters,
228 each multibyte character represented as 3 bytes. */
229 charset,
230
231 /* Same parameters as charset, but match any character that is
232 not one of those specified. */
233 charset_not,
234
235 /* Start remembering the text that is matched, for storing in a
236 register. Followed by one byte with the register number, in
237 the range 0 to one less than the pattern buffer's re_nsub
238 field. */
239 start_memory,
240
241 /* Stop remembering the text that is matched and store it in a
242 memory register. Followed by one byte with the register
243 number, in the range 0 to one less than 're_nsub' in the
244 pattern buffer. */
245 stop_memory,
246
247 /* Match a duplicate of something remembered. Followed by one
248 byte containing the register number. */
249 duplicate,
250
251 /* Fail unless at beginning of line. */
252 begline,
253
254 /* Fail unless at end of line. */
255 endline,
256
257 /* Succeeds if at beginning of buffer. */
258 begbuf,
259
260 /* Analogously, for end of buffer/string. */
261 endbuf,
262
263 /* Followed by two byte relative address to which to jump. */
264 jump,
265
266 /* Followed by two-byte relative address of place to resume at
267 in case of failure. */
268 on_failure_jump,
269
270 /* Like on_failure_jump, but pushes a placeholder instead of the
271 current string position when executed. */
272 on_failure_keep_string_jump,
273
274 /* Just like 'on_failure_jump', except that it checks that we
275 don't get stuck in an infinite loop (matching an empty string
276 indefinitely). */
277 on_failure_jump_loop,
278
279 /* Just like 'on_failure_jump_loop', except that it checks for
280 a different kind of loop (the kind that shows up with non-greedy
281 operators). This operation has to be immediately preceded
282 by a 'no_op'. */
283 on_failure_jump_nastyloop,
284
285 /* A smart 'on_failure_jump' used for greedy * and + operators.
286 It analyzes the loop before which it is put and if the
287 loop does not require backtracking, it changes itself to
288 'on_failure_keep_string_jump' and short-circuits the loop,
289 else it just defaults to changing itself into 'on_failure_jump'.
290 It assumes that it is pointing to just past a 'jump'. */
291 on_failure_jump_smart,
292
293 /* Followed by two-byte relative address and two-byte number n.
294 After matching N times, jump to the address upon failure.
295 Does not work if N starts at 0: use on_failure_jump_loop
296 instead. */
297 succeed_n,
298
299 /* Followed by two-byte relative address, and two-byte number n.
300 Jump to the address N times, then fail. */
301 jump_n,
302
303 /* Set the following two-byte relative address to the
304 subsequent two-byte number. The address *includes* the two
305 bytes of number. */
306 set_number_at,
307
308 wordbeg, /* Succeeds if at word beginning. */
309 wordend, /* Succeeds if at word end. */
310
311 wordbound, /* Succeeds if at a word boundary. */
312 notwordbound, /* Succeeds if not at a word boundary. */
313
314 symbeg, /* Succeeds if at symbol beginning. */
315 symend, /* Succeeds if at symbol end. */
316
317 /* Matches any character whose syntax is specified. Followed by
318 a byte which contains a syntax code, e.g., Sword. */
319 syntaxspec,
320
321 /* Matches any character whose syntax is not that specified. */
322 notsyntaxspec,
323
324 at_dot, /* Succeeds if at point. */
325
326 /* Matches any character whose category-set contains the specified
327 category. The operator is followed by a byte which contains a
328 category code (mnemonic ASCII character). */
329 categoryspec,
330
331 /* Matches any character whose category-set does not contain the
332 specified category. The operator is followed by a byte which
333 contains the category code (mnemonic ASCII character). */
334 notcategoryspec
335 } re_opcode_t;
336
337 /* Common operations on the compiled pattern. */
338
339 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */
340
341 #define STORE_NUMBER(destination, number) \
342 do { \
343 (destination)[0] = (number) & 0377; \
344 (destination)[1] = (number) >> 8; \
345 } while (false)
346
347 /* Same as STORE_NUMBER, except increment DESTINATION to
348 the byte after where the number is stored. Therefore, DESTINATION
349 must be an lvalue. */
350
351 #define STORE_NUMBER_AND_INCR(destination, number) \
352 do { \
353 STORE_NUMBER (destination, number); \
354 (destination) += 2; \
355 } while (false)
356
357 /* Put into DESTINATION a number stored in two contiguous bytes starting
358 at SOURCE. */
359
360 #define EXTRACT_NUMBER(destination, source) \
361 ((destination) = extract_number (source))
362
363 static int
364 extract_number (re_char *source)
365 {
366 signed char leading_byte = source[1];
367 return leading_byte * 256 + source[0];
368 }
369
370 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
371 SOURCE must be an lvalue. */
372
373 #define EXTRACT_NUMBER_AND_INCR(destination, source) \
374 ((destination) = extract_number_and_incr (&source))
375
376 static int
377 extract_number_and_incr (re_char **source)
378 {
379 int num = extract_number (*source);
380 *source += 2;
381 return num;
382 }
383
384 /* Store a multibyte character in three contiguous bytes starting
385 DESTINATION, and increment DESTINATION to the byte after where the
386 character is stored. Therefore, DESTINATION must be an lvalue. */
387
388 #define STORE_CHARACTER_AND_INCR(destination, character) \
389 do { \
390 (destination)[0] = (character) & 0377; \
391 (destination)[1] = ((character) >> 8) & 0377; \
392 (destination)[2] = (character) >> 16; \
393 (destination) += 3; \
394 } while (false)
395
396 /* Put into DESTINATION a character stored in three contiguous bytes
397 starting at SOURCE. */
398
399 #define EXTRACT_CHARACTER(destination, source) \
400 do { \
401 (destination) = ((source)[0] \
402 | ((source)[1] << 8) \
403 | ((source)[2] << 16)); \
404 } while (false)
405
406
407 /* Macros for charset. */
408
409 /* Size of bitmap of charset P in bytes. P is a start of charset,
410 i.e. *P is (re_opcode_t) charset or (re_opcode_t) charset_not. */
411 #define CHARSET_BITMAP_SIZE(p) ((p)[1] & 0x7F)
412
413 /* Nonzero if charset P has range table. */
414 #define CHARSET_RANGE_TABLE_EXISTS_P(p) (((p)[1] & 0x80) != 0)
415
416 /* Return the address of range table of charset P. But not the start
417 of table itself, but the before where the number of ranges is
418 stored. '2 +' means to skip re_opcode_t and size of bitmap,
419 and the 2 bytes of flags at the start of the range table. */
420 #define CHARSET_RANGE_TABLE(p) (&(p)[4 + CHARSET_BITMAP_SIZE (p)])
421
422 /* Extract the bit flags that start a range table. */
423 #define CHARSET_RANGE_TABLE_BITS(p) \
424 ((p)[2 + CHARSET_BITMAP_SIZE (p)] \
425 + (p)[3 + CHARSET_BITMAP_SIZE (p)] * 0x100)
426
427 /* Return the address of end of RANGE_TABLE. COUNT is number of
428 ranges (which is a pair of (start, end)) in the RANGE_TABLE. '* 2'
429 is start of range and end of range. '* 3' is size of each start
430 and end. */
431 #define CHARSET_RANGE_TABLE_END(range_table, count) \
432 ((range_table) + (count) * 2 * 3)
433
434 /* If REGEX_EMACS_DEBUG is defined, print many voluminous messages
435 (if the variable regex_emacs_debug is positive). */
436
437 #ifdef REGEX_EMACS_DEBUG
438
439 /* Use standard I/O for debugging. */
440 # include "sysstdio.h"
441
442 static int regex_emacs_debug = -100000;
443
444 # define DEBUG_STATEMENT(e) e
445 # define DEBUG_PRINT(...) \
446 if (regex_emacs_debug > 0) fprintf (stderr, __VA_ARGS__)
447 # define DEBUG_COMPILES_ARGUMENTS
448 # define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
449 if (regex_emacs_debug > 0) print_partial_compiled_pattern (s, e)
450 # define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
451 if (regex_emacs_debug > 0) print_double_string (w, s1, sz1, s2, sz2)
452
453 static void
454 debug_putchar (int c)
455 {
456 if (c >= 32 && c <= 126)
457 putc (c, stderr);
458 else
459 {
460 unsigned int uc = c;
461 fprintf (stderr, "{%02x}", uc);
462 }
463 }
464
465 /* Print the fastmap in human-readable form. */
466
467 static void
468 print_fastmap (char *fastmap)
469 {
470 bool was_a_range = false;
471 int i = 0;
472
473 while (i < (1 << BYTEWIDTH))
474 {
475 if (fastmap[i++])
476 {
477 was_a_range = false;
478 debug_putchar (i - 1);
479 while (i < (1 << BYTEWIDTH) && fastmap[i])
480 {
481 was_a_range = true;
482 i++;
483 }
484 if (was_a_range)
485 {
486 debug_putchar ('-');
487 debug_putchar (i - 1);
488 }
489 }
490 }
491 putc ('\n', stderr);
492 }
493
494
495 /* Print a compiled pattern string in human-readable form, starting at
496 the START pointer into it and ending just before the pointer END. */
497
498 static void
499 print_partial_compiled_pattern (re_char *start, re_char *end)
500 {
501 int mcnt, mcnt2;
502 re_char *p = start;
503 re_char *pend = end;
504
505 if (start == NULL)
506 {
507 fputs ("(null)\n", stderr);
508 return;
509 }
510
511 /* Loop over pattern commands. */
512 while (p < pend)
513 {
514 fprintf (stderr, "%td:\t", p - start);
515
516 switch ((re_opcode_t) *p++)
517 {
518 case no_op:
519 fputs ("/no_op", stderr);
520 break;
521
522 case succeed:
523 fputs ("/succeed", stderr);
524 break;
525
526 case exactn:
527 mcnt = *p++;
528 fprintf (stderr, "/exactn/%d", mcnt);
529 do
530 {
531 debug_putchar ('/');
532 debug_putchar (*p++);
533 }
534 while (--mcnt);
535 break;
536
537 case start_memory:
538 fprintf (stderr, "/start_memory/%d", *p++);
539 break;
540
541 case stop_memory:
542 fprintf (stderr, "/stop_memory/%d", *p++);
543 break;
544
545 case duplicate:
546 fprintf (stderr, "/duplicate/%d", *p++);
547 break;
548
549 case anychar:
550 fputs ("/anychar", stderr);
551 break;
552
553 case charset:
554 case charset_not:
555 {
556 int c, last = -100;
557 bool in_range = false;
558 int length = CHARSET_BITMAP_SIZE (p - 1);
559 bool has_range_table = CHARSET_RANGE_TABLE_EXISTS_P (p - 1);
560
561 fprintf (stderr, "/charset [%s",
562 (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
563
564 if (p + *p >= pend)
565 fputs (" !extends past end of pattern! ", stderr);
566
567 for (c = 0; c < 256; c++)
568 if (c / 8 < length
569 && (p[1 + (c/8)] & (1 << (c % 8))))
570 {
571 /* Are we starting a range? */
572 if (last + 1 == c && ! in_range)
573 {
574 debug_putchar ('-');
575 in_range = true;
576 }
577 /* Have we broken a range? */
578 else if (last + 1 != c && in_range)
579 {
580 debug_putchar (last);
581 in_range = false;
582 }
583
584 if (! in_range)
585 debug_putchar (c);
586
587 last = c;
588 }
589
590 if (in_range)
591 debug_putchar (last);
592
593 debug_putchar (']');
594
595 p += 1 + length;
596
597 if (has_range_table)
598 {
599 int count;
600 fputs ("has-range-table", stderr);
601
602 /* ??? Should print the range table; for now, just skip it. */
603 p += 2; /* skip range table bits */
604 EXTRACT_NUMBER_AND_INCR (count, p);
605 p = CHARSET_RANGE_TABLE_END (p, count);
606 }
607 }
608 break;
609
610 case begline:
611 fputs ("/begline", stderr);
612 break;
613
614 case endline:
615 fputs ("/endline", stderr);
616 break;
617
618 case on_failure_jump:
619 EXTRACT_NUMBER_AND_INCR (mcnt, p);
620 fprintf (stderr, "/on_failure_jump to %td", p + mcnt - start);
621 break;
622
623 case on_failure_keep_string_jump:
624 EXTRACT_NUMBER_AND_INCR (mcnt, p);
625 fprintf (stderr, "/on_failure_keep_string_jump to %td",
626 p + mcnt - start);
627 break;
628
629 case on_failure_jump_nastyloop:
630 EXTRACT_NUMBER_AND_INCR (mcnt, p);
631 fprintf (stderr, "/on_failure_jump_nastyloop to %td",
632 p + mcnt - start);
633 break;
634
635 case on_failure_jump_loop:
636 EXTRACT_NUMBER_AND_INCR (mcnt, p);
637 fprintf (stderr, "/on_failure_jump_loop to %td",
638 p + mcnt - start);
639 break;
640
641 case on_failure_jump_smart:
642 EXTRACT_NUMBER_AND_INCR (mcnt, p);
643 fprintf (stderr, "/on_failure_jump_smart to %td",
644 p + mcnt - start);
645 break;
646
647 case jump:
648 EXTRACT_NUMBER_AND_INCR (mcnt, p);
649 fprintf (stderr, "/jump to %td", p + mcnt - start);
650 break;
651
652 case succeed_n:
653 EXTRACT_NUMBER_AND_INCR (mcnt, p);
654 EXTRACT_NUMBER_AND_INCR (mcnt2, p);
655 fprintf (stderr, "/succeed_n to %td, %d times",
656 p - 2 + mcnt - start, mcnt2);
657 break;
658
659 case jump_n:
660 EXTRACT_NUMBER_AND_INCR (mcnt, p);
661 EXTRACT_NUMBER_AND_INCR (mcnt2, p);
662 fprintf (stderr, "/jump_n to %td, %d times",
663 p - 2 + mcnt - start, mcnt2);
664 break;
665
666 case set_number_at:
667 EXTRACT_NUMBER_AND_INCR (mcnt, p);
668 EXTRACT_NUMBER_AND_INCR (mcnt2, p);
669 fprintf (stderr, "/set_number_at location %td to %d",
670 p - 2 + mcnt - start, mcnt2);
671 break;
672
673 case wordbound:
674 fputs ("/wordbound", stderr);
675 break;
676
677 case notwordbound:
678 fputs ("/notwordbound", stderr);
679 break;
680
681 case wordbeg:
682 fputs ("/wordbeg", stderr);
683 break;
684
685 case wordend:
686 fputs ("/wordend", stderr);
687 break;
688
689 case symbeg:
690 fputs ("/symbeg", stderr);
691 break;
692
693 case symend:
694 fputs ("/symend", stderr);
695 break;
696
697 case syntaxspec:
698 fputs ("/syntaxspec", stderr);
699 mcnt = *p++;
700 fprintf (stderr, "/%d", mcnt);
701 break;
702
703 case notsyntaxspec:
704 fputs ("/notsyntaxspec", stderr);
705 mcnt = *p++;
706 fprintf (stderr, "/%d", mcnt);
707 break;
708
709 case at_dot:
710 fputs ("/at_dot", stderr);
711 break;
712
713 case categoryspec:
714 fputs ("/categoryspec", stderr);
715 mcnt = *p++;
716 fprintf (stderr, "/%d", mcnt);
717 break;
718
719 case notcategoryspec:
720 fputs ("/notcategoryspec", stderr);
721 mcnt = *p++;
722 fprintf (stderr, "/%d", mcnt);
723 break;
724
725 case begbuf:
726 fputs ("/begbuf", stderr);
727 break;
728
729 case endbuf:
730 fputs ("/endbuf", stderr);
731 break;
732
733 default:
734 fprintf (stderr, "?%d", *(p-1));
735 }
736
737 putc ('\n', stderr);
738 }
739
740 fprintf (stderr, "%td:\tend of pattern.\n", p - start);
741 }
742
743
744 static void
745 print_compiled_pattern (struct re_pattern_buffer *bufp)
746 {
747 re_char *buffer = bufp->buffer;
748
749 print_partial_compiled_pattern (buffer, buffer + bufp->used);
750 fprintf (stderr, "%td bytes used/%td bytes allocated.\n",
751 bufp->used, bufp->allocated);
752
753 if (bufp->fastmap_accurate && bufp->fastmap)
754 {
755 fputs ("fastmap: ", stderr);
756 print_fastmap (bufp->fastmap);
757 }
758
759 fprintf (stderr, "re_nsub: %td\t", bufp->re_nsub);
760 fprintf (stderr, "regs_alloc: %d\t", bufp->regs_allocated);
761 fprintf (stderr, "can_be_null: %d\n", bufp->can_be_null);
762 /* Perhaps we should print the translate table? */
763 }
764
765
766 static void
767 print_double_string (re_char *where, re_char *string1, ptrdiff_t size1,
768 re_char *string2, ptrdiff_t size2)
769 {
770 if (where == NULL)
771 fputs ("(null)", stderr);
772 else
773 {
774 int i;
775 if (FIRST_STRING_P (where))
776 {
777 for (i = 0; i < string1 + size1 - where; i++)
778 debug_putchar (where[i]);
779 where = string2;
780 }
781
782 for (i = 0; i < string2 + size2 - where; i++)
783 debug_putchar (where[i]);
784 }
785 }
786
787 #else /* not REGEX_EMACS_DEBUG */
788
789 # define DEBUG_STATEMENT(e)
790 # define DEBUG_PRINT(...)
791 # define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
792 # define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
793
794 #endif /* not REGEX_EMACS_DEBUG */
795
796 typedef enum
797 {
798 REG_NOERROR = 0, /* Success. */
799 REG_NOMATCH, /* Didn't find a match (for regexec). */
800
801 /* POSIX regcomp return error codes. (In the order listed in the
802 standard.) An older version of this code supported the POSIX
803 API; this version continues to use these names internally. */
804 REG_BADPAT, /* Invalid pattern. */
805 REG_ECOLLATE, /* Not implemented. */
806 REG_ECTYPE, /* Invalid character class name. */
807 REG_EESCAPE, /* Trailing backslash. */
808 REG_ESUBREG, /* Invalid back reference. */
809 REG_EBRACK, /* Unmatched left bracket. */
810 REG_EPAREN, /* Parenthesis imbalance. */
811 REG_EBRACE, /* Unmatched \{. */
812 REG_BADBR, /* Invalid contents of \{\}. */
813 REG_ERANGE, /* Invalid range end. */
814 REG_ESPACE, /* Ran out of memory. */
815 REG_BADRPT, /* No preceding re for repetition op. */
816
817 /* Error codes we've added. */
818 REG_EEND, /* Premature end. */
819 REG_ESIZE, /* Compiled pattern bigger than 2^16 bytes. */
820 REG_ERPAREN, /* Unmatched ) or \); not returned from regcomp. */
821 REG_ERANGEX, /* Range striding over charsets. */
822 REG_ESIZEBR /* n or m too big in \{n,m\} */
823 } reg_errcode_t;
824
825 static const char *re_error_msgid[] =
826 {
827 [REG_NOERROR] = "Success",
828 [REG_NOMATCH] = "No match",
829 [REG_BADPAT] = "Invalid regular expression",
830 [REG_ECOLLATE] = "Invalid collation character",
831 [REG_ECTYPE] = "Invalid character class name",
832 [REG_EESCAPE] = "Trailing backslash",
833 [REG_ESUBREG] = "Invalid back reference",
834 [REG_EBRACK] = "Unmatched [ or [^",
835 [REG_EPAREN] = "Unmatched ( or \\(",
836 [REG_EBRACE] = "Unmatched \\{",
837 [REG_BADBR] = "Invalid content of \\{\\}",
838 [REG_ERANGE] = "Invalid range end",
839 [REG_ESPACE] = "Memory exhausted",
840 [REG_BADRPT] = "Invalid preceding regular expression",
841 [REG_EEND] = "Premature end of regular expression",
842 [REG_ESIZE] = "Regular expression too big",
843 [REG_ERPAREN] = "Unmatched ) or \\)",
844 [REG_ERANGEX ] = "Range striding over charsets",
845 [REG_ESIZEBR ] = "Invalid content of \\{\\}",
846 };
847
848 /* For 'regs_allocated'. */
849 enum { REGS_UNALLOCATED, REGS_REALLOCATE, REGS_FIXED };
850
851 /* If 'regs_allocated' is REGS_UNALLOCATED in the pattern buffer,
852 're_match_2' returns information about at least this many registers
853 the first time a 'regs' structure is passed. */
854 enum { RE_NREGS = 30 };
855
856 /* The searching and matching functions allocate memory for the
857 failure stack and registers. Otherwise searching and matching
858 routines would have much smaller memory resources at their
859 disposal, and therefore might fail to handle complex regexps. */
860
861 /* Failure stack declarations and macros; both re_compile_fastmap and
862 re_match_2 use a failure stack. These have to be macros because of
863 SAFE_ALLOCA. */
864
865
866 /* Approximate number of failure points for which to initially allocate space
867 when matching. If this number is exceeded, we allocate more
868 space, so it is not a hard limit. */
869 #define INIT_FAILURE_ALLOC 20
870
871 /* Roughly the maximum number of failure points on the stack. Would be
872 exactly that if failure always used TYPICAL_FAILURE_SIZE items.
873 This is a variable only so users of regex can assign to it; we never
874 change it ourselves. We always multiply it by TYPICAL_FAILURE_SIZE
875 before using it, so it should probably be a byte-count instead. */
876 /* Note that 4400 was enough to cause a crash on Alpha OSF/1,
877 whose default stack limit is 2mb. In order for a larger
878 value to work reliably, you have to try to make it accord
879 with the process stack limit. */
880 ptrdiff_t emacs_re_max_failures = 40000;
881
882 union fail_stack_elt
883 {
884 re_char *pointer;
885 intptr_t integer;
886 };
887
888 typedef union fail_stack_elt fail_stack_elt_t;
889
890 typedef struct
891 {
892 fail_stack_elt_t *stack;
893 ptrdiff_t size;
894 ptrdiff_t avail; /* Offset of next open position. */
895 ptrdiff_t frame; /* Offset of the cur constructed frame. */
896 } fail_stack_type;
897
898 #define FAIL_STACK_EMPTY() (fail_stack.frame == 0)
899
900
901 /* Define macros to initialize and free the failure stack. */
902
903 #define INIT_FAIL_STACK() \
904 do { \
905 fail_stack.stack = \
906 SAFE_ALLOCA (INIT_FAILURE_ALLOC * TYPICAL_FAILURE_SIZE \
907 * sizeof (fail_stack_elt_t)); \
908 fail_stack.size = INIT_FAILURE_ALLOC; \
909 fail_stack.avail = 0; \
910 fail_stack.frame = 0; \
911 } while (false)
912
913
914 /* Double the size of FAIL_STACK, up to a limit
915 which allows approximately 'emacs_re_max_failures' items.
916
917 Return 1 if succeeds, and 0 if either ran out of memory
918 allocating space for it or it was already too large.
919
920 REGEX_REALLOCATE requires 'destination' be declared. */
921
922 /* Factor to increase the failure stack size by.
923 This used to be 2, but 2 was too wasteful
924 because the old discarded stacks added up to as much space
925 were as ultimate, maximum-size stack. */
926 #define FAIL_STACK_GROWTH_FACTOR 4
927
928 #define GROW_FAIL_STACK(fail_stack) \
929 (((fail_stack).size >= emacs_re_max_failures * TYPICAL_FAILURE_SIZE) \
930 ? 0 \
931 : ((fail_stack).stack \
932 = REGEX_REALLOCATE ((fail_stack).stack, \
933 (fail_stack).avail * sizeof (fail_stack_elt_t), \
934 min (emacs_re_max_failures * TYPICAL_FAILURE_SIZE, \
935 ((fail_stack).size * FAIL_STACK_GROWTH_FACTOR)) \
936 * sizeof (fail_stack_elt_t)), \
937 ((fail_stack).size \
938 = (min (emacs_re_max_failures * TYPICAL_FAILURE_SIZE, \
939 ((fail_stack).size * FAIL_STACK_GROWTH_FACTOR)))), \
940 1))
941
942
943 /* Push a pointer value onto the failure stack.
944 Assumes the variable 'fail_stack'. Probably should only
945 be called from within 'PUSH_FAILURE_POINT'. */
946 #define PUSH_FAILURE_POINTER(item) \
947 fail_stack.stack[fail_stack.avail++].pointer = (item)
948
949 /* This pushes an integer-valued item onto the failure stack.
950 Assumes the variable 'fail_stack'. Probably should only
951 be called from within 'PUSH_FAILURE_POINT'. */
952 #define PUSH_FAILURE_INT(item) \
953 fail_stack.stack[fail_stack.avail++].integer = (item)
954
955 /* These POP... operations complement the PUSH... operations.
956 All assume that 'fail_stack' is nonempty. */
957 #define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
958 #define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
959
960 /* Individual items aside from the registers. */
961 #define NUM_NONREG_ITEMS 3
962
963 /* Used to examine the stack (to detect infinite loops). */
964 #define FAILURE_PAT(h) fail_stack.stack[(h) - 1].pointer
965 #define FAILURE_STR(h) (fail_stack.stack[(h) - 2].pointer)
966 #define NEXT_FAILURE_HANDLE(h) fail_stack.stack[(h) - 3].integer
967 #define TOP_FAILURE_HANDLE() fail_stack.frame
968
969
970 #define ENSURE_FAIL_STACK(space) \
971 while (REMAINING_AVAIL_SLOTS <= space) { \
972 if (!GROW_FAIL_STACK (fail_stack)) \
973 { \
974 unbind_to (count, Qnil); \
975 SAFE_FREE (); \
976 return -2; \
977 } \
978 DEBUG_PRINT ("\n Doubled stack; size now: %td\n", fail_stack.size); \
979 DEBUG_PRINT (" slots available: %td\n", REMAINING_AVAIL_SLOTS);\
980 }
981
982 /* Push register NUM onto the stack. */
983 #define PUSH_FAILURE_REG(num) \
984 do { \
985 char *destination; \
986 intptr_t n = num; \
987 eassert (0 < n && n < num_regs); \
988 eassert (REG_UNSET (regstart[n]) <= REG_UNSET (regend[n])); \
989 ENSURE_FAIL_STACK(3); \
990 DEBUG_PRINT (" Push reg %"PRIdPTR" (spanning %p -> %p)\n", \
991 n, regstart[n], regend[n]); \
992 PUSH_FAILURE_POINTER (regstart[n]); \
993 PUSH_FAILURE_POINTER (regend[n]); \
994 PUSH_FAILURE_INT (n); \
995 } while (false)
996
997 /* Change the counter's value to VAL, but make sure that it will
998 be reset when backtracking. */
999 #define PUSH_NUMBER(ptr,val) \
1000 do { \
1001 char *destination; \
1002 int c; \
1003 ENSURE_FAIL_STACK(3); \
1004 EXTRACT_NUMBER (c, ptr); \
1005 DEBUG_PRINT (" Push number %p = %d -> %d\n", ptr, c, val); \
1006 PUSH_FAILURE_INT (c); \
1007 PUSH_FAILURE_POINTER (ptr); \
1008 PUSH_FAILURE_INT (-1); \
1009 STORE_NUMBER (ptr, val); \
1010 } while (false)
1011
1012 /* Pop a saved register off the stack. */
1013 #define POP_FAILURE_REG_OR_COUNT() \
1014 do { \
1015 intptr_t pfreg = POP_FAILURE_INT (); \
1016 if (pfreg == -1) \
1017 { \
1018 /* It's a counter. */ \
1019 /* Discard 'const', making re_search non-reentrant. */ \
1020 unsigned char *ptr = (unsigned char *) POP_FAILURE_POINTER (); \
1021 pfreg = POP_FAILURE_INT (); \
1022 STORE_NUMBER (ptr, pfreg); \
1023 DEBUG_PRINT (" Pop counter %p = %"PRIdPTR"\n", ptr, pfreg); \
1024 } \
1025 else \
1026 { \
1027 eassert (0 < pfreg && pfreg < num_regs); \
1028 regend[pfreg] = POP_FAILURE_POINTER (); \
1029 regstart[pfreg] = POP_FAILURE_POINTER (); \
1030 eassert (REG_UNSET (regstart[pfreg]) <= REG_UNSET (regend[pfreg])); \
1031 DEBUG_PRINT (" Pop reg %ld (spanning %p -> %p)\n", \
1032 pfreg, regstart[pfreg], regend[pfreg]); \
1033 } \
1034 } while (false)
1035
1036 /* Check that we are not stuck in an infinite loop. */
1037 #define CHECK_INFINITE_LOOP(pat_cur, string_place) \
1038 do { \
1039 ptrdiff_t failure = TOP_FAILURE_HANDLE (); \
1040 /* Check for infinite matching loops */ \
1041 while (failure > 0 \
1042 && (FAILURE_STR (failure) == string_place \
1043 || FAILURE_STR (failure) == NULL)) \
1044 { \
1045 eassert (FAILURE_PAT (failure) >= bufp->buffer \
1046 && FAILURE_PAT (failure) <= bufp->buffer + bufp->used); \
1047 if (FAILURE_PAT (failure) == pat_cur) \
1048 { \
1049 cycle = true; \
1050 break; \
1051 } \
1052 DEBUG_PRINT (" Other pattern: %p\n", FAILURE_PAT (failure)); \
1053 failure = NEXT_FAILURE_HANDLE(failure); \
1054 } \
1055 DEBUG_PRINT (" Other string: %p\n", FAILURE_STR (failure)); \
1056 } while (false)
1057
1058 /* Push the information about the state we will need
1059 if we ever fail back to it.
1060
1061 Requires variables fail_stack, regstart, regend and
1062 num_regs be declared. GROW_FAIL_STACK requires 'destination' be
1063 declared.
1064
1065 Does 'return FAILURE_CODE' if runs out of memory. */
1066
1067 #define PUSH_FAILURE_POINT(pattern, string_place) \
1068 do { \
1069 char *destination; \
1070 DEBUG_STATEMENT (nfailure_points_pushed++); \
1071 DEBUG_PRINT ("\nPUSH_FAILURE_POINT:\n"); \
1072 DEBUG_PRINT (" Before push, next avail: %td\n", fail_stack.avail); \
1073 DEBUG_PRINT (" size: %td\n", fail_stack.size); \
1074 \
1075 ENSURE_FAIL_STACK (NUM_NONREG_ITEMS); \
1076 \
1077 DEBUG_PRINT ("\n"); \
1078 \
1079 DEBUG_PRINT (" Push frame index: %td\n", fail_stack.frame); \
1080 PUSH_FAILURE_INT (fail_stack.frame); \
1081 \
1082 DEBUG_PRINT (" Push string %p: \"", string_place); \
1083 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, size2);\
1084 DEBUG_PRINT ("\"\n"); \
1085 PUSH_FAILURE_POINTER (string_place); \
1086 \
1087 DEBUG_PRINT (" Push pattern %p: ", pattern); \
1088 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern, pend); \
1089 PUSH_FAILURE_POINTER (pattern); \
1090 \
1091 /* Close the frame by moving the frame pointer past it. */ \
1092 fail_stack.frame = fail_stack.avail; \
1093 } while (false)
1094
1095 /* Estimate the size of data pushed by a typical failure stack entry.
1096 An estimate is all we need, because all we use this for
1097 is to choose a limit for how big to make the failure stack. */
1098 /* BEWARE, the value `20' is hard-coded in emacs.c:main(). */
1099 #define TYPICAL_FAILURE_SIZE 20
1100
1101 /* How many items can still be added to the stack without overflowing it. */
1102 #define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
1103
1104
1105 /* Pop what PUSH_FAIL_STACK pushes.
1106
1107 Restore into the parameters, all of which should be lvalues:
1108 STR -- the saved data position.
1109 PAT -- the saved pattern position.
1110 REGSTART, REGEND -- arrays of string positions.
1111
1112 Also assume the variables FAIL_STACK and (if debugging) BUFP, PEND,
1113 STRING1, SIZE1, STRING2, and SIZE2. */
1114
1115 #define POP_FAILURE_POINT(str, pat) \
1116 do { \
1117 eassert (!FAIL_STACK_EMPTY ()); \
1118 \
1119 /* Remove failure points and point to how many regs pushed. */ \
1120 DEBUG_PRINT ("POP_FAILURE_POINT:\n"); \
1121 DEBUG_PRINT (" Before pop, next avail: %td\n", fail_stack.avail); \
1122 DEBUG_PRINT (" size: %td\n", fail_stack.size); \
1123 \
1124 /* Pop the saved registers. */ \
1125 while (fail_stack.frame < fail_stack.avail) \
1126 POP_FAILURE_REG_OR_COUNT (); \
1127 \
1128 pat = POP_FAILURE_POINTER (); \
1129 DEBUG_PRINT (" Popping pattern %p: ", pat); \
1130 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
1131 \
1132 /* If the saved string location is NULL, it came from an \
1133 on_failure_keep_string_jump opcode, and we want to throw away the \
1134 saved NULL, thus retaining our current position in the string. */ \
1135 str = POP_FAILURE_POINTER (); \
1136 DEBUG_PRINT (" Popping string %p: \"", str); \
1137 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
1138 DEBUG_PRINT ("\"\n"); \
1139 \
1140 fail_stack.frame = POP_FAILURE_INT (); \
1141 DEBUG_PRINT (" Popping frame index: %td\n", fail_stack.frame); \
1142 \
1143 eassert (fail_stack.avail >= 0); \
1144 eassert (fail_stack.frame <= fail_stack.avail); \
1145 \
1146 DEBUG_STATEMENT (nfailure_points_popped++); \
1147 } while (false) /* POP_FAILURE_POINT */
1148
1149
1150
1151 /* Registers are set to a sentinel when they haven't yet matched. */
1152 #define REG_UNSET(e) ((e) == NULL)
1153
1154 /* Subroutine declarations and macros for regex_compile. */
1155
1156 static reg_errcode_t regex_compile (re_char *pattern, ptrdiff_t size,
1157 bool posix_backtracking,
1158 const char *whitespace_regexp,
1159 struct re_pattern_buffer *bufp);
1160 static void store_op1 (re_opcode_t op, unsigned char *loc, int arg);
1161 static void store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2);
1162 static void insert_op1 (re_opcode_t op, unsigned char *loc,
1163 int arg, unsigned char *end);
1164 static void insert_op2 (re_opcode_t op, unsigned char *loc,
1165 int arg1, int arg2, unsigned char *end);
1166 static bool at_begline_loc_p (re_char *pattern, re_char *p);
1167 static bool at_endline_loc_p (re_char *p, re_char *pend);
1168 static re_char *skip_one_char (re_char *p);
1169 static int analyze_first (re_char *p, re_char *pend,
1170 char *fastmap, bool multibyte);
1171
1172 /* Fetch the next character in the uncompiled pattern, with no
1173 translation. */
1174 #define PATFETCH(c) \
1175 do { \
1176 int len; \
1177 if (p == pend) return REG_EEND; \
1178 c = RE_STRING_CHAR_AND_LENGTH (p, len, multibyte); \
1179 p += len; \
1180 } while (false)
1181
1182
1183 #define RE_TRANSLATE(TBL, C) char_table_translate (TBL, C)
1184 #define TRANSLATE(d) (!NILP (translate) ? RE_TRANSLATE (translate, d) : (d))
1185
1186 /* Macros for outputting the compiled pattern into 'buffer'. */
1187
1188 /* If the buffer isn't allocated when it comes in, use this. */
1189 #define INIT_BUF_SIZE 32
1190
1191 /* Ensure at least N more bytes of space in buffer. */
1192 #define GET_BUFFER_SPACE(n) \
1193 if (bufp->buffer + bufp->allocated - b < (n)) \
1194 EXTEND_BUFFER ((n) - (bufp->buffer + bufp->allocated - b))
1195
1196 /* Ensure one more byte of buffer space and then add C to it. */
1197 #define BUF_PUSH(c) \
1198 do { \
1199 GET_BUFFER_SPACE (1); \
1200 *b++ = (unsigned char) (c); \
1201 } while (false)
1202
1203
1204 /* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1205 #define BUF_PUSH_2(c1, c2) \
1206 do { \
1207 GET_BUFFER_SPACE (2); \
1208 *b++ = (unsigned char) (c1); \
1209 *b++ = (unsigned char) (c2); \
1210 } while (false)
1211
1212
1213 /* Store a jump with opcode OP at LOC to location TO. Store a
1214 relative address offset by the three bytes the jump itself occupies. */
1215 #define STORE_JUMP(op, loc, to) \
1216 store_op1 (op, loc, (to) - (loc) - 3)
1217
1218 /* Likewise, for a two-argument jump. */
1219 #define STORE_JUMP2(op, loc, to, arg) \
1220 store_op2 (op, loc, (to) - (loc) - 3, arg)
1221
1222 /* Like 'STORE_JUMP', but for inserting. Assume B is the buffer end. */
1223 #define INSERT_JUMP(op, loc, to) \
1224 insert_op1 (op, loc, (to) - (loc) - 3, b)
1225
1226 /* Like 'STORE_JUMP2', but for inserting. Assume B is the buffer end. */
1227 #define INSERT_JUMP2(op, loc, to, arg) \
1228 insert_op2 (op, loc, (to) - (loc) - 3, arg, b)
1229
1230
1231 /* This is not an arbitrary limit: the arguments which represent offsets
1232 into the pattern are two bytes long. So if 2^15 bytes turns out to
1233 be too small, many things would have to change. */
1234 # define MAX_BUF_SIZE (1 << 15)
1235
1236 /* Extend the buffer by at least N bytes via realloc and
1237 reset the pointers that pointed into the old block to point to the
1238 correct places in the new one. If extending the buffer results in it
1239 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
1240 #define EXTEND_BUFFER(n) \
1241 do { \
1242 ptrdiff_t requested_extension = n; \
1243 unsigned char *old_buffer = bufp->buffer; \
1244 if (MAX_BUF_SIZE - bufp->allocated < requested_extension) \
1245 return REG_ESIZE; \
1246 ptrdiff_t b_off = b - old_buffer; \
1247 ptrdiff_t begalt_off = begalt - old_buffer; \
1248 ptrdiff_t fixup_alt_jump_off = \
1249 fixup_alt_jump ? fixup_alt_jump - old_buffer : -1; \
1250 ptrdiff_t laststart_off = laststart ? laststart - old_buffer : -1; \
1251 ptrdiff_t pending_exact_off = \
1252 pending_exact ? pending_exact - old_buffer : -1; \
1253 bufp->buffer = xpalloc (bufp->buffer, &bufp->allocated, \
1254 requested_extension, MAX_BUF_SIZE, 1); \
1255 unsigned char *new_buffer = bufp->buffer; \
1256 b = new_buffer + b_off; \
1257 begalt = new_buffer + begalt_off; \
1258 if (0 <= fixup_alt_jump_off) \
1259 fixup_alt_jump = new_buffer + fixup_alt_jump_off; \
1260 if (0 <= laststart_off) \
1261 laststart = new_buffer + laststart_off; \
1262 if (0 <= pending_exact_off) \
1263 pending_exact = new_buffer + pending_exact_off; \
1264 } while (false)
1265
1266
1267 /* Since we have one byte reserved for the register number argument to
1268 {start,stop}_memory, the maximum number of groups we can report
1269 things about is what fits in that byte. */
1270 #define MAX_REGNUM 255
1271
1272 /* But patterns can have more than 'MAX_REGNUM' registers. Just
1273 ignore the excess. */
1274 typedef int regnum_t;
1275
1276
1277 /* Macros for the compile stack. */
1278
1279 typedef long pattern_offset_t;
1280 verify (LONG_MIN <= -(MAX_BUF_SIZE - 1) && MAX_BUF_SIZE - 1 <= LONG_MAX);
1281
1282 typedef struct
1283 {
1284 pattern_offset_t begalt_offset;
1285 pattern_offset_t fixup_alt_jump;
1286 pattern_offset_t laststart_offset;
1287 regnum_t regnum;
1288 } compile_stack_elt_t;
1289
1290
1291 typedef struct
1292 {
1293 compile_stack_elt_t *stack;
1294 ptrdiff_t size;
1295 ptrdiff_t avail; /* Offset of next open position. */
1296 } compile_stack_type;
1297
1298
1299 #define INIT_COMPILE_STACK_SIZE 32
1300
1301 #define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1302 #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1303
1304 /* The next available element. */
1305 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1306
1307 /* Structure to manage work area for range table. */
1308 struct range_table_work_area
1309 {
1310 int *table; /* actual work area. */
1311 int allocated; /* allocated size for work area in bytes. */
1312 int used; /* actually used size in words. */
1313 int bits; /* flag to record character classes */
1314 };
1315
1316 /* Make sure that WORK_AREA can hold N more multibyte characters.
1317 If it can't get the space, it returns from the surrounding function. */
1318
1319 #define EXTEND_RANGE_TABLE(work_area, n) \
1320 do { \
1321 if (((work_area).used + (n)) * sizeof (int) > (work_area).allocated) \
1322 { \
1323 extend_range_table_work_area (&work_area); \
1324 if ((work_area).table == 0) \
1325 return (REG_ESPACE); \
1326 } \
1327 } while (false)
1328
1329 #define SET_RANGE_TABLE_WORK_AREA_BIT(work_area, bit) \
1330 (work_area).bits |= (bit)
1331
1332 /* Set a range (RANGE_START, RANGE_END) to WORK_AREA. */
1333 #define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end) \
1334 do { \
1335 EXTEND_RANGE_TABLE ((work_area), 2); \
1336 (work_area).table[(work_area).used++] = (range_start); \
1337 (work_area).table[(work_area).used++] = (range_end); \
1338 } while (false)
1339
1340 /* Free allocated memory for WORK_AREA. */
1341 #define FREE_RANGE_TABLE_WORK_AREA(work_area) \
1342 do { \
1343 if ((work_area).table) \
1344 xfree ((work_area).table); \
1345 } while (false)
1346
1347 #define CLEAR_RANGE_TABLE_WORK_USED(work_area) \
1348 ((work_area).used = 0, (work_area).bits = 0)
1349 #define RANGE_TABLE_WORK_USED(work_area) ((work_area).used)
1350 #define RANGE_TABLE_WORK_BITS(work_area) ((work_area).bits)
1351 #define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i])
1352
1353 /* Bits used to implement the multibyte-part of the various character classes
1354 such as [:alnum:] in a charset's range table. The code currently assumes
1355 that only the low 16 bits are used. */
1356 #define BIT_WORD 0x1
1357 #define BIT_LOWER 0x2
1358 #define BIT_PUNCT 0x4
1359 #define BIT_SPACE 0x8
1360 #define BIT_UPPER 0x10
1361 #define BIT_MULTIBYTE 0x20
1362 #define BIT_ALPHA 0x40
1363 #define BIT_ALNUM 0x80
1364 #define BIT_GRAPH 0x100
1365 #define BIT_PRINT 0x200
1366 #define BIT_BLANK 0x400
1367
1368
1369 /* Set the bit for character C in a list. */
1370 #define SET_LIST_BIT(c) (b[((c)) / BYTEWIDTH] |= 1 << ((c) % BYTEWIDTH))
1371
1372
1373 /* Store characters in the range FROM to TO in the bitmap at B (for
1374 ASCII and unibyte characters) and WORK_AREA (for multibyte
1375 characters) while translating them and paying attention to the
1376 continuity of translated characters.
1377
1378 Implementation note: It is better to implement these fairly big
1379 macros by a function, but it's not that easy because macros called
1380 in this macro assume various local variables already declared. */
1381
1382 /* Both FROM and TO are ASCII characters. */
1383
1384 #define SETUP_ASCII_RANGE(work_area, FROM, TO) \
1385 do { \
1386 int C0, C1; \
1387 \
1388 for (C0 = (FROM); C0 <= (TO); C0++) \
1389 { \
1390 C1 = TRANSLATE (C0); \
1391 if (! ASCII_CHAR_P (C1)) \
1392 { \
1393 SET_RANGE_TABLE_WORK_AREA ((work_area), C1, C1); \
1394 if ((C1 = RE_CHAR_TO_UNIBYTE (C1)) < 0) \
1395 C1 = C0; \
1396 } \
1397 SET_LIST_BIT (C1); \
1398 } \
1399 } while (false)
1400
1401
1402 /* Both FROM and TO are unibyte characters (0x80..0xFF). */
1403
1404 #define SETUP_UNIBYTE_RANGE(work_area, FROM, TO) \
1405 do { \
1406 int C0, C1, C2, I; \
1407 int USED = RANGE_TABLE_WORK_USED (work_area); \
1408 \
1409 for (C0 = (FROM); C0 <= (TO); C0++) \
1410 { \
1411 C1 = RE_CHAR_TO_MULTIBYTE (C0); \
1412 if (CHAR_BYTE8_P (C1)) \
1413 SET_LIST_BIT (C0); \
1414 else \
1415 { \
1416 C2 = TRANSLATE (C1); \
1417 if (C2 == C1 \
1418 || (C1 = RE_CHAR_TO_UNIBYTE (C2)) < 0) \
1419 C1 = C0; \
1420 SET_LIST_BIT (C1); \
1421 for (I = RANGE_TABLE_WORK_USED (work_area) - 2; I >= USED; I -= 2) \
1422 { \
1423 int from = RANGE_TABLE_WORK_ELT (work_area, I); \
1424 int to = RANGE_TABLE_WORK_ELT (work_area, I + 1); \
1425 \
1426 if (C2 >= from - 1 && C2 <= to + 1) \
1427 { \
1428 if (C2 == from - 1) \
1429 RANGE_TABLE_WORK_ELT (work_area, I)--; \
1430 else if (C2 == to + 1) \
1431 RANGE_TABLE_WORK_ELT (work_area, I + 1)++; \
1432 break; \
1433 } \
1434 } \
1435 if (I < USED) \
1436 SET_RANGE_TABLE_WORK_AREA ((work_area), C2, C2); \
1437 } \
1438 } \
1439 } while (false)
1440
1441
1442 /* Both FROM and TO are multibyte characters. */
1443
1444 #define SETUP_MULTIBYTE_RANGE(work_area, FROM, TO) \
1445 do { \
1446 int C0, C1, C2, I, USED = RANGE_TABLE_WORK_USED (work_area); \
1447 \
1448 SET_RANGE_TABLE_WORK_AREA ((work_area), (FROM), (TO)); \
1449 for (C0 = (FROM); C0 <= (TO); C0++) \
1450 { \
1451 C1 = TRANSLATE (C0); \
1452 if ((C2 = RE_CHAR_TO_UNIBYTE (C1)) >= 0 \
1453 || (C1 != C0 && (C2 = RE_CHAR_TO_UNIBYTE (C0)) >= 0)) \
1454 SET_LIST_BIT (C2); \
1455 if (C1 >= (FROM) && C1 <= (TO)) \
1456 continue; \
1457 for (I = RANGE_TABLE_WORK_USED (work_area) - 2; I >= USED; I -= 2) \
1458 { \
1459 int from = RANGE_TABLE_WORK_ELT (work_area, I); \
1460 int to = RANGE_TABLE_WORK_ELT (work_area, I + 1); \
1461 \
1462 if (C1 >= from - 1 && C1 <= to + 1) \
1463 { \
1464 if (C1 == from - 1) \
1465 RANGE_TABLE_WORK_ELT (work_area, I)--; \
1466 else if (C1 == to + 1) \
1467 RANGE_TABLE_WORK_ELT (work_area, I + 1)++; \
1468 break; \
1469 } \
1470 } \
1471 if (I < USED) \
1472 SET_RANGE_TABLE_WORK_AREA ((work_area), C1, C1); \
1473 } \
1474 } while (false)
1475
1476 /* Get the next unsigned number in the uncompiled pattern. */
1477 #define GET_INTERVAL_COUNT(num) \
1478 do { \
1479 if (p == pend) \
1480 FREE_STACK_RETURN (REG_EBRACE); \
1481 else \
1482 { \
1483 PATFETCH (c); \
1484 while ('0' <= c && c <= '9') \
1485 { \
1486 if (num < 0) \
1487 num = 0; \
1488 if (RE_DUP_MAX / 10 - (RE_DUP_MAX % 10 < c - '0') < num) \
1489 FREE_STACK_RETURN (REG_ESIZEBR); \
1490 num = num * 10 + c - '0'; \
1491 if (p == pend) \
1492 FREE_STACK_RETURN (REG_EBRACE); \
1493 PATFETCH (c); \
1494 } \
1495 } \
1496 } while (false)
1497
1498 /* Parse a character class, i.e. string such as "[:name:]". *strp
1499 points to the string to be parsed and limit is length, in bytes, of
1500 that string.
1501
1502 If *strp point to a string that begins with "[:name:]", where name is
1503 a non-empty sequence of lower case letters, *strp will be advanced past the
1504 closing square bracket and RECC_* constant which maps to the name will be
1505 returned. If name is not a valid character class name zero, or RECC_ERROR,
1506 is returned.
1507
1508 Otherwise, if *strp doesn't begin with "[:name:]", -1 is returned.
1509
1510 The function can be used on ASCII and multibyte (UTF-8-encoded) strings.
1511 */
1512 re_wctype_t
1513 re_wctype_parse (const unsigned char **strp, ptrdiff_t limit)
1514 {
1515 const char *beg = (const char *)*strp, *it;
1516
1517 if (limit < 4 || beg[0] != '[' || beg[1] != ':')
1518 return -1;
1519
1520 beg += 2; /* skip opening "[:" */
1521 limit -= 3; /* opening "[:" and half of closing ":]"; --limit handles rest */
1522 for (it = beg; it[0] != ':' || it[1] != ']'; ++it)
1523 if (!--limit)
1524 return -1;
1525
1526 *strp = (const unsigned char *)(it + 2);
1527
1528 /* Sort tests in the length=five case by frequency the classes to minimize
1529 number of times we fail the comparison. The frequencies of character class
1530 names used in Emacs sources as of 2016-07-27:
1531
1532 $ find \( -name \*.c -o -name \*.el \) -exec grep -h '\[:[a-z]*:]' {} + |
1533 sed 's/]/]\n/g' |grep -o '\[:[a-z]*:]' |sort |uniq -c |sort -nr
1534 213 [:alnum:]
1535 104 [:alpha:]
1536 62 [:space:]
1537 39 [:digit:]
1538 36 [:blank:]
1539 26 [:word:]
1540 26 [:upper:]
1541 21 [:lower:]
1542 10 [:xdigit:]
1543 10 [:punct:]
1544 10 [:ascii:]
1545 4 [:nonascii:]
1546 4 [:graph:]
1547 2 [:print:]
1548 2 [:cntrl:]
1549 1 [:ff:]
1550
1551 If you update this list, consider also updating chain of or'ed conditions
1552 in execute_charset function.
1553 */
1554
1555 switch (it - beg) {
1556 case 4:
1557 if (!memcmp (beg, "word", 4)) return RECC_WORD;
1558 break;
1559 case 5:
1560 if (!memcmp (beg, "alnum", 5)) return RECC_ALNUM;
1561 if (!memcmp (beg, "alpha", 5)) return RECC_ALPHA;
1562 if (!memcmp (beg, "space", 5)) return RECC_SPACE;
1563 if (!memcmp (beg, "digit", 5)) return RECC_DIGIT;
1564 if (!memcmp (beg, "blank", 5)) return RECC_BLANK;
1565 if (!memcmp (beg, "upper", 5)) return RECC_UPPER;
1566 if (!memcmp (beg, "lower", 5)) return RECC_LOWER;
1567 if (!memcmp (beg, "punct", 5)) return RECC_PUNCT;
1568 if (!memcmp (beg, "ascii", 5)) return RECC_ASCII;
1569 if (!memcmp (beg, "graph", 5)) return RECC_GRAPH;
1570 if (!memcmp (beg, "print", 5)) return RECC_PRINT;
1571 if (!memcmp (beg, "cntrl", 5)) return RECC_CNTRL;
1572 break;
1573 case 6:
1574 if (!memcmp (beg, "xdigit", 6)) return RECC_XDIGIT;
1575 break;
1576 case 7:
1577 if (!memcmp (beg, "unibyte", 7)) return RECC_UNIBYTE;
1578 break;
1579 case 8:
1580 if (!memcmp (beg, "nonascii", 8)) return RECC_NONASCII;
1581 break;
1582 case 9:
1583 if (!memcmp (beg, "multibyte", 9)) return RECC_MULTIBYTE;
1584 break;
1585 }
1586
1587 return RECC_ERROR;
1588 }
1589
1590 /* True if CH is in the char class CC. */
1591 bool
1592 re_iswctype (int ch, re_wctype_t cc)
1593 {
1594 switch (cc)
1595 {
1596 case RECC_ALNUM: return ISALNUM (ch) != 0;
1597 case RECC_ALPHA: return ISALPHA (ch) != 0;
1598 case RECC_BLANK: return ISBLANK (ch) != 0;
1599 case RECC_CNTRL: return ISCNTRL (ch) != 0;
1600 case RECC_DIGIT: return ISDIGIT (ch) != 0;
1601 case RECC_GRAPH: return ISGRAPH (ch) != 0;
1602 case RECC_LOWER: return ISLOWER (ch) != 0;
1603 case RECC_PRINT: return ISPRINT (ch) != 0;
1604 case RECC_PUNCT: return ISPUNCT (ch) != 0;
1605 case RECC_SPACE: return ISSPACE (ch) != 0;
1606 case RECC_UPPER: return ISUPPER (ch) != 0;
1607 case RECC_XDIGIT: return ISXDIGIT (ch) != 0;
1608 case RECC_ASCII: return IS_REAL_ASCII (ch) != 0;
1609 case RECC_NONASCII: return !IS_REAL_ASCII (ch);
1610 case RECC_UNIBYTE: return ISUNIBYTE (ch) != 0;
1611 case RECC_MULTIBYTE: return !ISUNIBYTE (ch);
1612 case RECC_WORD: return ISWORD (ch) != 0;
1613 case RECC_ERROR: return false;
1614 default:
1615 abort ();
1616 }
1617 }
1618
1619 /* Return a bit-pattern to use in the range-table bits to match multibyte
1620 chars of class CC. */
1621 static int
1622 re_wctype_to_bit (re_wctype_t cc)
1623 {
1624 switch (cc)
1625 {
1626 case RECC_NONASCII:
1627 case RECC_MULTIBYTE: return BIT_MULTIBYTE;
1628 case RECC_ALPHA: return BIT_ALPHA;
1629 case RECC_ALNUM: return BIT_ALNUM;
1630 case RECC_WORD: return BIT_WORD;
1631 case RECC_LOWER: return BIT_LOWER;
1632 case RECC_UPPER: return BIT_UPPER;
1633 case RECC_PUNCT: return BIT_PUNCT;
1634 case RECC_SPACE: return BIT_SPACE;
1635 case RECC_GRAPH: return BIT_GRAPH;
1636 case RECC_PRINT: return BIT_PRINT;
1637 case RECC_BLANK: return BIT_BLANK;
1638 case RECC_ASCII: case RECC_DIGIT: case RECC_XDIGIT: case RECC_CNTRL:
1639 case RECC_UNIBYTE: case RECC_ERROR: return 0;
1640 default:
1641 abort ();
1642 }
1643 }
1644
1645 /* Filling in the work area of a range. */
1646
1647 /* Actually extend the space in WORK_AREA. */
1648
1649 static void
1650 extend_range_table_work_area (struct range_table_work_area *work_area)
1651 {
1652 work_area->allocated += 16 * sizeof (int);
1653 work_area->table = xrealloc (work_area->table, work_area->allocated);
1654 }
1655
1656 /* regex_compile and helpers. */
1657
1658 static bool group_in_compile_stack (compile_stack_type, regnum_t);
1659
1660 /* Insert the 'jump' from the end of last alternative to "here".
1661 The space for the jump has already been allocated. */
1662 #define FIXUP_ALT_JUMP() \
1663 do { \
1664 if (fixup_alt_jump) \
1665 STORE_JUMP (jump, fixup_alt_jump, b); \
1666 } while (false)
1667
1668
1669 /* Return, freeing storage we allocated. */
1670 #define FREE_STACK_RETURN(value) \
1671 do { \
1672 FREE_RANGE_TABLE_WORK_AREA (range_table_work); \
1673 xfree (compile_stack.stack); \
1674 return value; \
1675 } while (false)
1676
1677 /* Compile PATTERN (of length SIZE) according to SYNTAX.
1678 Return a nonzero error code on failure, or zero for success.
1679
1680 If WHITESPACE_REGEXP is given, use it instead of a space
1681 character in PATTERN.
1682
1683 Assume the 'allocated' (and perhaps 'buffer') and 'translate'
1684 fields are set in BUFP on entry.
1685
1686 If successful, put results in *BUFP (otherwise the
1687 contents of *BUFP are undefined):
1688 'buffer' is the compiled pattern;
1689 'syntax' is set to SYNTAX;
1690 'used' is set to the length of the compiled pattern;
1691 'fastmap_accurate' is false;
1692 're_nsub' is the number of subexpressions in PATTERN;
1693
1694 The 'fastmap' field is neither examined nor set. */
1695
1696 static reg_errcode_t
1697 regex_compile (re_char *pattern, ptrdiff_t size,
1698 bool posix_backtracking,
1699 const char *whitespace_regexp,
1700 struct re_pattern_buffer *bufp)
1701 {
1702 /* Fetch characters from PATTERN here. */
1703 int c, c1;
1704
1705 /* Points to the end of the buffer, where we should append. */
1706 unsigned char *b;
1707
1708 /* Keeps track of unclosed groups. */
1709 compile_stack_type compile_stack;
1710
1711 /* Points to the current (ending) position in the pattern. */
1712 re_char *p = pattern;
1713 re_char *pend = pattern + size;
1714
1715 /* How to translate the characters in the pattern. */
1716 Lisp_Object translate = bufp->translate;
1717
1718 /* Address of the count-byte of the most recently inserted 'exactn'
1719 command. This makes it possible to tell if a new exact-match
1720 character can be added to that command or if the character requires
1721 a new 'exactn' command. */
1722 unsigned char *pending_exact = 0;
1723
1724 /* Address of start of the most recently finished expression.
1725 This tells, e.g., postfix * where to find the start of its
1726 operand. Reset at the beginning of groups and alternatives. */
1727 unsigned char *laststart = 0;
1728
1729 /* Address of beginning of regexp, or inside of last group. */
1730 unsigned char *begalt;
1731
1732 /* Place in the uncompiled pattern (i.e., the {) to
1733 which to go back if the interval is invalid. */
1734 re_char *beg_interval;
1735
1736 /* Address of the place where a forward jump should go to the end of
1737 the containing expression. Each alternative of an 'or' -- except the
1738 last -- ends with a forward jump of this sort. */
1739 unsigned char *fixup_alt_jump = 0;
1740
1741 /* Work area for range table of charset. */
1742 struct range_table_work_area range_table_work;
1743
1744 /* If the regular expression is multibyte. */
1745 bool multibyte = RE_MULTIBYTE_P (bufp);
1746
1747 /* Nonzero if we have pushed down into a subpattern. */
1748 int in_subpattern = 0;
1749
1750 /* These hold the values of p, pattern, and pend from the main
1751 pattern when we have pushed into a subpattern. */
1752 re_char *main_p;
1753 re_char *main_pattern;
1754 re_char *main_pend;
1755
1756 #ifdef REGEX_EMACS_DEBUG
1757 regex_emacs_debug++;
1758 DEBUG_PRINT ("\nCompiling pattern: ");
1759 if (regex_emacs_debug > 0)
1760 {
1761 for (ptrdiff_t debug_count = 0; debug_count < size; debug_count++)
1762 debug_putchar (pattern[debug_count]);
1763 putc ('\n', stderr);
1764 }
1765 #endif
1766
1767 /* Initialize the compile stack. */
1768 compile_stack.stack = xmalloc (INIT_COMPILE_STACK_SIZE
1769 * sizeof *compile_stack.stack);
1770 __lsan_ignore_object (compile_stack.stack);
1771 compile_stack.size = INIT_COMPILE_STACK_SIZE;
1772 compile_stack.avail = 0;
1773
1774 range_table_work.table = 0;
1775 range_table_work.allocated = 0;
1776
1777 /* Initialize the pattern buffer. */
1778 bufp->fastmap_accurate = false;
1779 bufp->used_syntax = false;
1780
1781 /* Set 'used' to zero, so that if we return an error, the pattern
1782 printer (for debugging) will think there's no pattern. We reset it
1783 at the end. */
1784 bufp->used = 0;
1785
1786 bufp->re_nsub = 0;
1787
1788 if (bufp->allocated == 0)
1789 {
1790 /* This loses if BUFP->buffer is bogus, but that is the user's
1791 responsibility. */
1792 bufp->buffer = xrealloc (bufp->buffer, INIT_BUF_SIZE);
1793 bufp->allocated = INIT_BUF_SIZE;
1794 }
1795
1796 begalt = b = bufp->buffer;
1797
1798 /* Loop through the uncompiled pattern until we're at the end. */
1799 while (1)
1800 {
1801 if (p == pend)
1802 {
1803 /* If this is the end of an included regexp,
1804 pop back to the main regexp and try again. */
1805 if (in_subpattern)
1806 {
1807 in_subpattern = 0;
1808 pattern = main_pattern;
1809 p = main_p;
1810 pend = main_pend;
1811 continue;
1812 }
1813 /* If this is the end of the main regexp, we are done. */
1814 break;
1815 }
1816
1817 PATFETCH (c);
1818
1819 switch (c)
1820 {
1821 case ' ':
1822 {
1823 re_char *p1 = p;
1824
1825 /* If there's no special whitespace regexp, treat
1826 spaces normally. And don't try to do this recursively. */
1827 if (!whitespace_regexp || in_subpattern)
1828 goto normal_char;
1829
1830 /* Peek past following spaces. */
1831 while (p1 != pend)
1832 {
1833 if (*p1 != ' ')
1834 break;
1835 p1++;
1836 }
1837 /* If the spaces are followed by a repetition op,
1838 treat them normally. */
1839 if (p1 != pend
1840 && (*p1 == '*' || *p1 == '+' || *p1 == '?'
1841 || (*p1 == '\\' && p1 + 1 != pend && p1[1] == '{')))
1842 goto normal_char;
1843
1844 /* Replace the spaces with the whitespace regexp. */
1845 in_subpattern = 1;
1846 main_p = p1;
1847 main_pend = pend;
1848 main_pattern = pattern;
1849 p = pattern = (re_char *) whitespace_regexp;
1850 pend = p + strlen (whitespace_regexp);
1851 break;
1852 }
1853
1854 case '^':
1855 if (! (p == pattern + 1 || at_begline_loc_p (pattern, p)))
1856 goto normal_char;
1857 BUF_PUSH (begline);
1858 break;
1859
1860 case '$':
1861 if (! (p == pend || at_endline_loc_p (p, pend)))
1862 goto normal_char;
1863 BUF_PUSH (endline);
1864 break;
1865
1866
1867 case '+':
1868 case '?':
1869 case '*':
1870 /* If there is no previous pattern... */
1871 if (!laststart)
1872 goto normal_char;
1873
1874 {
1875 /* 1 means zero (many) matches is allowed. */
1876 bool zero_times_ok = false, many_times_ok = false;
1877 bool greedy = true;
1878
1879 /* If there is a sequence of repetition chars, collapse it
1880 down to just one (the right one). We can't combine
1881 interval operators with these because of, e.g., 'a{2}*',
1882 which should only match an even number of 'a's. */
1883
1884 for (;;)
1885 {
1886 if (c == '?' && (zero_times_ok || many_times_ok))
1887 greedy = false;
1888 else
1889 {
1890 zero_times_ok |= c != '+';
1891 many_times_ok |= c != '?';
1892 }
1893
1894 if (! (p < pend && (*p == '*' || *p == '+' || *p == '?')))
1895 break;
1896 /* If we get here, we found another repeat character. */
1897 c = *p++;
1898 }
1899
1900 /* Star, etc. applied to an empty pattern is equivalent
1901 to an empty pattern. */
1902 if (!laststart || laststart == b)
1903 break;
1904
1905 /* Now we know whether or not zero matches is allowed
1906 and also whether or not two or more matches is allowed. */
1907 if (greedy)
1908 {
1909 if (many_times_ok)
1910 {
1911 bool simple = skip_one_char (laststart) == b;
1912 ptrdiff_t startoffset = 0;
1913 re_opcode_t ofj =
1914 /* Check if the loop can match the empty string. */
1915 (simple || !analyze_first (laststart, b, NULL, false))
1916 ? on_failure_jump : on_failure_jump_loop;
1917 eassert (skip_one_char (laststart) <= b);
1918
1919 if (!zero_times_ok && simple)
1920 { /* Since simple * loops can be made faster by using
1921 on_failure_keep_string_jump, we turn simple P+
1922 into PP* if P is simple. */
1923 unsigned char *p1, *p2;
1924 startoffset = b - laststart;
1925 GET_BUFFER_SPACE (startoffset);
1926 p1 = b; p2 = laststart;
1927 while (p2 < p1)
1928 *b++ = *p2++;
1929 zero_times_ok = 1;
1930 }
1931
1932 GET_BUFFER_SPACE (6);
1933 if (!zero_times_ok)
1934 /* A + loop. */
1935 STORE_JUMP (ofj, b, b + 6);
1936 else
1937 /* Simple * loops can use on_failure_keep_string_jump
1938 depending on what follows. But since we don't know
1939 that yet, we leave the decision up to
1940 on_failure_jump_smart. */
1941 INSERT_JUMP (simple ? on_failure_jump_smart : ofj,
1942 laststart + startoffset, b + 6);
1943 b += 3;
1944 STORE_JUMP (jump, b, laststart + startoffset);
1945 b += 3;
1946 }
1947 else
1948 {
1949 /* A simple ? pattern. */
1950 eassert (zero_times_ok);
1951 GET_BUFFER_SPACE (3);
1952 INSERT_JUMP (on_failure_jump, laststart, b + 3);
1953 b += 3;
1954 }
1955 }
1956 else /* not greedy */
1957 { /* I wish the greedy and non-greedy cases could be merged. */
1958
1959 GET_BUFFER_SPACE (7); /* We might use less. */
1960 if (many_times_ok)
1961 {
1962 bool emptyp = !!analyze_first (laststart, b, NULL, false);
1963
1964 /* The non-greedy multiple match looks like
1965 a repeat..until: we only need a conditional jump
1966 at the end of the loop. */
1967 if (emptyp) BUF_PUSH (no_op);
1968 STORE_JUMP (emptyp ? on_failure_jump_nastyloop
1969 : on_failure_jump, b, laststart);
1970 b += 3;
1971 if (zero_times_ok)
1972 {
1973 /* The repeat...until naturally matches one or more.
1974 To also match zero times, we need to first jump to
1975 the end of the loop (its conditional jump). */
1976 INSERT_JUMP (jump, laststart, b);
1977 b += 3;
1978 }
1979 }
1980 else
1981 {
1982 /* non-greedy a?? */
1983 INSERT_JUMP (jump, laststart, b + 3);
1984 b += 3;
1985 INSERT_JUMP (on_failure_jump, laststart, laststart + 6);
1986 b += 3;
1987 }
1988 }
1989 }
1990 pending_exact = 0;
1991 break;
1992
1993
1994 case '.':
1995 laststart = b;
1996 BUF_PUSH (anychar);
1997 break;
1998
1999
2000 case '[':
2001 {
2002 re_char *p1;
2003
2004 CLEAR_RANGE_TABLE_WORK_USED (range_table_work);
2005
2006 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2007
2008 /* Ensure that we have enough space to push a charset: the
2009 opcode, the length count, and the bitset; 34 bytes in all. */
2010 GET_BUFFER_SPACE (34);
2011
2012 laststart = b;
2013
2014 /* Test '*p == '^' twice, instead of using an if
2015 statement, so we need only one BUF_PUSH. */
2016 BUF_PUSH (*p == '^' ? charset_not : charset);
2017 if (*p == '^')
2018 p++;
2019
2020 /* Remember the first position in the bracket expression. */
2021 p1 = p;
2022
2023 /* Push the number of bytes in the bitmap. */
2024 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
2025
2026 /* Clear the whole map. */
2027 memset (b, 0, (1 << BYTEWIDTH) / BYTEWIDTH);
2028
2029 /* Read in characters and ranges, setting map bits. */
2030 for (;;)
2031 {
2032 const unsigned char *p2 = p;
2033 int ch;
2034
2035 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2036
2037 /* See if we're at the beginning of a possible character
2038 class. */
2039 re_wctype_t cc = re_wctype_parse (&p, pend - p);
2040 if (cc != -1)
2041 {
2042 if (cc == 0)
2043 FREE_STACK_RETURN (REG_ECTYPE);
2044
2045 if (p == pend)
2046 FREE_STACK_RETURN (REG_EBRACK);
2047
2048 /* Most character classes in a multibyte match just set
2049 a flag. Exceptions are is_blank, is_digit, is_cntrl, and
2050 is_xdigit, since they can only match ASCII characters.
2051 We don't need to handle them for multibyte. */
2052
2053 /* Setup the gl_state object to its buffer-defined value.
2054 This hardcodes the buffer-global syntax-table for ASCII
2055 chars, while the other chars will obey syntax-table
2056 properties. It's not ideal, but it's the way it's been
2057 done until now. */
2058 SETUP_BUFFER_SYNTAX_TABLE ();
2059
2060 for (c = 0; c < 0x80; ++c)
2061 if (re_iswctype (c, cc))
2062 {
2063 SET_LIST_BIT (c);
2064 c1 = TRANSLATE (c);
2065 if (c1 == c)
2066 continue;
2067 if (ASCII_CHAR_P (c1))
2068 SET_LIST_BIT (c1);
2069 else if ((c1 = RE_CHAR_TO_UNIBYTE (c1)) >= 0)
2070 SET_LIST_BIT (c1);
2071 }
2072 SET_RANGE_TABLE_WORK_AREA_BIT
2073 (range_table_work, re_wctype_to_bit (cc));
2074
2075 /* In most cases the matching rule for char classes only
2076 uses the syntax table for multibyte chars, so that the
2077 content of the syntax-table is not hardcoded in the
2078 range_table. SPACE and WORD are the two exceptions. */
2079 if ((1 << cc) & ((1 << RECC_SPACE) | (1 << RECC_WORD)))
2080 bufp->used_syntax = true;
2081
2082 /* Repeat the loop. */
2083 continue;
2084 }
2085
2086 /* Don't translate yet. The range TRANSLATE(X..Y) cannot
2087 always be determined from TRANSLATE(X) and TRANSLATE(Y)
2088 So the translation is done later in a loop. Example:
2089 (let ((case-fold-search t)) (string-match "[A-_]" "A")) */
2090 PATFETCH (c);
2091
2092 /* Could be the end of the bracket expression. If it's
2093 not (i.e., when the bracket expression is '[]' so
2094 far), the ']' character bit gets set way below. */
2095 if (c == ']' && p2 != p1)
2096 break;
2097
2098 if (p < pend && p[0] == '-' && p[1] != ']')
2099 {
2100
2101 /* Discard the '-'. */
2102 PATFETCH (c1);
2103
2104 /* Fetch the character which ends the range. */
2105 PATFETCH (c1);
2106
2107 if (CHAR_BYTE8_P (c1)
2108 && ! ASCII_CHAR_P (c) && ! CHAR_BYTE8_P (c))
2109 /* Treat the range from a multibyte character to
2110 raw-byte character as empty. */
2111 c = c1 + 1;
2112 }
2113 else
2114 /* Range from C to C. */
2115 c1 = c;
2116
2117 if (c <= c1)
2118 {
2119 if (c < 128)
2120 {
2121 ch = min (127, c1);
2122 SETUP_ASCII_RANGE (range_table_work, c, ch);
2123 c = ch + 1;
2124 if (CHAR_BYTE8_P (c1))
2125 c = BYTE8_TO_CHAR (128);
2126 }
2127 if (c <= c1)
2128 {
2129 if (CHAR_BYTE8_P (c))
2130 {
2131 c = CHAR_TO_BYTE8 (c);
2132 c1 = CHAR_TO_BYTE8 (c1);
2133 for (; c <= c1; c++)
2134 SET_LIST_BIT (c);
2135 }
2136 else if (multibyte)
2137 SETUP_MULTIBYTE_RANGE (range_table_work, c, c1);
2138 else
2139 SETUP_UNIBYTE_RANGE (range_table_work, c, c1);
2140 }
2141 }
2142 }
2143
2144 /* Discard any (non)matching list bytes that are all 0 at the
2145 end of the map. Decrease the map-length byte too. */
2146 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
2147 b[-1]--;
2148 b += b[-1];
2149
2150 /* Build real range table from work area. */
2151 if (RANGE_TABLE_WORK_USED (range_table_work)
2152 || RANGE_TABLE_WORK_BITS (range_table_work))
2153 {
2154 int i;
2155 int used = RANGE_TABLE_WORK_USED (range_table_work);
2156
2157 /* Allocate space for COUNT + RANGE_TABLE. Needs two
2158 bytes for flags, two for COUNT, and three bytes for
2159 each character. */
2160 GET_BUFFER_SPACE (4 + used * 3);
2161
2162 /* Indicate the existence of range table. */
2163 laststart[1] |= 0x80;
2164
2165 /* Store the character class flag bits into the range table. */
2166 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) & 0xff;
2167 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) >> 8;
2168
2169 STORE_NUMBER_AND_INCR (b, used / 2);
2170 for (i = 0; i < used; i++)
2171 STORE_CHARACTER_AND_INCR
2172 (b, RANGE_TABLE_WORK_ELT (range_table_work, i));
2173 }
2174 }
2175 break;
2176
2177
2178 case '\\':
2179 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2180
2181 /* Do not translate the character after the \, so that we can
2182 distinguish, e.g., \B from \b, even if we normally would
2183 translate, e.g., B to b. */
2184 PATFETCH (c);
2185
2186 switch (c)
2187 {
2188 case '(':
2189 {
2190 bool shy = false;
2191 regnum_t regnum = 0;
2192 if (p+1 < pend)
2193 {
2194 /* Look for a special (?...) construct */
2195 if (*p == '?')
2196 {
2197 PATFETCH (c); /* Gobble up the '?'. */
2198 while (!shy)
2199 {
2200 PATFETCH (c);
2201 switch (c)
2202 {
2203 case ':': shy = true; break;
2204 case '0':
2205 /* An explicitly specified regnum must start
2206 with non-0. */
2207 if (regnum == 0)
2208 FREE_STACK_RETURN (REG_BADPAT);
2209 FALLTHROUGH;
2210 case '1': case '2': case '3': case '4':
2211 case '5': case '6': case '7': case '8': case '9':
2212 if (INT_MULTIPLY_WRAPV (regnum, 10, ®num)
2213 || INT_ADD_WRAPV (regnum, c - '0',
2214 ®num))
2215 FREE_STACK_RETURN (REG_ESIZE);
2216 break;
2217 default:
2218 /* Only (?:...) is supported right now. */
2219 FREE_STACK_RETURN (REG_BADPAT);
2220 }
2221 }
2222 }
2223 }
2224
2225 if (!shy)
2226 regnum = ++bufp->re_nsub;
2227 else if (regnum)
2228 { /* It's actually not shy, but explicitly numbered. */
2229 shy = false;
2230 if (regnum > bufp->re_nsub)
2231 bufp->re_nsub = regnum;
2232 else if (regnum > bufp->re_nsub
2233 /* Ideally, we'd want to check that the specified
2234 group can't have matched (i.e. all subgroups
2235 using the same regnum are in other branches of
2236 OR patterns), but we don't currently keep track
2237 of enough info to do that easily. */
2238 || group_in_compile_stack (compile_stack, regnum))
2239 FREE_STACK_RETURN (REG_BADPAT);
2240 }
2241 else
2242 /* It's really shy. */
2243 regnum = - bufp->re_nsub;
2244
2245 if (COMPILE_STACK_FULL)
2246 compile_stack.stack
2247 = xpalloc (compile_stack.stack, &compile_stack.size,
2248 1, -1, sizeof *compile_stack.stack);
2249
2250 /* These are the values to restore when we hit end of this
2251 group. They are all relative offsets, so that if the
2252 whole pattern moves because of realloc, they will still
2253 be valid. */
2254 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
2255 COMPILE_STACK_TOP.fixup_alt_jump
2256 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
2257 COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
2258 COMPILE_STACK_TOP.regnum = regnum;
2259
2260 /* Do not push a start_memory for groups beyond the last one
2261 we can represent in the compiled pattern. */
2262 if (regnum <= MAX_REGNUM && regnum > 0)
2263 BUF_PUSH_2 (start_memory, regnum);
2264
2265 compile_stack.avail++;
2266
2267 fixup_alt_jump = 0;
2268 laststart = 0;
2269 begalt = b;
2270 /* If we've reached MAX_REGNUM groups, then this open
2271 won't actually generate any code, so we'll have to
2272 clear pending_exact explicitly. */
2273 pending_exact = 0;
2274 break;
2275 }
2276
2277 case ')':
2278 if (COMPILE_STACK_EMPTY)
2279 FREE_STACK_RETURN (REG_ERPAREN);
2280
2281 FIXUP_ALT_JUMP ();
2282
2283 /* See similar code for backslashed left paren above. */
2284 if (COMPILE_STACK_EMPTY)
2285 FREE_STACK_RETURN (REG_ERPAREN);
2286
2287 /* Since we just checked for an empty stack above, this
2288 "can't happen". */
2289 eassert (compile_stack.avail != 0);
2290 {
2291 /* We don't just want to restore into 'regnum', because
2292 later groups should continue to be numbered higher,
2293 as in '(ab)c(de)' -- the second group is #2. */
2294 regnum_t regnum;
2295
2296 compile_stack.avail--;
2297 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
2298 fixup_alt_jump
2299 = COMPILE_STACK_TOP.fixup_alt_jump
2300 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
2301 : 0;
2302 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
2303 regnum = COMPILE_STACK_TOP.regnum;
2304 /* If we've reached MAX_REGNUM groups, then this open
2305 won't actually generate any code, so we'll have to
2306 clear pending_exact explicitly. */
2307 pending_exact = 0;
2308
2309 /* We're at the end of the group, so now we know how many
2310 groups were inside this one. */
2311 if (regnum <= MAX_REGNUM && regnum > 0)
2312 BUF_PUSH_2 (stop_memory, regnum);
2313 }
2314 break;
2315
2316
2317 case '|': /* '\|'. */
2318 /* Insert before the previous alternative a jump which
2319 jumps to this alternative if the former fails. */
2320 GET_BUFFER_SPACE (3);
2321 INSERT_JUMP (on_failure_jump, begalt, b + 6);
2322 pending_exact = 0;
2323 b += 3;
2324
2325 /* The alternative before this one has a jump after it
2326 which gets executed if it gets matched. Adjust that
2327 jump so it will jump to this alternative's analogous
2328 jump (put in below, which in turn will jump to the next
2329 (if any) alternative's such jump, etc.). The last such
2330 jump jumps to the correct final destination. A picture:
2331 _____ _____
2332 | | | |
2333 | v | v
2334 A | B | C
2335
2336 If we are at B, then fixup_alt_jump right now points to a
2337 three-byte space after A. We'll put in the jump, set
2338 fixup_alt_jump to right after B, and leave behind three
2339 bytes which we'll fill in when we get to after C. */
2340
2341 FIXUP_ALT_JUMP ();
2342
2343 /* Mark and leave space for a jump after this alternative,
2344 to be filled in later either by next alternative or
2345 when know we're at the end of a series of alternatives. */
2346 fixup_alt_jump = b;
2347 GET_BUFFER_SPACE (3);
2348 b += 3;
2349
2350 laststart = 0;
2351 begalt = b;
2352 break;
2353
2354
2355 case '{':
2356 {
2357 /* At least (most) this many matches must be made. */
2358 int lower_bound = 0, upper_bound = -1;
2359
2360 beg_interval = p;
2361
2362 GET_INTERVAL_COUNT (lower_bound);
2363
2364 if (c == ',')
2365 GET_INTERVAL_COUNT (upper_bound);
2366 else
2367 /* Interval such as '{1}' => match exactly once. */
2368 upper_bound = lower_bound;
2369
2370 if (lower_bound < 0
2371 || (0 <= upper_bound && upper_bound < lower_bound)
2372 || c != '\\')
2373 FREE_STACK_RETURN (REG_BADBR);
2374 if (p == pend)
2375 FREE_STACK_RETURN (REG_EESCAPE);
2376 if (*p++ != '}')
2377 FREE_STACK_RETURN (REG_BADBR);
2378
2379 /* We just parsed a valid interval. */
2380
2381 /* If it's invalid to have no preceding re. */
2382 if (!laststart)
2383 goto unfetch_interval;
2384
2385 if (upper_bound == 0)
2386 /* If the upper bound is zero, just drop the sub pattern
2387 altogether. */
2388 b = laststart;
2389 else if (lower_bound == 1 && upper_bound == 1)
2390 /* Just match it once: nothing to do here. */
2391 ;
2392
2393 /* Otherwise, we have a nontrivial interval. When
2394 we're all done, the pattern will look like:
2395 set_number_at <jump count> <upper bound>
2396 set_number_at <succeed_n count> <lower bound>
2397 succeed_n <after jump addr> <succeed_n count>
2398 <body of loop>
2399 jump_n <succeed_n addr> <jump count>
2400 (The upper bound and 'jump_n' are omitted if
2401 'upper_bound' is 1, though.) */
2402 else
2403 { /* If the upper bound is > 1, we need to insert
2404 more at the end of the loop. */
2405 int nbytes = upper_bound < 0 ? 3 : upper_bound > 1 ? 5 : 0;
2406 int startoffset = 0;
2407
2408 GET_BUFFER_SPACE (20); /* We might use less. */
2409
2410 if (lower_bound == 0)
2411 {
2412 /* A succeed_n that starts with 0 is really
2413 a simple on_failure_jump_loop. */
2414 INSERT_JUMP (on_failure_jump_loop, laststart,
2415 b + 3 + nbytes);
2416 b += 3;
2417 }
2418 else
2419 {
2420 /* Initialize lower bound of the 'succeed_n', even
2421 though it will be set during matching by its
2422 attendant 'set_number_at' (inserted next),
2423 because 're_compile_fastmap' needs to know.
2424 Jump to the 'jump_n' we might insert below. */
2425 INSERT_JUMP2 (succeed_n, laststart,
2426 b + 5 + nbytes,
2427 lower_bound);
2428 b += 5;
2429
2430 /* Code to initialize the lower bound. Insert
2431 before the 'succeed_n'. The '5' is the last two
2432 bytes of this 'set_number_at', plus 3 bytes of
2433 the following 'succeed_n'. */
2434 insert_op2 (set_number_at, laststart, 5,
2435 lower_bound, b);
2436 b += 5;
2437 startoffset += 5;
2438 }
2439
2440 if (upper_bound < 0)
2441 {
2442 /* A negative upper bound stands for infinity,
2443 in which case it degenerates to a plain jump. */
2444 STORE_JUMP (jump, b, laststart + startoffset);
2445 b += 3;
2446 }
2447 else if (upper_bound > 1)
2448 { /* More than one repetition is allowed, so
2449 append a backward jump to the 'succeed_n'
2450 that starts this interval.
2451
2452 When we've reached this during matching,
2453 we'll have matched the interval once, so
2454 jump back only 'upper_bound - 1' times. */
2455 STORE_JUMP2 (jump_n, b, laststart + startoffset,
2456 upper_bound - 1);
2457 b += 5;
2458
2459 /* The location we want to set is the second
2460 parameter of the 'jump_n'; that is 'b-2' as
2461 an absolute address. 'laststart' will be
2462 the 'set_number_at' we're about to insert;
2463 'laststart+3' the number to set, the source
2464 for the relative address. But we are
2465 inserting into the middle of the pattern --
2466 so everything is getting moved up by 5.
2467 Conclusion: (b - 2) - (laststart + 3) + 5,
2468 i.e., b - laststart.
2469
2470 Insert this at the beginning of the loop
2471 so that if we fail during matching, we'll
2472 reinitialize the bounds. */
2473 insert_op2 (set_number_at, laststart, b - laststart,
2474 upper_bound - 1, b);
2475 b += 5;
2476 }
2477 }
2478 pending_exact = 0;
2479 beg_interval = NULL;
2480 }
2481 break;
2482
2483 unfetch_interval:
2484 /* If an invalid interval, match the characters as literals. */
2485 eassert (beg_interval);
2486 p = beg_interval;
2487 beg_interval = NULL;
2488 eassert (p > pattern && p[-1] == '\\');
2489 c = '{';
2490 goto normal_char;
2491
2492 case '=':
2493 laststart = b;
2494 BUF_PUSH (at_dot);
2495 break;
2496
2497 case 's':
2498 laststart = b;
2499 PATFETCH (c);
2500 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
2501 break;
2502
2503 case 'S':
2504 laststart = b;
2505 PATFETCH (c);
2506 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
2507 break;
2508
2509 case 'c':
2510 laststart = b;
2511 PATFETCH (c);
2512 BUF_PUSH_2 (categoryspec, c);
2513 break;
2514
2515 case 'C':
2516 laststart = b;
2517 PATFETCH (c);
2518 BUF_PUSH_2 (notcategoryspec, c);
2519 break;
2520
2521 case 'w':
2522 laststart = b;
2523 BUF_PUSH_2 (syntaxspec, Sword);
2524 break;
2525
2526
2527 case 'W':
2528 laststart = b;
2529 BUF_PUSH_2 (notsyntaxspec, Sword);
2530 break;
2531
2532
2533 case '<':
2534 laststart = b;
2535 BUF_PUSH (wordbeg);
2536 break;
2537
2538 case '>':
2539 laststart = b;
2540 BUF_PUSH (wordend);
2541 break;
2542
2543 case '_':
2544 laststart = b;
2545 PATFETCH (c);
2546 if (c == '<')
2547 BUF_PUSH (symbeg);
2548 else if (c == '>')
2549 BUF_PUSH (symend);
2550 else
2551 FREE_STACK_RETURN (REG_BADPAT);
2552 break;
2553
2554 case 'b':
2555 BUF_PUSH (wordbound);
2556 break;
2557
2558 case 'B':
2559 BUF_PUSH (notwordbound);
2560 break;
2561
2562 case '`':
2563 BUF_PUSH (begbuf);
2564 break;
2565
2566 case '\'':
2567 BUF_PUSH (endbuf);
2568 break;
2569
2570 case '1': case '2': case '3': case '4': case '5':
2571 case '6': case '7': case '8': case '9':
2572 {
2573 regnum_t reg = c - '0';
2574
2575 if (reg > bufp->re_nsub || reg < 1
2576 /* Can't back reference to a subexp before its end. */
2577 || group_in_compile_stack (compile_stack, reg))
2578 FREE_STACK_RETURN (REG_ESUBREG);
2579
2580 laststart = b;
2581 BUF_PUSH_2 (duplicate, reg);
2582 }
2583 break;
2584
2585 default:
2586 /* You might think it would be useful for \ to mean
2587 not to translate; but if we don't translate it
2588 it will never match anything. */
2589 goto normal_char;
2590 }
2591 break;
2592
2593
2594 default:
2595 /* Expects the character in C. */
2596 normal_char:
2597 /* If no exactn currently being built. */
2598 if (!pending_exact
2599
2600 /* If last exactn not at current position. */
2601 || pending_exact + *pending_exact + 1 != b
2602
2603 /* Only one byte follows the exactn for the count. */
2604 || *pending_exact >= (1 << BYTEWIDTH) - MAX_MULTIBYTE_LENGTH
2605
2606 /* If followed by a repetition operator. */
2607 || (p != pend
2608 && (*p == '*' || *p == '+' || *p == '?' || *p == '^'))
2609 || (p + 1 < pend && p[0] == '\\' && p[1] == '{'))
2610 {
2611 /* Start building a new exactn. */
2612
2613 laststart = b;
2614
2615 BUF_PUSH_2 (exactn, 0);
2616 pending_exact = b - 1;
2617 }
2618
2619 GET_BUFFER_SPACE (MAX_MULTIBYTE_LENGTH);
2620 {
2621 int len;
2622
2623 if (multibyte)
2624 {
2625 c = TRANSLATE (c);
2626 len = CHAR_STRING (c, b);
2627 b += len;
2628 }
2629 else
2630 {
2631 c1 = RE_CHAR_TO_MULTIBYTE (c);
2632 if (! CHAR_BYTE8_P (c1))
2633 {
2634 int c2 = TRANSLATE (c1);
2635
2636 if (c1 != c2 && (c1 = RE_CHAR_TO_UNIBYTE (c2)) >= 0)
2637 c = c1;
2638 }
2639 *b++ = c;
2640 len = 1;
2641 }
2642 (*pending_exact) += len;
2643 }
2644
2645 break;
2646 } /* switch (c) */
2647 } /* while p != pend */
2648
2649
2650 /* Through the pattern now. */
2651
2652 FIXUP_ALT_JUMP ();
2653
2654 if (!COMPILE_STACK_EMPTY)
2655 FREE_STACK_RETURN (REG_EPAREN);
2656
2657 /* If we don't want backtracking, force success
2658 the first time we reach the end of the compiled pattern. */
2659 if (!posix_backtracking)
2660 BUF_PUSH (succeed);
2661
2662 /* Success; set the length of the buffer. */
2663 bufp->used = b - bufp->buffer;
2664
2665 #ifdef REGEX_EMACS_DEBUG
2666 if (regex_emacs_debug > 0)
2667 {
2668 re_compile_fastmap (bufp);
2669 DEBUG_PRINT ("\nCompiled pattern:\n");
2670 print_compiled_pattern (bufp);
2671 }
2672 regex_emacs_debug--;
2673 #endif
2674
2675 FREE_STACK_RETURN (REG_NOERROR);
2676
2677 } /* regex_compile */
2678
2679 /* Subroutines for 'regex_compile'. */
2680
2681 /* Store OP at LOC followed by two-byte integer parameter ARG. */
2682
2683 static void
2684 store_op1 (re_opcode_t op, unsigned char *loc, int arg)
2685 {
2686 *loc = (unsigned char) op;
2687 STORE_NUMBER (loc + 1, arg);
2688 }
2689
2690
2691 /* Like 'store_op1', but for two two-byte parameters ARG1 and ARG2. */
2692
2693 static void
2694 store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2)
2695 {
2696 *loc = (unsigned char) op;
2697 STORE_NUMBER (loc + 1, arg1);
2698 STORE_NUMBER (loc + 3, arg2);
2699 }
2700
2701
2702 /* Copy the bytes from LOC to END to open up three bytes of space at LOC
2703 for OP followed by two-byte integer parameter ARG. */
2704
2705 static void
2706 insert_op1 (re_opcode_t op, unsigned char *loc, int arg, unsigned char *end)
2707 {
2708 register unsigned char *pfrom = end;
2709 register unsigned char *pto = end + 3;
2710
2711 while (pfrom != loc)
2712 *--pto = *--pfrom;
2713
2714 store_op1 (op, loc, arg);
2715 }
2716
2717
2718 /* Like 'insert_op1', but for two two-byte parameters ARG1 and ARG2. */
2719
2720 static void
2721 insert_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2,
2722 unsigned char *end)
2723 {
2724 register unsigned char *pfrom = end;
2725 register unsigned char *pto = end + 5;
2726
2727 while (pfrom != loc)
2728 *--pto = *--pfrom;
2729
2730 store_op2 (op, loc, arg1, arg2);
2731 }
2732
2733
2734 /* P points to just after a ^ in PATTERN. Return true if that ^ comes
2735 after an alternative or a begin-subexpression. Assume there is at
2736 least one character before the ^. */
2737
2738 static bool
2739 at_begline_loc_p (re_char *pattern, re_char *p)
2740 {
2741 re_char *prev = p - 2;
2742
2743 switch (*prev)
2744 {
2745 case '(': /* After a subexpression. */
2746 case '|': /* After an alternative. */
2747 break;
2748
2749 case ':': /* After a shy subexpression. */
2750 /* Skip over optional regnum. */
2751 while (prev > pattern && '0' <= prev[-1] && prev[-1] <= '9')
2752 --prev;
2753
2754 if (! (prev > pattern + 1 && prev[-1] == '?' && prev[-2] == '('))
2755 return false;
2756 prev -= 2;
2757 break;
2758
2759 default:
2760 return false;
2761 }
2762
2763 /* Count the number of preceding backslashes. */
2764 p = prev;
2765 while (prev > pattern && prev[-1] == '\\')
2766 --prev;
2767 return (p - prev) & 1;
2768 }
2769
2770
2771 /* The dual of at_begline_loc_p. This one is for $. Assume there is
2772 at least one character after the $, i.e., 'P < PEND'. */
2773
2774 static bool
2775 at_endline_loc_p (re_char *p, re_char *pend)
2776 {
2777 /* Before a subexpression or an alternative? */
2778 return *p == '\\' && p + 1 < pend && (p[1] == ')' || p[1] == '|');
2779 }
2780
2781
2782 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and
2783 false if it's not. */
2784
2785 static bool
2786 group_in_compile_stack (compile_stack_type compile_stack, regnum_t regnum)
2787 {
2788 ptrdiff_t this_element;
2789
2790 for (this_element = compile_stack.avail - 1;
2791 this_element >= 0;
2792 this_element--)
2793 if (compile_stack.stack[this_element].regnum == regnum)
2794 return true;
2795
2796 return false;
2797 }
2798
2799 /* analyze_first.
2800 If fastmap is non-NULL, go through the pattern and fill fastmap
2801 with all the possible leading chars. If fastmap is NULL, don't
2802 bother filling it up (obviously) and only return whether the
2803 pattern could potentially match the empty string.
2804
2805 Return 1 if p..pend might match the empty string.
2806 Return 0 if p..pend matches at least one char.
2807 Return -1 if fastmap was not updated accurately. */
2808
2809 static int
2810 analyze_first (re_char *p, re_char *pend, char *fastmap, bool multibyte)
2811 {
2812 int j, k;
2813 int nbits;
2814 bool not;
2815
2816 /* If all elements for base leading-codes in fastmap is set, this
2817 flag is set true. */
2818 bool match_any_multibyte_characters = false;
2819
2820 eassert (p);
2821
2822 /* The loop below works as follows:
2823 - It has a working-list kept in the PATTERN_STACK and which basically
2824 starts by only containing a pointer to the first operation.
2825 - If the opcode we're looking at is a match against some set of
2826 chars, then we add those chars to the fastmap and go on to the
2827 next work element from the worklist (done via 'break').
2828 - If the opcode is a control operator on the other hand, we either
2829 ignore it (if it's meaningless at this point, such as 'start_memory')
2830 or execute it (if it's a jump). If the jump has several destinations
2831 (i.e. 'on_failure_jump'), then we push the other destination onto the
2832 worklist.
2833 We guarantee termination by ignoring backward jumps (more or less),
2834 so that P is monotonically increasing. More to the point, we
2835 never set P (or push) anything '<= p1'. */
2836
2837 while (p < pend)
2838 {
2839 /* P1 is used as a marker of how far back a 'on_failure_jump'
2840 can go without being ignored. It is normally equal to P
2841 (which prevents any backward 'on_failure_jump') except right
2842 after a plain 'jump', to allow patterns such as:
2843 0: jump 10
2844 3..9: <body>
2845 10: on_failure_jump 3
2846 as used for the *? operator. */
2847 re_char *p1 = p;
2848
2849 switch (*p++)
2850 {
2851 case succeed:
2852 return 1;
2853
2854 case duplicate:
2855 /* If the first character has to match a backreference, that means
2856 that the group was empty (since it already matched). Since this
2857 is the only case that interests us here, we can assume that the
2858 backreference must match the empty string. */
2859 p++;
2860 continue;
2861
2862
2863 /* Following are the cases which match a character. These end
2864 with 'break'. */
2865
2866 case exactn:
2867 if (fastmap)
2868 {
2869 /* If multibyte is nonzero, the first byte of each
2870 character is an ASCII or a leading code. Otherwise,
2871 each byte is a character. Thus, this works in both
2872 cases. */
2873 fastmap[p[1]] = 1;
2874 if (multibyte)
2875 {
2876 /* Cover the case of matching a raw char in a
2877 multibyte regexp against unibyte. */
2878 if (CHAR_BYTE8_HEAD_P (p[1]))
2879 fastmap[CHAR_TO_BYTE8 (STRING_CHAR (p + 1))] = 1;
2880 }
2881 else
2882 {
2883 /* For the case of matching this unibyte regex
2884 against multibyte, we must set a leading code of
2885 the corresponding multibyte character. */
2886 int c = RE_CHAR_TO_MULTIBYTE (p[1]);
2887
2888 fastmap[CHAR_LEADING_CODE (c)] = 1;
2889 }
2890 }
2891 break;
2892
2893
2894 case anychar:
2895 /* We could put all the chars except for \n (and maybe \0)
2896 but we don't bother since it is generally not worth it. */
2897 if (!fastmap) break;
2898 return -1;
2899
2900
2901 case charset_not:
2902 if (!fastmap) break;
2903 {
2904 /* Chars beyond end of bitmap are possible matches. */
2905 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
2906 j < (1 << BYTEWIDTH); j++)
2907 fastmap[j] = 1;
2908 }
2909 FALLTHROUGH;
2910 case charset:
2911 if (!fastmap) break;
2912 not = (re_opcode_t) *(p - 1) == charset_not;
2913 nbits = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
2914 p++;
2915 for (j = 0; j < nbits; j++)
2916 if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
2917 fastmap[j] = 1;
2918
2919 /* To match raw bytes (in the 80..ff range) against multibyte
2920 strings, add their leading bytes to the fastmap. */
2921 for (j = 0x80; j < nbits; j++)
2922 if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
2923 fastmap[CHAR_LEADING_CODE (BYTE8_TO_CHAR (j))] = 1;
2924
2925 if (/* Any leading code can possibly start a character
2926 which doesn't match the specified set of characters. */
2927 not
2928 ||
2929 /* If we can match a character class, we can match any
2930 multibyte characters. */
2931 (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
2932 && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0))
2933
2934 {
2935 if (match_any_multibyte_characters == false)
2936 {
2937 for (j = MIN_MULTIBYTE_LEADING_CODE;
2938 j <= MAX_MULTIBYTE_LEADING_CODE; j++)
2939 fastmap[j] = 1;
2940 match_any_multibyte_characters = true;
2941 }
2942 }
2943
2944 else if (!not && CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
2945 && match_any_multibyte_characters == false)
2946 {
2947 /* Set fastmap[I] to 1 where I is a leading code of each
2948 multibyte character in the range table. */
2949 int c, count;
2950 unsigned char lc1, lc2;
2951
2952 /* Make P points the range table. '+ 2' is to skip flag
2953 bits for a character class. */
2954 p += CHARSET_BITMAP_SIZE (&p[-2]) + 2;
2955
2956 /* Extract the number of ranges in range table into COUNT. */
2957 EXTRACT_NUMBER_AND_INCR (count, p);
2958 for (; count > 0; count--, p += 3)
2959 {
2960 /* Extract the start and end of each range. */
2961 EXTRACT_CHARACTER (c, p);
2962 lc1 = CHAR_LEADING_CODE (c);
2963 p += 3;
2964 EXTRACT_CHARACTER (c, p);
2965 lc2 = CHAR_LEADING_CODE (c);
2966 for (j = lc1; j <= lc2; j++)
2967 fastmap[j] = 1;
2968 }
2969 }
2970 break;
2971
2972 case syntaxspec:
2973 case notsyntaxspec:
2974 if (!fastmap) break;
2975 /* This match depends on text properties. These end with
2976 aborting optimizations. */
2977 return -1;
2978
2979 case categoryspec:
2980 case notcategoryspec:
2981 if (!fastmap) break;
2982 not = (re_opcode_t)p[-1] == notcategoryspec;
2983 k = *p++;
2984 for (j = (1 << BYTEWIDTH); j >= 0; j--)
2985 if ((CHAR_HAS_CATEGORY (j, k)) ^ not)
2986 fastmap[j] = 1;
2987
2988 /* Any leading code can possibly start a character which
2989 has or doesn't has the specified category. */
2990 if (match_any_multibyte_characters == false)
2991 {
2992 for (j = MIN_MULTIBYTE_LEADING_CODE;
2993 j <= MAX_MULTIBYTE_LEADING_CODE; j++)
2994 fastmap[j] = 1;
2995 match_any_multibyte_characters = true;
2996 }
2997 break;
2998
2999 /* All cases after this match the empty string. These end with
3000 'continue'. */
3001
3002 case at_dot:
3003 case no_op:
3004 case begline:
3005 case endline:
3006 case begbuf:
3007 case endbuf:
3008 case wordbound:
3009 case notwordbound:
3010 case wordbeg:
3011 case wordend:
3012 case symbeg:
3013 case symend:
3014 continue;
3015
3016
3017 case jump:
3018 EXTRACT_NUMBER_AND_INCR (j, p);
3019 if (j < 0)
3020 /* Backward jumps can only go back to code that we've already
3021 visited. 're_compile' should make sure this is true. */
3022 break;
3023 p += j;
3024 switch (*p)
3025 {
3026 case on_failure_jump:
3027 case on_failure_keep_string_jump:
3028 case on_failure_jump_loop:
3029 case on_failure_jump_nastyloop:
3030 case on_failure_jump_smart:
3031 p++;
3032 break;
3033 default:
3034 continue;
3035 };
3036 /* Keep P1 to allow the 'on_failure_jump' we are jumping to
3037 to jump back to "just after here". */
3038 FALLTHROUGH;
3039 case on_failure_jump:
3040 case on_failure_keep_string_jump:
3041 case on_failure_jump_nastyloop:
3042 case on_failure_jump_loop:
3043 case on_failure_jump_smart:
3044 EXTRACT_NUMBER_AND_INCR (j, p);
3045 if (p + j <= p1)
3046 ; /* Backward jump to be ignored. */
3047 else
3048 { /* We have to look down both arms.
3049 We first go down the "straight" path so as to minimize
3050 stack usage when going through alternatives. */
3051 int r = analyze_first (p, pend, fastmap, multibyte);
3052 if (r) return r;
3053 p += j;
3054 }
3055 continue;
3056
3057
3058 case jump_n:
3059 /* This code simply does not properly handle forward jump_n. */
3060 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p); eassert (j < 0));
3061 p += 4;
3062 /* jump_n can either jump or fall through. The (backward) jump
3063 case has already been handled, so we only need to look at the
3064 fallthrough case. */
3065 continue;
3066
3067 case succeed_n:
3068 /* If N == 0, it should be an on_failure_jump_loop instead. */
3069 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); eassert (j > 0));
3070 p += 4;
3071 /* We only care about one iteration of the loop, so we don't
3072 need to consider the case where this behaves like an
3073 on_failure_jump. */
3074 continue;
3075
3076
3077 case set_number_at:
3078 p += 4;
3079 continue;
3080
3081
3082 case start_memory:
3083 case stop_memory:
3084 p += 1;
3085 continue;
3086
3087
3088 default:
3089 abort (); /* We have listed all the cases. */
3090 } /* switch *p++ */
3091
3092 /* Getting here means we have found the possible starting
3093 characters for one path of the pattern -- and that the empty
3094 string does not match. We need not follow this path further. */
3095 return 0;
3096 } /* while p */
3097
3098 /* We reached the end without matching anything. */
3099 return 1;
3100
3101 } /* analyze_first */
3102
3103 /* Compute a fastmap for the compiled pattern in BUFP.
3104 A fastmap records which of the (1 << BYTEWIDTH) possible
3105 characters can start a string that matches the pattern. This fastmap
3106 is used by re_search to skip quickly over impossible starting points.
3107
3108 Character codes above (1 << BYTEWIDTH) are not represented in the
3109 fastmap, but the leading codes are represented. Thus, the fastmap
3110 indicates which character sets could start a match.
3111
3112 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
3113 area as BUFP->fastmap.
3114
3115 Set the 'fastmap', 'fastmap_accurate', and 'can_be_null' fields in
3116 the pattern buffer. */
3117
3118 static void
3119 re_compile_fastmap (struct re_pattern_buffer *bufp)
3120 {
3121 char *fastmap = bufp->fastmap;
3122 int analysis;
3123
3124 eassert (fastmap && bufp->buffer);
3125
3126 memset (fastmap, 0, 1 << BYTEWIDTH); /* Assume nothing's valid. */
3127
3128 /* FIXME: Is the following assignment correct even when ANALYSIS < 0? */
3129 bufp->fastmap_accurate = 1; /* It will be when we're done. */
3130
3131 analysis = analyze_first (bufp->buffer, bufp->buffer + bufp->used,
3132 fastmap, RE_MULTIBYTE_P (bufp));
3133 bufp->can_be_null = (analysis != 0);
3134 } /* re_compile_fastmap */
3135
3136 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
3137 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
3138 this memory for recording register information. STARTS and ENDS
3139 must be allocated using the malloc library routine, and must each
3140 be at least NUM_REGS * sizeof (ptrdiff_t) bytes long.
3141
3142 If NUM_REGS == 0, then subsequent matches should allocate their own
3143 register data.
3144
3145 Unless this function is called, the first search or match using
3146 PATTERN_BUFFER will allocate its own register data, without
3147 freeing the old data. */
3148
3149 void
3150 re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs,
3151 ptrdiff_t num_regs, ptrdiff_t *starts, ptrdiff_t *ends)
3152 {
3153 if (num_regs)
3154 {
3155 bufp->regs_allocated = REGS_REALLOCATE;
3156 regs->num_regs = num_regs;
3157 regs->start = starts;
3158 regs->end = ends;
3159 }
3160 else
3161 {
3162 bufp->regs_allocated = REGS_UNALLOCATED;
3163 regs->num_regs = 0;
3164 regs->start = regs->end = 0;
3165 }
3166 }
3167
3168 /* Searching routines. */
3169
3170 /* Like re_search_2, below, but only one string is specified, and
3171 doesn't let you say where to stop matching. */
3172
3173 ptrdiff_t
3174 re_search (struct re_pattern_buffer *bufp, const char *string, ptrdiff_t size,
3175 ptrdiff_t startpos, ptrdiff_t range, struct re_registers *regs)
3176 {
3177 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
3178 regs, size);
3179 }
3180
3181 /* Address of POS in the concatenation of virtual string. */
3182 #define POS_ADDR_VSTRING(POS) \
3183 (((POS) >= size1 ? string2 - size1 : string1) + (POS))
3184
3185 /* Using the compiled pattern in BUFP->buffer, first tries to match the
3186 virtual concatenation of STRING1 and STRING2, starting first at index
3187 STARTPOS, then at STARTPOS + 1, and so on.
3188
3189 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
3190
3191 RANGE is how far to scan while trying to match. RANGE = 0 means try
3192 only at STARTPOS; in general, the last start tried is STARTPOS +
3193 RANGE.
3194
3195 In REGS, return the indices of the virtual concatenation of STRING1
3196 and STRING2 that matched the entire BUFP->buffer and its contained
3197 subexpressions.
3198
3199 Do not consider matching one past the index STOP in the virtual
3200 concatenation of STRING1 and STRING2.
3201
3202 Return either the position in the strings at which the match was
3203 found, -1 if no match, or -2 if error (such as failure
3204 stack overflow). */
3205
3206 ptrdiff_t
3207 re_search_2 (struct re_pattern_buffer *bufp, const char *str1, ptrdiff_t size1,
3208 const char *str2, ptrdiff_t size2,
3209 ptrdiff_t startpos, ptrdiff_t range,
3210 struct re_registers *regs, ptrdiff_t stop)
3211 {
3212 ptrdiff_t val;
3213 re_char *string1 = (re_char *) str1;
3214 re_char *string2 = (re_char *) str2;
3215 char *fastmap = bufp->fastmap;
3216 Lisp_Object translate = bufp->translate;
3217 ptrdiff_t total_size = size1 + size2;
3218 ptrdiff_t endpos = startpos + range;
3219 bool anchored_start;
3220 /* Nonzero if we are searching multibyte string. */
3221 bool multibyte = RE_TARGET_MULTIBYTE_P (bufp);
3222
3223 /* Check for out-of-range STARTPOS. */
3224 if (startpos < 0 || startpos > total_size)
3225 return -1;
3226
3227 /* Fix up RANGE if it might eventually take us outside
3228 the virtual concatenation of STRING1 and STRING2.
3229 Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE. */
3230 if (endpos < 0)
3231 range = 0 - startpos;
3232 else if (endpos > total_size)
3233 range = total_size - startpos;
3234
3235 /* If the search isn't to be a backwards one, don't waste time in a
3236 search for a pattern anchored at beginning of buffer. */
3237 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
3238 {
3239 if (startpos > 0)
3240 return -1;
3241 else
3242 range = 0;
3243 }
3244
3245 /* In a forward search for something that starts with \=.
3246 don't keep searching past point. */
3247 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
3248 {
3249 range = PT_BYTE - BEGV_BYTE - startpos;
3250 if (range < 0)
3251 return -1;
3252 }
3253
3254 /* Update the fastmap now if not correct already. */
3255 if (fastmap && !bufp->fastmap_accurate)
3256 re_compile_fastmap (bufp);
3257
3258 /* See whether the pattern is anchored. */
3259 anchored_start = (bufp->buffer[0] == begline);
3260
3261 gl_state.object = re_match_object; /* Used by SYNTAX_TABLE_BYTE_TO_CHAR. */
3262 {
3263 ptrdiff_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (startpos));
3264
3265 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
3266 }
3267
3268 /* Loop through the string, looking for a place to start matching. */
3269 for (;;)
3270 {
3271 /* If the pattern is anchored,
3272 skip quickly past places we cannot match.
3273 Don't bother to treat startpos == 0 specially
3274 because that case doesn't repeat. */
3275 if (anchored_start && startpos > 0)
3276 {
3277 if (! ((startpos <= size1 ? string1[startpos - 1]
3278 : string2[startpos - size1 - 1])
3279 == '\n'))
3280 goto advance;
3281 }
3282
3283 /* If a fastmap is supplied, skip quickly over characters that
3284 cannot be the start of a match. If the pattern can match the
3285 null string, however, we don't need to skip characters; we want
3286 the first null string. */
3287 if (fastmap && startpos < total_size && !bufp->can_be_null)
3288 {
3289 re_char *d;
3290 int buf_ch;
3291
3292 d = POS_ADDR_VSTRING (startpos);
3293
3294 if (range > 0) /* Searching forwards. */
3295 {
3296 ptrdiff_t irange = range, lim = 0;
3297
3298 if (startpos < size1 && startpos + range >= size1)
3299 lim = range - (size1 - startpos);
3300
3301 /* Written out as an if-else to avoid testing 'translate'
3302 inside the loop. */
3303 if (!NILP (translate))
3304 {
3305 if (multibyte)
3306 while (range > lim)
3307 {
3308 int buf_charlen;
3309
3310 buf_ch = string_char_and_length (d, &buf_charlen);
3311 buf_ch = RE_TRANSLATE (translate, buf_ch);
3312 if (fastmap[CHAR_LEADING_CODE (buf_ch)])
3313 break;
3314
3315 range -= buf_charlen;
3316 d += buf_charlen;
3317 }
3318 else
3319 while (range > lim)
3320 {
3321 buf_ch = *d;
3322 int ch = RE_CHAR_TO_MULTIBYTE (buf_ch);
3323 int translated = RE_TRANSLATE (translate, ch);
3324 if (translated != ch
3325 && (ch = RE_CHAR_TO_UNIBYTE (translated)) >= 0)
3326 buf_ch = ch;
3327 if (fastmap[buf_ch])
3328 break;
3329 d++;
3330 range--;
3331 }
3332 }
3333 else
3334 {
3335 if (multibyte)
3336 while (range > lim)
3337 {
3338 int buf_charlen;
3339
3340 buf_ch = string_char_and_length (d, &buf_charlen);
3341 if (fastmap[CHAR_LEADING_CODE (buf_ch)])
3342 break;
3343 range -= buf_charlen;
3344 d += buf_charlen;
3345 }
3346 else
3347 while (range > lim && !fastmap[*d])
3348 {
3349 d++;
3350 range--;
3351 }
3352 }
3353 startpos += irange - range;
3354 }
3355 else /* Searching backwards. */
3356 {
3357 if (multibyte)
3358 {
3359 buf_ch = STRING_CHAR (d);
3360 buf_ch = TRANSLATE (buf_ch);
3361 if (! fastmap[CHAR_LEADING_CODE (buf_ch)])
3362 goto advance;
3363 }
3364 else
3365 {
3366 buf_ch = *d;
3367 int ch = RE_CHAR_TO_MULTIBYTE (buf_ch);
3368 int translated = TRANSLATE (ch);
3369 if (translated != ch
3370 && (ch = RE_CHAR_TO_UNIBYTE (translated)) >= 0)
3371 buf_ch = ch;
3372 if (! fastmap[TRANSLATE (buf_ch)])
3373 goto advance;
3374 }
3375 }
3376 }
3377
3378 /* If can't match the null string, and that's all we have left, fail. */
3379 if (range >= 0 && startpos == total_size && fastmap
3380 && !bufp->can_be_null)
3381 return -1;
3382
3383 val = re_match_2_internal (bufp, string1, size1, string2, size2,
3384 startpos, regs, stop);
3385
3386 if (val >= 0)
3387 return startpos;
3388
3389 if (val == -2)
3390 return -2;
3391
3392 advance:
3393 if (!range)
3394 break;
3395 else if (range > 0)
3396 {
3397 /* Update STARTPOS to the next character boundary. */
3398 if (multibyte)
3399 {
3400 re_char *p = POS_ADDR_VSTRING (startpos);
3401 int len = BYTES_BY_CHAR_HEAD (*p);
3402
3403 range -= len;
3404 if (range < 0)
3405 break;
3406 startpos += len;
3407 }
3408 else
3409 {
3410 range--;
3411 startpos++;
3412 }
3413 }
3414 else
3415 {
3416 range++;
3417 startpos--;
3418
3419 /* Update STARTPOS to the previous character boundary. */
3420 if (multibyte)
3421 {
3422 re_char *p = POS_ADDR_VSTRING (startpos) + 1;
3423 int len = raw_prev_char_len (p);
3424
3425 range += len - 1;
3426 if (range > 0)
3427 break;
3428 startpos -= len - 1;
3429 }
3430 }
3431 }
3432 return -1;
3433 } /* re_search_2 */
3434
3435 /* Declarations and macros for re_match_2. */
3436
3437 static bool bcmp_translate (re_char *, re_char *, ptrdiff_t,
3438 Lisp_Object, bool);
3439
3440 /* This converts PTR, a pointer into one of the search strings 'string1'
3441 and 'string2' into an offset from the beginning of that string. */
3442 #define POINTER_TO_OFFSET(ptr) \
3443 (FIRST_STRING_P (ptr) \
3444 ? (ptr) - string1 \
3445 : (ptr) - string2 + (ptrdiff_t) size1)
3446
3447 /* Call before fetching a character with *d. This switches over to
3448 string2 if necessary.
3449 `reset' is executed before backtracking if there are no more characters.
3450 Check re_match_2_internal for a discussion of why end_match_2 might
3451 not be within string2 (but be equal to end_match_1 instead). */
3452 #define PREFETCH(reset) \
3453 while (d == dend) \
3454 { \
3455 /* End of string2 => fail. */ \
3456 if (dend == end_match_2) \
3457 { \
3458 reset; \
3459 goto fail; \
3460 } \
3461 /* End of string1 => advance to string2. */ \
3462 d = string2; \
3463 dend = end_match_2; \
3464 }
3465
3466 /* Call before fetching a char with *d if you already checked other limits.
3467 This is meant for use in lookahead operations like wordend, etc..
3468 where we might need to look at parts of the string that might be
3469 outside of the LIMITs (i.e past 'stop'). */
3470 #define PREFETCH_NOLIMIT() \
3471 if (d == end1) \
3472 { \
3473 d = string2; \
3474 dend = end_match_2; \
3475 }
3476
3477 /* Test if at very beginning or at very end of the virtual concatenation
3478 of STRING1 and STRING2. If only one string, it's STRING2. */
3479 #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
3480 #define AT_STRINGS_END(d) ((d) == end2)
3481
3482 /* Disabled due to a compiler bug -- see comment at case wordbound */
3483
3484 /* The comment at case wordbound is following one, but we don't use
3485 AT_WORD_BOUNDARY anymore to support multibyte form.
3486
3487 The DEC Alpha C compiler 3.x generates incorrect code for the
3488 test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of
3489 AT_WORD_BOUNDARY, so this code is disabled. Expanding the
3490 macro and introducing temporary variables works around the bug. */
3491
3492 #if 0
3493 /* Test if D points to a character which is word-constituent. We have
3494 two special cases to check for: if past the end of string1, look at
3495 the first character in string2; and if before the beginning of
3496 string2, look at the last character in string1. */
3497 #define WORDCHAR_P(d) \
3498 (SYNTAX ((d) == end1 ? *string2 \
3499 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
3500 == Sword)
3501
3502 /* Test if the character before D and the one at D differ with respect
3503 to being word-constituent. */
3504 #define AT_WORD_BOUNDARY(d) \
3505 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
3506 || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
3507 #endif
3508
3509
3510 /* Optimization routines. */
3511
3512 /* If the operation is a match against one or more chars,
3513 return a pointer to the next operation, else return NULL. */
3514 static re_char *
3515 skip_one_char (re_char *p)
3516 {
3517 switch (*p++)
3518 {
3519 case anychar:
3520 break;
3521
3522 case exactn:
3523 p += *p + 1;
3524 break;
3525
3526 case charset_not:
3527 case charset:
3528 if (CHARSET_RANGE_TABLE_EXISTS_P (p - 1))
3529 {
3530 int mcnt;
3531 p = CHARSET_RANGE_TABLE (p - 1);
3532 EXTRACT_NUMBER_AND_INCR (mcnt, p);
3533 p = CHARSET_RANGE_TABLE_END (p, mcnt);
3534 }
3535 else
3536 p += 1 + CHARSET_BITMAP_SIZE (p - 1);
3537 break;
3538
3539 case syntaxspec:
3540 case notsyntaxspec:
3541 case categoryspec:
3542 case notcategoryspec:
3543 p++;
3544 break;
3545
3546 default:
3547 p = NULL;
3548 }
3549 return p;
3550 }
3551
3552
3553 /* Jump over non-matching operations. */
3554 static re_char *
3555 skip_noops (re_char *p, re_char *pend)
3556 {
3557 int mcnt;
3558 while (p < pend)
3559 {
3560 switch (*p)
3561 {
3562 case start_memory:
3563 case stop_memory:
3564 p += 2; break;
3565 case no_op:
3566 p += 1; break;
3567 case jump:
3568 p += 1;
3569 EXTRACT_NUMBER_AND_INCR (mcnt, p);
3570 p += mcnt;
3571 break;
3572 default:
3573 return p;
3574 }
3575 }
3576 eassert (p == pend);
3577 return p;
3578 }
3579
3580 /* Test if C matches charset op. *PP points to the charset or charset_not
3581 opcode. When the function finishes, *PP will be advanced past that opcode.
3582 C is character to test (possibly after translations) and CORIG is original
3583 character (i.e. without any translations). UNIBYTE denotes whether c is
3584 unibyte or multibyte character.
3585 CANON_TABLE is the canonicalisation table for case folding or Qnil. */
3586 static bool
3587 execute_charset (re_char **pp, int c, int corig, bool unibyte,
3588 Lisp_Object canon_table)
3589 {
3590 eassume (0 <= c && 0 <= corig);
3591 re_char *p = *pp, *rtp = NULL;
3592 bool not = (re_opcode_t) *p == charset_not;
3593
3594 if (CHARSET_RANGE_TABLE_EXISTS_P (p))
3595 {
3596 int count;
3597 rtp = CHARSET_RANGE_TABLE (p);
3598 EXTRACT_NUMBER_AND_INCR (count, rtp);
3599 *pp = CHARSET_RANGE_TABLE_END ((rtp), (count));
3600 }
3601 else
3602 *pp += 2 + CHARSET_BITMAP_SIZE (p);
3603
3604 if (unibyte && c < (1 << BYTEWIDTH))
3605 { /* Lookup bitmap. */
3606 /* Cast to 'unsigned' instead of 'unsigned char' in
3607 case the bit list is a full 32 bytes long. */
3608 if (c < (unsigned) (CHARSET_BITMAP_SIZE (p) * BYTEWIDTH)
3609 && p[2 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
3610 return !not;
3611 }
3612 else if (rtp)
3613 {
3614 int class_bits = CHARSET_RANGE_TABLE_BITS (p);
3615 int range_start, range_end;
3616
3617 /* Sort tests by the most commonly used classes with some adjustment to which
3618 tests are easiest to perform. Take a look at comment in re_wctype_parse
3619 for table with frequencies of character class names. */
3620
3621 if ((class_bits & BIT_MULTIBYTE) ||
3622 (class_bits & BIT_ALNUM && ISALNUM (c)) ||
3623 (class_bits & BIT_ALPHA && ISALPHA (c)) ||
3624 (class_bits & BIT_SPACE && ISSPACE (c)) ||
3625 (class_bits & BIT_BLANK && ISBLANK (c)) ||
3626 (class_bits & BIT_WORD && ISWORD (c)) ||
3627 ((class_bits & BIT_UPPER) &&
3628 (ISUPPER (corig) || (!NILP (canon_table) && ISLOWER (corig)))) ||
3629 ((class_bits & BIT_LOWER) &&
3630 (ISLOWER (corig) || (!NILP (canon_table) && ISUPPER (corig)))) ||
3631 (class_bits & BIT_PUNCT && ISPUNCT (c)) ||
3632 (class_bits & BIT_GRAPH && ISGRAPH (c)) ||
3633 (class_bits & BIT_PRINT && ISPRINT (c)))
3634 return !not;
3635
3636 for (p = *pp; rtp < p; rtp += 2 * 3)
3637 {
3638 EXTRACT_CHARACTER (range_start, rtp);
3639 EXTRACT_CHARACTER (range_end, rtp + 3);
3640 if (range_start <= c && c <= range_end)
3641 return !not;
3642 }
3643 }
3644
3645 return not;
3646 }
3647
3648 /* True if "p1 matches something" implies "p2 fails". */
3649 static bool
3650 mutually_exclusive_p (struct re_pattern_buffer *bufp, re_char *p1,
3651 re_char *p2)
3652 {
3653 re_opcode_t op2;
3654 bool multibyte = RE_MULTIBYTE_P (bufp);
3655 unsigned char *pend = bufp->buffer + bufp->used;
3656 re_char *p2_orig = p2;
3657
3658 eassert (p1 >= bufp->buffer && p1 < pend
3659 && p2 >= bufp->buffer && p2 <= pend);
3660
3661 /* Skip over open/close-group commands.
3662 If what follows this loop is a ...+ construct,
3663 look at what begins its body, since we will have to
3664 match at least one of that. */
3665 p2 = skip_noops (p2, pend);
3666 /* The same skip can be done for p1, except that this function
3667 is only used in the case where p1 is a simple match operator. */
3668 /* p1 = skip_noops (p1, pend); */
3669
3670 eassert (p1 >= bufp->buffer && p1 < pend
3671 && p2 >= bufp->buffer && p2 <= pend);
3672
3673 op2 = p2 == pend ? succeed : *p2;
3674
3675 switch (op2)
3676 {
3677 case succeed:
3678 case endbuf:
3679 /* If we're at the end of the pattern, we can change. */
3680 if (skip_one_char (p1))
3681 {
3682 DEBUG_PRINT (" End of pattern: fast loop.\n");
3683 return true;
3684 }
3685 break;
3686
3687 case endline:
3688 case exactn:
3689 {
3690 int c
3691 = (re_opcode_t) *p2 == endline ? '\n'
3692 : RE_STRING_CHAR (p2 + 2, multibyte);
3693
3694 if ((re_opcode_t) *p1 == exactn)
3695 {
3696 if (c != RE_STRING_CHAR (p1 + 2, multibyte))
3697 {
3698 DEBUG_PRINT (" '%c' != '%c' => fast loop.\n", c, p1[2]);
3699 return true;
3700 }
3701 }
3702
3703 else if ((re_opcode_t) *p1 == charset
3704 || (re_opcode_t) *p1 == charset_not)
3705 {
3706 if (!execute_charset (&p1, c, c, !multibyte || ASCII_CHAR_P (c),
3707 Qnil))
3708 {
3709 DEBUG_PRINT (" No match => fast loop.\n");
3710 return true;
3711 }
3712 }
3713 else if ((re_opcode_t) *p1 == anychar
3714 && c == '\n')
3715 {
3716 DEBUG_PRINT (" . != \\n => fast loop.\n");
3717 return true;
3718 }
3719 }
3720 break;
3721
3722 case charset:
3723 {
3724 if ((re_opcode_t) *p1 == exactn)
3725 /* Reuse the code above. */
3726 return mutually_exclusive_p (bufp, p2, p1);
3727
3728 /* It is hard to list up all the character in charset
3729 P2 if it includes multibyte character. Give up in
3730 such case. */
3731 else if (!multibyte || !CHARSET_RANGE_TABLE_EXISTS_P (p2))
3732 {
3733 /* Now, we are sure that P2 has no range table.
3734 So, for the size of bitmap in P2, 'p2[1]' is
3735 enough. But P1 may have range table, so the
3736 size of bitmap table of P1 is extracted by
3737 using macro 'CHARSET_BITMAP_SIZE'.
3738
3739 In a multibyte case, we know that all the character
3740 listed in P2 is ASCII. In a unibyte case, P1 has only a
3741 bitmap table. So, in both cases, it is enough to test
3742 only the bitmap table of P1. */
3743
3744 if ((re_opcode_t) *p1 == charset)
3745 {
3746 int idx;
3747 /* We win if the charset inside the loop
3748 has no overlap with the one after the loop. */
3749 for (idx = 0;
3750 (idx < (int) p2[1]
3751 && idx < CHARSET_BITMAP_SIZE (p1));
3752 idx++)
3753 if ((p2[2 + idx] & p1[2 + idx]) != 0)
3754 break;
3755
3756 if (idx == p2[1]
3757 || idx == CHARSET_BITMAP_SIZE (p1))
3758 {
3759 DEBUG_PRINT (" No match => fast loop.\n");
3760 return true;
3761 }
3762 }
3763 else if ((re_opcode_t) *p1 == charset_not)
3764 {
3765 int idx;
3766 /* We win if the charset_not inside the loop lists
3767 every character listed in the charset after. */
3768 for (idx = 0; idx < (int) p2[1]; idx++)
3769 if (! (p2[2 + idx] == 0
3770 || (idx < CHARSET_BITMAP_SIZE (p1)
3771 && ((p2[2 + idx] & ~ p1[2 + idx]) == 0))))
3772 break;
3773
3774 if (idx == p2[1])
3775 {
3776 DEBUG_PRINT (" No match => fast loop.\n");
3777 return true;
3778 }
3779 }
3780 }
3781 }
3782 break;
3783
3784 case charset_not:
3785 switch (*p1)
3786 {
3787 case exactn:
3788 case charset:
3789 /* Reuse the code above. */
3790 return mutually_exclusive_p (bufp, p2, p1);
3791 case charset_not:
3792 /* When we have two charset_not, it's very unlikely that
3793 they don't overlap. The union of the two sets of excluded
3794 chars should cover all possible chars, which, as a matter of
3795 fact, is virtually impossible in multibyte buffers. */
3796 break;
3797 }
3798 break;
3799
3800 case wordend:
3801 return ((re_opcode_t) *p1 == syntaxspec && p1[1] == Sword);
3802 case symend:
3803 return ((re_opcode_t) *p1 == syntaxspec
3804 && (p1[1] == Ssymbol || p1[1] == Sword));
3805 case notsyntaxspec:
3806 return ((re_opcode_t) *p1 == syntaxspec && p1[1] == p2[1]);
3807
3808 case wordbeg:
3809 return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == Sword);
3810 case symbeg:
3811 return ((re_opcode_t) *p1 == notsyntaxspec
3812 && (p1[1] == Ssymbol || p1[1] == Sword));
3813 case syntaxspec:
3814 return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == p2[1]);
3815
3816 case wordbound:
3817 return (((re_opcode_t) *p1 == notsyntaxspec
3818 || (re_opcode_t) *p1 == syntaxspec)
3819 && p1[1] == Sword);
3820
3821 case categoryspec:
3822 return ((re_opcode_t) *p1 == notcategoryspec && p1[1] == p2[1]);
3823 case notcategoryspec:
3824 return ((re_opcode_t) *p1 == categoryspec && p1[1] == p2[1]);
3825
3826 case on_failure_jump_nastyloop:
3827 case on_failure_jump_smart:
3828 case on_failure_jump_loop:
3829 case on_failure_keep_string_jump:
3830 case on_failure_jump:
3831 {
3832 int mcnt;
3833 p2++;
3834 EXTRACT_NUMBER_AND_INCR (mcnt, p2);
3835 /* Don't just test `mcnt > 0` because non-greedy loops have
3836 their test at the end with an unconditional jump at the start. */
3837 if (p2 + mcnt > p2_orig) /* Ensure forward progress. */
3838 return (mutually_exclusive_p (bufp, p1, p2)
3839 && mutually_exclusive_p (bufp, p1, p2 + mcnt));
3840 break;
3841 }
3842
3843 default:
3844 ;
3845 }
3846
3847 /* Safe default. */
3848 return false;
3849 }
3850
3851
3852 /* Matching routines. */
3853
3854 /* re_match_2 matches the compiled pattern in BUFP against the
3855 (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
3856 and SIZE2, respectively). We start matching at POS, and stop
3857 matching at STOP.
3858
3859 If REGS is non-null, store offsets for the substring each group
3860 matched in REGS.
3861
3862 We return -1 if no match, -2 if an internal error (such as the
3863 failure stack overflowing). Otherwise, we return the length of the
3864 matched substring. */
3865
3866 ptrdiff_t
3867 re_match_2 (struct re_pattern_buffer *bufp,
3868 char const *string1, ptrdiff_t size1,
3869 char const *string2, ptrdiff_t size2,
3870 ptrdiff_t pos, struct re_registers *regs, ptrdiff_t stop)
3871 {
3872 ptrdiff_t result;
3873
3874 ptrdiff_t charpos;
3875 gl_state.object = re_match_object; /* Used by SYNTAX_TABLE_BYTE_TO_CHAR. */
3876 charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (pos));
3877 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
3878
3879 result = re_match_2_internal (bufp, (re_char *) string1, size1,
3880 (re_char *) string2, size2,
3881 pos, regs, stop);
3882 return result;
3883 }
3884
3885 static void
3886 unwind_re_match (void *ptr)
3887 {
3888 struct buffer *b = (struct buffer *) ptr;
3889 b->text->inhibit_shrinking = 0;
3890 }
3891
3892 /* This is a separate function so that we can force an alloca cleanup
3893 afterwards. */
3894 static ptrdiff_t
3895 re_match_2_internal (struct re_pattern_buffer *bufp,
3896 re_char *string1, ptrdiff_t size1,
3897 re_char *string2, ptrdiff_t size2,
3898 ptrdiff_t pos, struct re_registers *regs, ptrdiff_t stop)
3899 {
3900 eassume (0 <= size1);
3901 eassume (0 <= size2);
3902 eassume (0 <= pos && pos <= stop && stop <= size1 + size2);
3903
3904 /* General temporaries. */
3905 int mcnt;
3906
3907 /* Just past the end of the corresponding string. */
3908 re_char *end1, *end2;
3909
3910 /* Pointers into string1 and string2, just past the last characters in
3911 each to consider matching. */
3912 re_char *end_match_1, *end_match_2;
3913
3914 /* Where we are in the data, and the end of the current string. */
3915 re_char *d, *dend;
3916
3917 /* Used sometimes to remember where we were before starting matching
3918 an operator so that we can go back in case of failure. This "atomic"
3919 behavior of matching opcodes is indispensable to the correctness
3920 of the on_failure_keep_string_jump optimization. */
3921 re_char *dfail;
3922
3923 /* Where we are in the pattern, and the end of the pattern. */
3924 re_char *p = bufp->buffer;
3925 re_char *pend = p + bufp->used;
3926
3927 /* We use this to map every character in the string. */
3928 Lisp_Object translate = bufp->translate;
3929
3930 /* True if BUFP is setup from a multibyte regex. */
3931 bool multibyte = RE_MULTIBYTE_P (bufp);
3932
3933 /* True if STRING1/STRING2 are multibyte. */
3934 bool target_multibyte = RE_TARGET_MULTIBYTE_P (bufp);
3935
3936 /* Failure point stack. Each place that can handle a failure further
3937 down the line pushes a failure point on this stack. It consists of
3938 regstart, and regend for all registers corresponding to
3939 the subexpressions we're currently inside, plus the number of such
3940 registers, and, finally, two char *'s. The first char * is where
3941 to resume scanning the pattern; the second one is where to resume
3942 scanning the strings. */
3943 fail_stack_type fail_stack;
3944 #ifdef DEBUG_COMPILES_ARGUMENTS
3945 ptrdiff_t nfailure_points_pushed = 0, nfailure_points_popped = 0;
3946 #endif
3947
3948 /* We fill all the registers internally, independent of what we
3949 return, for use in backreferences. The number here includes
3950 an element for register zero. */
3951 ptrdiff_t num_regs = bufp->re_nsub + 1;
3952 eassume (0 < num_regs);
3953
3954 /* Information on the contents of registers. These are pointers into
3955 the input strings; they record just what was matched (on this
3956 attempt) by a subexpression part of the pattern, that is, the
3957 regnum-th regstart pointer points to where in the pattern we began
3958 matching and the regnum-th regend points to right after where we
3959 stopped matching the regnum-th subexpression. */
3960 re_char **regstart UNINIT, **regend UNINIT;
3961
3962 /* The following record the register info as found in the above
3963 variables when we find a match better than any we've seen before.
3964 This happens as we backtrack through the failure points, which in
3965 turn happens only if we have not yet matched the entire string. */
3966 bool best_regs_set = false;
3967 re_char **best_regstart UNINIT, **best_regend UNINIT;
3968
3969 /* Logically, this is 'best_regend[0]'. But we don't want to have to
3970 allocate space for that if we're not allocating space for anything
3971 else (see below). Also, we never need info about register 0 for
3972 any of the other register vectors, and it seems rather a kludge to
3973 treat 'best_regend' differently from the rest. So we keep track of
3974 the end of the best match so far in a separate variable. We
3975 initialize this to NULL so that when we backtrack the first time
3976 and need to test it, it's not garbage. */
3977 re_char *match_end = NULL;
3978
3979 /* This keeps track of how many buffer/string positions we examined. */
3980 ptrdiff_t nchars = 0;
3981
3982 #ifdef DEBUG_COMPILES_ARGUMENTS
3983 /* Counts the total number of registers pushed. */
3984 ptrdiff_t num_regs_pushed = 0;
3985 #endif
3986
3987 DEBUG_PRINT ("\nEntering re_match_2.\n");
3988
3989 REGEX_USE_SAFE_ALLOCA;
3990
3991 INIT_FAIL_STACK ();
3992
3993 specpdl_ref count = SPECPDL_INDEX ();
3994
3995 /* Prevent shrinking and relocation of buffer text if GC happens
3996 while we are inside this function. The calls to
3997 UPDATE_SYNTAX_TABLE_* macros can call Lisp (via
3998 `internal--syntax-propertize`); these calls are careful to defend against
3999 buffer modifications, but even with no modifications, the buffer text may
4000 be relocated during GC by `compact_buffer` which would invalidate
4001 our C pointers to buffer text. */
4002 if (!current_buffer->text->inhibit_shrinking)
4003 {
4004 record_unwind_protect_ptr (unwind_re_match, current_buffer);
4005 current_buffer->text->inhibit_shrinking = 1;
4006 }
4007
4008 /* Do not bother to initialize all the register variables if there are
4009 no groups in the pattern, as it takes a fair amount of time. If
4010 there are groups, we include space for register 0 (the whole
4011 pattern) in REGSTART[0], even though we never use it, to avoid
4012 the undefined behavior of subtracting 1 from REGSTART. */
4013 ptrdiff_t re_nsub = num_regs - 1;
4014 if (0 < re_nsub)
4015 {
4016 regstart = SAFE_ALLOCA ((re_nsub * 4 + 1) * sizeof *regstart);
4017 regend = regstart + num_regs;
4018 best_regstart = regend + re_nsub;
4019 best_regend = best_regstart + re_nsub;
4020
4021 /* Initialize subexpression text positions to unset, to mark ones
4022 that no start_memory/stop_memory has been seen for. */
4023 for (re_char **apos = regstart + 1; apos < best_regstart + 1; apos++)
4024 *apos = NULL;
4025 }
4026
4027 /* We move 'string1' into 'string2' if the latter's empty -- but not if
4028 'string1' is null. */
4029 if (size2 == 0 && string1 != NULL)
4030 {
4031 string2 = string1;
4032 size2 = size1;
4033 string1 = 0;
4034 size1 = 0;
4035 }
4036 end1 = string1 + size1;
4037 end2 = string2 + size2;
4038
4039 /* P scans through the pattern as D scans through the data.
4040 DEND is the end of the input string that D points within.
4041 Advance D into the following input string whenever necessary, but
4042 this happens before fetching; therefore, at the beginning of the
4043 loop, D can be pointing at the end of a string, but it cannot
4044 equal STRING2. */
4045 if (pos >= size1)
4046 {
4047 /* Only match within string2. */
4048 d = string2 + pos - size1;
4049 dend = end_match_2 = string2 + stop - size1;
4050 end_match_1 = end1; /* Just to give it a value. */
4051 }
4052 else
4053 {
4054 if (stop < size1)
4055 {
4056 /* Only match within string1. */
4057 end_match_1 = string1 + stop;
4058 /* BEWARE!
4059 When we reach end_match_1, PREFETCH normally switches to string2.
4060 But in the present case, this means that just doing a PREFETCH
4061 makes us jump from 'stop' to 'gap' within the string.
4062 What we really want here is for the search to stop as
4063 soon as we hit end_match_1. That's why we set end_match_2
4064 to end_match_1 (since PREFETCH fails as soon as we hit
4065 end_match_2). */
4066 end_match_2 = end_match_1;
4067 }
4068 else
4069 { /* It's important to use this code when STOP == SIZE so that
4070 moving D from end1 to string2 will not prevent the D == DEND
4071 check from catching the end of string. */
4072 end_match_1 = end1;
4073 end_match_2 = string2 + stop - size1;
4074 }
4075 d = string1 + pos;
4076 dend = end_match_1;
4077 }
4078
4079 DEBUG_PRINT ("The compiled pattern is:\n");
4080 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
4081 DEBUG_PRINT ("The string to match is: \"");
4082 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
4083 DEBUG_PRINT ("\"\n");
4084
4085 /* This loops over pattern commands. It exits by returning from the
4086 function if the match is complete, or it drops through if the match
4087 fails at this starting point in the input data. */
4088 for (;;)
4089 {
4090 DEBUG_PRINT ("\n%p: ", p);
4091
4092 if (p == pend)
4093 {
4094 /* End of pattern means we might have succeeded. */
4095 DEBUG_PRINT ("end of pattern ... ");
4096
4097 /* If we haven't matched the entire string, and we want the
4098 longest match, try backtracking. */
4099 if (d != end_match_2)
4100 {
4101 /* True if this match is the best seen so far. */
4102 bool best_match_p;
4103
4104 {
4105 /* True if this match ends in the same string (string1
4106 or string2) as the best previous match. */
4107 bool same_str_p = (FIRST_STRING_P (match_end)
4108 == FIRST_STRING_P (d));
4109
4110 /* AIX compiler got confused when this was combined
4111 with the previous declaration. */
4112 if (same_str_p)
4113 best_match_p = d > match_end;
4114 else
4115 best_match_p = !FIRST_STRING_P (d);
4116 }
4117
4118 DEBUG_PRINT ("backtracking.\n");
4119
4120 if (!FAIL_STACK_EMPTY ())
4121 { /* More failure points to try. */
4122
4123 /* If exceeds best match so far, save it. */
4124 if (!best_regs_set || best_match_p)
4125 {
4126 best_regs_set = true;
4127 match_end = d;
4128
4129 DEBUG_PRINT ("\nSAVING match as best so far.\n");
4130
4131 for (ptrdiff_t reg = 1; reg < num_regs; reg++)
4132 {
4133 best_regstart[reg] = regstart[reg];
4134 best_regend[reg] = regend[reg];
4135 }
4136 }
4137 goto fail;
4138 }
4139
4140 /* If no failure points, don't restore garbage. And if
4141 last match is real best match, don't restore second
4142 best one. */
4143 else if (best_regs_set && !best_match_p)
4144 {
4145 restore_best_regs:
4146 /* Restore best match. It may happen that 'dend ==
4147 end_match_1' while the restored d is in string2.
4148 For example, the pattern 'x.*y.*z' against the
4149 strings 'x-' and 'y-z-', if the two strings are
4150 not consecutive in memory. */
4151 DEBUG_PRINT ("Restoring best registers.\n");
4152
4153 d = match_end;
4154 dend = ((d >= string1 && d <= end1)
4155 ? end_match_1 : end_match_2);
4156
4157 for (ptrdiff_t reg = 1; reg < num_regs; reg++)
4158 {
4159 regstart[reg] = best_regstart[reg];
4160 regend[reg] = best_regend[reg];
4161 eassert (REG_UNSET (regstart[reg])
4162 <= REG_UNSET (regend[reg]));
4163 }
4164 }
4165 } /* d != end_match_2 */
4166
4167 succeed_label:
4168 DEBUG_PRINT ("Accepting match.\n");
4169
4170 /* If caller wants register contents data back, do it. */
4171 if (regs)
4172 {
4173 /* Have the register data arrays been allocated? */
4174 if (bufp->regs_allocated == REGS_UNALLOCATED)
4175 { /* No. So allocate them with malloc. */
4176 ptrdiff_t n = max (RE_NREGS, num_regs);
4177 regs->start = xnmalloc (n, sizeof *regs->start);
4178 regs->end = xnmalloc (n, sizeof *regs->end);
4179 regs->num_regs = n;
4180 bufp->regs_allocated = REGS_REALLOCATE;
4181 }
4182 else if (bufp->regs_allocated == REGS_REALLOCATE)
4183 { /* Yes. If we need more elements than were already
4184 allocated, reallocate them. If we need fewer, just
4185 leave it alone. */
4186 ptrdiff_t n = regs->num_regs;
4187 if (n < num_regs)
4188 {
4189 n = max (n + (n >> 1), num_regs);
4190 regs->start
4191 = xnrealloc (regs->start, n, sizeof *regs->start);
4192 regs->end = xnrealloc (regs->end, n, sizeof *regs->end);
4193 regs->num_regs = n;
4194 }
4195 }
4196 else
4197 eassert (bufp->regs_allocated == REGS_FIXED);
4198
4199 /* Convert the pointer data in 'regstart' and 'regend' to
4200 indices. Register zero has to be set differently,
4201 since we haven't kept track of any info for it. */
4202 if (regs->num_regs > 0)
4203 {
4204 regs->start[0] = pos;
4205 regs->end[0] = POINTER_TO_OFFSET (d);
4206 }
4207
4208 for (ptrdiff_t reg = 1; reg < num_regs; reg++)
4209 {
4210 eassert (REG_UNSET (regstart[reg])
4211 <= REG_UNSET (regend[reg]));
4212 if (REG_UNSET (regend[reg]))
4213 regs->start[reg] = regs->end[reg] = -1;
4214 else
4215 {
4216 regs->start[reg] = POINTER_TO_OFFSET (regstart[reg]);
4217 regs->end[reg] = POINTER_TO_OFFSET (regend[reg]);
4218 }
4219 }
4220
4221 /* If the regs structure we return has more elements than
4222 were in the pattern, set the extra elements to -1. */
4223 for (ptrdiff_t reg = num_regs; reg < regs->num_regs; reg++)
4224 regs->start[reg] = regs->end[reg] = -1;
4225 }
4226
4227 DEBUG_PRINT ("%td failure points pushed, %td popped (%td remain).\n",
4228 nfailure_points_pushed, nfailure_points_popped,
4229 nfailure_points_pushed - nfailure_points_popped);
4230 DEBUG_PRINT ("%td registers pushed.\n", num_regs_pushed);
4231
4232 ptrdiff_t dcnt = POINTER_TO_OFFSET (d) - pos;
4233
4234 DEBUG_PRINT ("Returning %td from re_match_2.\n", dcnt);
4235
4236 unbind_to (count, Qnil);
4237 SAFE_FREE ();
4238 /* The factor of 50 below is a heuristic that needs to be tuned. It
4239 means we consider 50 buffer positions examined by this function
4240 roughly equivalent to the display engine iterating over a single
4241 buffer position. */
4242 if (max_redisplay_ticks > 0 && nchars > 0)
4243 update_redisplay_ticks (nchars / 50 + 1, NULL);
4244 return dcnt;
4245 }
4246
4247 /* Otherwise match next pattern command. */
4248 switch (*p++)
4249 {
4250 /* Ignore these. Used to ignore the n of succeed_n's which
4251 currently have n == 0. */
4252 case no_op:
4253 DEBUG_PRINT ("EXECUTING no_op.\n");
4254 break;
4255
4256 case succeed:
4257 DEBUG_PRINT ("EXECUTING succeed.\n");
4258 goto succeed_label;
4259
4260 /* Match the next n pattern characters exactly. The following
4261 byte in the pattern defines n, and the n bytes after that
4262 are the characters to match. */
4263 case exactn:
4264 mcnt = *p++;
4265 DEBUG_PRINT ("EXECUTING exactn %d.\n", mcnt);
4266
4267 /* Remember the start point to rollback upon failure. */
4268 dfail = d;
4269
4270 /* The cost of testing 'translate' is comparatively small. */
4271 if (target_multibyte)
4272 do
4273 {
4274 int pat_charlen, buf_charlen;
4275 int pat_ch, buf_ch;
4276
4277 PREFETCH (d = dfail);
4278 if (multibyte)
4279 pat_ch = string_char_and_length (p, &pat_charlen);
4280 else
4281 {
4282 pat_ch = RE_CHAR_TO_MULTIBYTE (*p);
4283 pat_charlen = 1;
4284 }
4285 buf_ch = string_char_and_length (d, &buf_charlen);
4286
4287 if (TRANSLATE (buf_ch) != pat_ch)
4288 {
4289 d = dfail;
4290 goto fail;
4291 }
4292
4293 p += pat_charlen;
4294 d += buf_charlen;
4295 mcnt -= pat_charlen;
4296 nchars++;
4297 }
4298 while (mcnt > 0);
4299 else
4300 do
4301 {
4302 int pat_charlen;
4303 int pat_ch, buf_ch;
4304
4305 PREFETCH (d = dfail);
4306 if (multibyte)
4307 {
4308 pat_ch = string_char_and_length (p, &pat_charlen);
4309 pat_ch = RE_CHAR_TO_UNIBYTE (pat_ch);
4310 }
4311 else
4312 {
4313 pat_ch = *p;
4314 pat_charlen = 1;
4315 }
4316 buf_ch = RE_CHAR_TO_MULTIBYTE (*d);
4317 if (! CHAR_BYTE8_P (buf_ch))
4318 {
4319 buf_ch = TRANSLATE (buf_ch);
4320 buf_ch = RE_CHAR_TO_UNIBYTE (buf_ch);
4321 if (buf_ch < 0)
4322 buf_ch = *d;
4323 }
4324 else
4325 buf_ch = *d;
4326 if (buf_ch != pat_ch)
4327 {
4328 d = dfail;
4329 goto fail;
4330 }
4331 p += pat_charlen;
4332 d++;
4333 mcnt -= pat_charlen;
4334 nchars++;
4335 }
4336 while (mcnt > 0);
4337
4338 break;
4339
4340
4341 /* Match any character except newline. */
4342 case anychar:
4343 {
4344 int buf_charlen;
4345 int buf_ch;
4346
4347 DEBUG_PRINT ("EXECUTING anychar.\n");
4348
4349 PREFETCH ();
4350 buf_ch = RE_STRING_CHAR_AND_LENGTH (d, buf_charlen,
4351 target_multibyte);
4352 buf_ch = TRANSLATE (buf_ch);
4353 if (buf_ch == '\n')
4354 goto fail;
4355
4356 DEBUG_PRINT (" Matched \"%d\".\n", *d);
4357 d += buf_charlen;
4358 nchars++;
4359 }
4360 break;
4361
4362
4363 case charset:
4364 case charset_not:
4365 {
4366 /* Whether matching against a unibyte character. */
4367 bool unibyte_char = false;
4368
4369 DEBUG_PRINT ("EXECUTING charset%s.\n",
4370 (re_opcode_t) *(p - 1) == charset_not ? "_not" : "");
4371
4372 PREFETCH ();
4373 int len;
4374 int corig = RE_STRING_CHAR_AND_LENGTH (d, len, target_multibyte);
4375 int c = corig;
4376 if (target_multibyte)
4377 {
4378 int c1;
4379
4380 c = TRANSLATE (c);
4381 c1 = RE_CHAR_TO_UNIBYTE (c);
4382 if (c1 >= 0)
4383 {
4384 unibyte_char = true;
4385 c = c1;
4386 }
4387 }
4388 else
4389 {
4390 int c1 = RE_CHAR_TO_MULTIBYTE (c);
4391
4392 if (! CHAR_BYTE8_P (c1))
4393 {
4394 c1 = TRANSLATE (c1);
4395 c1 = RE_CHAR_TO_UNIBYTE (c1);
4396 if (c1 >= 0)
4397 {
4398 unibyte_char = true;
4399 c = c1;
4400 }
4401 }
4402 else
4403 unibyte_char = true;
4404 }
4405
4406 p -= 1;
4407 if (!execute_charset (&p, c, corig, unibyte_char, translate))
4408 goto fail;
4409
4410 d += len;
4411 nchars++;
4412 }
4413 break;
4414
4415
4416 /* The beginning of a group is represented by start_memory.
4417 The argument is the register number. The text
4418 matched within the group is recorded (in the internal
4419 registers data structure) under the register number. */
4420 case start_memory:
4421 DEBUG_PRINT ("EXECUTING start_memory %d:\n", *p);
4422 eassert (0 < *p && *p < num_regs);
4423
4424 /* In case we need to undo this operation (via backtracking). */
4425 PUSH_FAILURE_REG (*p);
4426
4427 regstart[*p] = d;
4428 DEBUG_PRINT (" regstart: %td\n", POINTER_TO_OFFSET (regstart[*p]));
4429
4430 /* Move past the register number and inner group count. */
4431 p += 1;
4432 break;
4433
4434
4435 /* The stop_memory opcode represents the end of a group. Its
4436 argument is the same as start_memory's: the register number. */
4437 case stop_memory:
4438 DEBUG_PRINT ("EXECUTING stop_memory %d:\n", *p);
4439
4440 eassert (0 < *p && *p < num_regs);
4441 eassert (!REG_UNSET (regstart[*p]));
4442 /* Strictly speaking, there should be code such as:
4443
4444 eassert (REG_UNSET (regend[*p]));
4445 PUSH_FAILURE_REGSTOP (*p);
4446
4447 But the only info to be pushed is regend[*p] and it is known to
4448 be UNSET, so there really isn't anything to push.
4449 Not pushing anything, on the other hand deprives us from the
4450 guarantee that regend[*p] is UNSET since undoing this operation
4451 will not reset its value properly. This is not important since
4452 the value will only be read on the next start_memory or at
4453 the very end and both events can only happen if this stop_memory
4454 is *not* undone. */
4455
4456 regend[*p] = d;
4457 DEBUG_PRINT (" regend: %td\n", POINTER_TO_OFFSET (regend[*p]));
4458
4459 /* Move past the register number and the inner group count. */
4460 p += 1;
4461 break;
4462
4463
4464 /* \<digit> has been turned into a 'duplicate' command which is
4465 followed by the numeric value of <digit> as the register number. */
4466 case duplicate:
4467 {
4468 re_char *d2, *dend2;
4469 int regno = *p++; /* Get which register to match against. */
4470 DEBUG_PRINT ("EXECUTING duplicate %d.\n", regno);
4471
4472 /* Can't back reference a group which we've never matched. */
4473 eassert (0 < regno && regno < num_regs);
4474 eassert (REG_UNSET (regstart[regno]) <= REG_UNSET (regend[regno]));
4475 if (REG_UNSET (regend[regno]))
4476 goto fail;
4477
4478 /* Where in input to try to start matching. */
4479 d2 = regstart[regno];
4480
4481 /* Remember the start point to rollback upon failure. */
4482 dfail = d;
4483
4484 /* Where to stop matching; if both the place to start and
4485 the place to stop matching are in the same string, then
4486 set to the place to stop, otherwise, for now have to use
4487 the end of the first string. */
4488
4489 dend2 = ((FIRST_STRING_P (regstart[regno])
4490 == FIRST_STRING_P (regend[regno]))
4491 ? regend[regno] : end_match_1);
4492 for (;;)
4493 {
4494 ptrdiff_t dcnt;
4495
4496 /* If necessary, advance to next segment in register
4497 contents. */
4498 while (d2 == dend2)
4499 {
4500 if (dend2 == end_match_2) break;
4501 if (dend2 == regend[regno]) break;
4502
4503 /* End of string1 => advance to string2. */
4504 d2 = string2;
4505 dend2 = regend[regno];
4506 }
4507 /* At end of register contents => success */
4508 if (d2 == dend2) break;
4509
4510 /* If necessary, advance to next segment in data. */
4511 PREFETCH (d = dfail);
4512
4513 /* How many characters left in this segment to match. */
4514 dcnt = dend - d;
4515
4516 /* Want how many consecutive characters we can match in
4517 one shot, so, if necessary, adjust the count. */
4518 if (dcnt > dend2 - d2)
4519 dcnt = dend2 - d2;
4520
4521 /* Compare that many; failure if mismatch, else move
4522 past them. */
4523 if (!NILP (translate)
4524 ? bcmp_translate (d, d2, dcnt, translate, target_multibyte)
4525 : memcmp (d, d2, dcnt))
4526 {
4527 d = dfail;
4528 goto fail;
4529 }
4530 d += dcnt, d2 += dcnt;
4531 nchars++;
4532 }
4533 }
4534 break;
4535
4536
4537 /* begline matches the empty string at the beginning of the string,
4538 and after newlines. */
4539 case begline:
4540 DEBUG_PRINT ("EXECUTING begline.\n");
4541
4542 if (AT_STRINGS_BEG (d))
4543 break;
4544 else
4545 {
4546 unsigned c;
4547 GET_CHAR_BEFORE_2 (c, d, string1, end1, string2, end2);
4548 if (c == '\n')
4549 break;
4550 }
4551 goto fail;
4552
4553
4554 /* endline is the dual of begline. */
4555 case endline:
4556 DEBUG_PRINT ("EXECUTING endline.\n");
4557
4558 if (AT_STRINGS_END (d))
4559 break;
4560 PREFETCH_NOLIMIT ();
4561 if (*d == '\n')
4562 break;
4563 goto fail;
4564
4565
4566 /* Match at the very beginning of the data. */
4567 case begbuf:
4568 DEBUG_PRINT ("EXECUTING begbuf.\n");
4569 if (AT_STRINGS_BEG (d))
4570 break;
4571 goto fail;
4572
4573
4574 /* Match at the very end of the data. */
4575 case endbuf:
4576 DEBUG_PRINT ("EXECUTING endbuf.\n");
4577 if (AT_STRINGS_END (d))
4578 break;
4579 goto fail;
4580
4581
4582 /* on_failure_keep_string_jump is used to optimize '.*\n'. It
4583 pushes NULL as the value for the string on the stack. Then
4584 'POP_FAILURE_POINT' will keep the current value for the
4585 string, instead of restoring it. To see why, consider
4586 matching 'foo\nbar' against '.*\n'. The .* matches the foo;
4587 then the . fails against the \n. But the next thing we want
4588 to do is match the \n against the \n; if we restored the
4589 string value, we would be back at the foo.
4590
4591 Because this is used only in specific cases, we don't need to
4592 check all the things that 'on_failure_jump' does, to make
4593 sure the right things get saved on the stack. Hence we don't
4594 share its code. The only reason to push anything on the
4595 stack at all is that otherwise we would have to change
4596 'anychar's code to do something besides goto fail in this
4597 case; that seems worse than this. */
4598 case on_failure_keep_string_jump:
4599 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4600 DEBUG_PRINT ("EXECUTING on_failure_keep_string_jump %d (to %p):\n",
4601 mcnt, p + mcnt);
4602
4603 PUSH_FAILURE_POINT (p - 3, NULL);
4604 break;
4605
4606 /* A nasty loop is introduced by the non-greedy *? and +?.
4607 With such loops, the stack only ever contains one failure point
4608 at a time, so that a plain on_failure_jump_loop kind of
4609 cycle detection cannot work. Worse yet, such a detection
4610 can not only fail to detect a cycle, but it can also wrongly
4611 detect a cycle (between different instantiations of the same
4612 loop).
4613 So the method used for those nasty loops is a little different:
4614 We use a special cycle-detection-stack-frame which is pushed
4615 when the on_failure_jump_nastyloop failure-point is *popped*.
4616 This special frame thus marks the beginning of one iteration
4617 through the loop and we can hence easily check right here
4618 whether something matched between the beginning and the end of
4619 the loop. */
4620 case on_failure_jump_nastyloop:
4621 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4622 DEBUG_PRINT ("EXECUTING on_failure_jump_nastyloop %d (to %p):\n",
4623 mcnt, p + mcnt);
4624
4625 eassert ((re_opcode_t)p[-4] == no_op);
4626 {
4627 bool cycle = false;
4628 CHECK_INFINITE_LOOP (p - 4, d);
4629 if (!cycle)
4630 /* If there's a cycle, just continue without pushing
4631 this failure point. The failure point is the "try again"
4632 option, which shouldn't be tried.
4633 We want (x?)*?y\1z to match both xxyz and xxyxz. */
4634 PUSH_FAILURE_POINT (p - 3, d);
4635 }
4636 break;
4637
4638 /* Simple loop detecting on_failure_jump: just check on the
4639 failure stack if the same spot was already hit earlier. */
4640 case on_failure_jump_loop:
4641 on_failure:
4642 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4643 DEBUG_PRINT ("EXECUTING on_failure_jump_loop %d (to %p):\n",
4644 mcnt, p + mcnt);
4645 {
4646 bool cycle = false;
4647 CHECK_INFINITE_LOOP (p - 3, d);
4648 if (cycle)
4649 /* If there's a cycle, get out of the loop, as if the matching
4650 had failed. We used to just 'goto fail' here, but that was
4651 aborting the search a bit too early: we want to keep the
4652 empty-loop-match and keep matching after the loop.
4653 We want (x?)*y\1z to match both xxyz and xxyxz. */
4654 p += mcnt;
4655 else
4656 PUSH_FAILURE_POINT (p - 3, d);
4657 }
4658 break;
4659
4660
4661 /* Uses of on_failure_jump:
4662
4663 Each alternative starts with an on_failure_jump that points
4664 to the beginning of the next alternative. Each alternative
4665 except the last ends with a jump that in effect jumps past
4666 the rest of the alternatives. (They really jump to the
4667 ending jump of the following alternative, because tensioning
4668 these jumps is a hassle.)
4669
4670 Repeats start with an on_failure_jump that points past both
4671 the repetition text and either the following jump or
4672 pop_failure_jump back to this on_failure_jump. */
4673 case on_failure_jump:
4674 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4675 DEBUG_PRINT ("EXECUTING on_failure_jump %d (to %p):\n",
4676 mcnt, p + mcnt);
4677
4678 PUSH_FAILURE_POINT (p -3, d);
4679 break;
4680
4681 /* This operation is used for greedy *.
4682 Compare the beginning of the repeat with what in the
4683 pattern follows its end. If we can establish that there
4684 is nothing that they would both match, i.e., that we
4685 would have to backtrack because of (as in, e.g., 'a*a')
4686 then we can use a non-backtracking loop based on
4687 on_failure_keep_string_jump instead of on_failure_jump. */
4688 case on_failure_jump_smart:
4689 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4690 DEBUG_PRINT ("EXECUTING on_failure_jump_smart %d (to %p).\n",
4691 mcnt, p + mcnt);
4692 {
4693 re_char *p1 = p; /* Next operation. */
4694 /* Discard 'const', making re_search non-reentrant. */
4695 unsigned char *p2 = (unsigned char *) p + mcnt; /* Jump dest. */
4696 unsigned char *p3 = (unsigned char *) p - 3; /* opcode location. */
4697
4698 p -= 3; /* Reset so that we will re-execute the
4699 instruction once it's been changed. */
4700
4701 EXTRACT_NUMBER (mcnt, p2 - 2);
4702
4703 /* Ensure this is indeed the trivial kind of loop
4704 we are expecting. */
4705 eassert (skip_one_char (p1) == p2 - 3);
4706 eassert ((re_opcode_t) p2[-3] == jump && p2 + mcnt == p);
4707 DEBUG_STATEMENT (regex_emacs_debug += 2);
4708 if (mutually_exclusive_p (bufp, p1, p2))
4709 {
4710 /* Use a fast 'on_failure_keep_string_jump' loop. */
4711 DEBUG_PRINT (" smart exclusive => fast loop.\n");
4712 *p3 = (unsigned char) on_failure_keep_string_jump;
4713 STORE_NUMBER (p2 - 2, mcnt + 3);
4714 }
4715 else
4716 {
4717 /* Default to a safe 'on_failure_jump' loop. */
4718 DEBUG_PRINT (" smart default => slow loop.\n");
4719 *p3 = (unsigned char) on_failure_jump;
4720 }
4721 DEBUG_STATEMENT (regex_emacs_debug -= 2);
4722 }
4723 break;
4724
4725 /* Unconditionally jump (without popping any failure points). */
4726 case jump:
4727 unconditional_jump:
4728 maybe_quit ();
4729 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
4730 DEBUG_PRINT ("EXECUTING jump %d ", mcnt);
4731 p += mcnt; /* Do the jump. */
4732 DEBUG_PRINT ("(to %p).\n", p);
4733 break;
4734
4735
4736 /* Have to succeed matching what follows at least n times.
4737 After that, handle like 'on_failure_jump'. */
4738 case succeed_n:
4739 /* Signedness doesn't matter since we only compare MCNT to 0. */
4740 EXTRACT_NUMBER (mcnt, p + 2);
4741 DEBUG_PRINT ("EXECUTING succeed_n %d.\n", mcnt);
4742
4743 /* Originally, mcnt is how many times we HAVE to succeed. */
4744 if (mcnt != 0)
4745 {
4746 /* Discard 'const', making re_search non-reentrant. */
4747 unsigned char *p2 = (unsigned char *) p + 2; /* counter loc. */
4748 mcnt--;
4749 p += 4;
4750 PUSH_NUMBER (p2, mcnt);
4751 }
4752 else
4753 /* The two bytes encoding mcnt == 0 are two no_op opcodes. */
4754 goto on_failure;
4755 break;
4756
4757 case jump_n:
4758 /* Signedness doesn't matter since we only compare MCNT to 0. */
4759 EXTRACT_NUMBER (mcnt, p + 2);
4760 DEBUG_PRINT ("EXECUTING jump_n %d.\n", mcnt);
4761
4762 /* Originally, this is how many times we CAN jump. */
4763 if (mcnt != 0)
4764 {
4765 /* Discard 'const', making re_search non-reentrant. */
4766 unsigned char *p2 = (unsigned char *) p + 2; /* counter loc. */
4767 mcnt--;
4768 PUSH_NUMBER (p2, mcnt);
4769 goto unconditional_jump;
4770 }
4771 /* If don't have to jump any more, skip over the rest of command. */
4772 else
4773 p += 4;
4774 break;
4775
4776 case set_number_at:
4777 {
4778 unsigned char *p2; /* Location of the counter. */
4779 DEBUG_PRINT ("EXECUTING set_number_at.\n");
4780
4781 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4782 /* Discard 'const', making re_search non-reentrant. */
4783 p2 = (unsigned char *) p + mcnt;
4784 /* Signedness doesn't matter since we only copy MCNT's bits. */
4785 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4786 DEBUG_PRINT (" Setting %p to %d.\n", p2, mcnt);
4787 PUSH_NUMBER (p2, mcnt);
4788 break;
4789 }
4790
4791 case wordbound:
4792 case notwordbound:
4793 {
4794 bool not = (re_opcode_t) *(p - 1) == notwordbound;
4795 DEBUG_PRINT ("EXECUTING %swordbound.\n", not ? "not" : "");
4796
4797 /* We SUCCEED (or FAIL) in one of the following cases: */
4798
4799 /* Case 1: D is at the beginning or the end of string. */
4800 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
4801 not = !not;
4802 else
4803 {
4804 /* C1 is the character before D, S1 is the syntax of C1, C2
4805 is the character at D, and S2 is the syntax of C2. */
4806 int c1, c2;
4807 int s1, s2;
4808 int dummy;
4809 ptrdiff_t offset = PTR_TO_OFFSET (d);
4810 ptrdiff_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset) - 1;
4811 UPDATE_SYNTAX_TABLE (charpos);
4812 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
4813 nchars++;
4814 s1 = SYNTAX (c1);
4815 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
4816 PREFETCH_NOLIMIT ();
4817 GET_CHAR_AFTER (c2, d, dummy);
4818 nchars++;
4819 s2 = SYNTAX (c2);
4820
4821 if (/* Case 2: Only one of S1 and S2 is Sword. */
4822 ((s1 == Sword) != (s2 == Sword))
4823 /* Case 3: Both of S1 and S2 are Sword, and macro
4824 WORD_BOUNDARY_P (C1, C2) returns nonzero. */
4825 || ((s1 == Sword) && WORD_BOUNDARY_P (c1, c2)))
4826 not = !not;
4827 }
4828 if (not)
4829 break;
4830 else
4831 goto fail;
4832 }
4833
4834 case wordbeg:
4835 DEBUG_PRINT ("EXECUTING wordbeg.\n");
4836
4837 /* We FAIL in one of the following cases: */
4838
4839 /* Case 1: D is at the end of string. */
4840 if (AT_STRINGS_END (d))
4841 goto fail;
4842 else
4843 {
4844 /* C1 is the character before D, S1 is the syntax of C1, C2
4845 is the character at D, and S2 is the syntax of C2. */
4846 int c1, c2;
4847 int s1, s2;
4848 int dummy;
4849 ptrdiff_t offset = PTR_TO_OFFSET (d);
4850 ptrdiff_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
4851 UPDATE_SYNTAX_TABLE (charpos);
4852 PREFETCH ();
4853 GET_CHAR_AFTER (c2, d, dummy);
4854 nchars++;
4855 s2 = SYNTAX (c2);
4856
4857 /* Case 2: S2 is not Sword. */
4858 if (s2 != Sword)
4859 goto fail;
4860
4861 /* Case 3: D is not at the beginning of string ... */
4862 if (!AT_STRINGS_BEG (d))
4863 {
4864 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
4865 nchars++;
4866 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
4867 s1 = SYNTAX (c1);
4868
4869 /* ... and S1 is Sword, and WORD_BOUNDARY_P (C1, C2)
4870 returns 0. */
4871 if ((s1 == Sword) && !WORD_BOUNDARY_P (c1, c2))
4872 goto fail;
4873 }
4874 }
4875 break;
4876
4877 case wordend:
4878 DEBUG_PRINT ("EXECUTING wordend.\n");
4879
4880 /* We FAIL in one of the following cases: */
4881
4882 /* Case 1: D is at the beginning of string. */
4883 if (AT_STRINGS_BEG (d))
4884 goto fail;
4885 else
4886 {
4887 /* C1 is the character before D, S1 is the syntax of C1, C2
4888 is the character at D, and S2 is the syntax of C2. */
4889 int c1, c2;
4890 int s1, s2;
4891 int dummy;
4892 ptrdiff_t offset = PTR_TO_OFFSET (d);
4893 ptrdiff_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset) - 1;
4894 UPDATE_SYNTAX_TABLE (charpos);
4895 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
4896 nchars++;
4897 s1 = SYNTAX (c1);
4898
4899 /* Case 2: S1 is not Sword. */
4900 if (s1 != Sword)
4901 goto fail;
4902
4903 /* Case 3: D is not at the end of string ... */
4904 if (!AT_STRINGS_END (d))
4905 {
4906 PREFETCH_NOLIMIT ();
4907 GET_CHAR_AFTER (c2, d, dummy);
4908 nchars++;
4909 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
4910 s2 = SYNTAX (c2);
4911
4912 /* ... and S2 is Sword, and WORD_BOUNDARY_P (C1, C2)
4913 returns 0. */
4914 if ((s2 == Sword) && !WORD_BOUNDARY_P (c1, c2))
4915 goto fail;
4916 }
4917 }
4918 break;
4919
4920 case symbeg:
4921 DEBUG_PRINT ("EXECUTING symbeg.\n");
4922
4923 /* We FAIL in one of the following cases: */
4924
4925 /* Case 1: D is at the end of string. */
4926 if (AT_STRINGS_END (d))
4927 goto fail;
4928 else
4929 {
4930 /* C1 is the character before D, S1 is the syntax of C1, C2
4931 is the character at D, and S2 is the syntax of C2. */
4932 int c1, c2;
4933 int s1, s2;
4934 ptrdiff_t offset = PTR_TO_OFFSET (d);
4935 ptrdiff_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
4936 UPDATE_SYNTAX_TABLE (charpos);
4937 PREFETCH ();
4938 c2 = RE_STRING_CHAR (d, target_multibyte);
4939 nchars++;
4940 s2 = SYNTAX (c2);
4941
4942 /* Case 2: S2 is neither Sword nor Ssymbol. */
4943 if (s2 != Sword && s2 != Ssymbol)
4944 goto fail;
4945
4946 /* Case 3: D is not at the beginning of string ... */
4947 if (!AT_STRINGS_BEG (d))
4948 {
4949 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
4950 nchars++;
4951 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
4952 s1 = SYNTAX (c1);
4953
4954 /* ... and S1 is Sword or Ssymbol. */
4955 if (s1 == Sword || s1 == Ssymbol)
4956 goto fail;
4957 }
4958 }
4959 break;
4960
4961 case symend:
4962 DEBUG_PRINT ("EXECUTING symend.\n");
4963
4964 /* We FAIL in one of the following cases: */
4965
4966 /* Case 1: D is at the beginning of string. */
4967 if (AT_STRINGS_BEG (d))
4968 goto fail;
4969 else
4970 {
4971 /* C1 is the character before D, S1 is the syntax of C1, C2
4972 is the character at D, and S2 is the syntax of C2. */
4973 int c1, c2;
4974 int s1, s2;
4975 ptrdiff_t offset = PTR_TO_OFFSET (d);
4976 ptrdiff_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset) - 1;
4977 UPDATE_SYNTAX_TABLE (charpos);
4978 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
4979 nchars++;
4980 s1 = SYNTAX (c1);
4981
4982 /* Case 2: S1 is neither Ssymbol nor Sword. */
4983 if (s1 != Sword && s1 != Ssymbol)
4984 goto fail;
4985
4986 /* Case 3: D is not at the end of string ... */
4987 if (!AT_STRINGS_END (d))
4988 {
4989 PREFETCH_NOLIMIT ();
4990 c2 = RE_STRING_CHAR (d, target_multibyte);
4991 nchars++;
4992 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
4993 s2 = SYNTAX (c2);
4994
4995 /* ... and S2 is Sword or Ssymbol. */
4996 if (s2 == Sword || s2 == Ssymbol)
4997 goto fail;
4998 }
4999 }
5000 break;
5001
5002 case syntaxspec:
5003 case notsyntaxspec:
5004 {
5005 bool not = (re_opcode_t) *(p - 1) == notsyntaxspec;
5006 mcnt = *p++;
5007 DEBUG_PRINT ("EXECUTING %ssyntaxspec %d.\n", not ? "not" : "",
5008 mcnt);
5009 PREFETCH ();
5010 {
5011 ptrdiff_t offset = PTR_TO_OFFSET (d);
5012 ptrdiff_t pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
5013 UPDATE_SYNTAX_TABLE (pos1);
5014 }
5015 {
5016 int len;
5017 int c;
5018
5019 GET_CHAR_AFTER (c, d, len);
5020 if ((SYNTAX (c) != (enum syntaxcode) mcnt) ^ not)
5021 goto fail;
5022 d += len;
5023 nchars++;
5024 }
5025 }
5026 break;
5027
5028 case at_dot:
5029 DEBUG_PRINT ("EXECUTING at_dot.\n");
5030 if (PTR_BYTE_POS (d) != PT_BYTE)
5031 goto fail;
5032 break;
5033
5034 case categoryspec:
5035 case notcategoryspec:
5036 {
5037 bool not = (re_opcode_t) *(p - 1) == notcategoryspec;
5038 mcnt = *p++;
5039 DEBUG_PRINT ("EXECUTING %scategoryspec %d.\n",
5040 not ? "not" : "", mcnt);
5041 PREFETCH ();
5042
5043 {
5044 int len;
5045 int c;
5046 GET_CHAR_AFTER (c, d, len);
5047 if ((!CHAR_HAS_CATEGORY (c, mcnt)) ^ not)
5048 goto fail;
5049 d += len;
5050 nchars++;
5051 }
5052 }
5053 break;
5054
5055 default:
5056 abort ();
5057 }
5058 continue; /* Successfully executed one pattern command; keep going. */
5059
5060
5061 /* We goto here if a matching operation fails. */
5062 fail:
5063 maybe_quit ();
5064 if (!FAIL_STACK_EMPTY ())
5065 {
5066 re_char *str, *pat;
5067 /* A restart point is known. Restore to that state. */
5068 DEBUG_PRINT ("\nFAIL:\n");
5069 POP_FAILURE_POINT (str, pat);
5070 switch (*pat++)
5071 {
5072 case on_failure_keep_string_jump:
5073 eassert (str == NULL);
5074 goto continue_failure_jump;
5075
5076 case on_failure_jump_nastyloop:
5077 eassert ((re_opcode_t)pat[-2] == no_op);
5078 PUSH_FAILURE_POINT (pat - 2, str);
5079 FALLTHROUGH;
5080 case on_failure_jump_loop:
5081 case on_failure_jump:
5082 case succeed_n:
5083 d = str;
5084 continue_failure_jump:
5085 EXTRACT_NUMBER_AND_INCR (mcnt, pat);
5086 p = pat + mcnt;
5087 break;
5088
5089 case no_op:
5090 /* A special frame used for nastyloops. */
5091 goto fail;
5092
5093 default:
5094 abort ();
5095 }
5096
5097 eassert (p >= bufp->buffer && p <= pend);
5098
5099 if (d >= string1 && d <= end1)
5100 dend = end_match_1;
5101 }
5102 else
5103 break; /* Matching at this starting point really fails. */
5104 } /* for (;;) */
5105
5106 if (best_regs_set)
5107 goto restore_best_regs;
5108
5109 unbind_to (count, Qnil);
5110 SAFE_FREE ();
5111
5112 if (max_redisplay_ticks > 0 && nchars > 0)
5113 update_redisplay_ticks (nchars / 50 + 1, NULL);
5114
5115 return -1; /* Failure to match. */
5116 }
5117
5118 /* Subroutine definitions for re_match_2. */
5119
5120 /* Return true if TRANSLATE[S1] and TRANSLATE[S2] are not identical
5121 for LEN bytes. */
5122
5123 static bool
5124 bcmp_translate (re_char *s1, re_char *s2, ptrdiff_t len,
5125 Lisp_Object translate, bool target_multibyte)
5126 {
5127 re_char *p1 = s1, *p2 = s2;
5128 re_char *p1_end = s1 + len;
5129 re_char *p2_end = s2 + len;
5130
5131 /* FIXME: Checking both p1 and p2 presumes that the two strings might have
5132 different lengths, but relying on a single LEN would break this. -sm */
5133 while (p1 < p1_end && p2 < p2_end)
5134 {
5135 int p1_charlen, p2_charlen;
5136 int p1_ch, p2_ch;
5137
5138 GET_CHAR_AFTER (p1_ch, p1, p1_charlen);
5139 GET_CHAR_AFTER (p2_ch, p2, p2_charlen);
5140
5141 if (RE_TRANSLATE (translate, p1_ch)
5142 != RE_TRANSLATE (translate, p2_ch))
5143 return true;
5144
5145 p1 += p1_charlen, p2 += p2_charlen;
5146 }
5147
5148 return p1 != p1_end || p2 != p2_end;
5149 }
5150
5151 /* Entry points for GNU code. */
5152
5153 /* re_compile_pattern is the GNU regular expression compiler: it
5154 compiles PATTERN (of length SIZE) and puts the result in BUFP.
5155 Returns 0 if the pattern was valid, otherwise an error string.
5156
5157 Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
5158 are set in BUFP on entry.
5159
5160 We call regex_compile to do the actual compilation. */
5161
5162 const char *
5163 re_compile_pattern (const char *pattern, ptrdiff_t length,
5164 bool posix_backtracking, const char *whitespace_regexp,
5165 struct re_pattern_buffer *bufp)
5166 {
5167 bufp->regs_allocated = REGS_UNALLOCATED;
5168
5169 reg_errcode_t ret
5170 = regex_compile ((re_char *) pattern, length,
5171 posix_backtracking,
5172 whitespace_regexp,
5173 bufp);
5174
5175 if (!ret)
5176 return NULL;
5177 return re_error_msgid[ret];
5178 }