1 /* Low-level bidirectional buffer/string-scanning functions for GNU Emacs.
2
3 Copyright (C) 2000-2001, 2004-2005, 2009-2023 Free Software Foundation,
4 Inc.
5
6 Author: Eli Zaretskii <eliz@gnu.org>
7
8 This file is part of GNU Emacs.
9
10 GNU Emacs is free software: you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation, either version 3 of the License, or (at
13 your option) any later version.
14
15 GNU Emacs is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with GNU Emacs. If not, see <https://www.gnu.org/licenses/>. */
22
23 /* A sequential implementation of the Unicode Bidirectional algorithm,
24 (UBA) as per UAX#9, a part of the Unicode Standard.
25
26 Unlike the Reference Implementation and most other implementations,
27 this one is designed to be called once for every character in the
28 buffer or string. That way, we can leave intact the design of the
29 Emacs display engine, whereby an iterator object is used to
30 traverse buffer or string text character by character, and generate
31 the necessary data for displaying each character in 'struct glyph'
32 objects. (See xdisp.c for the details of that iteration.) The
33 functions on this file replace the original linear iteration in the
34 logical order of the text with a non-linear iteration in the visual
35 order, i.e. in the order characters should be shown on display.
36
37 The main entry point is bidi_move_to_visually_next. Each time it
38 is called, it finds the next character in the visual order, and
39 returns its information in a special structure. The caller is then
40 expected to process this character for display or any other
41 purposes, and call bidi_move_to_visually_next for the next
42 character. See the comments in bidi_move_to_visually_next for more
43 details about its algorithm that finds the next visual-order
44 character by resolving their levels on the fly.
45
46 Two other entry points are bidi_paragraph_init and
47 bidi_mirror_char. The first determines the base direction of a
48 paragraph, while the second returns the mirrored version of its
49 argument character.
50
51 A few auxiliary entry points are used to initialize the bidi
52 iterator for iterating an object (buffer or string), push and pop
53 the bidi iterator state, and save and restore the state of the bidi
54 cache.
55
56 If you want to understand the code, you will have to read it
57 together with the relevant portions of UAX#9. The comments include
58 references to UAX#9 rules, for that very reason.
59
60 A note about references to UAX#9 rules: if the reference says
61 something like "X9/Retaining", it means that you need to refer to
62 rule X9 and to its modifications described in the "Implementation
63 Notes" section of UAX#9, under "Retaining Format Codes".
64
65 Here's the overview of the design of the reordering engine
66 implemented by this file.
67
68 Basic implementation structure
69 ------------------------------
70
71 The sequential processing steps described by UAX#9 are implemented
72 as recursive levels of processing, all of which examine the next
73 character in the logical order. This hierarchy of processing looks
74 as follows, from the innermost (deepest) to the outermost level,
75 omitting some subroutines used by each level:
76
77 bidi_fetch_char -- fetch next character
78 bidi_resolve_explicit -- resolve explicit levels and directions
79 bidi_resolve_weak -- resolve weak types
80 bidi_resolve_brackets -- resolve "paired brackets" neutral types
81 bidi_resolve_neutral -- resolve neutral types
82 bidi_level_of_next_char -- resolve implicit levels
83
84 Each level calls the level below it, and works on the result
85 returned by the lower level, including all of its sub-levels.
86
87 Unlike all the levels below it, bidi_level_of_next_char can return
88 the information about either the next or previous character in the
89 logical order, depending on the current direction of scanning the
90 buffer or string. For the next character, it calls all the levels
91 below it; for the previous character, it uses the cache, described
92 below.
93
94 Thus, the result of calling bidi_level_of_next_char is the resolved
95 level of the next or the previous character in the logical order.
96 Based on this information, the function bidi_move_to_visually_next
97 finds the next character in the visual order and updates the
98 direction in which the buffer is scanned, either forward or
99 backward, to find the next character to be displayed. (Text is
100 scanned backwards when it needs to be reversed for display, i.e. if
101 the visual order is the inverse of the logical order.) This
102 implements the last, reordering steps of the UBA, by successively
103 calling bidi_level_of_next_char until the character of the required
104 embedding level is found; the scan direction is dynamically updated
105 as a side effect. See the commentary before the 'while' loop in
106 bidi_move_to_visually_next, for the details.
107
108 Fetching characters
109 -------------------
110
111 In a nutshell, fetching the next character boils down to calling
112 string_char_and_length, passing it the address of a buffer or
113 string position. See bidi_fetch_char. However, if the next
114 character is "covered" by a display property of some kind,
115 bidi_fetch_char returns the u+FFFC "object replacement character"
116 that represents the entire run of text covered by the display
117 property. (The ch_len and nchars members of 'struct bidi_it'
118 reflect the length in bytes and characters of that text.) This is
119 so we reorder text on both sides of the display property as
120 appropriate for an image or embedded string. Similarly, text
121 covered by a display spec of the form '(space ...)', is replaced
122 with the u+2029 paragraph separator character, so such display
123 specs produce the same effect as a TAB under UBA. Both these
124 special characters are not actually displayed -- the display
125 property is displayed instead -- but just used to compute the
126 embedding level of the surrounding text so as to produce the
127 required effect.
128
129 Bidi iterator states
130 --------------------
131
132 The UBA is highly context dependent in some of its parts,
133 i.e. results of processing a character can generally depend on
134 characters very far away. The UAX#9 description of the UBA
135 prescribes a stateful processing of each character, whereby the
136 results of this processing depend on various state variables, such
137 as the current embedding level, level stack, and directional
138 override status. In addition, the UAX#9 description includes many
139 passages like this (from rule W2 in this case):
140
141 Search backward from each instance of a European number until the
142 first strong type (R, L, AL, or sos) is found. If an AL is found,
143 change the type of the European number to Arabic number.
144
145 To support this, we use a bidi iterator object, 'struct bidi_it',
146 which is a sub-structure of 'struct it' used by xdisp.c (see
147 dispextern.h for the definition of both of these structures). The
148 bidi iterator holds the entire state of the iteration required by
149 the UBA, and is updated as the text is traversed. In particular,
150 the embedding level of the current character being resolved is
151 recorded in the iterator state. To avoid costly searches backward
152 in support of rules like W2 above, the necessary character types
153 are also recorded in the iterator state as they are found during
154 the forward scan, and then used when such rules need to be applied.
155 (Forward scans cannot be avoided in this way; they need to be
156 performed at least once, and the results recorded in the iterator
157 state, to be reused until the forward scan oversteps the recorded
158 position.)
159
160 In this manner, the iterator state acts as a mini-cache of
161 contextual information required for resolving the level of the
162 current character by various UBA rules.
163
164 Caching of bidi iterator states
165 -------------------------------
166
167 As described above, the reordering engine uses the information
168 recorded in the bidi iterator state in order to resolve the
169 embedding level of the current character. When the reordering
170 engine needs to process the next character in the logical order, it
171 fetches it and applies to it all the UBA levels, updating the
172 iterator state as it goes. But when the buffer or string is
173 scanned backwards, i.e. in the reverse order of buffer/string
174 positions, the scanned characters were already processed during the
175 preceding forward scan (see bidi_find_other_level_edge). To avoid
176 costly re-processing of characters that were already processed
177 during the forward scan, the iterator states computed while
178 scanning forward are cached.
179
180 The cache is just a linear array of 'struct bidi_it' objects, which
181 is dynamically allocated and reallocated as needed, since the size
182 of the cache depends on the text being processed. We only need the
183 cache while processing embedded levels higher than the base
184 paragraph embedding level, because these higher levels require
185 changes in scan direction. Therefore, as soon as we are back to
186 the base embedding level, we can free the cache; see the calls to
187 bidi_cache_reset and bidi_cache_shrink, for the conditions to do
188 this.
189
190 The cache maintains the index of the next unused cache slot -- this
191 is where the next iterator state will be cached. The function
192 bidi_cache_iterator_state saves an instance of the state in the
193 cache and increments the unused slot index. The companion function
194 bidi_cache_find looks up a cached state that corresponds to a given
195 buffer/string position. All of the cached states must correspond
196 1:1 to the buffer or string region whose processing they reflect;
197 bidi.c will abort if it finds cache slots that violate this 1:1
198 correspondence.
199
200 When the parent iterator 'struct it' is pushed (see push_it in
201 xdisp.c) to pause the current iteration and start iterating over a
202 different object (e.g., a 'display' string that covers some buffer
203 text), the bidi iterator cache needs to be "pushed" as well, so
204 that a new empty cache could be used while iterating over the new
205 object. Later, when the new object is exhausted, and xdisp.c calls
206 pop_it, we need to "pop" the bidi cache as well and return to the
207 original cache. See bidi_push_it and bidi_pop_it for how this is
208 done.
209
210 Some functions of the display engine save copies of 'struct it' in
211 local variables, and restore them later. For examples, see
212 pos_visible_p and move_it_in_display_line_to in xdisp.c, and
213 window_scroll_pixel_based in window.c. When this happens, we need
214 to save and restore the bidi cache as well, because conceptually
215 the cache is part of the 'struct it' state, and needs to be in
216 perfect sync with the portion of the buffer/string that is being
217 processed. This saving and restoring of the cache state is handled
218 by bidi_shelve_cache and bidi_unshelve_cache, and the helper macros
219 SAVE_IT and RESTORE_IT defined on xdisp.c.
220
221 Note that, because reordering is implemented below the level in
222 xdisp.c that breaks glyphs into screen lines, we are violating
223 paragraph 3.4 of UAX#9. which mandates that line breaking shall be
224 done before reordering each screen line separately. However,
225 following UAX#9 to the letter in this matter goes against the basic
226 design of the Emacs display engine, and so we choose here this
227 minor deviation from the UBA letter in preference to redesign of
228 the display engine. The effect of this is only seen in continued
229 lines that are broken into screen lines in the middle of a run
230 whose direction is opposite to the paragraph's base direction.
231
232 Important design and implementation note: when the code needs to
233 scan far ahead, be sure to avoid such scans as much as possible
234 when the buffer/string doesn't contain any RTL characters. Users
235 of left-to-right scripts will never forgive you if you introduce
236 some slow-down due to bidi in situations that don't involve any
237 bidirectional text. See the large comment near the beginning of
238 bidi_resolve_neutral, for one situation where such shortcut was
239 necessary. */
240
241 #include <config.h>
242
243 #include "lisp.h"
244 #include "character.h"
245 #include "buffer.h"
246 #include "dispextern.h"
247 #include "region-cache.h"
248 #include "sysstdio.h"
249
250 static bool bidi_initialized = 0;
251
252 static Lisp_Object bidi_type_table, bidi_mirror_table, bidi_brackets_table;
253
254 #define BIDI_EOB (-1)
255
256 /* Data type for describing the bidirectional character categories. */
257 typedef enum {
258 UNKNOWN_BC,
259 NEUTRAL,
260 WEAK,
261 STRONG,
262 EXPLICIT_FORMATTING
263 } bidi_category_t;
264
265 static Lisp_Object paragraph_start_re, paragraph_separate_re;
266
267
268 /***********************************************************************
269 Utilities
270 ***********************************************************************/
271
272 /* Return the bidi type of a character CH, subject to the current
273 directional OVERRIDE. */
274 static bidi_type_t
275 bidi_get_type (int ch, bidi_dir_t override)
276 {
277 bidi_type_t default_type;
278
279 if (ch == BIDI_EOB)
280 return NEUTRAL_B;
281 if (ch < 0 || ch > MAX_CHAR)
282 emacs_abort ();
283
284 default_type = (bidi_type_t) XFIXNUM (CHAR_TABLE_REF (bidi_type_table, ch));
285 /* Every valid character code, even those that are unassigned by the
286 UCD, have some bidi-class property, according to
287 DerivedBidiClass.txt file. Therefore, if we ever get UNKNOWN_BT
288 (= zero) code from CHAR_TABLE_REF, that's a bug. */
289 if (default_type == UNKNOWN_BT)
290 emacs_abort ();
291
292 switch (default_type)
293 {
294 case WEAK_BN:
295 case NEUTRAL_B:
296 case LRE:
297 case LRO:
298 case RLE:
299 case RLO:
300 case PDF:
301 case LRI:
302 case RLI:
303 case FSI:
304 case PDI:
305 return default_type;
306 default:
307 if (override == L2R)
308 return STRONG_L;
309 else if (override == R2L)
310 return STRONG_R;
311 else
312 return default_type;
313 }
314 }
315
316 static void
317 bidi_check_type (bidi_type_t type)
318 {
319 eassert (UNKNOWN_BT <= type && type <= NEUTRAL_ON);
320 }
321
322 /* Given a bidi TYPE of a character, return its category. */
323 static bidi_category_t
324 bidi_get_category (bidi_type_t type)
325 {
326 switch (type)
327 {
328 case UNKNOWN_BT:
329 return UNKNOWN_BC;
330 case STRONG_L:
331 case STRONG_R:
332 case STRONG_AL:
333 return STRONG;
334 case WEAK_EN:
335 case WEAK_ES:
336 case WEAK_ET:
337 case WEAK_AN:
338 case WEAK_CS:
339 case WEAK_NSM:
340 case WEAK_BN:
341 return WEAK;
342 case NEUTRAL_B:
343 case NEUTRAL_S:
344 case NEUTRAL_WS:
345 case NEUTRAL_ON:
346 return NEUTRAL;
347 case LRE:
348 case LRO:
349 case RLE:
350 case RLO:
351 case PDF:
352 case LRI:
353 case RLI:
354 case FSI:
355 case PDI:
356 return EXPLICIT_FORMATTING;
357 default:
358 emacs_abort ();
359 }
360 }
361
362 static bool
363 bidi_isolate_fmt_char (bidi_type_t ch_type)
364 {
365 return (ch_type == LRI || ch_type == RLI || ch_type == PDI || ch_type == FSI);
366 }
367
368 /* Return the mirrored character of C, if it has one. If C has no
369 mirrored counterpart, return C.
370 Note: The conditions in UAX#9 clause L4 regarding the surrounding
371 context must be tested by the caller. */
372 int
373 bidi_mirror_char (int c)
374 {
375 Lisp_Object val;
376
377 if (c == BIDI_EOB)
378 return c;
379 if (c < 0 || c > MAX_CHAR)
380 emacs_abort ();
381
382 val = CHAR_TABLE_REF (bidi_mirror_table, c);
383 if (FIXNUMP (val))
384 {
385 int v;
386
387 /* When debugging, check before assigning to V, so that the check
388 isn't broken by undefined behavior due to int overflow. */
389 eassert (CHAR_VALID_P (XFIXNUM (val)));
390
391 v = XFIXNUM (val);
392
393 /* Minimal test we must do in optimized builds, to prevent weird
394 crashes further down the road. */
395 if (v < 0 || v > MAX_CHAR)
396 emacs_abort ();
397
398 return v;
399 }
400
401 return c;
402 }
403
404 /* Return the Bidi_Paired_Bracket_Type property of the character C. */
405 static bidi_bracket_type_t
406 bidi_paired_bracket_type (int c)
407 {
408 if (c == BIDI_EOB || bidi_inhibit_bpa)
409 return BIDI_BRACKET_NONE;
410 if (c < 0 || c > MAX_CHAR)
411 emacs_abort ();
412
413 return (bidi_bracket_type_t) XFIXNUM (CHAR_TABLE_REF (bidi_brackets_table, c));
414 }
415
416 /* Determine the start-of-sequence (sos) directional type given the two
417 embedding levels on either side of the run boundary. Also, update
418 the saved info about previously seen characters, since that info is
419 generally valid for a single level run. */
420 static void
421 bidi_set_sos_type (struct bidi_it *bidi_it, int level_before, int level_after)
422 {
423 int higher_level = (level_before > level_after ? level_before : level_after);
424
425 /* FIXME: should the default sos direction be user selectable? */
426 bidi_it->sos = ((higher_level & 1) != 0 ? R2L : L2R); /* X10 */
427
428 bidi_it->prev.type = UNKNOWN_BT;
429 bidi_it->last_strong.type = bidi_it->last_strong.orig_type = UNKNOWN_BT;
430 bidi_it->prev_for_neutral.type = (bidi_it->sos == R2L ? STRONG_R : STRONG_L);
431 bidi_it->prev_for_neutral.charpos = bidi_it->charpos;
432 bidi_it->next_for_neutral.type
433 = bidi_it->next_for_neutral.orig_type = UNKNOWN_BT;
434 }
435
436 #define ISOLATE_STATUS(BIDI_IT, IDX) ((BIDI_IT)->level_stack[IDX].flags & 1)
437 #define OVERRIDE(BIDI_IT, IDX) (((BIDI_IT)->level_stack[IDX].flags >> 1) & 3)
438
439 /* Push the current embedding level and override status; reset the
440 current level to LEVEL and the current override status to OVERRIDE. */
441 static void
442 bidi_push_embedding_level (struct bidi_it *bidi_it,
443 int level, bidi_dir_t override, bool isolate_status)
444 {
445 struct bidi_stack *st;
446 int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
447
448 bidi_it->stack_idx++;
449 eassert (bidi_it->stack_idx < BIDI_MAXDEPTH+2+1);
450 st = &bidi_it->level_stack[bidi_it->stack_idx];
451 eassert (level <= (1 << 7));
452 st->level = level;
453 st->flags = (((override & 3) << 1) | (isolate_status != 0));
454 if (isolate_status)
455 {
456 st->last_strong_type = bidi_it->last_strong.type;
457 st->prev_for_neutral_type = bidi_it->prev_for_neutral.type;
458 st->next_for_neutral_type = bidi_it->next_for_neutral.type;
459 st->next_for_neutral_pos = bidi_it->next_for_neutral.charpos;
460 st->flags |= ((bidi_it->sos == L2R ? 0 : 1) << 3);
461 }
462 /* We've got a new isolating sequence, compute the directional type
463 of sos and initialize per-sequence variables (UAX#9, clause X10). */
464 bidi_set_sos_type (bidi_it, prev_level, level);
465 }
466
467 /* Pop from the stack the embedding level, the directional override
468 status, and optionally saved information for the isolating run
469 sequence. Return the new level. */
470 static int
471 bidi_pop_embedding_level (struct bidi_it *bidi_it)
472 {
473 int level;
474
475 /* UAX#9 says to ignore invalid PDFs (X7, last bullet)
476 and PDIs (X6a, 2nd bullet). */
477 if (bidi_it->stack_idx > 0)
478 {
479 bool isolate_status = ISOLATE_STATUS (bidi_it, bidi_it->stack_idx);
480 int old_level = bidi_it->level_stack[bidi_it->stack_idx].level;
481
482 struct bidi_stack st;
483
484 st = bidi_it->level_stack[bidi_it->stack_idx];
485 if (isolate_status)
486 {
487 bidi_dir_t sos = ((st.flags >> 3) & 1);
488 /* PREV is used in W1 for resolving WEAK_NSM. By the time
489 we get to an NSM, we must have gotten past at least one
490 character: the PDI that ends the isolate from which we
491 are popping here. So PREV will have been filled up by
492 the time we first use it. We initialize it here to
493 UNKNOWN_BT to be able to catch any blunders in this
494 logic. */
495 bidi_it->prev.orig_type = bidi_it->prev.type = UNKNOWN_BT;
496 bidi_it->last_strong.type = st.last_strong_type;
497 bidi_it->prev_for_neutral.type = st.prev_for_neutral_type;
498 bidi_it->next_for_neutral.type = st.next_for_neutral_type;
499 bidi_it->next_for_neutral.charpos = st.next_for_neutral_pos;
500 bidi_it->sos = (sos == 0 ? L2R : R2L);
501 }
502 else
503 bidi_set_sos_type (bidi_it, old_level,
504 bidi_it->level_stack[bidi_it->stack_idx - 1].level);
505
506 bidi_it->stack_idx--;
507 }
508 level = bidi_it->level_stack[bidi_it->stack_idx].level;
509 eassert (0 <= level && level <= BIDI_MAXDEPTH + 1);
510 return level;
511 }
512
513 /* Record in SAVED_INFO the information about the current character. */
514 static void
515 bidi_remember_char (struct bidi_saved_info *saved_info,
516 struct bidi_it *bidi_it, bool from_type)
517 {
518 saved_info->charpos = bidi_it->charpos;
519 if (from_type)
520 saved_info->type = bidi_it->type;
521 else
522 saved_info->type = bidi_it->type_after_wn;
523 bidi_check_type (saved_info->type);
524 saved_info->orig_type = bidi_it->orig_type;
525 bidi_check_type (saved_info->orig_type);
526 }
527
528 /* Copy the bidi iterator from FROM to TO. To save cycles, this only
529 copies the part of the level stack that is actually in use. */
530 static void
531 bidi_copy_it (struct bidi_it *to, struct bidi_it *from)
532 {
533 /* Copy everything from the start through the active part of
534 the level stack. */
535 memcpy (to, from,
536 (offsetof (struct bidi_it, level_stack) + sizeof from->level_stack[0]
537 + from->stack_idx * sizeof from->level_stack[0]));
538 }
539
540
541 /***********************************************************************
542 Caching the bidi iterator states
543 ***********************************************************************/
544
545 /* We allocate and de-allocate the cache in chunks of this size (in
546 characters). 200 was chosen as an upper limit for reasonably-long
547 lines in a text file/buffer. */
548 #define BIDI_CACHE_CHUNK 200
549 /* Maximum size we allow the cache to become, per iterator stack slot,
550 in units of struct bidi_it size. If we allow unlimited growth, we
551 could run out of memory for pathologically long bracketed text or
552 very long text lines that need to be reordered. This is aggravated
553 when word-wrap is in effect, since then functions display_line and
554 move_it_in_display_line_to need to keep up to 4 copies of the
555 cache.
556
557 This limitation means there can be no more than that amount of
558 contiguous RTL text on any single physical line in a LTR paragraph,
559 and similarly with contiguous LTR + numeric text in a RTL
560 paragraph. (LTR text in a LTR paragraph and RTL text in a RTL
561 paragraph are not reordered, and so don't need the cache, and
562 cannot hit this limit.) More importantly, no single line can have
563 text longer than this inside paired brackets (because bracket pairs
564 resolution uses the cache). If the limit is exceeded, the fallback
565 code will produce visual order that will be incorrect if there are
566 RTL characters in the offending line of text. */
567 /* Do we need to allow customization of this limit? */
568 #define BIDI_CACHE_MAX_ELTS_PER_SLOT 50000
569 verify (BIDI_CACHE_CHUNK < BIDI_CACHE_MAX_ELTS_PER_SLOT);
570 static ptrdiff_t bidi_cache_max_elts = BIDI_CACHE_MAX_ELTS_PER_SLOT;
571 static struct bidi_it *bidi_cache;
572 static ptrdiff_t bidi_cache_size = 0;
573 enum { elsz = sizeof (struct bidi_it) };
574 static ptrdiff_t bidi_cache_idx; /* next unused cache slot */
575 static ptrdiff_t bidi_cache_last_idx; /* slot of last cache hit */
576 static ptrdiff_t bidi_cache_start = 0; /* start of cache for this
577 "stack" level */
578
579 /* 5-slot stack for saving the start of the previous level of the
580 cache. xdisp.c maintains a 5-slot stack for its iterator state,
581 and we need the same size of our stack. */
582 static ptrdiff_t bidi_cache_start_stack[IT_STACK_SIZE];
583 static int bidi_cache_sp;
584
585 /* Size of header used by bidi_shelve_cache. */
586 enum
587 {
588 bidi_shelve_header_size
589 = (sizeof (bidi_cache_idx) + sizeof (bidi_cache_start_stack)
590 + sizeof (bidi_cache_sp) + sizeof (bidi_cache_start)
591 + sizeof (bidi_cache_last_idx) + sizeof (bidi_cache_max_elts))
592 };
593
594 /* Effectively remove the cached states beyond the Nth state from the
595 part of the cache relevant to iteration of the current object
596 (buffer or string). */
597 static void
598 bidi_cache_reset_to (int n)
599 {
600 bidi_cache_idx = bidi_cache_start + n;
601 bidi_cache_last_idx = -1;
602 }
603
604 /* Reset the cache state to the empty state. We only reset the part
605 of the cache relevant to iteration of the current object. Previous
606 objects, which are pushed on the display iterator's stack, are left
607 intact. This is called when the cached information is no more
608 useful for the current iteration, e.g. when we were reseated to a
609 new position on the same object. */
610 static void
611 bidi_cache_reset (void)
612 {
613 bidi_cache_reset_to (0);
614 }
615
616 /* Shrink the cache to its minimal size. Called when we init the bidi
617 iterator for reordering a buffer or a string that does not come
618 from display properties, because that means all the previously
619 cached info is of no further use. */
620 static void
621 bidi_cache_shrink (void)
622 {
623 if (bidi_cache_size > BIDI_CACHE_CHUNK)
624 {
625 bidi_cache = xrealloc (bidi_cache, BIDI_CACHE_CHUNK * elsz);
626 bidi_cache_size = BIDI_CACHE_CHUNK;
627 }
628 bidi_cache_reset ();
629 bidi_cache_max_elts = BIDI_CACHE_MAX_ELTS_PER_SLOT;
630 }
631
632 static void
633 bidi_cache_fetch_state (ptrdiff_t idx, struct bidi_it *bidi_it)
634 {
635 int current_scan_dir = bidi_it->scan_dir;
636
637 if (idx < bidi_cache_start || idx >= bidi_cache_idx)
638 emacs_abort ();
639
640 bidi_copy_it (bidi_it, &bidi_cache[idx]);
641 bidi_it->scan_dir = current_scan_dir;
642 bidi_cache_last_idx = idx;
643 }
644
645 /* Find a cached state with a given CHARPOS and resolved embedding
646 level less or equal to LEVEL. If LEVEL is -1, disregard the
647 resolved levels in cached states. DIR, if non-zero, means search
648 in that direction from the last cache hit.
649
650 Value is the index of the cached state, or -1 if not found. */
651 static ptrdiff_t
652 bidi_cache_search (ptrdiff_t charpos, int level, int dir)
653 {
654 ptrdiff_t i, i_start;
655
656 if (bidi_cache_idx > bidi_cache_start)
657 {
658 if (bidi_cache_last_idx == -1)
659 bidi_cache_last_idx = bidi_cache_idx - 1;
660 if (charpos < bidi_cache[bidi_cache_last_idx].charpos)
661 {
662 dir = -1;
663 i_start = bidi_cache_last_idx - 1;
664 }
665 else if (charpos > (bidi_cache[bidi_cache_last_idx].charpos
666 + bidi_cache[bidi_cache_last_idx].nchars - 1))
667 {
668 dir = 1;
669 i_start = bidi_cache_last_idx + 1;
670 }
671 else if (dir)
672 i_start = bidi_cache_last_idx;
673 else
674 {
675 dir = -1;
676 i_start = bidi_cache_idx - 1;
677 }
678
679 if (dir < 0)
680 {
681 /* Linear search for now; FIXME! */
682 for (i = i_start; i >= bidi_cache_start; i--)
683 if (bidi_cache[i].charpos <= charpos
684 && charpos < bidi_cache[i].charpos + bidi_cache[i].nchars
685 && (level == -1 || bidi_cache[i].resolved_level <= level))
686 return i;
687 }
688 else
689 {
690 for (i = i_start; i < bidi_cache_idx; i++)
691 if (bidi_cache[i].charpos <= charpos
692 && charpos < bidi_cache[i].charpos + bidi_cache[i].nchars
693 && (level == -1 || bidi_cache[i].resolved_level <= level))
694 return i;
695 }
696 }
697
698 return -1;
699 }
700
701 /* Find a cached state where the resolved level changes to a value
702 that is lower than LEVEL, and return its cache slot index. DIR is
703 the direction to search, starting with the last used cache slot.
704 If DIR is zero, we search backwards from the last occupied cache
705 slot. BEFORE means return the index of the slot that
706 is ``before'' the level change in the search direction. That is,
707 given the cached levels like this:
708
709 1122333442211
710 AB C
711
712 and assuming we are at the position cached at the slot marked with
713 C, searching backwards (DIR = -1) for LEVEL = 2 will return the
714 index of slot B or A, depending whether BEFORE is, respectively,
715 true or false. */
716 static ptrdiff_t
717 bidi_cache_find_level_change (int level, int dir, bool before)
718 {
719 if (bidi_cache_idx)
720 {
721 ptrdiff_t i = dir ? bidi_cache_last_idx : bidi_cache_idx - 1;
722 int incr = before ? 1 : 0;
723
724 if (i < 0) /* cache overflowed? */
725 i = 0;
726
727 if (!dir)
728 dir = -1;
729 else if (!incr)
730 i += dir;
731
732 if (dir < 0)
733 {
734 while (i >= bidi_cache_start + incr)
735 {
736 if (bidi_cache[i - incr].resolved_level >= 0
737 && bidi_cache[i - incr].resolved_level < level)
738 return i;
739 i--;
740 }
741 }
742 else
743 {
744 while (i < bidi_cache_idx - incr)
745 {
746 if (bidi_cache[i + incr].resolved_level >= 0
747 && bidi_cache[i + incr].resolved_level < level)
748 return i;
749 i++;
750 }
751 }
752 }
753
754 return -1;
755 }
756
757 static void
758 bidi_cache_ensure_space (ptrdiff_t idx)
759 {
760 /* Enlarge the cache as needed. */
761 if (idx >= bidi_cache_size)
762 {
763 ptrdiff_t chunk_size = BIDI_CACHE_CHUNK;
764
765 if (bidi_cache_size > bidi_cache_max_elts - chunk_size)
766 chunk_size = bidi_cache_max_elts - bidi_cache_size;
767
768 if (max (idx + 1,
769 bidi_cache_size + chunk_size) <= bidi_cache_max_elts)
770 {
771 /* The bidi cache cannot be larger than the largest Lisp
772 string or buffer. */
773 ptrdiff_t string_or_buffer_bound
774 = max (BUF_BYTES_MAX, STRING_BYTES_BOUND);
775
776 /* Also, it cannot be larger than what C can represent. */
777 ptrdiff_t c_bound
778 = (min (PTRDIFF_MAX, SIZE_MAX) - bidi_shelve_header_size) / elsz;
779 ptrdiff_t max_elts = bidi_cache_max_elts;
780
781 max_elts = min (max_elts, min (string_or_buffer_bound, c_bound));
782
783 /* Force xpalloc not to over-allocate by passing it MAX_ELTS
784 as its 4th argument. */
785 bidi_cache = xpalloc (bidi_cache, &bidi_cache_size,
786 max (chunk_size, idx - bidi_cache_size + 1),
787 max_elts, elsz);
788 eassert (bidi_cache_size > idx);
789 }
790 }
791 }
792
793 static int
794 bidi_cache_iterator_state (struct bidi_it *bidi_it, bool resolved,
795 bool update_only)
796 {
797 ptrdiff_t idx;
798
799 /* We should never cache on backward scans. */
800 if (bidi_it->scan_dir == -1)
801 emacs_abort ();
802 idx = bidi_cache_search (bidi_it->charpos, -1, 1);
803
804 if (idx < 0 && update_only)
805 return 0;
806
807 if (idx < 0)
808 {
809 idx = bidi_cache_idx;
810 bidi_cache_ensure_space (idx);
811 /* Character positions should correspond to cache positions 1:1.
812 If we are outside the range of cached positions, the cache is
813 useless and must be reset. */
814 if (bidi_cache_start < idx && idx < bidi_cache_size
815 && (bidi_it->charpos > (bidi_cache[idx - 1].charpos
816 + bidi_cache[idx - 1].nchars)
817 || bidi_it->charpos < bidi_cache[bidi_cache_start].charpos))
818 {
819 bidi_cache_reset ();
820 idx = bidi_cache_start;
821 }
822 if (bidi_it->nchars <= 0)
823 emacs_abort ();
824 /* Don't cache if no available space in the cache. */
825 if (bidi_cache_size > idx)
826 {
827 bidi_copy_it (&bidi_cache[idx], bidi_it);
828 if (!resolved)
829 bidi_cache[idx].resolved_level = -1;
830 }
831 }
832 else
833 {
834 /* Copy only the members which could have changed, to avoid
835 costly copying of the entire struct. */
836 bidi_cache[idx].type = bidi_it->type;
837 bidi_check_type (bidi_it->type);
838 bidi_cache[idx].type_after_wn = bidi_it->type_after_wn;
839 bidi_check_type (bidi_it->type_after_wn);
840 if (resolved)
841 bidi_cache[idx].resolved_level = bidi_it->resolved_level;
842 else
843 bidi_cache[idx].resolved_level = -1;
844 bidi_cache[idx].invalid_levels = bidi_it->invalid_levels;
845 bidi_cache[idx].next_for_neutral = bidi_it->next_for_neutral;
846 bidi_cache[idx].next_for_ws = bidi_it->next_for_ws;
847 bidi_cache[idx].disp_pos = bidi_it->disp_pos;
848 bidi_cache[idx].disp_prop = bidi_it->disp_prop;
849 bidi_cache[idx].bracket_pairing_pos = bidi_it->bracket_pairing_pos;
850 bidi_cache[idx].bracket_enclosed_type = bidi_it->bracket_enclosed_type;
851 }
852
853 if (bidi_cache_size > idx)
854 {
855 bidi_cache_last_idx = idx;
856 if (idx >= bidi_cache_idx)
857 bidi_cache_idx = idx + 1;
858 return 1;
859 }
860 else
861 {
862 /* The cache overflowed. */
863 bidi_cache_last_idx = -1;
864 return 0;
865 }
866 }
867
868 /* Look for a cached iterator state that corresponds to CHARPOS. If
869 found, copy the cached state into BIDI_IT and return the type of
870 the cached entry. If not found, return UNKNOWN_BT. RESOLVED_ONLY
871 zero means it is OK to return cached states that were not fully
872 resolved yet. This can happen if the state was cached before it
873 was resolved in bidi_resolve_neutral. */
874 static bidi_type_t
875 bidi_cache_find (ptrdiff_t charpos, bool resolved_only, struct bidi_it *bidi_it)
876 {
877 ptrdiff_t i = bidi_cache_search (charpos, -1, bidi_it->scan_dir);
878
879 if (i >= bidi_cache_start
880 && (!resolved_only
881 /* Callers that want only fully resolved states (and set
882 resolved_only = true) need to be sure that there's enough
883 info in the cached state to return the state as final,
884 and if not, they don't want the cached state. */
885 || bidi_cache[i].resolved_level >= 0))
886 {
887 bidi_dir_t current_scan_dir = bidi_it->scan_dir;
888
889 bidi_copy_it (bidi_it, &bidi_cache[i]);
890 bidi_cache_last_idx = i;
891 /* Don't let scan direction from the cached state override
892 the current scan direction. */
893 bidi_it->scan_dir = current_scan_dir;
894 return bidi_it->type;
895 }
896
897 return UNKNOWN_BT;
898 }
899
900 static int
901 bidi_peek_at_next_level (struct bidi_it *bidi_it)
902 {
903 if (bidi_cache_idx == bidi_cache_start)
904 emacs_abort ();
905 /* If the cache overflowed, return the level of the last cached
906 character. */
907 if (bidi_cache_last_idx == -1
908 || (bidi_cache_last_idx >= bidi_cache_idx - 1 && bidi_it->scan_dir > 0))
909 return bidi_cache[bidi_cache_idx - 1].resolved_level;
910 return bidi_cache[bidi_cache_last_idx + bidi_it->scan_dir].resolved_level;
911 }
912
913
914 /***********************************************************************
915 Pushing and popping the bidi iterator state
916 ***********************************************************************/
917
918 /* Push the bidi iterator state in preparation for reordering a
919 different object, e.g. display string found at certain buffer
920 position. Pushing the bidi iterator boils down to saving its
921 entire state on the cache and starting a new cache "stacked" on top
922 of the current cache. */
923 void
924 bidi_push_it (struct bidi_it *bidi_it)
925 {
926 /* Give this stack slot its cache room. */
927 bidi_cache_max_elts += BIDI_CACHE_MAX_ELTS_PER_SLOT;
928 /* Save the current iterator state in its entirety after the last
929 used cache slot. */
930 bidi_cache_ensure_space (bidi_cache_idx);
931 bidi_cache[bidi_cache_idx++] = *bidi_it;
932
933 /* Push the current cache start onto the stack. */
934 eassert (bidi_cache_sp < IT_STACK_SIZE);
935 bidi_cache_start_stack[bidi_cache_sp++] = bidi_cache_start;
936
937 /* Start a new level of cache, and make it empty. */
938 bidi_cache_start = bidi_cache_idx;
939 bidi_cache_last_idx = -1;
940 }
941
942 /* Restore the iterator state saved by bidi_push_it and return the
943 cache to the corresponding state. */
944 void
945 bidi_pop_it (struct bidi_it *bidi_it)
946 {
947 if (bidi_cache_start <= 0)
948 emacs_abort ();
949
950 /* Reset the next free cache slot index to what it was before the
951 call to bidi_push_it. */
952 bidi_cache_idx = bidi_cache_start - 1;
953
954 /* Restore the bidi iterator state saved in the cache. */
955 *bidi_it = bidi_cache[bidi_cache_idx];
956
957 /* Pop the previous cache start from the stack. */
958 if (bidi_cache_sp <= 0)
959 emacs_abort ();
960 bidi_cache_start = bidi_cache_start_stack[--bidi_cache_sp];
961
962 /* Invalidate the last-used cache slot data. */
963 bidi_cache_last_idx = -1;
964
965 bidi_cache_max_elts -= BIDI_CACHE_MAX_ELTS_PER_SLOT;
966 eassert (bidi_cache_max_elts > 0);
967 }
968
969 static ptrdiff_t bidi_cache_total_alloc;
970
971 /* Stash away a copy of the cache and its control variables. */
972 void *
973 bidi_shelve_cache (void)
974 {
975 unsigned char *databuf;
976 ptrdiff_t alloc;
977
978 /* Empty cache. */
979 if (bidi_cache_idx == 0)
980 return NULL;
981
982 alloc = (bidi_shelve_header_size
983 + bidi_cache_idx * sizeof (struct bidi_it));
984 databuf = xmalloc (alloc);
985 bidi_cache_total_alloc += alloc;
986
987 memcpy (databuf, &bidi_cache_idx, sizeof (bidi_cache_idx));
988 memcpy (databuf + sizeof (bidi_cache_idx),
989 bidi_cache, bidi_cache_idx * sizeof (struct bidi_it));
990 memcpy (databuf + sizeof (bidi_cache_idx)
991 + bidi_cache_idx * sizeof (struct bidi_it),
992 bidi_cache_start_stack, sizeof (bidi_cache_start_stack));
993 memcpy (databuf + sizeof (bidi_cache_idx)
994 + bidi_cache_idx * sizeof (struct bidi_it)
995 + sizeof (bidi_cache_start_stack),
996 &bidi_cache_sp, sizeof (bidi_cache_sp));
997 memcpy (databuf + sizeof (bidi_cache_idx)
998 + bidi_cache_idx * sizeof (struct bidi_it)
999 + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp),
1000 &bidi_cache_start, sizeof (bidi_cache_start));
1001 memcpy (databuf + sizeof (bidi_cache_idx)
1002 + bidi_cache_idx * sizeof (struct bidi_it)
1003 + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp)
1004 + sizeof (bidi_cache_start),
1005 &bidi_cache_last_idx, sizeof (bidi_cache_last_idx));
1006 memcpy (databuf + sizeof (bidi_cache_idx)
1007 + bidi_cache_idx * sizeof (struct bidi_it)
1008 + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp)
1009 + sizeof (bidi_cache_start) + sizeof (bidi_cache_last_idx),
1010 &bidi_cache_max_elts, sizeof (bidi_cache_max_elts));
1011
1012 return databuf;
1013 }
1014
1015 /* Restore the cache state from a copy stashed away by
1016 bidi_shelve_cache, and free the buffer used to stash that copy.
1017 JUST_FREE means free the buffer, but don't restore the
1018 cache; used when the corresponding iterator is discarded instead of
1019 being restored. */
1020 void
1021 bidi_unshelve_cache (void *databuf, bool just_free)
1022 {
1023 unsigned char *p = databuf;
1024
1025 if (!p)
1026 {
1027 if (!just_free)
1028 {
1029 /* A NULL pointer means an empty cache. */
1030 bidi_cache_start = 0;
1031 bidi_cache_sp = 0;
1032 bidi_cache_max_elts = BIDI_CACHE_MAX_ELTS_PER_SLOT;
1033 bidi_cache_reset ();
1034 }
1035 }
1036 else
1037 {
1038 if (just_free)
1039 {
1040 ptrdiff_t idx;
1041
1042 memcpy (&idx, p, sizeof (bidi_cache_idx));
1043 bidi_cache_total_alloc
1044 -= bidi_shelve_header_size + idx * sizeof (struct bidi_it);
1045 }
1046 else
1047 {
1048 memcpy (&bidi_cache_idx, p, sizeof (bidi_cache_idx));
1049 bidi_cache_ensure_space (bidi_cache_idx);
1050 memcpy (bidi_cache, p + sizeof (bidi_cache_idx),
1051 bidi_cache_idx * sizeof (struct bidi_it));
1052 memcpy (bidi_cache_start_stack,
1053 p + sizeof (bidi_cache_idx)
1054 + bidi_cache_idx * sizeof (struct bidi_it),
1055 sizeof (bidi_cache_start_stack));
1056 memcpy (&bidi_cache_sp,
1057 p + sizeof (bidi_cache_idx)
1058 + bidi_cache_idx * sizeof (struct bidi_it)
1059 + sizeof (bidi_cache_start_stack),
1060 sizeof (bidi_cache_sp));
1061 memcpy (&bidi_cache_start,
1062 p + sizeof (bidi_cache_idx)
1063 + bidi_cache_idx * sizeof (struct bidi_it)
1064 + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp),
1065 sizeof (bidi_cache_start));
1066 memcpy (&bidi_cache_last_idx,
1067 p + sizeof (bidi_cache_idx)
1068 + bidi_cache_idx * sizeof (struct bidi_it)
1069 + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp)
1070 + sizeof (bidi_cache_start),
1071 sizeof (bidi_cache_last_idx));
1072 memcpy (&bidi_cache_max_elts,
1073 p + sizeof (bidi_cache_idx)
1074 + bidi_cache_idx * sizeof (struct bidi_it)
1075 + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp)
1076 + sizeof (bidi_cache_start) + sizeof (bidi_cache_last_idx),
1077 sizeof (bidi_cache_max_elts));
1078 bidi_cache_total_alloc
1079 -= (bidi_shelve_header_size
1080 + bidi_cache_idx * sizeof (struct bidi_it));
1081 }
1082
1083 xfree (p);
1084 }
1085 }
1086
1087
1088 /***********************************************************************
1089 Initialization
1090 ***********************************************************************/
1091 static void
1092 bidi_initialize (void)
1093 {
1094 bidi_type_table = uniprop_table (intern ("bidi-class"));
1095 if (NILP (bidi_type_table))
1096 emacs_abort ();
1097 staticpro (&bidi_type_table);
1098
1099 bidi_mirror_table = uniprop_table (intern ("mirroring"));
1100 if (NILP (bidi_mirror_table))
1101 emacs_abort ();
1102 staticpro (&bidi_mirror_table);
1103
1104 bidi_brackets_table = uniprop_table (intern ("bracket-type"));
1105 if (NILP (bidi_brackets_table))
1106 emacs_abort ();
1107 staticpro (&bidi_brackets_table);
1108
1109 paragraph_start_re = build_string ("^\\(\f\\|[ \t]*\\)$");
1110 staticpro (¶graph_start_re);
1111 paragraph_separate_re = build_string ("^[ \t\f]*$");
1112 staticpro (¶graph_separate_re);
1113
1114 bidi_cache_sp = 0;
1115 bidi_cache_total_alloc = 0;
1116 bidi_cache_max_elts = BIDI_CACHE_MAX_ELTS_PER_SLOT;
1117
1118 bidi_initialized = 1;
1119 }
1120
1121 /* Do whatever UAX#9 clause X8 says should be done at paragraph's
1122 end. */
1123 static void
1124 bidi_set_paragraph_end (struct bidi_it *bidi_it)
1125 {
1126 bidi_it->invalid_levels = 0;
1127 bidi_it->invalid_isolates = 0;
1128 bidi_it->stack_idx = 0;
1129 bidi_it->isolate_level = 0;
1130 bidi_it->resolved_level = bidi_it->level_stack[0].level;
1131 }
1132
1133 /* Initialize the bidi iterator from buffer/string position CHARPOS. */
1134 void
1135 bidi_init_it (ptrdiff_t charpos, ptrdiff_t bytepos, bool frame_window_p,
1136 struct bidi_it *bidi_it)
1137 {
1138 if (! bidi_initialized)
1139 bidi_initialize ();
1140 if (charpos >= 0)
1141 bidi_it->charpos = charpos;
1142 if (bytepos >= 0)
1143 bidi_it->bytepos = bytepos;
1144 bidi_it->frame_window_p = frame_window_p;
1145 bidi_it->nchars = -1; /* to be computed in bidi_resolve_explicit */
1146 bidi_it->first_elt = 1;
1147 bidi_set_paragraph_end (bidi_it);
1148 bidi_it->new_paragraph = 1;
1149 bidi_it->separator_limit = -1;
1150 bidi_it->type = NEUTRAL_B;
1151 bidi_it->type_after_wn = NEUTRAL_B;
1152 bidi_it->orig_type = NEUTRAL_B;
1153 /* FIXME: Review this!!! */
1154 bidi_it->prev.type = bidi_it->prev.orig_type = UNKNOWN_BT;
1155 bidi_it->last_strong.type = bidi_it->last_strong.orig_type = UNKNOWN_BT;
1156 bidi_it->next_for_neutral.charpos = -1;
1157 bidi_it->next_for_neutral.type
1158 = bidi_it->next_for_neutral.orig_type = UNKNOWN_BT;
1159 bidi_it->prev_for_neutral.charpos = -1;
1160 bidi_it->prev_for_neutral.type
1161 = bidi_it->prev_for_neutral.orig_type = UNKNOWN_BT;
1162 bidi_it->bracket_pairing_pos = -1;
1163 bidi_it->sos = L2R; /* FIXME: should it be user-selectable? */
1164 bidi_it->disp_pos = -1; /* invalid/unknown */
1165 bidi_it->disp_prop = 0;
1166 /* We can only shrink the cache if we are at the bottom level of its
1167 "stack". */
1168 if (bidi_cache_start == 0)
1169 bidi_cache_shrink ();
1170 else
1171 bidi_cache_reset ();
1172 }
1173
1174 /* Perform initializations for reordering a new line of bidi text. */
1175 static void
1176 bidi_line_init (struct bidi_it *bidi_it)
1177 {
1178 bidi_it->scan_dir = 1; /* FIXME: do we need to have control on this? */
1179 bidi_it->stack_idx = 0;
1180 bidi_it->resolved_level = bidi_it->level_stack[0].level;
1181 bidi_it->level_stack[0].flags = 0; /* NEUTRAL_DIR, false per X1 */
1182 bidi_it->invalid_levels = 0;
1183 bidi_it->isolate_level = 0; /* X1 */
1184 bidi_it->invalid_isolates = 0; /* X1 */
1185 /* Setting this to zero will force its recomputation the first time
1186 we need it for W5. */
1187 bidi_it->next_en_pos = 0;
1188 bidi_it->next_en_type = UNKNOWN_BT;
1189 bidi_it->next_for_ws.charpos = -1;
1190 bidi_it->next_for_ws.type = UNKNOWN_BT;
1191 bidi_it->bracket_pairing_pos = -1;
1192 bidi_set_sos_type (bidi_it,
1193 (bidi_it->paragraph_dir == R2L ? 1 : 0),
1194 bidi_it->level_stack[0].level); /* X10 */
1195
1196 bidi_cache_reset ();
1197 }
1198
1199
1200 /***********************************************************************
1201 Fetching characters
1202 ***********************************************************************/
1203
1204 /* Count bytes in string S between BEG/BEGBYTE and END. BEG and END
1205 are zero-based character positions in S, BEGBYTE is byte position
1206 corresponding to BEG. UNIBYTE means S is a unibyte string. */
1207 static ptrdiff_t
1208 bidi_count_bytes (const unsigned char *s, ptrdiff_t beg,
1209 ptrdiff_t begbyte, ptrdiff_t end, bool unibyte)
1210 {
1211 ptrdiff_t pos = beg;
1212 const unsigned char *p = s + begbyte, *start = p;
1213
1214 if (unibyte)
1215 p = s + end;
1216 else
1217 {
1218 if (!CHAR_HEAD_P (*p))
1219 emacs_abort ();
1220
1221 while (pos < end)
1222 {
1223 p += BYTES_BY_CHAR_HEAD (*p);
1224 pos++;
1225 }
1226 }
1227
1228 return p - start;
1229 }
1230
1231 /* Fetch and return the character at byte position BYTEPOS. If S is
1232 non-NULL, fetch the character from string S; otherwise fetch the
1233 character from the current buffer. UNIBYTE means S is a
1234 unibyte string. */
1235 static int
1236 bidi_char_at_pos (ptrdiff_t bytepos, const unsigned char *s, bool unibyte)
1237 {
1238 if (s)
1239 {
1240 s += bytepos;
1241 if (unibyte)
1242 return *s;
1243 }
1244 else
1245 s = BYTE_POS_ADDR (bytepos);
1246 return STRING_CHAR (s);
1247 }
1248
1249 /* Fetch and return the character at CHARPOS/BYTEPOS. If that
1250 character is covered by a display string, treat the entire run of
1251 covered characters as a single character, either u+2029 or u+FFFC,
1252 and return their combined length in CH_LEN and NCHARS. DISP_POS
1253 specifies the character position of the next display string, or -1
1254 if not yet computed. When the next character is at or beyond that
1255 position, the function updates DISP_POS with the position of the
1256 next display string. *DISP_PROP non-zero means that there's really
1257 a display string at DISP_POS, as opposed to when we searched till
1258 DISP_POS without finding one. If *DISP_PROP is 2, it means the
1259 display spec is of the form `(space ...)', which is replaced with
1260 u+2029 to handle it as a paragraph separator. STRING->s is the C
1261 string to iterate, or NULL if iterating over a buffer or a Lisp
1262 string; in the latter case, STRING->lstring is the Lisp string. */
1263 static int
1264 bidi_fetch_char (ptrdiff_t charpos, ptrdiff_t bytepos, ptrdiff_t *disp_pos,
1265 int *disp_prop, struct bidi_string_data *string,
1266 struct window *w,
1267 bool frame_window_p, ptrdiff_t *ch_len, ptrdiff_t *nchars)
1268 {
1269 int ch;
1270 ptrdiff_t endpos
1271 = (string->s || STRINGP (string->lstring)) ? string->schars : ZV;
1272 struct text_pos pos;
1273
1274 /* If we got past the last known position of display string, compute
1275 the position of the next one. That position could be at CHARPOS. */
1276 if (charpos < endpos && charpos > *disp_pos)
1277 {
1278 SET_TEXT_POS (pos, charpos, bytepos);
1279 *disp_pos = compute_display_string_pos (&pos, string, w, frame_window_p,
1280 disp_prop);
1281 /* The factor of 100 below is a heuristic that needs to be
1282 tuned. It means we consider 100 buffer positions examined by
1283 the above call roughly equivalent to the display engine
1284 iterating over a single buffer position. */
1285 if (max_redisplay_ticks > 0 && *disp_pos > charpos)
1286 update_redisplay_ticks ((*disp_pos - charpos) / 100 + 1, w);
1287 }
1288
1289 /* Fetch the character at BYTEPOS. */
1290 if (charpos >= endpos)
1291 {
1292 ch = BIDI_EOB;
1293 *ch_len = 1;
1294 *nchars = 1;
1295 *disp_pos = endpos;
1296 *disp_prop = 0;
1297 }
1298 else if (charpos >= *disp_pos && *disp_prop)
1299 {
1300 ptrdiff_t disp_end_pos;
1301
1302 /* We don't expect to find ourselves in the middle of a display
1303 property. Hopefully, it will never be needed. */
1304 if (charpos > *disp_pos)
1305 emacs_abort ();
1306 /* Text covered by `display' properties and overlays with
1307 display properties or display strings is handled as a single
1308 character that represents the entire run of characters
1309 covered by the display property. */
1310 if (*disp_prop == 2)
1311 {
1312 /* `(space ...)' display specs are handled as paragraph
1313 separators for the purposes of the reordering; see UAX#9
1314 section 3 and clause HL1 in section 4.3 there. */
1315 ch = PARAGRAPH_SEPARATOR;
1316 }
1317 else
1318 {
1319 /* All other display specs are handled as the Unicode Object
1320 Replacement Character. */
1321 ch = OBJECT_REPLACEMENT_CHARACTER;
1322 }
1323 disp_end_pos = compute_display_string_end (*disp_pos, string);
1324 if (disp_end_pos < 0)
1325 {
1326 /* Somebody removed the display string from the buffer
1327 behind our back. Recover by processing this buffer
1328 position as if no display property were present there to
1329 begin with. */
1330 *disp_prop = 0;
1331 goto normal_char;
1332 }
1333 *nchars = disp_end_pos - *disp_pos;
1334 if (*nchars <= 0)
1335 emacs_abort ();
1336 if (string->s)
1337 *ch_len = bidi_count_bytes (string->s, *disp_pos, bytepos,
1338 disp_end_pos, string->unibyte);
1339 else if (STRINGP (string->lstring))
1340 *ch_len = bidi_count_bytes (SDATA (string->lstring), *disp_pos,
1341 bytepos, disp_end_pos, string->unibyte);
1342 else
1343 *ch_len = CHAR_TO_BYTE (disp_end_pos) - bytepos;
1344 }
1345 else
1346 {
1347 normal_char:
1348 if (string->s)
1349 {
1350 if (!string->unibyte)
1351 {
1352 int len;
1353 ch = string_char_and_length (string->s + bytepos, &len);
1354 *ch_len = len;
1355 }
1356 else
1357 {
1358 ch = UNIBYTE_TO_CHAR (string->s[bytepos]);
1359 *ch_len = 1;
1360 }
1361 }
1362 else if (STRINGP (string->lstring))
1363 {
1364 if (!string->unibyte)
1365 {
1366 int len;
1367 ch = string_char_and_length (SDATA (string->lstring) + bytepos,
1368 &len);
1369 *ch_len = len;
1370 }
1371 else
1372 {
1373 ch = UNIBYTE_TO_CHAR (SREF (string->lstring, bytepos));
1374 *ch_len = 1;
1375 }
1376 }
1377 else
1378 {
1379 int len;
1380 ch = string_char_and_length (BYTE_POS_ADDR (bytepos), &len);
1381 *ch_len = len;
1382 }
1383
1384 *nchars = 1;
1385 }
1386
1387 /* If we just entered a run of characters covered by a display
1388 string, compute the position of the next display string. */
1389 if (charpos + *nchars <= endpos && charpos + *nchars > *disp_pos
1390 && *disp_prop)
1391 {
1392 SET_TEXT_POS (pos, charpos + *nchars, bytepos + *ch_len);
1393 *disp_pos = compute_display_string_pos (&pos, string, w, frame_window_p,
1394 disp_prop);
1395 if (max_redisplay_ticks > 0 && *disp_pos > charpos + *nchars)
1396 update_redisplay_ticks ((*disp_pos - charpos - *nchars) / 100 + 1, w);
1397 }
1398
1399 return ch;
1400 }
1401
1402 /* Like bidi_fetch_char, but ignore any text between an isolate
1403 initiator and its matching PDI or, if it has no matching PDI, the
1404 end of the paragraph. If isolates were skipped, CH_LEN and NCHARS
1405 are set to the number of bytes and characters between BYTEPOS/CHARPOS
1406 and the character that was fetched after skipping the isolates. */
1407 static int
1408 bidi_fetch_char_skip_isolates (ptrdiff_t charpos, ptrdiff_t bytepos,
1409 ptrdiff_t *disp_pos, int *disp_prop,
1410 struct bidi_string_data *string,
1411 struct window *w, bool frame_window_p,
1412 ptrdiff_t *ch_len, ptrdiff_t *nchars)
1413 {
1414 ptrdiff_t orig_charpos = charpos, orig_bytepos = bytepos;
1415 int ch = bidi_fetch_char (charpos, bytepos, disp_pos, disp_prop, string, w,
1416 frame_window_p, ch_len, nchars);
1417 bidi_type_t ch_type = bidi_get_type (ch, NEUTRAL_DIR);
1418 ptrdiff_t level = 0;
1419
1420 if (ch_type == LRI || ch_type == RLI || ch_type == FSI)
1421 {
1422 level++;
1423 while (level > 0 && ch_type != NEUTRAL_B)
1424 {
1425 charpos += *nchars;
1426 bytepos += *ch_len;
1427 ch = bidi_fetch_char (charpos, bytepos, disp_pos, disp_prop, string,
1428 w, frame_window_p, ch_len, nchars);
1429 ch_type = bidi_get_type (ch, NEUTRAL_DIR);
1430 /* A Note to P2 says to ignore max_depth limit. */
1431 if (ch_type == LRI || ch_type == RLI || ch_type == FSI)
1432 level++;
1433 else if (ch_type == PDI)
1434 level--;
1435 }
1436 }
1437
1438 /* Communicate to the caller how much did we skip, so it could get
1439 past the last character position we examined. */
1440 *nchars += charpos - orig_charpos;
1441 *ch_len += bytepos - orig_bytepos;
1442 return ch;
1443 }
1444
1445
1446
1447 /***********************************************************************
1448 Determining paragraph direction
1449 ***********************************************************************/
1450
1451 /* Check if buffer position CHARPOS/BYTEPOS is the end of a paragraph.
1452 Value is the non-negative length of the paragraph separator
1453 following the buffer position, -1 if position is at the beginning
1454 of a new paragraph, or -2 if position is neither at beginning nor
1455 at end of a paragraph. */
1456 static ptrdiff_t
1457 bidi_at_paragraph_end (ptrdiff_t charpos, ptrdiff_t bytepos)
1458 {
1459 Lisp_Object sep_re;
1460 Lisp_Object start_re;
1461 ptrdiff_t val;
1462
1463 if (STRINGP (BVAR (current_buffer, bidi_paragraph_separate_re)))
1464 sep_re = BVAR (current_buffer, bidi_paragraph_separate_re);
1465 else
1466 sep_re = paragraph_separate_re;
1467 if (STRINGP (BVAR (current_buffer, bidi_paragraph_start_re)))
1468 start_re = BVAR (current_buffer, bidi_paragraph_start_re);
1469 else
1470 start_re = paragraph_start_re;
1471
1472 /* Prevent quitting inside re_match_2, as redisplay_window could
1473 have temporarily moved point. */
1474 specpdl_ref count = SPECPDL_INDEX ();
1475 specbind (Qinhibit_quit, Qt);
1476
1477 val = fast_looking_at (sep_re, charpos, bytepos, ZV, ZV_BYTE, Qnil);
1478 if (val < 0)
1479 {
1480 if (fast_looking_at (start_re, charpos, bytepos, ZV, ZV_BYTE, Qnil) >= 0)
1481 val = -1;
1482 else
1483 val = -2;
1484 }
1485
1486 unbind_to (count, Qnil);
1487 return val;
1488 }
1489
1490 /* If the user has requested the long scans caching, make sure that
1491 BIDI cache is enabled. Otherwise, make sure it's disabled. */
1492
1493 static struct region_cache *
1494 bidi_paragraph_cache_on_off (void)
1495 {
1496 struct buffer *cache_buffer = current_buffer;
1497 bool indirect_p = false;
1498
1499 /* For indirect buffers, make sure to use the cache of their base
1500 buffer. */
1501 if (cache_buffer->base_buffer)
1502 {
1503 cache_buffer = cache_buffer->base_buffer;
1504 indirect_p = true;
1505 }
1506
1507 /* Don't turn on or off the cache in the base buffer, if the value
1508 of cache-long-scans of the base buffer is inconsistent with that.
1509 This is because doing so will just make the cache pure overhead,
1510 since if we turn it on via indirect buffer, it will be
1511 immediately turned off by its base buffer. */
1512 if (NILP (BVAR (current_buffer, cache_long_scans)))
1513 {
1514 if (!indirect_p
1515 || NILP (BVAR (cache_buffer, cache_long_scans)))
1516 {
1517 if (cache_buffer->bidi_paragraph_cache)
1518 {
1519 free_region_cache (cache_buffer->bidi_paragraph_cache);
1520 cache_buffer->bidi_paragraph_cache = 0;
1521 }
1522 }
1523 return NULL;
1524 }
1525 else
1526 {
1527 if (!indirect_p
1528 || !NILP (BVAR (cache_buffer, cache_long_scans)))
1529 {
1530 if (!cache_buffer->bidi_paragraph_cache)
1531 cache_buffer->bidi_paragraph_cache = new_region_cache ();
1532 }
1533 return cache_buffer->bidi_paragraph_cache;
1534 }
1535 }
1536
1537 /* On my 2005-vintage machine, searching back for paragraph start
1538 takes ~1 ms per line. And bidi_paragraph_init is called 4 times
1539 when user types C-p. The number below limits each call to
1540 bidi_paragraph_init to about 10 ms. */
1541 #define MAX_PARAGRAPH_SEARCH 7500
1542
1543 /* Find the beginning of this paragraph by looking back in the buffer.
1544 Value is the byte position of the paragraph's beginning, or
1545 BEGV_BYTE if paragraph_start_re is still not found after looking
1546 back MAX_PARAGRAPH_SEARCH lines in the buffer. */
1547 static ptrdiff_t
1548 bidi_find_paragraph_start (ptrdiff_t pos, ptrdiff_t pos_byte)
1549 {
1550 Lisp_Object re =
1551 STRINGP (BVAR (current_buffer, bidi_paragraph_start_re))
1552 ? BVAR (current_buffer, bidi_paragraph_start_re)
1553 : paragraph_start_re;
1554 ptrdiff_t limit = ZV, limit_byte = ZV_BYTE;
1555 struct region_cache *bpc = bidi_paragraph_cache_on_off ();
1556 ptrdiff_t n = 0, oldpos = pos, next;
1557 struct buffer *cache_buffer = current_buffer;
1558
1559 if (cache_buffer->base_buffer)
1560 cache_buffer = cache_buffer->base_buffer;
1561
1562 /* Prevent quitting inside re_match_2, as redisplay_window could
1563 have temporarily moved point. */
1564 specpdl_ref count = SPECPDL_INDEX ();
1565 specbind (Qinhibit_quit, Qt);
1566
1567 while (pos_byte > BEGV_BYTE
1568 && n++ < MAX_PARAGRAPH_SEARCH
1569 && fast_looking_at (re, pos, pos_byte, limit, limit_byte, Qnil) < 0)
1570 {
1571 /* FIXME: What if the paragraph beginning is covered by a
1572 display string? And what if a display string covering some
1573 of the text over which we scan back includes
1574 paragraph_start_re? */
1575 dec_both (&pos, &pos_byte);
1576 if (bpc && region_cache_backward (cache_buffer, bpc, pos, &next))
1577 {
1578 pos = next, pos_byte = CHAR_TO_BYTE (pos);
1579 break;
1580 }
1581 else
1582 pos = find_newline_no_quit (pos, pos_byte, -1, &pos_byte);
1583 }
1584 unbind_to (count, Qnil);
1585 if (n >= MAX_PARAGRAPH_SEARCH)
1586 pos = BEGV, pos_byte = BEGV_BYTE;
1587 if (bpc)
1588 know_region_cache (cache_buffer, bpc, pos, oldpos);
1589 /* Positions returned by the region cache are not limited to
1590 BEGV..ZV range, so we limit them here. */
1591 pos_byte = clip_to_bounds (BEGV_BYTE, pos_byte, ZV_BYTE);
1592 return pos_byte;
1593 }
1594
1595 /* This tracks how far we needed to search for first strong character. */
1596 static ptrdiff_t nsearch_for_strong;
1597
1598 /* On a 3.4 GHz machine, searching forward for a strong directional
1599 character in a long paragraph full of weaks or neutrals takes about
1600 1 ms for each 20K characters. The number below limits each call to
1601 bidi_paragraph_init to less than 10 ms even on slow machines. */
1602 #define MAX_STRONG_CHAR_SEARCH 100000
1603
1604 /* Starting from POS, find the first strong (L, R, or AL) character,
1605 while skipping over any characters between an isolate initiator and
1606 its matching PDI. STOP_AT_PDI non-zero means stop at the PDI that
1607 matches the isolate initiator at POS. Return the bidi type of the
1608 character where the search stopped. Give up if after examining
1609 MAX_STRONG_CHAR_SEARCH buffer or string positions no strong
1610 character was found. */
1611 static bidi_type_t
1612 find_first_strong_char (ptrdiff_t pos, ptrdiff_t bytepos, ptrdiff_t end,
1613 ptrdiff_t *disp_pos, int *disp_prop,
1614 struct bidi_string_data *string, struct window *w,
1615 bool string_p, bool frame_window_p,
1616 ptrdiff_t *ch_len, ptrdiff_t *nchars, bool stop_at_pdi)
1617 {
1618 ptrdiff_t pos1;
1619 bidi_type_t type;
1620 int ch;
1621
1622 if (stop_at_pdi)
1623 {
1624 /* If STOP_AT_PDI is non-zero, we must have been called with FSI
1625 at POS. Get past it. */
1626 #ifdef ENABLE_CHECKING
1627 ch = bidi_fetch_char (pos, bytepos, disp_pos, disp_prop, string, w,
1628 frame_window_p, ch_len, nchars);
1629 type = bidi_get_type (ch, NEUTRAL_DIR);
1630 eassert (type == FSI /* || type == LRI || type == RLI */);
1631 #endif
1632 pos += *nchars;
1633 bytepos += *ch_len;
1634 }
1635 ch = bidi_fetch_char_skip_isolates (pos, bytepos, disp_pos, disp_prop, string,
1636 w, frame_window_p, ch_len, nchars);
1637 type = bidi_get_type (ch, NEUTRAL_DIR);
1638
1639 pos1 = pos;
1640 for (pos += *nchars, bytepos += *ch_len;
1641 bidi_get_category (type) != STRONG
1642 /* If requested to stop at first PDI, stop there. */
1643 && !(stop_at_pdi && type == PDI)
1644 /* Stop when searched too far into an abnormally large
1645 paragraph full of weak or neutral characters. */
1646 && pos - pos1 < MAX_STRONG_CHAR_SEARCH;
1647 type = bidi_get_type (ch, NEUTRAL_DIR))
1648 {
1649 if (pos >= end)
1650 {
1651 /* Pretend there's a paragraph separator at end of
1652 buffer/string. */
1653 type = NEUTRAL_B;
1654 break;
1655 }
1656 if (!string_p
1657 && type == NEUTRAL_B
1658 && bidi_at_paragraph_end (pos, bytepos) >= -1)
1659 break;
1660 /* Fetch next character and advance to get past it. */
1661 ch = bidi_fetch_char_skip_isolates (pos, bytepos, disp_pos, disp_prop,
1662 string, w, frame_window_p,
1663 ch_len, nchars);
1664 pos += *nchars;
1665 bytepos += *ch_len;
1666 }
1667
1668 nsearch_for_strong += pos - pos1;
1669 return type;
1670 }
1671
1672 /* Determine the base direction, a.k.a. base embedding level, of the
1673 paragraph we are about to iterate through. If DIR is either L2R or
1674 R2L, just use that. Otherwise, determine the paragraph direction
1675 from the first strong directional character of the paragraph.
1676
1677 NO_DEFAULT_P means don't default to L2R if the paragraph
1678 has no strong directional characters and both DIR and
1679 bidi_it->paragraph_dir are NEUTRAL_DIR. In that case, search back
1680 in the buffer until a paragraph is found with a strong character,
1681 or until hitting BEGV. In the latter case, fall back to L2R. This
1682 flag is used in current-bidi-paragraph-direction.
1683
1684 Note that this function gives the paragraph separator the same
1685 direction as the preceding paragraph, even though Emacs generally
1686 views the separator as not belonging to any paragraph. */
1687 void
1688 bidi_paragraph_init (bidi_dir_t dir, struct bidi_it *bidi_it, bool no_default_p)
1689 {
1690 ptrdiff_t bytepos = bidi_it->bytepos;
1691 bool string_p = bidi_it->string.s || STRINGP (bidi_it->string.lstring);
1692 ptrdiff_t pstartbyte;
1693 /* Note that begbyte is a byte position, while end is a character
1694 position. Yes, this is ugly, but we are trying to avoid costly
1695 calls to BYTE_TO_CHAR and its ilk. */
1696 ptrdiff_t begbyte = string_p ? 0 : BEGV_BYTE;
1697 ptrdiff_t end = string_p ? bidi_it->string.schars : ZV;
1698 ptrdiff_t pos = bidi_it->charpos;
1699
1700 nsearch_for_strong = 0;
1701
1702 /* Special case for an empty buffer. */
1703 if (bytepos == begbyte && bidi_it->charpos == end)
1704 dir = L2R;
1705 /* We should never be called at EOB or before BEGV. */
1706 else if (bidi_it->charpos >= end || bytepos < begbyte)
1707 emacs_abort ();
1708
1709 if (dir == L2R)
1710 {
1711 bidi_it->paragraph_dir = L2R;
1712 bidi_it->new_paragraph = 0;
1713 }
1714 else if (dir == R2L)
1715 {
1716 bidi_it->paragraph_dir = R2L;
1717 bidi_it->new_paragraph = 0;
1718 }
1719 else if (dir == NEUTRAL_DIR) /* P2 */
1720 {
1721 ptrdiff_t ch_len, nchars;
1722 ptrdiff_t disp_pos = -1;
1723 int disp_prop = 0;
1724 bidi_type_t type;
1725 const unsigned char *s;
1726
1727 if (!bidi_initialized)
1728 bidi_initialize ();
1729
1730 /* If we are inside a paragraph separator, we are just waiting
1731 for the separator to be exhausted; use the previous paragraph
1732 direction. But don't do that if we have been just reseated,
1733 because we need to reinitialize below in that case. */
1734 if (!bidi_it->first_elt
1735 && bidi_it->charpos < bidi_it->separator_limit)
1736 return;
1737
1738 /* If we are on a newline, get past it to where the next
1739 paragraph might start. But don't do that at BEGV since then
1740 we are potentially in a new paragraph that doesn't yet
1741 exist. */
1742 pos = bidi_it->charpos;
1743 s = (STRINGP (bidi_it->string.lstring)
1744 ? SDATA (bidi_it->string.lstring)
1745 : bidi_it->string.s);
1746 if (bytepos > begbyte
1747 && bidi_char_at_pos (bytepos, s, bidi_it->string.unibyte) == '\n')
1748 {
1749 bytepos++;
1750 pos++;
1751 }
1752
1753 /* We are either at the beginning of a paragraph or in the
1754 middle of it. Find where this paragraph starts. */
1755 if (string_p)
1756 {
1757 /* We don't support changes of paragraph direction inside a
1758 string. It is treated as a single paragraph. */
1759 pstartbyte = 0;
1760 }
1761 else
1762 pstartbyte = bidi_find_paragraph_start (pos, bytepos);
1763 bidi_it->separator_limit = -1;
1764 bidi_it->new_paragraph = 0;
1765
1766 /* The following loop is run more than once only if NO_DEFAULT_P,
1767 and only if we are iterating on a buffer. */
1768 do {
1769 bytepos = pstartbyte;
1770 if (!string_p)
1771 pos = BYTE_TO_CHAR (bytepos);
1772 type = find_first_strong_char (pos, bytepos, end, &disp_pos, &disp_prop,
1773 &bidi_it->string, bidi_it->w,
1774 string_p, bidi_it->frame_window_p,
1775 &ch_len, &nchars, false);
1776 if (type == STRONG_R || type == STRONG_AL) /* P3 */
1777 bidi_it->paragraph_dir = R2L;
1778 else if (type == STRONG_L)
1779 bidi_it->paragraph_dir = L2R;
1780 if (!string_p
1781 && no_default_p && bidi_it->paragraph_dir == NEUTRAL_DIR)
1782 {
1783 /* If this paragraph is at BEGV, default to L2R. */
1784 if (pstartbyte == BEGV_BYTE)
1785 bidi_it->paragraph_dir = L2R; /* P3 and HL1 */
1786 else
1787 {
1788 ptrdiff_t prevpbyte = pstartbyte;
1789 ptrdiff_t p = BYTE_TO_CHAR (pstartbyte), pbyte = pstartbyte;
1790
1791 /* Find the beginning of the previous paragraph, if any. */
1792 while (pbyte > BEGV_BYTE && prevpbyte >= pstartbyte)
1793 {
1794 /* FXIME: What if p is covered by a display
1795 string? See also a FIXME inside
1796 bidi_find_paragraph_start. */
1797 dec_both (&p, &pbyte);
1798 prevpbyte = bidi_find_paragraph_start (p, pbyte);
1799 }
1800 pstartbyte = prevpbyte;
1801 }
1802 }
1803 } while (!string_p
1804 && no_default_p && bidi_it->paragraph_dir == NEUTRAL_DIR);
1805 }
1806 else
1807 emacs_abort ();
1808
1809 /* Contrary to UAX#9 clause P3, we only default the paragraph
1810 direction to L2R if we have no previous usable paragraph
1811 direction. This is allowed by the HL1 clause. */
1812 if (bidi_it->paragraph_dir != L2R && bidi_it->paragraph_dir != R2L)
1813 bidi_it->paragraph_dir = L2R; /* P3 and HL1 ``higher-level protocols'' */
1814 if (bidi_it->paragraph_dir == R2L)
1815 bidi_it->level_stack[0].level = 1;
1816 else
1817 bidi_it->level_stack[0].level = 0;
1818
1819 bidi_line_init (bidi_it);
1820
1821 /* The factor of 50 below is a heuristic that needs to be tuned. It
1822 means we consider 50 buffer positions examined by this function
1823 roughly equivalent to the display engine iterating over a single
1824 buffer position. */
1825 ptrdiff_t nexamined = bidi_it->charpos - pos + nsearch_for_strong;
1826 if (max_redisplay_ticks > 0 && nexamined > 0)
1827 update_redisplay_ticks (nexamined / 50, bidi_it->w);
1828 }
1829
1830
1831 /***********************************************************************
1832 Resolving explicit and implicit levels.
1833 The rest of this file constitutes the core of the UBA implementation.
1834 ***********************************************************************/
1835
1836 static bool
1837 bidi_explicit_dir_char (int ch)
1838 {
1839 bidi_type_t ch_type;
1840
1841 if (!bidi_initialized)
1842 emacs_abort ();
1843 if (ch < 0)
1844 {
1845 eassert (ch == BIDI_EOB);
1846 return false;
1847 }
1848 ch_type = (bidi_type_t) XFIXNUM (CHAR_TABLE_REF (bidi_type_table, ch));
1849 return (ch_type == LRE || ch_type == LRO
1850 || ch_type == RLE || ch_type == RLO
1851 || ch_type == PDF);
1852 }
1853
1854 /* Given an iterator state in BIDI_IT, advance one character position
1855 in the buffer/string to the next character (in the logical order),
1856 resolve any explicit embeddings, directional overrides, and isolate
1857 initiators and terminators, and return the embedding level of the
1858 character after resolving these explicit directives. */
1859 static int
1860 bidi_resolve_explicit (struct bidi_it *bidi_it)
1861 {
1862 int curchar;
1863 bidi_type_t type, typ1, prev_type = UNKNOWN_BT;
1864 int current_level;
1865 int new_level;
1866 bidi_dir_t override;
1867 bool isolate_status;
1868 bool string_p = bidi_it->string.s || STRINGP (bidi_it->string.lstring);
1869 ptrdiff_t ch_len, nchars, disp_pos, end;
1870 int disp_prop;
1871 ptrdiff_t eob
1872 = ((bidi_it->string.s || STRINGP (bidi_it->string.lstring))
1873 ? bidi_it->string.schars : ZV);
1874
1875 /* Record the info about the previous character. */
1876 if (bidi_it->type_after_wn != WEAK_BN /* W1/Retaining */
1877 && bidi_it->type != WEAK_BN)
1878 {
1879 /* This special case is needed in support of Unicode 8.0
1880 correction to N0, as implemented in bidi_resolve_weak/W1
1881 below. */
1882 if (bidi_it->type_after_wn == NEUTRAL_ON
1883 && bidi_get_category (bidi_it->type) == STRONG
1884 && bidi_paired_bracket_type (bidi_it->ch) == BIDI_BRACKET_CLOSE)
1885 bidi_remember_char (&bidi_it->prev, bidi_it, 1);
1886 else
1887 bidi_remember_char (&bidi_it->prev, bidi_it, 0);
1888 }
1889 if (bidi_it->type_after_wn == STRONG_R
1890 || bidi_it->type_after_wn == STRONG_L
1891 || bidi_it->type_after_wn == STRONG_AL)
1892 bidi_remember_char (&bidi_it->last_strong, bidi_it, 0);
1893 if (bidi_it->type == STRONG_R || bidi_it->type == STRONG_L
1894 || bidi_it->type == WEAK_EN || bidi_it->type == WEAK_AN)
1895 bidi_remember_char (&bidi_it->prev_for_neutral, bidi_it, 1);
1896
1897 /* If we overstepped the characters used for resolving neutrals
1898 and whitespace, invalidate their info in the iterator. */
1899 if (bidi_it->charpos >= bidi_it->next_for_neutral.charpos)
1900 {
1901 bidi_it->next_for_neutral.type = UNKNOWN_BT;
1902 /* If needed, reset the "magical" value of pairing bracket
1903 position, so that bidi_resolve_brackets will resume
1904 resolution of brackets according to BPA. */
1905 if (bidi_it->bracket_pairing_pos == eob)
1906 bidi_it->bracket_pairing_pos = -1;
1907 }
1908 if (bidi_it->next_en_pos >= 0
1909 && bidi_it->charpos >= bidi_it->next_en_pos)
1910 {
1911 bidi_it->next_en_pos = 0;
1912 bidi_it->next_en_type = UNKNOWN_BT;
1913 }
1914
1915 /* Reset the bracket resolution info, unless we previously decided
1916 (in bidi_find_bracket_pairs) that brackets in this level run
1917 should be resolved as neutrals. */
1918 if (bidi_it->bracket_pairing_pos != eob)
1919 {
1920 bidi_it->bracket_pairing_pos = -1;
1921 bidi_it->bracket_enclosed_type = UNKNOWN_BT;
1922 }
1923
1924 /* If reseat()'ed, don't advance, so as to start iteration from the
1925 position where we were reseated. bidi_it->bytepos can be less
1926 than BEGV_BYTE after reseat to BEGV. */
1927 if (bidi_it->bytepos < (string_p ? 0 : BEGV_BYTE)
1928 || bidi_it->first_elt)
1929 {
1930 bidi_it->first_elt = 0;
1931 if (string_p)
1932 {
1933 const unsigned char *p
1934 = (STRINGP (bidi_it->string.lstring)
1935 ? SDATA (bidi_it->string.lstring)
1936 : bidi_it->string.s);
1937
1938 if (bidi_it->charpos < 0)
1939 bidi_it->charpos = bidi_it->bytepos = 0;
1940 eassert (bidi_it->bytepos == bidi_count_bytes (p, 0, 0,
1941 bidi_it->charpos,
1942 bidi_it->string.unibyte));
1943 }
1944 else
1945 {
1946 if (bidi_it->charpos < BEGV)
1947 {
1948 bidi_it->charpos = BEGV;
1949 bidi_it->bytepos = BEGV_BYTE;
1950 }
1951 eassert (bidi_it->bytepos == CHAR_TO_BYTE (bidi_it->charpos));
1952 }
1953 /* Determine the original bidi type of the previous character,
1954 which is needed for handling isolate initiators and PDF. The
1955 type of the previous character will be non-trivial only if
1956 our caller moved through some previous text in
1957 get_visually_first_element, in which case bidi_it->prev holds
1958 the information we want. */
1959 if (bidi_it->first_elt && bidi_it->prev.type != UNKNOWN_BT)
1960 {
1961 eassert (bidi_it->prev.charpos == bidi_it->charpos - 1);
1962 prev_type = bidi_it->prev.orig_type;
1963 }
1964 }
1965 /* Don't move at end of buffer/string. */
1966 else if (bidi_it->charpos < (string_p ? bidi_it->string.schars : ZV))
1967 {
1968 /* Advance to the next character, skipping characters covered by
1969 display strings (nchars > 1). */
1970 if (bidi_it->nchars <= 0)
1971 emacs_abort ();
1972 bidi_it->charpos += bidi_it->nchars;
1973 if (bidi_it->ch_len == 0)
1974 emacs_abort ();
1975 bidi_it->bytepos += bidi_it->ch_len;
1976 prev_type = bidi_it->orig_type;
1977 }
1978 else /* EOB or end of string */
1979 prev_type = NEUTRAL_B;
1980
1981 current_level = bidi_it->level_stack[bidi_it->stack_idx].level; /* X1 */
1982 isolate_status = ISOLATE_STATUS (bidi_it, bidi_it->stack_idx);
1983 override = OVERRIDE (bidi_it, bidi_it->stack_idx);
1984 new_level = current_level;
1985
1986 if (bidi_it->charpos >= (string_p ? bidi_it->string.schars : ZV))
1987 {
1988 curchar = BIDI_EOB;
1989 bidi_it->ch_len = 1;
1990 bidi_it->nchars = 1;
1991 bidi_it->disp_pos = (string_p ? bidi_it->string.schars : ZV);
1992 bidi_it->disp_prop = 0;
1993 }
1994 else
1995 {
1996 /* LRI, RLI, and FSI increment, and PDF decrements, the
1997 embedding level of the _following_ characters, so we must
1998 first look at the type of the previous character to support
1999 that. */
2000 switch (prev_type)
2001 {
2002 case RLI: /* X5a */
2003 if (current_level < BIDI_MAXDEPTH
2004 && bidi_it->invalid_levels == 0
2005 && bidi_it->invalid_isolates == 0)
2006 {
2007 new_level = ((current_level + 1) & ~1) + 1;
2008 bidi_it->isolate_level++;
2009 bidi_push_embedding_level (bidi_it, new_level,
2010 NEUTRAL_DIR, true);
2011 }
2012 else
2013 bidi_it->invalid_isolates++;
2014 break;
2015 case LRI: /* X5b */
2016 if (current_level < BIDI_MAXDEPTH - 1
2017 && bidi_it->invalid_levels == 0
2018 && bidi_it->invalid_isolates == 0)
2019 {
2020 new_level = ((current_level + 2) & ~1);
2021 bidi_it->isolate_level++;
2022 bidi_push_embedding_level (bidi_it, new_level,
2023 NEUTRAL_DIR, true);
2024 }
2025 else
2026 bidi_it->invalid_isolates++;
2027 break;
2028 case PDF: /* X7 */
2029 if (!bidi_it->invalid_isolates)
2030 {
2031 if (bidi_it->invalid_levels)
2032 bidi_it->invalid_levels--;
2033 else if (!isolate_status && bidi_it->stack_idx >= 1)
2034 new_level = bidi_pop_embedding_level (bidi_it);
2035 }
2036 break;
2037 default:
2038 eassert (prev_type != FSI);
2039 /* Nothing. */
2040 break;
2041 }
2042 /* Fetch the character at BYTEPOS. If it is covered by a
2043 display string, treat the entire run of covered characters as
2044 a single character u+FFFC. */
2045 curchar = bidi_fetch_char (bidi_it->charpos, bidi_it->bytepos,
2046 &bidi_it->disp_pos, &bidi_it->disp_prop,
2047 &bidi_it->string, bidi_it->w,
2048 bidi_it->frame_window_p,
2049 &bidi_it->ch_len, &bidi_it->nchars);
2050 }
2051 bidi_it->ch = curchar;
2052 bidi_it->resolved_level = new_level;
2053
2054 /* Don't apply directional override here, as all the types we handle
2055 below will not be affected by the override anyway, and we need
2056 the original type unaltered. The override will be applied in
2057 bidi_resolve_weak. */
2058 type = bidi_get_type (curchar, NEUTRAL_DIR);
2059 bidi_it->orig_type = type;
2060 bidi_check_type (bidi_it->orig_type);
2061
2062 bidi_it->type_after_wn = UNKNOWN_BT;
2063
2064 switch (type)
2065 {
2066 case RLE: /* X2 */
2067 case RLO: /* X4 */
2068 bidi_it->type_after_wn = type;
2069 bidi_check_type (bidi_it->type_after_wn);
2070 type = WEAK_BN; /* X9/Retaining */
2071 if (new_level < BIDI_MAXDEPTH
2072 && bidi_it->invalid_levels == 0
2073 && bidi_it->invalid_isolates == 0)
2074 {
2075 /* Compute the least odd embedding level greater than
2076 the current level. */
2077 new_level = ((new_level + 1) & ~1) + 1;
2078 if (bidi_it->type_after_wn == RLE)
2079 override = NEUTRAL_DIR;
2080 else
2081 override = R2L;
2082 bidi_push_embedding_level (bidi_it, new_level, override, false);
2083 bidi_it->resolved_level = new_level;
2084 }
2085 else
2086 {
2087 if (bidi_it->invalid_isolates == 0)
2088 bidi_it->invalid_levels++;
2089 }
2090 break;
2091 case LRE: /* X3 */
2092 case LRO: /* X5 */
2093 bidi_it->type_after_wn = type;
2094 bidi_check_type (bidi_it->type_after_wn);
2095 type = WEAK_BN; /* X9/Retaining */
2096 if (new_level < BIDI_MAXDEPTH - 1
2097 && bidi_it->invalid_levels == 0
2098 && bidi_it->invalid_isolates == 0)
2099 {
2100 /* Compute the least even embedding level greater than
2101 the current level. */
2102 new_level = ((new_level + 2) & ~1);
2103 if (bidi_it->type_after_wn == LRE)
2104 override = NEUTRAL_DIR;
2105 else
2106 override = L2R;
2107 bidi_push_embedding_level (bidi_it, new_level, override, false);
2108 bidi_it->resolved_level = new_level;
2109 }
2110 else
2111 {
2112 if (bidi_it->invalid_isolates == 0)
2113 bidi_it->invalid_levels++;
2114 }
2115 break;
2116 case FSI: /* X5c */
2117 end = string_p ? bidi_it->string.schars : ZV;
2118 disp_pos = bidi_it->disp_pos;
2119 disp_prop = bidi_it->disp_prop;
2120 nchars = bidi_it->nchars;
2121 ch_len = bidi_it->ch_len;
2122 typ1 = find_first_strong_char (bidi_it->charpos,
2123 bidi_it->bytepos, end,
2124 &disp_pos, &disp_prop,
2125 &bidi_it->string, bidi_it->w,
2126 string_p, bidi_it->frame_window_p,
2127 &ch_len, &nchars, true);
2128 if (typ1 != STRONG_R && typ1 != STRONG_AL)
2129 {
2130 type = LRI;
2131 /* Override orig_type, which will be needed when we come to
2132 examine the next character, which is the first character
2133 inside the isolate. */
2134 bidi_it->orig_type = type;
2135 goto fsi_as_lri;
2136 }
2137 else
2138 {
2139 type = RLI;
2140 bidi_it->orig_type = type;
2141 }
2142 FALLTHROUGH;
2143 case RLI: /* X5a */
2144 if (override == NEUTRAL_DIR)
2145 bidi_it->type_after_wn = type;
2146 else /* Unicode 8.0 correction. */
2147 bidi_it->type_after_wn = (override == L2R ? STRONG_L : STRONG_R);
2148 bidi_check_type (bidi_it->type_after_wn);
2149 break;
2150 case LRI: /* X5b */
2151 fsi_as_lri:
2152 if (override == NEUTRAL_DIR)
2153 bidi_it->type_after_wn = type;
2154 else /* Unicode 8.0 correction. */
2155 bidi_it->type_after_wn = (override == L2R ? STRONG_L : STRONG_R);
2156 bidi_check_type (bidi_it->type_after_wn);
2157 break;
2158 case PDI: /* X6a */
2159 if (bidi_it->invalid_isolates)
2160 bidi_it->invalid_isolates--;
2161 else if (bidi_it->isolate_level > 0)
2162 {
2163 bidi_it->invalid_levels = 0;
2164 while (!ISOLATE_STATUS (bidi_it, bidi_it->stack_idx))
2165 bidi_pop_embedding_level (bidi_it);
2166 eassert (bidi_it->stack_idx > 0);
2167 new_level = bidi_pop_embedding_level (bidi_it);
2168 bidi_it->isolate_level--;
2169 }
2170 bidi_it->resolved_level = new_level;
2171 /* Unicode 8.0 correction. */
2172 {
2173 bidi_dir_t stack_override = OVERRIDE (bidi_it, bidi_it->stack_idx);
2174 if (stack_override == L2R)
2175 bidi_it->type_after_wn = STRONG_L;
2176 else if (stack_override == R2L)
2177 bidi_it->type_after_wn = STRONG_R;
2178 else
2179 bidi_it->type_after_wn = type;
2180 }
2181 break;
2182 case PDF: /* X7 */
2183 bidi_it->type_after_wn = type;
2184 bidi_check_type (bidi_it->type_after_wn);
2185 type = WEAK_BN; /* X9/Retaining */
2186 break;
2187 default:
2188 /* Nothing. */
2189 break;
2190 }
2191
2192 bidi_it->type = type;
2193 bidi_check_type (bidi_it->type);
2194
2195 if (bidi_it->type == NEUTRAL_B) /* X8 */
2196 {
2197 bidi_set_paragraph_end (bidi_it);
2198 /* This is needed by bidi_resolve_weak below, and in L1. */
2199 bidi_it->type_after_wn = bidi_it->type;
2200 }
2201
2202 eassert (bidi_it->resolved_level >= 0);
2203 return bidi_it->resolved_level;
2204 }
2205
2206 /* Advance in the buffer/string, resolve weak types and return the
2207 type of the next character after weak type resolution. */
2208 static bidi_type_t
2209 bidi_resolve_weak (struct bidi_it *bidi_it)
2210 {
2211 bidi_type_t type;
2212 bidi_dir_t override;
2213 int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
2214 int new_level = bidi_resolve_explicit (bidi_it);
2215 int next_char;
2216 bidi_type_t type_of_next;
2217 struct bidi_it saved_it;
2218 ptrdiff_t eob
2219 = ((STRINGP (bidi_it->string.lstring) || bidi_it->string.s)
2220 ? bidi_it->string.schars : ZV);
2221
2222 type = bidi_it->type;
2223 override = OVERRIDE (bidi_it, bidi_it->stack_idx);
2224
2225 eassert (!(type == UNKNOWN_BT
2226 || type == LRE
2227 || type == LRO
2228 || type == RLE
2229 || type == RLO
2230 || type == PDF));
2231
2232 eassert (prev_level >= 0);
2233 if (bidi_it->type == NEUTRAL_B)
2234 {
2235 /* We've got a new isolating sequence, compute the directional
2236 type of sos and initialize per-run variables (UAX#9, clause
2237 X10). */
2238 bidi_set_sos_type (bidi_it, prev_level, new_level);
2239 }
2240 if (type == NEUTRAL_S || type == NEUTRAL_WS
2241 || type == WEAK_BN || type == STRONG_AL)
2242 bidi_it->type_after_wn = type; /* needed in L1 */
2243 bidi_check_type (bidi_it->type_after_wn);
2244
2245 /* Level and directional override status are already recorded in
2246 bidi_it, and do not need any change; see X6. */
2247 if (override == R2L) /* X6 */
2248 type = STRONG_R;
2249 else if (override == L2R)
2250 type = STRONG_L;
2251 else
2252 {
2253 if (type == WEAK_NSM) /* W1 */
2254 {
2255 /* Note that we don't need to consider the case where the
2256 prev character has its type overridden by an RLO or LRO,
2257 because then either the type of this NSM would have been
2258 also overridden, or the previous character is outside the
2259 current level run, and thus not relevant to this NSM.
2260 This is why NSM gets the type_after_wn of the previous
2261 character. */
2262 /* bidi_set_sos_type sets type_after_wn to UNKNOWN_BT. */
2263 if (bidi_it->prev.type != UNKNOWN_BT
2264 /* If type_after_wn is NEUTRAL_B, this NSM is at sos. */
2265 && bidi_it->prev.type != NEUTRAL_B)
2266 {
2267 if (bidi_isolate_fmt_char (bidi_it->prev.type))
2268 {
2269 /* From W1: "Note that in an isolating run sequence,
2270 an isolate initiator followed by an NSM or any
2271 type other than PDI must be an overflow isolate
2272 initiator." */
2273 eassert (bidi_it->invalid_isolates > 0);
2274 type = NEUTRAL_ON;
2275 }
2276 else
2277 {
2278 /* This includes the Unicode 8.0 correction for N0,
2279 due to how we set prev.type in bidi_resolve_explicit,
2280 which see. */
2281 type = bidi_it->prev.type;
2282 }
2283 }
2284 else if (bidi_it->sos == R2L)
2285 type = STRONG_R;
2286 else if (bidi_it->sos == L2R)
2287 type = STRONG_L;
2288 else /* shouldn't happen! */
2289 emacs_abort ();
2290 }
2291 if (type == WEAK_EN /* W2 */
2292 && bidi_it->last_strong.type == STRONG_AL)
2293 type = WEAK_AN;
2294 else if (type == STRONG_AL) /* W3 */
2295 type = STRONG_R;
2296 else if ((type == WEAK_ES /* W4 */
2297 && bidi_it->prev.type == WEAK_EN
2298 && bidi_it->prev.orig_type == WEAK_EN)
2299 || (type == WEAK_CS
2300 && ((bidi_it->prev.type == WEAK_EN
2301 && bidi_it->prev.orig_type == WEAK_EN)
2302 || bidi_it->prev.type == WEAK_AN)))
2303 {
2304 const unsigned char *s
2305 = (STRINGP (bidi_it->string.lstring)
2306 ? SDATA (bidi_it->string.lstring)
2307 : bidi_it->string.s);
2308
2309 next_char = (bidi_it->charpos + bidi_it->nchars >= eob
2310 ? BIDI_EOB
2311 : bidi_char_at_pos (bidi_it->bytepos + bidi_it->ch_len,
2312 s, bidi_it->string.unibyte));
2313 type_of_next = bidi_get_type (next_char, override);
2314
2315 if (type_of_next == WEAK_BN
2316 || bidi_explicit_dir_char (next_char))
2317 {
2318 bidi_copy_it (&saved_it, bidi_it);
2319 while (bidi_resolve_explicit (bidi_it) == new_level
2320 && bidi_it->type == WEAK_BN)
2321 type_of_next = bidi_it->type;
2322 bidi_copy_it (bidi_it, &saved_it);
2323 }
2324
2325 /* If the next character is EN, but the last strong-type
2326 character is AL, that next EN will be changed to AN when
2327 we process it in W2 above. So in that case, this ES
2328 should not be changed into EN. */
2329 if (type == WEAK_ES
2330 && type_of_next == WEAK_EN
2331 && bidi_it->last_strong.type != STRONG_AL)
2332 type = WEAK_EN;
2333 else if (type == WEAK_CS)
2334 {
2335 if (bidi_it->prev.type == WEAK_AN
2336 && (type_of_next == WEAK_AN
2337 /* If the next character is EN, but the last
2338 strong-type character is AL, EN will be later
2339 changed to AN when we process it in W2 above.
2340 So in that case, this ES should not be
2341 changed into EN. */
2342 || (type_of_next == WEAK_EN
2343 && bidi_it->last_strong.type == STRONG_AL)))
2344 type = WEAK_AN;
2345 else if (bidi_it->prev.type == WEAK_EN
2346 && type_of_next == WEAK_EN
2347 && bidi_it->last_strong.type != STRONG_AL)
2348 type = WEAK_EN;
2349 }
2350 }
2351 else if (type == WEAK_ET /* W5: ET with EN before or after it */
2352 || type == WEAK_BN) /* W5/Retaining */
2353 {
2354 if (bidi_it->prev.type == WEAK_EN) /* ET/BN w/EN before it */
2355 type = WEAK_EN;
2356 else if (bidi_it->next_en_pos > bidi_it->charpos
2357 && bidi_it->next_en_type != WEAK_BN)
2358 {
2359 if (bidi_it->next_en_type == WEAK_EN) /* ET/BN with EN after it */
2360 type = WEAK_EN;
2361 }
2362 else if (type == WEAK_BN
2363 /* This condition is for the following important case:
2364
2365 . we are at level zero
2366 . either previous strong character was L,
2367 or we've seen no strong characters since sos
2368 and the base paragraph direction is L2R
2369 . this BN is NOT a bidi directional control
2370
2371 For such a situation, either this BN will be
2372 converted to EN per W5, and then to L by virtue
2373 of W7; or it will become ON per W6, and then L
2374 because of N1/N2. So we take a shortcut here
2375 and make it L right away, to avoid the
2376 potentially costly loop below. This is
2377 important when the buffer has a long series of
2378 control characters, like binary nulls, and no
2379 R2L characters at all. */
2380 && new_level == 0
2381 && !bidi_explicit_dir_char (bidi_it->ch)
2382 && ((bidi_it->last_strong.type == STRONG_L)
2383 || (bidi_it->last_strong.type == UNKNOWN_BT
2384 && bidi_it->sos == L2R)))
2385 type = STRONG_L;
2386 else if (bidi_it->next_en_pos >= 0)
2387 {
2388 /* We overstepped the last known position for ET
2389 resolution but there could be other such characters
2390 in this paragraph (when we are sure there are no more
2391 such positions, we set next_en_pos to a negative
2392 value). Try to find the next position for ET
2393 resolution. */
2394 ptrdiff_t en_pos = bidi_it->charpos + bidi_it->nchars;
2395 const unsigned char *s = (STRINGP (bidi_it->string.lstring)
2396 ? SDATA (bidi_it->string.lstring)
2397 : bidi_it->string.s);
2398
2399 if (bidi_it->nchars <= 0)
2400 emacs_abort ();
2401 next_char
2402 = (bidi_it->charpos + bidi_it->nchars >= eob
2403 ? BIDI_EOB
2404 : bidi_char_at_pos (bidi_it->bytepos + bidi_it->ch_len, s,
2405 bidi_it->string.unibyte));
2406 type_of_next = bidi_get_type (next_char, override);
2407
2408 if (type_of_next == WEAK_ET
2409 || type_of_next == WEAK_BN
2410 || bidi_explicit_dir_char (next_char))
2411 {
2412 bidi_copy_it (&saved_it, bidi_it);
2413 while (bidi_resolve_explicit (bidi_it) == new_level
2414 && (bidi_it->type == WEAK_BN
2415 || bidi_it->type == WEAK_ET))
2416 type_of_next = bidi_it->type;
2417 if (type == WEAK_BN
2418 && bidi_it->charpos == saved_it.charpos + saved_it.nchars)
2419 {
2420 /* If we entered the above loop with a BN that
2421 changes the level, the type of next
2422 character, which is in a different level, is
2423 not relevant to resolving this series of ET
2424 and BN. */
2425 en_pos = saved_it.charpos;
2426 type_of_next = type;
2427 }
2428 else
2429 en_pos = bidi_it->charpos;
2430 bidi_copy_it (bidi_it, &saved_it);
2431 }
2432 /* Remember this position, to speed up processing of the
2433 next ETs. */
2434 bidi_it->next_en_pos = en_pos;
2435 if (type_of_next == WEAK_EN)
2436 {
2437 /* If the last strong character is AL, the EN we've
2438 found will become AN when we get to it (W2). */
2439 if (bidi_it->last_strong.type == STRONG_AL)
2440 type_of_next = WEAK_AN;
2441 else if (type == WEAK_BN)
2442 type = NEUTRAL_ON; /* W6/Retaining */
2443 else
2444 type = WEAK_EN;
2445 }
2446 else if (type_of_next == NEUTRAL_B)
2447 /* Record the fact that there are no more ENs from
2448 here to the end of paragraph, to avoid entering the
2449 loop above ever again in this paragraph. */
2450 bidi_it->next_en_pos = -1;
2451 /* Record the type of the character where we ended our search. */
2452 bidi_it->next_en_type = type_of_next;
2453 }
2454 }
2455 }
2456
2457 if (type == WEAK_ES || type == WEAK_ET || type == WEAK_CS /* W6 */
2458 || (type == WEAK_BN
2459 && (bidi_it->prev.type == WEAK_CS /* W6/Retaining */
2460 || bidi_it->prev.type == WEAK_ES
2461 || bidi_it->prev.type == WEAK_ET)))
2462 type = NEUTRAL_ON;
2463
2464 /* Store the type we've got so far, before we clobber it with strong
2465 types in W7 and while resolving neutral types. But leave alone
2466 the original types that were recorded above, because we will need
2467 them for the L1 clause. */
2468 if (bidi_it->type_after_wn == UNKNOWN_BT)
2469 bidi_it->type_after_wn = type;
2470 bidi_check_type (bidi_it->type_after_wn);
2471
2472 if (type == WEAK_EN) /* W7 */
2473 {
2474 if ((bidi_it->last_strong.type == STRONG_L)
2475 || (bidi_it->last_strong.type == UNKNOWN_BT && bidi_it->sos == L2R))
2476 type = STRONG_L;
2477 }
2478
2479 bidi_it->type = type;
2480 bidi_check_type (bidi_it->type);
2481 return type;
2482 }
2483
2484 /* Resolve the type of a neutral character according to the type of
2485 surrounding strong text and the current embedding level. */
2486 static bidi_type_t
2487 bidi_resolve_neutral_1 (bidi_type_t prev_type, bidi_type_t next_type, int lev)
2488 {
2489 /* N1: "European and Arabic numbers act as if they were R in terms
2490 of their influence on NIs." */
2491 if (next_type == WEAK_EN || next_type == WEAK_AN)
2492 next_type = STRONG_R;
2493 if (prev_type == WEAK_EN || prev_type == WEAK_AN)
2494 prev_type = STRONG_R;
2495
2496 if (next_type == prev_type) /* N1 */
2497 return next_type;
2498 else if ((lev & 1) == 0) /* N2 */
2499 return STRONG_L;
2500 else
2501 return STRONG_R;
2502 }
2503
2504 #define FLAG_EMBEDDING_INSIDE 1
2505 #define FLAG_OPPOSITE_INSIDE 2
2506
2507 /* A data type used in the stack maintained by
2508 bidi_find_bracket_pairs below. */
2509 typedef struct bpa_stack_entry {
2510 int close_bracket_char;
2511 int open_bracket_idx;
2512 #ifdef ENABLE_CHECKING
2513 ptrdiff_t open_bracket_pos;
2514 #endif
2515 unsigned flags : 2;
2516 } bpa_stack_entry;
2517
2518 /* Allow for the two struct bidi_it objects too, since they can be big.
2519 With MAX_ALLOCA of 16 KiB, this should allow at least 900 slots in the
2520 BPA stack, which should be more than enough for actual bidi text. */
2521 enum { MAX_BPA_STACK = max (1, ((MAX_ALLOCA - 2 * sizeof (struct bidi_it))
2522 / sizeof (bpa_stack_entry))) };
2523
2524 /* UAX#9 says to match opening brackets with the matching closing
2525 brackets or their canonical equivalents. As of Unicode 8.0, there
2526 are only 2 bracket characters that have canonical equivalence
2527 decompositions: u+2329 and u+232A. So instead of accessing the
2528 table in uni-decomposition.el, we just handle these 2 characters
2529 with this simple macro. Note that ASCII characters don't have
2530 canonical equivalents by definition. */
2531
2532 /* To find all the characters that need to be processed by
2533 CANONICAL_EQU, first find all the characters which have
2534 decompositions in UnicodeData.txt, with this Awk script:
2535
2536 awk -F ";" " {if ($6 != \"\") print $1, $6}" UnicodeData.txt
2537
2538 Then produce a list of all the bracket characters in BidiBrackets.txt:
2539
2540 awk -F "[ ;]" " {if ($1 != \"#\" && $1 != \"\") print $1}" BidiBrackets.txt
2541
2542 And finally, cross-reference these two:
2543
2544 grep -Fw -f brackets.txt decompositions.txt
2545
2546 where "decompositions.txt" was produced by the 1st script, and
2547 "brackets.txt" by the 2nd script. In the output of grep, look
2548 only for decompositions that don't begin with some compatibility
2549 formatting tag, such as "<compat>". Only decompositions that
2550 consist solely of character codepoints are relevant to bidi
2551 brackets processing. */
2552
2553 #define CANONICAL_EQU(c) \
2554 ( ASCII_CHAR_P (c) ? c \
2555 : (c) == LEFT_POINTING_ANGLE_BRACKET ? LEFT_ANGLE_BRACKET \
2556 : (c) == RIGHT_POINTING_ANGLE_BRACKET ? RIGHT_ANGLE_BRACKET \
2557 : c )
2558
2559 #ifdef ENABLE_CHECKING
2560 # define STORE_BRACKET_CHARPOS \
2561 bpa_stack[bpa_sp].open_bracket_pos = bidi_it->charpos
2562 #else
2563 # define STORE_BRACKET_CHARPOS /* nothing */
2564 #endif
2565
2566 #define PUSH_BPA_STACK \
2567 do { \
2568 int ch; \
2569 if (bpa_sp < MAX_BPA_STACK - 1 && bidi_cache_last_idx <= INT_MAX) \
2570 { \
2571 bpa_sp++; \
2572 ch = CANONICAL_EQU (bidi_it->ch); \
2573 bpa_stack[bpa_sp].close_bracket_char = bidi_mirror_char (ch); \
2574 bpa_stack[bpa_sp].open_bracket_idx = bidi_cache_last_idx; \
2575 bpa_stack[bpa_sp].flags = 0; \
2576 STORE_BRACKET_CHARPOS; \
2577 } \
2578 } while (0)
2579
2580
2581 /* This function implements BPA, the Bidi Parenthesis Algorithm,
2582 described in BD16 and N0 of UAX#9. It finds all the bracket pairs
2583 in the current isolating sequence, and records the enclosed type
2584 and the position of the matching bracket in the cache. It returns
2585 non-zero if called with the iterator on the opening bracket which
2586 has a matching closing bracket in the current isolating sequence,
2587 zero otherwise. */
2588 static bool
2589 bidi_find_bracket_pairs (struct bidi_it *bidi_it)
2590 {
2591 bidi_bracket_type_t btype;
2592 bidi_type_t type = bidi_it->type;
2593 bool retval = false;
2594 ptrdiff_t n = 0;
2595
2596 /* When scanning backwards, we don't expect any unresolved bidi
2597 bracket characters. */
2598 if (bidi_it->scan_dir != 1)
2599 emacs_abort ();
2600
2601 btype = bidi_paired_bracket_type (bidi_it->ch);
2602 if (btype == BIDI_BRACKET_OPEN)
2603 {
2604 bpa_stack_entry bpa_stack[MAX_BPA_STACK];
2605 int bpa_sp = -1;
2606 struct bidi_it saved_it;
2607 int base_level = bidi_it->level_stack[0].level;
2608 int embedding_level = bidi_it->level_stack[bidi_it->stack_idx].level;
2609 int maxlevel = embedding_level;
2610 bidi_type_t embedding_type = (embedding_level & 1) ? STRONG_R : STRONG_L;
2611 struct bidi_it tem_it;
2612 bool l2r_seen = false, r2l_seen = false;
2613 ptrdiff_t pairing_pos;
2614 int idx_at_entry = bidi_cache_idx;
2615
2616 verify (MAX_BPA_STACK >= 100);
2617 bidi_copy_it (&saved_it, bidi_it);
2618 /* bidi_cache_iterator_state refuses to cache on backward scans,
2619 and bidi_cache_fetch_state doesn't bring scan_dir from the
2620 cache, so we must initialize this explicitly. */
2621 tem_it.scan_dir = 1;
2622
2623 while (1)
2624 {
2625 int old_sidx, new_sidx;
2626 int current_level = bidi_it->level_stack[bidi_it->stack_idx].level;
2627
2628 if (maxlevel < current_level)
2629 maxlevel = current_level;
2630 /* Mark every opening bracket character we've traversed by
2631 putting its own position into bracket_pairing_pos. This
2632 is examined in bidi_resolve_brackets to distinguish
2633 brackets that were already resolved to stay NEUTRAL_ON,
2634 and those that were not yet processed by this function
2635 (because they were skipped when we skip higher embedding
2636 levels below). */
2637 if (btype == BIDI_BRACKET_OPEN && bidi_it->bracket_pairing_pos == -1)
2638 bidi_it->bracket_pairing_pos = bidi_it->charpos;
2639 if (!bidi_cache_iterator_state (bidi_it, type == NEUTRAL_B, 0))
2640 {
2641 /* No more space in cache -- give up and let the opening
2642 bracket that started this be processed as a
2643 NEUTRAL_ON. */
2644 bidi_cache_reset_to (idx_at_entry - bidi_cache_start);
2645 bidi_copy_it (bidi_it, &saved_it);
2646 goto give_up;
2647 }
2648 if (btype == BIDI_BRACKET_OPEN)
2649 PUSH_BPA_STACK;
2650 else if (btype == BIDI_BRACKET_CLOSE)
2651 {
2652 int sp = bpa_sp;
2653 int curchar = CANONICAL_EQU (bidi_it->ch);
2654
2655 eassert (sp >= 0);
2656 while (sp >= 0 && bpa_stack[sp].close_bracket_char != curchar)
2657 sp--;
2658 if (sp >= 0)
2659 {
2660 /* Update and cache the corresponding opening bracket. */
2661 bidi_cache_fetch_state (bpa_stack[sp].open_bracket_idx,
2662 &tem_it);
2663 #ifdef ENABLE_CHECKING
2664 eassert (bpa_stack[sp].open_bracket_pos == tem_it.charpos);
2665 #endif
2666 /* Determine the enclosed type for this bracket
2667 pair's type resolution according to N0. */
2668 if (bpa_stack[sp].flags & FLAG_EMBEDDING_INSIDE)
2669 tem_it.bracket_enclosed_type = embedding_type; /* N0b */
2670 else if (bpa_stack[sp].flags & FLAG_OPPOSITE_INSIDE)
2671 tem_it.bracket_enclosed_type /* N0c */
2672 = (embedding_type == STRONG_L ? STRONG_R : STRONG_L);
2673 else /* N0d */
2674 tem_it.bracket_enclosed_type = UNKNOWN_BT;
2675
2676 /* Record the position of the matching closing
2677 bracket, and update the cache. */
2678 tem_it.bracket_pairing_pos = bidi_it->charpos;
2679 bidi_cache_iterator_state (&tem_it, 0, 1);
2680
2681 /* Pop the BPA stack. */
2682 bpa_sp = sp - 1;
2683 }
2684 if (bpa_sp < 0)
2685 {
2686 retval = true;
2687 break;
2688 }
2689 }
2690 else if (bidi_get_category (bidi_it->type_after_wn) != NEUTRAL)
2691 {
2692 unsigned flag = 0;
2693 int sp;
2694
2695 /* Whenever we see a strong type, update the flags of
2696 all the slots on the stack. */
2697 switch (bidi_it->type)
2698 {
2699 case STRONG_L:
2700 flag = ((embedding_level & 1) == 0
2701 ? FLAG_EMBEDDING_INSIDE
2702 : FLAG_OPPOSITE_INSIDE);
2703 l2r_seen = true;
2704 break;
2705 case STRONG_R:
2706 case WEAK_EN:
2707 case WEAK_AN:
2708 flag = ((embedding_level & 1) == 1
2709 ? FLAG_EMBEDDING_INSIDE
2710 : FLAG_OPPOSITE_INSIDE);
2711 r2l_seen = true;
2712 break;
2713 default:
2714 break;
2715 }
2716 if (flag)
2717 {
2718 for (sp = bpa_sp; sp >= 0; sp--)
2719 bpa_stack[sp].flags |= flag;
2720 }
2721 }
2722 old_sidx = bidi_it->stack_idx;
2723 type = bidi_resolve_weak (bidi_it);
2724 n++;
2725 /* Skip level runs excluded from this isolating run sequence. */
2726 new_sidx = bidi_it->stack_idx;
2727 if (bidi_it->level_stack[new_sidx].level > current_level
2728 && (ISOLATE_STATUS (bidi_it, new_sidx)
2729 || (new_sidx > old_sidx + 1
2730 && ISOLATE_STATUS (bidi_it, new_sidx - 1))))
2731 {
2732 while (bidi_it->level_stack[bidi_it->stack_idx].level
2733 > current_level)
2734 {
2735 if (maxlevel < bidi_it->level_stack[bidi_it->stack_idx].level)
2736 maxlevel = bidi_it->level_stack[bidi_it->stack_idx].level;
2737 if (!bidi_cache_iterator_state (bidi_it,
2738 type == NEUTRAL_B, 0))
2739 {
2740 /* No more space in cache -- give up and let the
2741 opening bracket that started this be
2742 processed as any other NEUTRAL_ON. */
2743 bidi_cache_reset_to (idx_at_entry - bidi_cache_start);
2744 bidi_copy_it (bidi_it, &saved_it);
2745 goto give_up;
2746 }
2747 type = bidi_resolve_weak (bidi_it);
2748 n++;
2749 }
2750 }
2751 if (type == NEUTRAL_B
2752 || (bidi_it->level_stack[bidi_it->stack_idx].level
2753 != current_level))
2754 {
2755 /* We've marched all the way to the end of this
2756 isolating run sequence, and didn't find matching
2757 closing brackets for some opening brackets. Leave
2758 their type unchanged. */
2759 pairing_pos = bidi_it->charpos;
2760 break;
2761 }
2762 if (bidi_it->type_after_wn == NEUTRAL_ON) /* Unicode 8.0 correction */
2763 btype = bidi_paired_bracket_type (bidi_it->ch);
2764 else
2765 btype = BIDI_BRACKET_NONE;
2766 }
2767
2768 /* Restore bidi_it from the cache, which should have the bracket
2769 resolution members set as determined by the above loop. */
2770 type = bidi_cache_find (saved_it.charpos, 0, bidi_it);
2771 eassert (type == NEUTRAL_ON);
2772
2773 /* The following is an optimization for bracketed text that has
2774 only one level which is equal to the paragraph's base
2775 embedding level. That is, only L2R and weak/neutral
2776 characters in a L2R paragraph, or only R2L and weak/neutral
2777 characters in a R2L paragraph. Such brackets can be resolved
2778 by bidi_resolve_neutral, which has a further shortcut for
2779 this case. So we pretend we did not resolve the brackets in
2780 this case, set up next_for_neutral for the entire bracketed
2781 text, and reset the cache to the character before the opening
2782 bracket. The upshot is to allow bidi_move_to_visually_next
2783 reset the cache when it returns this opening bracket, thus
2784 cutting significantly on the size of the cache, which is
2785 important with long lines, especially if word-wrap is non-nil
2786 (which requires the display engine to copy the cache back and
2787 forth many times). */
2788 if (maxlevel == base_level
2789 && (l2r_seen || r2l_seen) /* N0d */
2790 && ((base_level == 0 && !r2l_seen)
2791 || (base_level == 1 && !l2r_seen)))
2792 {
2793 ptrdiff_t eob
2794 = ((bidi_it->string.s || STRINGP (bidi_it->string.lstring))
2795 ? bidi_it->string.schars : ZV);
2796
2797 if (retval)
2798 pairing_pos = bidi_it->bracket_pairing_pos;
2799
2800 /* This special value (which cannot possibly happen when
2801 brackets are resolved, since there's no character at ZV)
2802 will be noticed by bidi_resolve_explicit, and will be
2803 copied to the following iterator states, instead of being
2804 reset to -1. */
2805 bidi_it->bracket_pairing_pos = eob;
2806 /* This type value will be used for resolving the outermost
2807 closing bracket in bidi_resolve_brackets. */
2808 bidi_it->bracket_enclosed_type = embedding_type;
2809 /* bidi_cache_last_idx is set to the index of the current
2810 state, because we just called bidi_cache_find above.
2811 That state describes the outermost opening bracket, the
2812 one with which we entered this function. Force the cache
2813 to "forget" all the cached states starting from that state. */
2814 bidi_cache_reset_to (bidi_cache_last_idx - bidi_cache_start);
2815 /* Set up the next_for_neutral member, to help
2816 bidi_resolve_neutral. */
2817 bidi_it->next_for_neutral.type = embedding_type;
2818 bidi_it->next_for_neutral.charpos = pairing_pos;
2819 /* Pretend we didn't resolve this bracket. */
2820 retval = false;
2821 }
2822 }
2823
2824 give_up:
2825 /* The factor of 20 below is a heuristic that needs to be tuned. It
2826 means we consider 20 buffer positions examined by this function
2827 roughly equivalent to the display engine iterating over a single
2828 buffer position. */
2829 if (max_redisplay_ticks > 0 && n > 0)
2830 update_redisplay_ticks (n / 20 + 1, bidi_it->w);
2831 return retval;
2832 }
2833
2834 static void
2835 bidi_record_type_for_neutral (struct bidi_saved_info *info, int level,
2836 bool nextp)
2837 {
2838 int idx;
2839
2840 for (idx = bidi_cache_last_idx + 1; idx < bidi_cache_idx; idx++)
2841 {
2842 int lev = bidi_cache[idx].level_stack[bidi_cache[idx].stack_idx].level;
2843
2844 if (lev <= level)
2845 {
2846 eassert (lev == level);
2847 if (nextp)
2848 bidi_cache[idx].next_for_neutral = *info;
2849 else
2850 bidi_cache[idx].prev_for_neutral = *info;
2851 break;
2852 }
2853 }
2854 }
2855
2856 static bidi_type_t
2857 bidi_resolve_brackets (struct bidi_it *bidi_it)
2858 {
2859 int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
2860 bool resolve_bracket = false;
2861 bidi_type_t type = UNKNOWN_BT;
2862 int ch;
2863 struct bidi_saved_info prev_for_neutral, next_for_neutral;
2864 ptrdiff_t eob
2865 = ((bidi_it->string.s || STRINGP (bidi_it->string.lstring))
2866 ? bidi_it->string.schars : ZV);
2867
2868 /* Record the prev_for_neutral type either from the previous
2869 character, if it was a strong or AN/EN, or from the
2870 prev_for_neutral information recorded previously. */
2871 if (bidi_it->type == STRONG_L || bidi_it->type == STRONG_R
2872 || bidi_it->type == WEAK_AN || bidi_it->type == WEAK_EN)
2873 bidi_remember_char (&prev_for_neutral, bidi_it, 1);
2874 else
2875 prev_for_neutral = bidi_it->prev_for_neutral;
2876 /* Record the next_for_neutral type information. */
2877 if (bidi_it->next_for_neutral.charpos > bidi_it->charpos)
2878 next_for_neutral = bidi_it->next_for_neutral;
2879 else
2880 next_for_neutral.charpos = -1;
2881 if (!bidi_it->first_elt)
2882 {
2883 type = bidi_cache_find (bidi_it->charpos + bidi_it->nchars, 0, bidi_it);
2884 ch = bidi_it->ch;
2885 }
2886 if (type == UNKNOWN_BT)
2887 {
2888 type = bidi_resolve_weak (bidi_it);
2889 if (type == NEUTRAL_ON)
2890 {
2891 /* bracket_pairing_pos == eob means this bracket does not
2892 need to be resolved as a bracket, but as a neutral, see
2893 the optimization trick we play near the end of
2894 bidi_find_bracket_pairs. */
2895 if (bidi_it->bracket_pairing_pos == eob)
2896 {
2897 /* If this is the outermost closing bracket of a run of
2898 characters in which we decided to resolve brackets as
2899 neutrals, use the embedding level's type, recorded in
2900 bracket_enclosed_type, to resolve the bracket. */
2901 if (bidi_it->next_for_neutral.charpos == bidi_it->charpos
2902 && bidi_paired_bracket_type (bidi_it->ch) == BIDI_BRACKET_CLOSE)
2903 type = bidi_it->bracket_enclosed_type;
2904 }
2905 else if (bidi_find_bracket_pairs (bidi_it))
2906 resolve_bracket = true;
2907 }
2908 }
2909 else if (bidi_it->bracket_pairing_pos != eob)
2910 {
2911 eassert (bidi_it->resolved_level == -1);
2912 /* If the cached state shows an increase of embedding level due
2913 to an isolate initiator, we need to update the 1st cached
2914 state of the next run of the current isolating sequence with
2915 the prev_for_neutral and next_for_neutral information, so
2916 that it will be picked up when we advance to that next run. */
2917 if (bidi_it->level_stack[bidi_it->stack_idx].level > prev_level
2918 && ISOLATE_STATUS (bidi_it, bidi_it->stack_idx))
2919 {
2920 bidi_record_type_for_neutral (&prev_for_neutral, prev_level, 0);
2921 bidi_record_type_for_neutral (&next_for_neutral, prev_level, 1);
2922 }
2923 if (type == NEUTRAL_ON
2924 && bidi_paired_bracket_type (ch) == BIDI_BRACKET_OPEN)
2925 {
2926 if (bidi_it->bracket_pairing_pos > bidi_it->charpos)
2927 {
2928 /* A cached opening bracket that wasn't completely
2929 resolved yet. */
2930 resolve_bracket = true;
2931 }
2932 else if (bidi_it->bracket_pairing_pos == -1)
2933 {
2934 /* Higher levels were not BPA-resolved yet, even if
2935 cached by bidi_find_bracket_pairs. Force application
2936 of BPA to the new level now. */
2937 if (bidi_find_bracket_pairs (bidi_it))
2938 resolve_bracket = true;
2939 }
2940 }
2941 /* Keep track of the prev_for_neutral and next_for_neutral
2942 types, needed for resolving brackets below and for resolving
2943 neutrals in bidi_resolve_neutral. */
2944 if (bidi_it->level_stack[bidi_it->stack_idx].level == prev_level)
2945 {
2946 bidi_it->prev_for_neutral = prev_for_neutral;
2947 if (next_for_neutral.charpos > 0)
2948 bidi_it->next_for_neutral = next_for_neutral;
2949 }
2950 }
2951
2952 /* If needed, resolve the bracket type according to N0. */
2953 if (resolve_bracket)
2954 {
2955 int embedding_level = bidi_it->level_stack[bidi_it->stack_idx].level;
2956 bidi_type_t embedding_type = (embedding_level & 1) ? STRONG_R : STRONG_L;
2957
2958 eassert (bidi_it->bracket_pairing_pos > bidi_it->charpos);
2959 if (bidi_it->bracket_enclosed_type == embedding_type) /* N0b */
2960 type = embedding_type;
2961 else if (bidi_it->bracket_enclosed_type == STRONG_L /* N0c, N0d */
2962 || bidi_it->bracket_enclosed_type == STRONG_R)
2963 {
2964 bidi_type_t prev_type_for_neutral = bidi_it->prev_for_neutral.type;
2965
2966 if (prev_type_for_neutral == UNKNOWN_BT)
2967 prev_type_for_neutral = embedding_type;
2968 switch (prev_type_for_neutral)
2969 {
2970 case STRONG_R:
2971 case WEAK_EN:
2972 case WEAK_AN:
2973 type =
2974 (bidi_it->bracket_enclosed_type == STRONG_R) /* N0c */
2975 ? STRONG_R /* N0c1 */
2976 : embedding_type; /* N0c2 */
2977 break;
2978 case STRONG_L:
2979 type =
2980 (bidi_it->bracket_enclosed_type == STRONG_L) /* N0c */
2981 ? STRONG_L /* N0c1 */
2982 : embedding_type; /* N0c2 */
2983 break;
2984 default:
2985 /* N0d: Do not set the type for that bracket pair. */
2986 break;
2987 }
2988 }
2989 eassert (type == STRONG_L || type == STRONG_R || type == NEUTRAL_ON);
2990
2991 /* Update the type of the paired closing bracket to the same
2992 type as for the resolved opening bracket. */
2993 if (type != NEUTRAL_ON)
2994 {
2995 ptrdiff_t idx = bidi_cache_search (bidi_it->bracket_pairing_pos,
2996 -1, 1);
2997
2998 if (idx < bidi_cache_start)
2999 emacs_abort ();
3000 bidi_cache[idx].type = type;
3001 }
3002 }
3003
3004 return type;
3005 }
3006
3007 static bidi_type_t
3008 bidi_resolve_neutral (struct bidi_it *bidi_it)
3009 {
3010 bidi_type_t type = bidi_resolve_brackets (bidi_it);
3011 int current_level;
3012 bool is_neutral;
3013
3014 eassert (type == STRONG_R
3015 || type == STRONG_L
3016 || type == WEAK_BN
3017 || type == WEAK_EN
3018 || type == WEAK_AN
3019 || type == NEUTRAL_B
3020 || type == NEUTRAL_S
3021 || type == NEUTRAL_WS
3022 || type == NEUTRAL_ON
3023 || type == LRI
3024 || type == RLI
3025 || type == PDI);
3026
3027 current_level = bidi_it->level_stack[bidi_it->stack_idx].level;
3028 eassert (current_level >= 0);
3029 is_neutral = bidi_get_category (type) == NEUTRAL;
3030
3031 if ((type != NEUTRAL_B /* Don't risk entering the long loop below if
3032 we are already at paragraph end. */
3033 && (is_neutral || bidi_isolate_fmt_char (type)))
3034 /* N1-N2/Retaining */
3035 || type == WEAK_BN)
3036 {
3037 if (bidi_it->next_for_neutral.type != UNKNOWN_BT
3038 && (bidi_it->next_for_neutral.charpos > bidi_it->charpos
3039 /* PDI defines an eos, so it's OK for it to serve as its
3040 own next_for_neutral. */
3041 || (bidi_it->next_for_neutral.charpos == bidi_it->charpos
3042 && bidi_it->type == PDI)))
3043 {
3044 type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
3045 bidi_it->next_for_neutral.type,
3046 current_level);
3047 }
3048 /* The next two "else if" clauses are shortcuts for the
3049 important special case when we have a long sequence of
3050 neutral or WEAK_BN characters, such as whitespace or nulls or
3051 other control characters, on the base embedding level of the
3052 paragraph, and that sequence goes all the way to the end of
3053 the paragraph and follows a character whose resolved
3054 directionality is identical to the base embedding level.
3055 (This is what happens in a buffer with plain L2R text that
3056 happens to include long sequences of control characters.) By
3057 virtue of N1, the result of examining this long sequence will
3058 always be either STRONG_L or STRONG_R, depending on the base
3059 embedding level. So we use this fact directly instead of
3060 entering the expensive loop in the "else" clause. */
3061 else if (current_level == 0
3062 && bidi_it->prev_for_neutral.type == STRONG_L
3063 && (ASCII_CHAR_P (bidi_it->ch)
3064 || (type != WEAK_BN
3065 && !bidi_explicit_dir_char (bidi_it->ch)
3066 && !bidi_isolate_fmt_char (type))))
3067 type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
3068 STRONG_L, current_level);
3069 else if (/* current level is 1 */
3070 current_level == 1
3071 /* base embedding level is also 1 */
3072 && bidi_it->level_stack[0].level == 1
3073 /* previous character is one of those considered R for
3074 the purposes of W5 */
3075 && (bidi_it->prev_for_neutral.type == STRONG_R
3076 || bidi_it->prev_for_neutral.type == WEAK_EN
3077 || bidi_it->prev_for_neutral.type == WEAK_AN)
3078 && type != WEAK_BN
3079 && !bidi_explicit_dir_char (bidi_it->ch)
3080 && !bidi_isolate_fmt_char (type))
3081 type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
3082 STRONG_R, current_level);
3083 else
3084 {
3085 /* Arrrgh!! The UAX#9 algorithm is too deeply entrenched in
3086 the assumption of batch-style processing; see clauses W4,
3087 W5, and especially N1, which require looking far forward
3088 (as well as back) in the buffer/string. May the fleas of
3089 a thousand camels infest the armpits of those who design
3090 supposedly general-purpose algorithms by looking at their
3091 own implementations, and fail to consider other possible
3092 implementations! */
3093 struct bidi_it saved_it;
3094 bidi_type_t next_type;
3095 bool adjacent_to_neutrals = is_neutral;
3096
3097 bidi_copy_it (&saved_it, bidi_it);
3098 /* Scan the text forward until we find the first non-neutral
3099 character, and then use that to resolve the neutral we
3100 are dealing with now. We also cache the scanned iterator
3101 states, to salvage some of the effort later. */
3102 do {
3103 int old_sidx, new_sidx;
3104
3105 /* Paragraph separators have their levels fully resolved
3106 at this point, so cache them as resolved. */
3107 bidi_cache_iterator_state (bidi_it, type == NEUTRAL_B, 0);
3108 old_sidx = bidi_it->stack_idx;
3109 type = bidi_resolve_brackets (bidi_it);
3110 /* Skip level runs excluded from this isolating run sequence. */
3111 new_sidx = bidi_it->stack_idx;
3112 if (bidi_it->level_stack[new_sidx].level > current_level
3113 && (ISOLATE_STATUS (bidi_it, new_sidx)
3114 /* This is for when we have an isolate initiator
3115 immediately followed by an embedding or
3116 override initiator, in which case we get the
3117 level stack pushed twice by the single call to
3118 bidi_resolve_weak above. */
3119 || (new_sidx > old_sidx + 1
3120 && ISOLATE_STATUS (bidi_it, new_sidx - 1))))
3121 {
3122 while (bidi_it->level_stack[bidi_it->stack_idx].level
3123 > current_level)
3124 {
3125 bidi_cache_iterator_state (bidi_it, type == NEUTRAL_B, 0);
3126 type = bidi_resolve_brackets (bidi_it);
3127 }
3128 }
3129 if (!adjacent_to_neutrals
3130 && (bidi_get_category (type) == NEUTRAL
3131 || bidi_isolate_fmt_char (type)))
3132 adjacent_to_neutrals = true;
3133 } while (!(type == NEUTRAL_B
3134 || (type != WEAK_BN
3135 && bidi_get_category (type) != NEUTRAL
3136 && !bidi_isolate_fmt_char (type))
3137 /* This is all per level run, so stop when we
3138 reach the end of this level run. */
3139 || (bidi_it->level_stack[bidi_it->stack_idx].level
3140 != current_level)));
3141
3142 /* Record the character we stopped at. */
3143 bidi_remember_char (&saved_it.next_for_neutral, bidi_it, 1);
3144
3145 if ((bidi_it->level_stack[bidi_it->stack_idx].level != current_level)
3146 || type == NEUTRAL_B)
3147 {
3148 /* Marched all the way to the end of this level run. We
3149 need to use the eos type, whose information is stored
3150 by bidi_set_sos_type in the prev_for_neutral
3151 member. */
3152 if (adjacent_to_neutrals)
3153 next_type = bidi_it->prev_for_neutral.type;
3154 else
3155 {
3156 /* This is a BN which does not adjoin neutrals.
3157 Leave its type alone. */
3158 bidi_copy_it (bidi_it, &saved_it);
3159 return bidi_it->type;
3160 }
3161 }
3162 else
3163 {
3164 switch (type)
3165 {
3166 case STRONG_L:
3167 case STRONG_R:
3168 case STRONG_AL:
3169 /* Actually, STRONG_AL cannot happen here, because
3170 bidi_resolve_weak converts it to STRONG_R, per W3. */
3171 eassert (type != STRONG_AL);
3172 next_type = type;
3173 break;
3174 case WEAK_EN:
3175 case WEAK_AN:
3176 /* N1: "European and Arabic numbers act as if they
3177 were R in terms of their influence on NIs." */
3178 next_type = STRONG_R;
3179 break;
3180 default:
3181 emacs_abort ();
3182 break;
3183 }
3184 }
3185 /* Resolve the type of all the NIs found during the above loop. */
3186 type = bidi_resolve_neutral_1 (saved_it.prev_for_neutral.type,
3187 next_type, current_level);
3188 /* Update next_for_neutral with the resolved type, so we
3189 could use it for all the other NIs up to the place where
3190 we exited the loop. */
3191 saved_it.next_for_neutral.type = next_type;
3192 bidi_check_type (type);
3193 /* Update the character which caused us to enter the above loop. */
3194 saved_it.type = type;
3195 bidi_check_type (next_type);
3196 bidi_copy_it (bidi_it, &saved_it);
3197 }
3198 }
3199 return type;
3200 }
3201
3202 /* Given an iterator state in BIDI_IT, advance one character position
3203 in the buffer/string to the next character (in the logical order),
3204 resolve the bidi type of that next character, and return that
3205 type. */
3206 static bidi_type_t
3207 bidi_type_of_next_char (struct bidi_it *bidi_it)
3208 {
3209 bidi_type_t type;
3210
3211 /* This should always be called during a forward scan. */
3212 if (bidi_it->scan_dir != 1)
3213 emacs_abort ();
3214
3215 type = bidi_resolve_neutral (bidi_it);
3216
3217 return type;
3218 }
3219
3220 /* Given an iterator state BIDI_IT, advance one character position in
3221 the buffer/string to the next character (in the current scan
3222 direction), resolve the embedding and implicit levels of that next
3223 character, and return the resulting level. */
3224 static int
3225 bidi_level_of_next_char (struct bidi_it *bidi_it)
3226 {
3227 bidi_type_t type = UNKNOWN_BT;
3228 int level;
3229 ptrdiff_t next_char_pos = -2;
3230
3231 if (bidi_it->scan_dir == 1)
3232 {
3233 ptrdiff_t eob
3234 = ((bidi_it->string.s || STRINGP (bidi_it->string.lstring))
3235 ? bidi_it->string.schars : ZV);
3236
3237 /* There's no sense in trying to advance if we've already hit
3238 the end of text. */
3239 if (bidi_it->charpos >= eob)
3240 {
3241 eassert (bidi_it->resolved_level >= 0);
3242 return bidi_it->resolved_level;
3243 }
3244 }
3245
3246 /* Perhaps the character we want is already cached as fully resolved.
3247 If it is, the call to bidi_cache_find below will return a type
3248 other than UNKNOWN_BT. */
3249 if (bidi_cache_idx > bidi_cache_start && !bidi_it->first_elt)
3250 {
3251 int bob = ((bidi_it->string.s || STRINGP (bidi_it->string.lstring))
3252 ? 0 : 1);
3253
3254 if (bidi_it->scan_dir > 0)
3255 {
3256 if (bidi_it->nchars <= 0)
3257 emacs_abort ();
3258 next_char_pos = bidi_it->charpos + bidi_it->nchars;
3259 }
3260 else if (bidi_it->charpos >= bob)
3261 /* Implementation note: we allow next_char_pos to be as low as
3262 0 for buffers or -1 for strings, and that is okay because
3263 that's the "position" of the sentinel iterator state we
3264 cached at the beginning of the iteration. */
3265 next_char_pos = bidi_it->charpos - 1;
3266 if (next_char_pos >= bob - 1)
3267 type = bidi_cache_find (next_char_pos, 1, bidi_it);
3268 if (type != UNKNOWN_BT)
3269 {
3270 /* We asked the cache for fully resolved states. */
3271 eassert (bidi_it->resolved_level >= 0);
3272 return bidi_it->resolved_level;
3273 }
3274 }
3275
3276 if (bidi_it->scan_dir == -1)
3277 /* If we are going backwards, the iterator state is already cached
3278 from previous scans, and should be fully resolved. */
3279 emacs_abort ();
3280
3281 if (type == UNKNOWN_BT)
3282 type = bidi_type_of_next_char (bidi_it);
3283
3284 if (type == NEUTRAL_B)
3285 {
3286 eassert (bidi_it->resolved_level >= 0);
3287 return bidi_it->resolved_level;
3288 }
3289
3290 level = bidi_it->level_stack[bidi_it->stack_idx].level;
3291
3292 eassert ((type == STRONG_R
3293 || type == STRONG_L
3294 || type == WEAK_BN
3295 || type == WEAK_EN
3296 || type == WEAK_AN));
3297 bidi_it->type = type;
3298 bidi_check_type (bidi_it->type);
3299
3300 /* For L1 below, we need to know, for each WS character, whether
3301 it belongs to a sequence of WS characters preceding a newline
3302 or a TAB or a paragraph separator. */
3303 if ((bidi_it->orig_type == NEUTRAL_WS
3304 || (bidi_it->orig_type == WEAK_BN
3305 /* If this BN character is already at base level, we don't
3306 need to consider resetting it, since I1 and I2 below
3307 will not change the level, so avoid the potentially
3308 costly loop below. */
3309 && level != bidi_it->level_stack[0].level)
3310 || bidi_isolate_fmt_char (bidi_it->orig_type))
3311 /* This means the informaition about WS resolution is not valid. */
3312 && bidi_it->next_for_ws.charpos < bidi_it->charpos)
3313 {
3314 int ch;
3315 ptrdiff_t clen = bidi_it->ch_len;
3316 ptrdiff_t bpos = bidi_it->bytepos;
3317 ptrdiff_t cpos = bidi_it->charpos;
3318 ptrdiff_t disp_pos = bidi_it->disp_pos;
3319 ptrdiff_t nc = bidi_it->nchars;
3320 struct bidi_string_data bs = bidi_it->string;
3321 bidi_type_t chtype;
3322 bool fwp = bidi_it->frame_window_p;
3323 int dpp = bidi_it->disp_prop;
3324
3325 if (bidi_it->nchars <= 0)
3326 emacs_abort ();
3327 do {
3328 ch = bidi_fetch_char (cpos += nc, bpos += clen, &disp_pos, &dpp, &bs,
3329 bidi_it->w, fwp, &clen, &nc);
3330 chtype = bidi_get_type (ch, NEUTRAL_DIR);
3331 } while (chtype == NEUTRAL_WS || chtype == WEAK_BN
3332 || bidi_isolate_fmt_char (chtype)
3333 || bidi_explicit_dir_char (ch)); /* L1/Retaining */
3334 bidi_it->next_for_ws.type = chtype;
3335 bidi_check_type (bidi_it->next_for_ws.type);
3336 bidi_it->next_for_ws.charpos = cpos;
3337 }
3338
3339 /* Update the cache, but only if this state was already cached. */
3340 bidi_cache_iterator_state (bidi_it, 1, 1);
3341
3342 /* Resolve implicit levels. */
3343 if (bidi_it->orig_type == NEUTRAL_B /* L1 */
3344 || bidi_it->orig_type == NEUTRAL_S
3345 || bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB
3346 || ((bidi_it->orig_type == NEUTRAL_WS
3347 || bidi_it->orig_type == WEAK_BN /* L1/Retaining */
3348 || bidi_isolate_fmt_char (bidi_it->orig_type)
3349 || bidi_explicit_dir_char (bidi_it->ch))
3350 && (bidi_it->next_for_ws.type == NEUTRAL_B
3351 || bidi_it->next_for_ws.type == NEUTRAL_S)))
3352 level = bidi_it->level_stack[0].level;
3353 else if ((level & 1) == 0) /* I1 */
3354 {
3355 if (type == STRONG_R)
3356 level++;
3357 else if (type == WEAK_EN || type == WEAK_AN)
3358 level += 2;
3359 }
3360 else /* I2 */
3361 {
3362 if (type == STRONG_L || type == WEAK_EN || type == WEAK_AN)
3363 level++;
3364 }
3365
3366 bidi_it->resolved_level = level;
3367 return level;
3368 }
3369
3370 /* Move to the other edge of a level given by LEVEL. If END_FLAG,
3371 we are at the end of a level, and we need to prepare to
3372 resume the scan of the lower level.
3373
3374 If this level's other edge is cached, we simply jump to it, filling
3375 the iterator structure with the iterator state on the other edge.
3376 Otherwise, we walk the buffer or string until we come back to the
3377 same level as LEVEL.
3378
3379 Note: we are not talking here about a ``level run'' in the UAX#9
3380 sense of the term, but rather about a ``level'' which includes
3381 all the levels higher than it. In other words, given the levels
3382 like this:
3383
3384 11111112222222333333334443343222222111111112223322111
3385 A B C
3386
3387 and assuming we are at point A scanning left to right, this
3388 function moves to point C, whereas the UAX#9 ``level 2 run'' ends
3389 at point B. */
3390 static void
3391 bidi_find_other_level_edge (struct bidi_it *bidi_it, int level, bool end_flag)
3392 {
3393 int dir = end_flag ? -bidi_it->scan_dir : bidi_it->scan_dir;
3394 ptrdiff_t idx;
3395
3396 /* Try the cache first. */
3397 if ((idx = bidi_cache_find_level_change (level, dir, end_flag))
3398 >= bidi_cache_start)
3399 bidi_cache_fetch_state (idx, bidi_it);
3400 else
3401 {
3402 int new_level;
3403 ptrdiff_t pos0 = bidi_it->charpos;
3404
3405 /* If we are at end of level, its edges must be cached. */
3406 if (end_flag)
3407 emacs_abort ();
3408
3409 if (!bidi_cache_iterator_state (bidi_it, 1, 0))
3410 {
3411 /* Can't happen: if the cache needs to grow, it means we
3412 were at base embedding level, so the cache should have
3413 been either empty or already large enough to cover this
3414 character position. */
3415 emacs_abort ();
3416 }
3417 do {
3418 new_level = bidi_level_of_next_char (bidi_it);
3419 /* If the cache is full, perform an emergency return by
3420 pretending that the level ended. */
3421 if (!bidi_cache_iterator_state (bidi_it, 1, 0))
3422 {
3423 new_level = level - 1;
3424 /* Since the cache should only grow when we are scanning
3425 forward looking for the edge of the level that is one
3426 above the base embedding level, we can only have this
3427 contingency when LEVEL - 1 is the base embedding
3428 level. */
3429 eassert (new_level == bidi_it->level_stack[0].level);
3430 /* Plan B, for when the cache overflows: Back up to the
3431 previous character by fetching the last cached state,
3432 and force the resolved level of that character be the
3433 base embedding level. */
3434 bidi_cache_fetch_state (bidi_cache_idx - 1, bidi_it);
3435 bidi_it->resolved_level = new_level;
3436 bidi_cache_iterator_state (bidi_it, 1, 1);
3437 }
3438 } while (new_level >= level);
3439 /* The factor of 50 below is a heuristic that needs to be
3440 tuned. It means we consider 50 buffer positions examined by
3441 the above call roughly equivalent to the display engine
3442 iterating over a single buffer position. */
3443 if (max_redisplay_ticks > 0 && bidi_it->charpos > pos0)
3444 update_redisplay_ticks ((bidi_it->charpos - pos0) / 50 + 1, bidi_it->w);
3445 }
3446 }
3447
3448 void
3449 bidi_move_to_visually_next (struct bidi_it *bidi_it)
3450 {
3451 int old_level, new_level, next_level;
3452 struct bidi_it sentinel;
3453
3454 if (bidi_it->charpos < 0 || bidi_it->bytepos < 0)
3455 emacs_abort ();
3456
3457 if (bidi_it->scan_dir == 0)
3458 {
3459 bidi_it->scan_dir = 1; /* default to logical order */
3460 }
3461
3462 /* If we just passed a newline, initialize for the next line. */
3463 if (!bidi_it->first_elt
3464 && (bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB))
3465 bidi_line_init (bidi_it);
3466
3467 /* Prepare the sentinel iterator state, and cache it. When we bump
3468 into it, scanning backwards, we'll know that the last non-base
3469 level is exhausted. */
3470 if (bidi_cache_idx == bidi_cache_start)
3471 {
3472 bidi_copy_it (&sentinel, bidi_it);
3473 if (bidi_it->first_elt)
3474 {
3475 sentinel.charpos--; /* cached charpos needs to be monotonic */
3476 sentinel.bytepos--;
3477 sentinel.ch = '\n'; /* doesn't matter, but why not? */
3478 sentinel.ch_len = 1;
3479 sentinel.nchars = 1;
3480 }
3481 bidi_cache_iterator_state (&sentinel, 1, 0);
3482 }
3483
3484 old_level = bidi_it->resolved_level;
3485 new_level = bidi_level_of_next_char (bidi_it);
3486
3487 /* Reordering of resolved levels (clause L2) is implemented by
3488 jumping to the other edge of the level and flipping direction of
3489 scanning the text whenever we find a level change. */
3490 if (new_level != old_level)
3491 {
3492 bool ascending = new_level > old_level;
3493 int level_to_search = ascending ? old_level + 1 : old_level;
3494 int incr = ascending ? 1 : -1;
3495 int expected_next_level = old_level + incr;
3496
3497 /* Jump (or walk) to the other edge of this level. */
3498 bidi_find_other_level_edge (bidi_it, level_to_search, !ascending);
3499 /* Switch scan direction and peek at the next character in the
3500 new direction. */
3501 bidi_it->scan_dir = -bidi_it->scan_dir;
3502
3503 /* The following loop handles the case where the resolved level
3504 jumps by more than one. This is typical for numbers inside a
3505 run of text with left-to-right embedding direction, but can
3506 also happen in other situations. In those cases the decision
3507 where to continue after a level change, and in what direction,
3508 is tricky. For example, given a text like below:
3509
3510 abcdefgh
3511 11336622
3512
3513 (where the numbers below the text show the resolved levels),
3514 the result of reordering according to UAX#9 should be this:
3515
3516 efdcghba
3517
3518 This is implemented by the loop below which flips direction
3519 and jumps to the other edge of the level each time it finds
3520 the new level not to be the expected one. The expected level
3521 is always one more or one less than the previous one. */
3522 next_level = bidi_peek_at_next_level (bidi_it);
3523 while (next_level != expected_next_level)
3524 {
3525 /* If next_level is -1, it means we have an unresolved level
3526 in the cache, which at this point should not happen. If
3527 it does, we will infloop. */
3528 eassert (next_level >= 0);
3529 /* If next_level is not consistent with incr, we might
3530 infloop. */
3531 eassert (incr > 0
3532 ? next_level > expected_next_level
3533 : next_level < expected_next_level);
3534 expected_next_level += incr;
3535 level_to_search += incr;
3536 bidi_find_other_level_edge (bidi_it, level_to_search, !ascending);
3537 bidi_it->scan_dir = -bidi_it->scan_dir;
3538 next_level = bidi_peek_at_next_level (bidi_it);
3539 }
3540
3541 /* Finally, deliver the next character in the new direction. */
3542 next_level = bidi_level_of_next_char (bidi_it);
3543 }
3544
3545 /* Take note when we have just processed the newline that precedes
3546 the end of the paragraph. The next time we are about to be
3547 called, set_iterator_to_next will automatically reinit the
3548 paragraph direction, if needed. We do this at the newline before
3549 the paragraph separator, because the next character might not be
3550 the first character of the next paragraph, due to the bidi
3551 reordering, whereas we _must_ know the paragraph base direction
3552 _before_ we process the paragraph's text, since the base
3553 direction affects the reordering. */
3554 if (bidi_it->scan_dir == 1
3555 && (bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB))
3556 {
3557 /* The paragraph direction of the entire string, once
3558 determined, is in effect for the entire string. Setting the
3559 separator limit to the end of the string prevents
3560 bidi_paragraph_init from being called automatically on this
3561 string. */
3562 if (bidi_it->string.s || STRINGP (bidi_it->string.lstring))
3563 bidi_it->separator_limit = bidi_it->string.schars;
3564 else if (bidi_it->bytepos < ZV_BYTE)
3565 {
3566 ptrdiff_t sep_len
3567 = bidi_at_paragraph_end (bidi_it->charpos + bidi_it->nchars,
3568 bidi_it->bytepos + bidi_it->ch_len);
3569 if (bidi_it->nchars <= 0)
3570 emacs_abort ();
3571 if (sep_len >= 0)
3572 {
3573 bidi_it->new_paragraph = 1;
3574 /* Record the buffer position of the last character of
3575 the paragraph separator. If the paragraph separator
3576 is an empty string (e.g., the regex is "^"), the
3577 newline that precedes the end of the paragraph is
3578 that last character. */
3579 if (sep_len > 0)
3580 bidi_it->separator_limit
3581 = bidi_it->charpos + bidi_it->nchars + sep_len;
3582 else
3583 bidi_it->separator_limit = bidi_it->charpos;
3584 }
3585 }
3586 }
3587
3588 if (bidi_it->scan_dir == 1 && bidi_cache_idx > bidi_cache_start)
3589 {
3590 /* If we are at paragraph's base embedding level and beyond the
3591 last cached position, the cache's job is done and we can
3592 discard it. */
3593 if (bidi_it->resolved_level == bidi_it->level_stack[0].level
3594 && bidi_it->charpos > (bidi_cache[bidi_cache_idx - 1].charpos
3595 + bidi_cache[bidi_cache_idx - 1].nchars - 1))
3596 bidi_cache_reset ();
3597 /* Also reset the cache if it overflowed and we have just
3598 emergency-exited using Plan B. */
3599 else if (bidi_it->resolved_level == bidi_it->level_stack[0].level
3600 && bidi_cache_idx >= bidi_cache_size
3601 && bidi_it->charpos == bidi_cache[bidi_cache_idx - 1].charpos)
3602 bidi_cache_reset ();
3603 /* But as long as we are caching during forward scan, we must
3604 cache each state, or else the cache integrity will be
3605 compromised: it assumes cached states correspond to buffer
3606 positions 1:1. */
3607 else
3608 bidi_cache_iterator_state (bidi_it, 1, 0);
3609 }
3610
3611 eassert (bidi_it->resolved_level >= 0
3612 && bidi_it->resolved_level <= BIDI_MAXDEPTH + 2);
3613 }
3614
3615 /* Utility function for looking for strong directional characters
3616 whose bidi type was overridden by directional override or embedding
3617 or isolate control characters. */
3618 ptrdiff_t
3619 bidi_find_first_overridden (struct bidi_it *bidi_it)
3620 {
3621 ptrdiff_t eob
3622 = STRINGP (bidi_it->string.lstring) ? bidi_it->string.schars : ZV;
3623 ptrdiff_t found_pos = eob;
3624 /* Maximum bidi levels we allow for L2R and R2L characters. Note
3625 that these are levels after resolving explicit embeddings,
3626 overrides, and isolates, i.e. before resolving implicit levels. */
3627 int max_l2r = bidi_it->paragraph_dir == L2R ? 0 : 2;
3628 int max_r2l = 1;
3629 /* Same for WEAK and NEUTRAL_ON types. */
3630 int max_weak = bidi_it->paragraph_dir == L2R ? 1 : 2;
3631
3632 do
3633 {
3634 /* Need to call bidi_resolve_weak, not bidi_resolve_explicit,
3635 because the directional overrides are applied by the
3636 former. */
3637 bidi_type_t type = bidi_resolve_weak (bidi_it);
3638 unsigned level = bidi_it->level_stack[bidi_it->stack_idx].level;
3639 bidi_category_t category = bidi_get_category (bidi_it->orig_type);
3640
3641 /* Detect strong L or R types that have been overridden by
3642 explicit overrides. */
3643 if ((type == STRONG_R && bidi_it->orig_type == STRONG_L)
3644 || (type == STRONG_L
3645 && (bidi_it->orig_type == STRONG_R
3646 || bidi_it->orig_type == STRONG_AL))
3647 /* Detect strong L or R types or WEAK_EN types that were
3648 pushed into higher embedding levels (and will thus
3649 reorder) by explicit embeddings and isolates. */
3650 || ((bidi_it->orig_type == STRONG_L
3651 || bidi_it->orig_type == WEAK_EN)
3652 && level > max_l2r)
3653 || ((bidi_it->orig_type == STRONG_R
3654 || bidi_it->orig_type == STRONG_AL)
3655 && level > max_r2l)
3656 /* Detect other weak or neutral types whose level was
3657 tweaked by explicit embeddings and isolates. */
3658 || ((category == WEAK || bidi_it->orig_type == NEUTRAL_ON)
3659 && level > max_weak))
3660 found_pos = bidi_it->charpos;
3661 } while (found_pos == eob
3662 && bidi_it->charpos < eob
3663 && bidi_it->ch != BIDI_EOB
3664 && bidi_it->ch != '\n');
3665
3666 return found_pos;
3667 }
3668
3669 /* This is meant to be called from within the debugger, whenever you
3670 wish to examine the cache contents. */
3671 void bidi_dump_cached_states (void) EXTERNALLY_VISIBLE;
3672 void
3673 bidi_dump_cached_states (void)
3674 {
3675 ptrdiff_t i;
3676 int ndigits = 1;
3677
3678 if (bidi_cache_idx == 0)
3679 {
3680 fputs ("The cache is empty.\n", stderr);
3681 return;
3682 }
3683 fprintf (stderr, "Total of %"pD"d state%s in cache:\n",
3684 bidi_cache_idx, bidi_cache_idx == 1 ? "" : "s");
3685
3686 for (i = bidi_cache[bidi_cache_idx - 1].charpos; i > 0; i /= 10)
3687 ndigits++;
3688 fputs ("ch ", stderr);
3689 for (i = 0; i < bidi_cache_idx; i++)
3690 fprintf (stderr, "%*c", ndigits, bidi_cache[i].ch);
3691 fputs ("\nlvl ", stderr);
3692 for (i = 0; i < bidi_cache_idx; i++)
3693 fprintf (stderr, "%*d", ndigits, bidi_cache[i].resolved_level);
3694 fputs ("\npos ", stderr);
3695 for (i = 0; i < bidi_cache_idx; i++)
3696 fprintf (stderr, "%*"pD"d", ndigits, bidi_cache[i].charpos);
3697 putc ('\n', stderr);
3698 }