1 /* Analyze differences between two vectors.
2
3 Copyright (C) 1988-1989, 1992-1995, 2001-2004, 2006-2023 Free Software
4 Foundation, Inc.
5
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <https://www.gnu.org/licenses/>. */
18
19
20 /* The basic idea is to consider two vectors as similar if, when
21 transforming the first vector into the second vector through a
22 sequence of edits (inserts and deletes of one element each),
23 this sequence is short - or equivalently, if the ordered list
24 of elements that are untouched by these edits is long. For a
25 good introduction to the subject, read about the "Levenshtein
26 distance" in Wikipedia.
27
28 The basic algorithm is described in:
29 "An O(ND) Difference Algorithm and its Variations", Eugene W. Myers,
30 Algorithmica Vol. 1, 1986, pp. 251-266,
31 <https://doi.org/10.1007/BF01840446>.
32 See especially section 4.2, which describes the variation used below.
33
34 The basic algorithm was independently discovered as described in:
35 "Algorithms for Approximate String Matching", Esko Ukkonen,
36 Information and Control Vol. 64, 1985, pp. 100-118,
37 <https://doi.org/10.1016/S0019-9958(85)80046-2>.
38
39 Unless the 'find_minimal' flag is set, this code uses the TOO_EXPENSIVE
40 heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N)
41 at the price of producing suboptimal output for large inputs with
42 many differences. */
43
44 /* Before including this file, you need to define:
45 ELEMENT The element type of the vectors being compared.
46 EQUAL A two-argument macro that tests two elements for
47 equality.
48 OFFSET A signed integer type sufficient to hold the
49 difference between two indices. Usually
50 something like ptrdiff_t.
51 OFFSET_MAX (Optional) The maximum value of OFFSET (e.g.,
52 PTRDIFF_MAX). If omitted, it is inferred in a
53 way portable to the vast majority of C platforms,
54 as they lack padding bits.
55 EXTRA_CONTEXT_FIELDS Declarations of fields for 'struct context'.
56 NOTE_DELETE(ctxt, xoff) Record the removal of the object xvec[xoff].
57 NOTE_INSERT(ctxt, yoff) Record the insertion of the object yvec[yoff].
58 NOTE_ORDERED (Optional) A boolean expression saying that
59 NOTE_DELETE and NOTE_INSERT calls must be
60 issued in offset order.
61 EARLY_ABORT(ctxt) (Optional) A boolean expression that triggers an
62 early abort of the computation.
63 USE_HEURISTIC (Optional) Define if you want to support the
64 heuristic for large vectors.
65
66 It is also possible to use this file with abstract arrays. In this case,
67 xvec and yvec are not represented in memory. They only exist conceptually.
68 In this case, the list of defines above is amended as follows:
69 ELEMENT Undefined.
70 EQUAL Undefined.
71 XVECREF_YVECREF_EQUAL(ctxt, xoff, yoff)
72 A three-argument macro: References xvec[xoff] and
73 yvec[yoff] and tests these elements for equality.
74
75 Before including this file, you also need to include:
76 #include <limits.h>
77 #include "minmax.h"
78 */
79
80 /* Maximum value of type OFFSET. */
81 #ifndef OFFSET_MAX
82 # define OFFSET_MAX \
83 ((((OFFSET) 1 << (sizeof (OFFSET) * CHAR_BIT - 2)) - 1) * 2 + 1)
84 #endif
85
86 /* Default to no early abort. */
87 #ifndef EARLY_ABORT
88 # define EARLY_ABORT(ctxt) false
89 #endif
90
91 #ifndef NOTE_ORDERED
92 # define NOTE_ORDERED false
93 #endif
94
95 /* Use this to suppress gcc's "...may be used before initialized" warnings.
96 Beware: The Code argument must not contain commas. */
97 #if __GNUC__ + (__GNUC_MINOR__ >= 7) > 4
98 # pragma GCC diagnostic push
99 #endif
100 #ifndef IF_LINT
101 # if defined GCC_LINT || defined lint
102 # define IF_LINT(Code) Code
103 # else
104 # define IF_LINT(Code) /* empty */
105 # if __GNUC__ + (__GNUC_MINOR__ >= 7) > 4
106 # pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
107 # endif
108 # endif
109 #endif
110
111 /*
112 * Context of comparison operation.
113 */
114 struct context
115 {
116 #ifdef ELEMENT
117 /* Vectors being compared. */
118 ELEMENT const *xvec;
119 ELEMENT const *yvec;
120 #endif
121
122 /* Extra fields. */
123 EXTRA_CONTEXT_FIELDS
124
125 /* Vector, indexed by diagonal, containing 1 + the X coordinate of the point
126 furthest along the given diagonal in the forward search of the edit
127 matrix. */
128 OFFSET *fdiag;
129
130 /* Vector, indexed by diagonal, containing the X coordinate of the point
131 furthest along the given diagonal in the backward search of the edit
132 matrix. */
133 OFFSET *bdiag;
134
135 #ifdef USE_HEURISTIC
136 /* This corresponds to the diff --speed-large-files flag. With this
137 heuristic, for vectors with a constant small density of changes,
138 the algorithm is linear in the vector size. */
139 bool heuristic;
140 #endif
141
142 /* Edit scripts longer than this are too expensive to compute. */
143 OFFSET too_expensive;
144
145 /* Snakes bigger than this are considered "big". */
146 #define SNAKE_LIMIT 20
147 };
148
149 struct partition
150 {
151 /* Midpoints of this partition. */
152 OFFSET xmid;
153 OFFSET ymid;
154
155 /* True if low half will be analyzed minimally. */
156 bool lo_minimal;
157
158 /* Likewise for high half. */
159 bool hi_minimal;
160 };
161
162
163 /* Find the midpoint of the shortest edit script for a specified portion
164 of the two vectors.
165
166 Scan from the beginnings of the vectors, and simultaneously from the ends,
167 doing a breadth-first search through the space of edit-sequence.
168 When the two searches meet, we have found the midpoint of the shortest
169 edit sequence.
170
171 If FIND_MINIMAL is true, find the minimal edit script regardless of
172 expense. Otherwise, if the search is too expensive, use heuristics to
173 stop the search and report a suboptimal answer.
174
175 Set PART->(xmid,ymid) to the midpoint (XMID,YMID). The diagonal number
176 XMID - YMID equals the number of inserted elements minus the number
177 of deleted elements (counting only elements before the midpoint).
178
179 Set PART->lo_minimal to true iff the minimal edit script for the
180 left half of the partition is known; similarly for PART->hi_minimal.
181
182 This function assumes that the first elements of the specified portions
183 of the two vectors do not match, and likewise that the last elements do not
184 match. The caller must trim matching elements from the beginning and end
185 of the portions it is going to specify.
186
187 If we return the "wrong" partitions, the worst this can do is cause
188 suboptimal diff output. It cannot cause incorrect diff output. */
189
190 static void
191 diag (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim, bool find_minimal,
192 struct partition *part, struct context *ctxt)
193 {
194 OFFSET *const fd = ctxt->fdiag; /* Give the compiler a chance. */
195 OFFSET *const bd = ctxt->bdiag; /* Additional help for the compiler. */
196 #ifdef ELEMENT
197 ELEMENT const *const xv = ctxt->xvec; /* Still more help for the compiler. */
198 ELEMENT const *const yv = ctxt->yvec; /* And more and more . . . */
199 #define XREF_YREF_EQUAL(x,y) EQUAL (xv[x], yv[y])
200 #else
201 #define XREF_YREF_EQUAL(x,y) XVECREF_YVECREF_EQUAL (ctxt, x, y)
202 #endif
203 const OFFSET dmin = xoff - ylim; /* Minimum valid diagonal. */
204 const OFFSET dmax = xlim - yoff; /* Maximum valid diagonal. */
205 const OFFSET fmid = xoff - yoff; /* Center diagonal of top-down search. */
206 const OFFSET bmid = xlim - ylim; /* Center diagonal of bottom-up search. */
207 OFFSET fmin = fmid;
208 OFFSET fmax = fmid; /* Limits of top-down search. */
209 OFFSET bmin = bmid;
210 OFFSET bmax = bmid; /* Limits of bottom-up search. */
211 OFFSET c; /* Cost. */
212 bool odd = (fmid - bmid) & 1; /* True if southeast corner is on an odd
213 diagonal with respect to the northwest. */
214
215 fd[fmid] = xoff;
216 bd[bmid] = xlim;
217
218 for (c = 1;; ++c)
219 {
220 OFFSET d; /* Active diagonal. */
221 bool big_snake = false;
222
223 /* Extend the top-down search by an edit step in each diagonal. */
224 if (fmin > dmin)
225 fd[--fmin - 1] = -1;
226 else
227 ++fmin;
228 if (fmax < dmax)
229 fd[++fmax + 1] = -1;
230 else
231 --fmax;
232 for (d = fmax; d >= fmin; d -= 2)
233 {
234 OFFSET x;
235 OFFSET y;
236 OFFSET tlo = fd[d - 1];
237 OFFSET thi = fd[d + 1];
238 OFFSET x0 = tlo < thi ? thi : tlo + 1;
239
240 for (x = x0, y = x0 - d;
241 x < xlim && y < ylim && XREF_YREF_EQUAL (x, y);
242 x++, y++)
243 continue;
244 if (x - x0 > SNAKE_LIMIT)
245 big_snake = true;
246 fd[d] = x;
247 if (odd && bmin <= d && d <= bmax && bd[d] <= x)
248 {
249 part->xmid = x;
250 part->ymid = y;
251 part->lo_minimal = part->hi_minimal = true;
252 return;
253 }
254 }
255
256 /* Similarly extend the bottom-up search. */
257 if (bmin > dmin)
258 bd[--bmin - 1] = OFFSET_MAX;
259 else
260 ++bmin;
261 if (bmax < dmax)
262 bd[++bmax + 1] = OFFSET_MAX;
263 else
264 --bmax;
265 for (d = bmax; d >= bmin; d -= 2)
266 {
267 OFFSET x;
268 OFFSET y;
269 OFFSET tlo = bd[d - 1];
270 OFFSET thi = bd[d + 1];
271 OFFSET x0 = tlo < thi ? tlo : thi - 1;
272
273 for (x = x0, y = x0 - d;
274 xoff < x && yoff < y && XREF_YREF_EQUAL (x - 1, y - 1);
275 x--, y--)
276 continue;
277 if (x0 - x > SNAKE_LIMIT)
278 big_snake = true;
279 bd[d] = x;
280 if (!odd && fmin <= d && d <= fmax && x <= fd[d])
281 {
282 part->xmid = x;
283 part->ymid = y;
284 part->lo_minimal = part->hi_minimal = true;
285 return;
286 }
287 }
288
289 if (find_minimal)
290 continue;
291
292 #ifdef USE_HEURISTIC
293 bool heuristic = ctxt->heuristic;
294 #else
295 bool heuristic = false;
296 #endif
297
298 /* Heuristic: check occasionally for a diagonal that has made lots
299 of progress compared with the edit distance. If we have any
300 such, find the one that has made the most progress and return it
301 as if it had succeeded.
302
303 With this heuristic, for vectors with a constant small density
304 of changes, the algorithm is linear in the vector size. */
305
306 if (200 < c && big_snake && heuristic)
307 {
308 {
309 OFFSET best = 0;
310
311 for (d = fmax; d >= fmin; d -= 2)
312 {
313 OFFSET dd = d - fmid;
314 OFFSET x = fd[d];
315 OFFSET y = x - d;
316 OFFSET v = (x - xoff) * 2 - dd;
317
318 if (v > 12 * (c + (dd < 0 ? -dd : dd)))
319 {
320 if (v > best
321 && xoff + SNAKE_LIMIT <= x && x < xlim
322 && yoff + SNAKE_LIMIT <= y && y < ylim)
323 {
324 /* We have a good enough best diagonal; now insist
325 that it end with a significant snake. */
326 int k;
327
328 for (k = 1; XREF_YREF_EQUAL (x - k, y - k); k++)
329 if (k == SNAKE_LIMIT)
330 {
331 best = v;
332 part->xmid = x;
333 part->ymid = y;
334 break;
335 }
336 }
337 }
338 }
339 if (best > 0)
340 {
341 part->lo_minimal = true;
342 part->hi_minimal = false;
343 return;
344 }
345 }
346
347 {
348 OFFSET best = 0;
349
350 for (d = bmax; d >= bmin; d -= 2)
351 {
352 OFFSET dd = d - bmid;
353 OFFSET x = bd[d];
354 OFFSET y = x - d;
355 OFFSET v = (xlim - x) * 2 + dd;
356
357 if (v > 12 * (c + (dd < 0 ? -dd : dd)))
358 {
359 if (v > best
360 && xoff < x && x <= xlim - SNAKE_LIMIT
361 && yoff < y && y <= ylim - SNAKE_LIMIT)
362 {
363 /* We have a good enough best diagonal; now insist
364 that it end with a significant snake. */
365 int k;
366
367 for (k = 0; XREF_YREF_EQUAL (x + k, y + k); k++)
368 if (k == SNAKE_LIMIT - 1)
369 {
370 best = v;
371 part->xmid = x;
372 part->ymid = y;
373 break;
374 }
375 }
376 }
377 }
378 if (best > 0)
379 {
380 part->lo_minimal = false;
381 part->hi_minimal = true;
382 return;
383 }
384 }
385 }
386
387 /* Heuristic: if we've gone well beyond the call of duty, give up
388 and report halfway between our best results so far. */
389 if (c >= ctxt->too_expensive)
390 {
391 OFFSET fxybest;
392 OFFSET fxbest IF_LINT (= 0);
393 OFFSET bxybest;
394 OFFSET bxbest IF_LINT (= 0);
395
396 /* Find forward diagonal that maximizes X + Y. */
397 fxybest = -1;
398 for (d = fmax; d >= fmin; d -= 2)
399 {
400 OFFSET x = MIN (fd[d], xlim);
401 OFFSET y = x - d;
402 if (ylim < y)
403 {
404 x = ylim + d;
405 y = ylim;
406 }
407 if (fxybest < x + y)
408 {
409 fxybest = x + y;
410 fxbest = x;
411 }
412 }
413
414 /* Find backward diagonal that minimizes X + Y. */
415 bxybest = OFFSET_MAX;
416 for (d = bmax; d >= bmin; d -= 2)
417 {
418 OFFSET x = MAX (xoff, bd[d]);
419 OFFSET y = x - d;
420 if (y < yoff)
421 {
422 x = yoff + d;
423 y = yoff;
424 }
425 if (x + y < bxybest)
426 {
427 bxybest = x + y;
428 bxbest = x;
429 }
430 }
431
432 /* Use the better of the two diagonals. */
433 if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff))
434 {
435 part->xmid = fxbest;
436 part->ymid = fxybest - fxbest;
437 part->lo_minimal = true;
438 part->hi_minimal = false;
439 }
440 else
441 {
442 part->xmid = bxbest;
443 part->ymid = bxybest - bxbest;
444 part->lo_minimal = false;
445 part->hi_minimal = true;
446 }
447 return;
448 }
449 }
450 #undef XREF_YREF_EQUAL
451 }
452
453
454 /* Compare in detail contiguous subsequences of the two vectors
455 which are known, as a whole, to match each other.
456
457 The subsequence of vector 0 is [XOFF, XLIM) and likewise for vector 1.
458
459 Note that XLIM, YLIM are exclusive bounds. All indices into the vectors
460 are origin-0.
461
462 If FIND_MINIMAL, find a minimal difference no matter how
463 expensive it is.
464
465 The results are recorded by invoking NOTE_DELETE and NOTE_INSERT.
466
467 Return false if terminated normally, or true if terminated through early
468 abort. */
469
470 static bool
471 compareseq (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim,
472 bool find_minimal, struct context *ctxt)
473 {
474 #ifdef ELEMENT
475 ELEMENT const *xv = ctxt->xvec; /* Help the compiler. */
476 ELEMENT const *yv = ctxt->yvec;
477 #define XREF_YREF_EQUAL(x,y) EQUAL (xv[x], yv[y])
478 #else
479 #define XREF_YREF_EQUAL(x,y) XVECREF_YVECREF_EQUAL (ctxt, x, y)
480 #endif
481
482 while (true)
483 {
484 /* Slide down the bottom initial diagonal. */
485 while (xoff < xlim && yoff < ylim && XREF_YREF_EQUAL (xoff, yoff))
486 {
487 xoff++;
488 yoff++;
489 }
490
491 /* Slide up the top initial diagonal. */
492 while (xoff < xlim && yoff < ylim && XREF_YREF_EQUAL (xlim - 1, ylim - 1))
493 {
494 xlim--;
495 ylim--;
496 }
497
498 /* Handle simple cases. */
499 if (xoff == xlim)
500 {
501 while (yoff < ylim)
502 {
503 NOTE_INSERT (ctxt, yoff);
504 if (EARLY_ABORT (ctxt))
505 return true;
506 yoff++;
507 }
508 break;
509 }
510 if (yoff == ylim)
511 {
512 while (xoff < xlim)
513 {
514 NOTE_DELETE (ctxt, xoff);
515 if (EARLY_ABORT (ctxt))
516 return true;
517 xoff++;
518 }
519 break;
520 }
521
522 struct partition part;
523
524 /* Find a point of correspondence in the middle of the vectors. */
525 diag (xoff, xlim, yoff, ylim, find_minimal, &part, ctxt);
526
527 /* Use the partitions to split this problem into subproblems. */
528 OFFSET xoff1, xlim1, yoff1, ylim1, xoff2, xlim2, yoff2, ylim2;
529 bool find_minimal1, find_minimal2;
530 if (!NOTE_ORDERED
531 && ((xlim + ylim) - (part.xmid + part.ymid)
532 < (part.xmid + part.ymid) - (xoff + yoff)))
533 {
534 /* The second problem is smaller and the caller doesn't
535 care about order, so do the second problem first to
536 lessen recursion. */
537 xoff1 = part.xmid; xlim1 = xlim;
538 yoff1 = part.ymid; ylim1 = ylim;
539 find_minimal1 = part.hi_minimal;
540
541 xoff2 = xoff; xlim2 = part.xmid;
542 yoff2 = yoff; ylim2 = part.ymid;
543 find_minimal2 = part.lo_minimal;
544 }
545 else
546 {
547 xoff1 = xoff; xlim1 = part.xmid;
548 yoff1 = yoff; ylim1 = part.ymid;
549 find_minimal1 = part.lo_minimal;
550
551 xoff2 = part.xmid; xlim2 = xlim;
552 yoff2 = part.ymid; ylim2 = ylim;
553 find_minimal2 = part.hi_minimal;
554 }
555
556 /* Recurse to do one subproblem. */
557 bool early = compareseq (xoff1, xlim1, yoff1, ylim1, find_minimal1, ctxt);
558 if (early)
559 return early;
560
561 /* Iterate to do the other subproblem. */
562 xoff = xoff2; xlim = xlim2;
563 yoff = yoff2; ylim = ylim2;
564 find_minimal = find_minimal2;
565 }
566
567 return false;
568 #undef XREF_YREF_EQUAL
569 }
570
571 #if __GNUC__ + (__GNUC_MINOR__ >= 7) > 4
572 # pragma GCC diagnostic pop
573 #endif
574
575 #undef ELEMENT
576 #undef EQUAL
577 #undef OFFSET
578 #undef EXTRA_CONTEXT_FIELDS
579 #undef NOTE_DELETE
580 #undef NOTE_INSERT
581 #undef EARLY_ABORT
582 #undef USE_HEURISTIC
583 #undef XVECREF_YVECREF_EQUAL
584 #undef OFFSET_MAX