1 /* Analyze differences between two vectors.
2
3 Copyright (C) 1988-1989, 1992-1995, 2001-2004, 2006-2023 Free
4 Software Foundation, Inc.
5
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <https://www.gnu.org/licenses/>. */
18
19
20 /* The basic idea is to consider two vectors as similar if, when
21 transforming the first vector into the second vector through a
22 sequence of edits (inserts and deletes of one element each),
23 this sequence is short - or equivalently, if the ordered list
24 of elements that are untouched by these edits is long. For a
25 good introduction to the subject, read about the "Levenshtein
26 distance" in Wikipedia.
27
28 The basic algorithm is described in:
29 "An O(ND) Difference Algorithm and its Variations", Eugene W. Myers,
30 Algorithmica Vol. 1, 1986, pp. 251-266,
31 <https://doi.org/10.1007/BF01840446>.
32 See especially section 4.2, which describes the variation used below.
33
34 The basic algorithm was independently discovered as described in:
35 "Algorithms for Approximate String Matching", Esko Ukkonen,
36 Information and Control Vol. 64, 1985, pp. 100-118,
37 <https://doi.org/10.1016/S0019-9958(85)80046-2>.
38
39 Unless the 'find_minimal' flag is set, this code uses the TOO_EXPENSIVE
40 heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N)
41 at the price of producing suboptimal output for large inputs with
42 many differences. */
43
44 /* Before including this file, you need to define:
45 ELEMENT The element type of the vectors being compared.
46 EQUAL A two-argument macro that tests two elements for
47 equality.
48 OFFSET A signed integer type sufficient to hold the
49 difference between two indices. Usually
50 something like ptrdiff_t.
51 EXTRA_CONTEXT_FIELDS Declarations of fields for 'struct context'.
52 NOTE_DELETE(ctxt, xoff) Record the removal of the object xvec[xoff].
53 NOTE_INSERT(ctxt, yoff) Record the insertion of the object yvec[yoff].
54 NOTE_ORDERED (Optional) A boolean expression saying that
55 NOTE_DELETE and NOTE_INSERT calls must be
56 issued in offset order.
57 EARLY_ABORT(ctxt) (Optional) A boolean expression that triggers an
58 early abort of the computation.
59 USE_HEURISTIC (Optional) Define if you want to support the
60 heuristic for large vectors.
61
62 It is also possible to use this file with abstract arrays. In this case,
63 xvec and yvec are not represented in memory. They only exist conceptually.
64 In this case, the list of defines above is amended as follows:
65 ELEMENT Undefined.
66 EQUAL Undefined.
67 XVECREF_YVECREF_EQUAL(ctxt, xoff, yoff)
68 A three-argument macro: References xvec[xoff] and
69 yvec[yoff] and tests these elements for equality.
70
71 Before including this file, you also need to include:
72 #include <limits.h>
73 #include "minmax.h"
74 */
75
76 /* Maximum value of type OFFSET. */
77 #define OFFSET_MAX \
78 ((((OFFSET)1 << (sizeof (OFFSET) * CHAR_BIT - 2)) - 1) * 2 + 1)
79
80 /* Default to no early abort. */
81 #ifndef EARLY_ABORT
82 # define EARLY_ABORT(ctxt) false
83 #endif
84
85 #ifndef NOTE_ORDERED
86 # define NOTE_ORDERED false
87 #endif
88
89 /* Use this to suppress gcc's "...may be used before initialized" warnings.
90 Beware: The Code argument must not contain commas. */
91 #ifndef IF_LINT
92 # if defined GCC_LINT || defined lint
93 # define IF_LINT(Code) Code
94 # else
95 # define IF_LINT(Code) /* empty */
96 # endif
97 #endif
98
99 /*
100 * Context of comparison operation.
101 */
102 struct context
103 {
104 #ifdef ELEMENT
105 /* Vectors being compared. */
106 ELEMENT const *xvec;
107 ELEMENT const *yvec;
108 #endif
109
110 /* Extra fields. */
111 EXTRA_CONTEXT_FIELDS
112
113 /* Vector, indexed by diagonal, containing 1 + the X coordinate of the point
114 furthest along the given diagonal in the forward search of the edit
115 matrix. */
116 OFFSET *fdiag;
117
118 /* Vector, indexed by diagonal, containing the X coordinate of the point
119 furthest along the given diagonal in the backward search of the edit
120 matrix. */
121 OFFSET *bdiag;
122
123 #ifdef USE_HEURISTIC
124 /* This corresponds to the diff --speed-large-files flag. With this
125 heuristic, for vectors with a constant small density of changes,
126 the algorithm is linear in the vector size. */
127 bool heuristic;
128 #endif
129
130 /* Edit scripts longer than this are too expensive to compute. */
131 OFFSET too_expensive;
132
133 /* Snakes bigger than this are considered "big". */
134 #define SNAKE_LIMIT 20
135 };
136
137 struct partition
138 {
139 /* Midpoints of this partition. */
140 OFFSET xmid;
141 OFFSET ymid;
142
143 /* True if low half will be analyzed minimally. */
144 bool lo_minimal;
145
146 /* Likewise for high half. */
147 bool hi_minimal;
148 };
149
150
151 /* Find the midpoint of the shortest edit script for a specified portion
152 of the two vectors.
153
154 Scan from the beginnings of the vectors, and simultaneously from the ends,
155 doing a breadth-first search through the space of edit-sequence.
156 When the two searches meet, we have found the midpoint of the shortest
157 edit sequence.
158
159 If FIND_MINIMAL is true, find the minimal edit script regardless of
160 expense. Otherwise, if the search is too expensive, use heuristics to
161 stop the search and report a suboptimal answer.
162
163 Set PART->(xmid,ymid) to the midpoint (XMID,YMID). The diagonal number
164 XMID - YMID equals the number of inserted elements minus the number
165 of deleted elements (counting only elements before the midpoint).
166
167 Set PART->lo_minimal to true iff the minimal edit script for the
168 left half of the partition is known; similarly for PART->hi_minimal.
169
170 This function assumes that the first elements of the specified portions
171 of the two vectors do not match, and likewise that the last elements do not
172 match. The caller must trim matching elements from the beginning and end
173 of the portions it is going to specify.
174
175 If we return the "wrong" partitions, the worst this can do is cause
176 suboptimal diff output. It cannot cause incorrect diff output. */
177
178 static void
179 diag (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim, bool find_minimal,
180 struct partition *part, struct context *ctxt)
181 {
182 OFFSET *const fd = ctxt->fdiag; /* Give the compiler a chance. */
183 OFFSET *const bd = ctxt->bdiag; /* Additional help for the compiler. */
184 #ifdef ELEMENT
185 ELEMENT const *const xv = ctxt->xvec; /* Still more help for the compiler. */
186 ELEMENT const *const yv = ctxt->yvec; /* And more and more . . . */
187 #define XREF_YREF_EQUAL(x,y) EQUAL (xv[x], yv[y])
188 #else
189 #define XREF_YREF_EQUAL(x,y) XVECREF_YVECREF_EQUAL (ctxt, x, y)
190 #endif
191 const OFFSET dmin = xoff - ylim; /* Minimum valid diagonal. */
192 const OFFSET dmax = xlim - yoff; /* Maximum valid diagonal. */
193 const OFFSET fmid = xoff - yoff; /* Center diagonal of top-down search. */
194 const OFFSET bmid = xlim - ylim; /* Center diagonal of bottom-up search. */
195 OFFSET fmin = fmid;
196 OFFSET fmax = fmid; /* Limits of top-down search. */
197 OFFSET bmin = bmid;
198 OFFSET bmax = bmid; /* Limits of bottom-up search. */
199 OFFSET c; /* Cost. */
200 bool odd = (fmid - bmid) & 1; /* True if southeast corner is on an odd
201 diagonal with respect to the northwest. */
202
203 fd[fmid] = xoff;
204 bd[bmid] = xlim;
205
206 for (c = 1;; ++c)
207 {
208 OFFSET d; /* Active diagonal. */
209 bool big_snake = false;
210
211 /* Extend the top-down search by an edit step in each diagonal. */
212 if (fmin > dmin)
213 fd[--fmin - 1] = -1;
214 else
215 ++fmin;
216 if (fmax < dmax)
217 fd[++fmax + 1] = -1;
218 else
219 --fmax;
220 for (d = fmax; d >= fmin; d -= 2)
221 {
222 OFFSET x;
223 OFFSET y;
224 OFFSET tlo = fd[d - 1];
225 OFFSET thi = fd[d + 1];
226 OFFSET x0 = tlo < thi ? thi : tlo + 1;
227
228 for (x = x0, y = x0 - d;
229 x < xlim && y < ylim && XREF_YREF_EQUAL (x, y);
230 x++, y++)
231 continue;
232 if (x - x0 > SNAKE_LIMIT)
233 big_snake = true;
234 fd[d] = x;
235 if (odd && bmin <= d && d <= bmax && bd[d] <= x)
236 {
237 part->xmid = x;
238 part->ymid = y;
239 part->lo_minimal = part->hi_minimal = true;
240 return;
241 }
242 }
243
244 /* Similarly extend the bottom-up search. */
245 if (bmin > dmin)
246 bd[--bmin - 1] = OFFSET_MAX;
247 else
248 ++bmin;
249 if (bmax < dmax)
250 bd[++bmax + 1] = OFFSET_MAX;
251 else
252 --bmax;
253 for (d = bmax; d >= bmin; d -= 2)
254 {
255 OFFSET x;
256 OFFSET y;
257 OFFSET tlo = bd[d - 1];
258 OFFSET thi = bd[d + 1];
259 OFFSET x0 = tlo < thi ? tlo : thi - 1;
260
261 for (x = x0, y = x0 - d;
262 xoff < x && yoff < y && XREF_YREF_EQUAL (x - 1, y - 1);
263 x--, y--)
264 continue;
265 if (x0 - x > SNAKE_LIMIT)
266 big_snake = true;
267 bd[d] = x;
268 if (!odd && fmin <= d && d <= fmax && x <= fd[d])
269 {
270 part->xmid = x;
271 part->ymid = y;
272 part->lo_minimal = part->hi_minimal = true;
273 return;
274 }
275 }
276
277 if (find_minimal)
278 continue;
279
280 #ifdef USE_HEURISTIC
281 bool heuristic = ctxt->heuristic;
282 #else
283 bool heuristic = false;
284 #endif
285
286 /* Heuristic: check occasionally for a diagonal that has made lots
287 of progress compared with the edit distance. If we have any
288 such, find the one that has made the most progress and return it
289 as if it had succeeded.
290
291 With this heuristic, for vectors with a constant small density
292 of changes, the algorithm is linear in the vector size. */
293
294 if (200 < c && big_snake && heuristic)
295 {
296 {
297 OFFSET best = 0;
298
299 for (d = fmax; d >= fmin; d -= 2)
300 {
301 OFFSET dd = d - fmid;
302 OFFSET x = fd[d];
303 OFFSET y = x - d;
304 OFFSET v = (x - xoff) * 2 - dd;
305
306 if (v > 12 * (c + (dd < 0 ? -dd : dd)))
307 {
308 if (v > best
309 && xoff + SNAKE_LIMIT <= x && x < xlim
310 && yoff + SNAKE_LIMIT <= y && y < ylim)
311 {
312 /* We have a good enough best diagonal; now insist
313 that it end with a significant snake. */
314 int k;
315
316 for (k = 1; XREF_YREF_EQUAL (x - k, y - k); k++)
317 if (k == SNAKE_LIMIT)
318 {
319 best = v;
320 part->xmid = x;
321 part->ymid = y;
322 break;
323 }
324 }
325 }
326 }
327 if (best > 0)
328 {
329 part->lo_minimal = true;
330 part->hi_minimal = false;
331 return;
332 }
333 }
334
335 {
336 OFFSET best = 0;
337
338 for (d = bmax; d >= bmin; d -= 2)
339 {
340 OFFSET dd = d - bmid;
341 OFFSET x = bd[d];
342 OFFSET y = x - d;
343 OFFSET v = (xlim - x) * 2 + dd;
344
345 if (v > 12 * (c + (dd < 0 ? -dd : dd)))
346 {
347 if (v > best
348 && xoff < x && x <= xlim - SNAKE_LIMIT
349 && yoff < y && y <= ylim - SNAKE_LIMIT)
350 {
351 /* We have a good enough best diagonal; now insist
352 that it end with a significant snake. */
353 int k;
354
355 for (k = 0; XREF_YREF_EQUAL (x + k, y + k); k++)
356 if (k == SNAKE_LIMIT - 1)
357 {
358 best = v;
359 part->xmid = x;
360 part->ymid = y;
361 break;
362 }
363 }
364 }
365 }
366 if (best > 0)
367 {
368 part->lo_minimal = false;
369 part->hi_minimal = true;
370 return;
371 }
372 }
373 }
374
375 /* Heuristic: if we've gone well beyond the call of duty, give up
376 and report halfway between our best results so far. */
377 if (c >= ctxt->too_expensive)
378 {
379 OFFSET fxybest;
380 OFFSET fxbest IF_LINT (= 0);
381 OFFSET bxybest;
382 OFFSET bxbest IF_LINT (= 0);
383
384 /* Find forward diagonal that maximizes X + Y. */
385 fxybest = -1;
386 for (d = fmax; d >= fmin; d -= 2)
387 {
388 OFFSET x = MIN (fd[d], xlim);
389 OFFSET y = x - d;
390 if (ylim < y)
391 {
392 x = ylim + d;
393 y = ylim;
394 }
395 if (fxybest < x + y)
396 {
397 fxybest = x + y;
398 fxbest = x;
399 }
400 }
401
402 /* Find backward diagonal that minimizes X + Y. */
403 bxybest = OFFSET_MAX;
404 for (d = bmax; d >= bmin; d -= 2)
405 {
406 OFFSET x = MAX (xoff, bd[d]);
407 OFFSET y = x - d;
408 if (y < yoff)
409 {
410 x = yoff + d;
411 y = yoff;
412 }
413 if (x + y < bxybest)
414 {
415 bxybest = x + y;
416 bxbest = x;
417 }
418 }
419
420 /* Use the better of the two diagonals. */
421 if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff))
422 {
423 part->xmid = fxbest;
424 part->ymid = fxybest - fxbest;
425 part->lo_minimal = true;
426 part->hi_minimal = false;
427 }
428 else
429 {
430 part->xmid = bxbest;
431 part->ymid = bxybest - bxbest;
432 part->lo_minimal = false;
433 part->hi_minimal = true;
434 }
435 return;
436 }
437 }
438 #undef XREF_YREF_EQUAL
439 }
440
441
442 /* Compare in detail contiguous subsequences of the two vectors
443 which are known, as a whole, to match each other.
444
445 The subsequence of vector 0 is [XOFF, XLIM) and likewise for vector 1.
446
447 Note that XLIM, YLIM are exclusive bounds. All indices into the vectors
448 are origin-0.
449
450 If FIND_MINIMAL, find a minimal difference no matter how
451 expensive it is.
452
453 The results are recorded by invoking NOTE_DELETE and NOTE_INSERT.
454
455 Return false if terminated normally, or true if terminated through early
456 abort. */
457
458 static bool
459 compareseq (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim,
460 bool find_minimal, struct context *ctxt)
461 {
462 #ifdef ELEMENT
463 ELEMENT const *xv = ctxt->xvec; /* Help the compiler. */
464 ELEMENT const *yv = ctxt->yvec;
465 #define XREF_YREF_EQUAL(x,y) EQUAL (xv[x], yv[y])
466 #else
467 #define XREF_YREF_EQUAL(x,y) XVECREF_YVECREF_EQUAL (ctxt, x, y)
468 #endif
469
470 while (true)
471 {
472 /* Slide down the bottom initial diagonal. */
473 while (xoff < xlim && yoff < ylim && XREF_YREF_EQUAL (xoff, yoff))
474 {
475 xoff++;
476 yoff++;
477 }
478
479 /* Slide up the top initial diagonal. */
480 while (xoff < xlim && yoff < ylim && XREF_YREF_EQUAL (xlim - 1, ylim - 1))
481 {
482 xlim--;
483 ylim--;
484 }
485
486 /* Handle simple cases. */
487 if (xoff == xlim)
488 {
489 while (yoff < ylim)
490 {
491 NOTE_INSERT (ctxt, yoff);
492 if (EARLY_ABORT (ctxt))
493 return true;
494 yoff++;
495 }
496 break;
497 }
498 if (yoff == ylim)
499 {
500 while (xoff < xlim)
501 {
502 NOTE_DELETE (ctxt, xoff);
503 if (EARLY_ABORT (ctxt))
504 return true;
505 xoff++;
506 }
507 break;
508 }
509
510 struct partition part;
511
512 /* Find a point of correspondence in the middle of the vectors. */
513 diag (xoff, xlim, yoff, ylim, find_minimal, &part, ctxt);
514
515 /* Use the partitions to split this problem into subproblems. */
516 OFFSET xoff1, xlim1, yoff1, ylim1, xoff2, xlim2, yoff2, ylim2;
517 bool find_minimal1, find_minimal2;
518 if (!NOTE_ORDERED
519 && ((xlim + ylim) - (part.xmid + part.ymid)
520 < (part.xmid + part.ymid) - (xoff + yoff)))
521 {
522 /* The second problem is smaller and the caller doesn't
523 care about order, so do the second problem first to
524 lessen recursion. */
525 xoff1 = part.xmid; xlim1 = xlim;
526 yoff1 = part.ymid; ylim1 = ylim;
527 find_minimal1 = part.hi_minimal;
528
529 xoff2 = xoff; xlim2 = part.xmid;
530 yoff2 = yoff; ylim2 = part.ymid;
531 find_minimal2 = part.lo_minimal;
532 }
533 else
534 {
535 xoff1 = xoff; xlim1 = part.xmid;
536 yoff1 = yoff; ylim1 = part.ymid;
537 find_minimal1 = part.lo_minimal;
538
539 xoff2 = part.xmid; xlim2 = xlim;
540 yoff2 = part.ymid; ylim2 = ylim;
541 find_minimal2 = part.hi_minimal;
542 }
543
544 /* Recurse to do one subproblem. */
545 bool early = compareseq (xoff1, xlim1, yoff1, ylim1, find_minimal1, ctxt);
546 if (early)
547 return early;
548
549 /* Iterate to do the other subproblem. */
550 xoff = xoff2; xlim = xlim2;
551 yoff = yoff2; ylim = ylim2;
552 find_minimal = find_minimal2;
553 }
554
555 return false;
556 #undef XREF_YREF_EQUAL
557 }
558
559 #undef ELEMENT
560 #undef EQUAL
561 #undef OFFSET
562 #undef EXTRA_CONTEXT_FIELDS
563 #undef NOTE_DELETE
564 #undef NOTE_INSERT
565 #undef EARLY_ABORT
566 #undef USE_HEURISTIC
567 #undef XVECREF_YVECREF_EQUAL
568 #undef OFFSET_MAX