root/lib/diffseq.h

/* [<][>][^][v][top][bottom][index][help] */

INCLUDED FROM


DEFINITIONS

This source file includes following definitions.
  1. diag
  2. compareseq

     1 /* Analyze differences between two vectors.
     2 
     3    Copyright (C) 1988-1989, 1992-1995, 2001-2004, 2006-2023 Free
     4    Software Foundation, Inc.
     5 
     6    This program is free software: you can redistribute it and/or modify
     7    it under the terms of the GNU General Public License as published by
     8    the Free Software Foundation, either version 3 of the License, or
     9    (at your option) any later version.
    10 
    11    This program is distributed in the hope that it will be useful,
    12    but WITHOUT ANY WARRANTY; without even the implied warranty of
    13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    14    GNU General Public License for more details.
    15 
    16    You should have received a copy of the GNU General Public License
    17    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
    18 
    19 
    20 /* The basic idea is to consider two vectors as similar if, when
    21    transforming the first vector into the second vector through a
    22    sequence of edits (inserts and deletes of one element each),
    23    this sequence is short - or equivalently, if the ordered list
    24    of elements that are untouched by these edits is long.  For a
    25    good introduction to the subject, read about the "Levenshtein
    26    distance" in Wikipedia.
    27 
    28    The basic algorithm is described in:
    29    "An O(ND) Difference Algorithm and its Variations", Eugene W. Myers,
    30    Algorithmica Vol. 1, 1986, pp. 251-266,
    31    <https://doi.org/10.1007/BF01840446>.
    32    See especially section 4.2, which describes the variation used below.
    33 
    34    The basic algorithm was independently discovered as described in:
    35    "Algorithms for Approximate String Matching", Esko Ukkonen,
    36    Information and Control Vol. 64, 1985, pp. 100-118,
    37    <https://doi.org/10.1016/S0019-9958(85)80046-2>.
    38 
    39    Unless the 'find_minimal' flag is set, this code uses the TOO_EXPENSIVE
    40    heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N)
    41    at the price of producing suboptimal output for large inputs with
    42    many differences.  */
    43 
    44 /* Before including this file, you need to define:
    45      ELEMENT                 The element type of the vectors being compared.
    46      EQUAL                   A two-argument macro that tests two elements for
    47                              equality.
    48      OFFSET                  A signed integer type sufficient to hold the
    49                              difference between two indices.  Usually
    50                              something like ptrdiff_t.
    51      EXTRA_CONTEXT_FIELDS    Declarations of fields for 'struct context'.
    52      NOTE_DELETE(ctxt, xoff) Record the removal of the object xvec[xoff].
    53      NOTE_INSERT(ctxt, yoff) Record the insertion of the object yvec[yoff].
    54      NOTE_ORDERED            (Optional) A boolean expression saying that
    55                              NOTE_DELETE and NOTE_INSERT calls must be
    56                              issued in offset order.
    57      EARLY_ABORT(ctxt)       (Optional) A boolean expression that triggers an
    58                              early abort of the computation.
    59      USE_HEURISTIC           (Optional) Define if you want to support the
    60                              heuristic for large vectors.
    61 
    62    It is also possible to use this file with abstract arrays.  In this case,
    63    xvec and yvec are not represented in memory.  They only exist conceptually.
    64    In this case, the list of defines above is amended as follows:
    65      ELEMENT                 Undefined.
    66      EQUAL                   Undefined.
    67      XVECREF_YVECREF_EQUAL(ctxt, xoff, yoff)
    68                              A three-argument macro: References xvec[xoff] and
    69                              yvec[yoff] and tests these elements for equality.
    70 
    71    Before including this file, you also need to include:
    72      #include <limits.h>
    73      #include "minmax.h"
    74  */
    75 
    76 /* Maximum value of type OFFSET.  */
    77 #define OFFSET_MAX \
    78   ((((OFFSET)1 << (sizeof (OFFSET) * CHAR_BIT - 2)) - 1) * 2 + 1)
    79 
    80 /* Default to no early abort.  */
    81 #ifndef EARLY_ABORT
    82 # define EARLY_ABORT(ctxt) false
    83 #endif
    84 
    85 #ifndef NOTE_ORDERED
    86 # define NOTE_ORDERED false
    87 #endif
    88 
    89 /* Use this to suppress gcc's "...may be used before initialized" warnings.
    90    Beware: The Code argument must not contain commas.  */
    91 #ifndef IF_LINT
    92 # if defined GCC_LINT || defined lint
    93 #  define IF_LINT(Code) Code
    94 # else
    95 #  define IF_LINT(Code) /* empty */
    96 # endif
    97 #endif
    98 
    99 /*
   100  * Context of comparison operation.
   101  */
   102 struct context
   103 {
   104   #ifdef ELEMENT
   105   /* Vectors being compared.  */
   106   ELEMENT const *xvec;
   107   ELEMENT const *yvec;
   108   #endif
   109 
   110   /* Extra fields.  */
   111   EXTRA_CONTEXT_FIELDS
   112 
   113   /* Vector, indexed by diagonal, containing 1 + the X coordinate of the point
   114      furthest along the given diagonal in the forward search of the edit
   115      matrix.  */
   116   OFFSET *fdiag;
   117 
   118   /* Vector, indexed by diagonal, containing the X coordinate of the point
   119      furthest along the given diagonal in the backward search of the edit
   120      matrix.  */
   121   OFFSET *bdiag;
   122 
   123   #ifdef USE_HEURISTIC
   124   /* This corresponds to the diff --speed-large-files flag.  With this
   125      heuristic, for vectors with a constant small density of changes,
   126      the algorithm is linear in the vector size.  */
   127   bool heuristic;
   128   #endif
   129 
   130   /* Edit scripts longer than this are too expensive to compute.  */
   131   OFFSET too_expensive;
   132 
   133   /* Snakes bigger than this are considered "big".  */
   134   #define SNAKE_LIMIT 20
   135 };
   136 
   137 struct partition
   138 {
   139   /* Midpoints of this partition.  */
   140   OFFSET xmid;
   141   OFFSET ymid;
   142 
   143   /* True if low half will be analyzed minimally.  */
   144   bool lo_minimal;
   145 
   146   /* Likewise for high half.  */
   147   bool hi_minimal;
   148 };
   149 
   150 
   151 /* Find the midpoint of the shortest edit script for a specified portion
   152    of the two vectors.
   153 
   154    Scan from the beginnings of the vectors, and simultaneously from the ends,
   155    doing a breadth-first search through the space of edit-sequence.
   156    When the two searches meet, we have found the midpoint of the shortest
   157    edit sequence.
   158 
   159    If FIND_MINIMAL is true, find the minimal edit script regardless of
   160    expense.  Otherwise, if the search is too expensive, use heuristics to
   161    stop the search and report a suboptimal answer.
   162 
   163    Set PART->(xmid,ymid) to the midpoint (XMID,YMID).  The diagonal number
   164    XMID - YMID equals the number of inserted elements minus the number
   165    of deleted elements (counting only elements before the midpoint).
   166 
   167    Set PART->lo_minimal to true iff the minimal edit script for the
   168    left half of the partition is known; similarly for PART->hi_minimal.
   169 
   170    This function assumes that the first elements of the specified portions
   171    of the two vectors do not match, and likewise that the last elements do not
   172    match.  The caller must trim matching elements from the beginning and end
   173    of the portions it is going to specify.
   174 
   175    If we return the "wrong" partitions, the worst this can do is cause
   176    suboptimal diff output.  It cannot cause incorrect diff output.  */
   177 
   178 static void
   179 diag (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim, bool find_minimal,
   180       struct partition *part, struct context *ctxt)
   181 {
   182   OFFSET *const fd = ctxt->fdiag;       /* Give the compiler a chance. */
   183   OFFSET *const bd = ctxt->bdiag;       /* Additional help for the compiler. */
   184 #ifdef ELEMENT
   185   ELEMENT const *const xv = ctxt->xvec; /* Still more help for the compiler. */
   186   ELEMENT const *const yv = ctxt->yvec; /* And more and more . . . */
   187   #define XREF_YREF_EQUAL(x,y)  EQUAL (xv[x], yv[y])
   188 #else
   189   #define XREF_YREF_EQUAL(x,y)  XVECREF_YVECREF_EQUAL (ctxt, x, y)
   190 #endif
   191   const OFFSET dmin = xoff - ylim;      /* Minimum valid diagonal. */
   192   const OFFSET dmax = xlim - yoff;      /* Maximum valid diagonal. */
   193   const OFFSET fmid = xoff - yoff;      /* Center diagonal of top-down search. */
   194   const OFFSET bmid = xlim - ylim;      /* Center diagonal of bottom-up search. */
   195   OFFSET fmin = fmid;
   196   OFFSET fmax = fmid;           /* Limits of top-down search. */
   197   OFFSET bmin = bmid;
   198   OFFSET bmax = bmid;           /* Limits of bottom-up search. */
   199   OFFSET c;                     /* Cost. */
   200   bool odd = (fmid - bmid) & 1; /* True if southeast corner is on an odd
   201                                    diagonal with respect to the northwest. */
   202 
   203   fd[fmid] = xoff;
   204   bd[bmid] = xlim;
   205 
   206   for (c = 1;; ++c)
   207     {
   208       OFFSET d;                 /* Active diagonal. */
   209       bool big_snake = false;
   210 
   211       /* Extend the top-down search by an edit step in each diagonal. */
   212       if (fmin > dmin)
   213         fd[--fmin - 1] = -1;
   214       else
   215         ++fmin;
   216       if (fmax < dmax)
   217         fd[++fmax + 1] = -1;
   218       else
   219         --fmax;
   220       for (d = fmax; d >= fmin; d -= 2)
   221         {
   222           OFFSET x;
   223           OFFSET y;
   224           OFFSET tlo = fd[d - 1];
   225           OFFSET thi = fd[d + 1];
   226           OFFSET x0 = tlo < thi ? thi : tlo + 1;
   227 
   228           for (x = x0, y = x0 - d;
   229                x < xlim && y < ylim && XREF_YREF_EQUAL (x, y);
   230                x++, y++)
   231             continue;
   232           if (x - x0 > SNAKE_LIMIT)
   233             big_snake = true;
   234           fd[d] = x;
   235           if (odd && bmin <= d && d <= bmax && bd[d] <= x)
   236             {
   237               part->xmid = x;
   238               part->ymid = y;
   239               part->lo_minimal = part->hi_minimal = true;
   240               return;
   241             }
   242         }
   243 
   244       /* Similarly extend the bottom-up search.  */
   245       if (bmin > dmin)
   246         bd[--bmin - 1] = OFFSET_MAX;
   247       else
   248         ++bmin;
   249       if (bmax < dmax)
   250         bd[++bmax + 1] = OFFSET_MAX;
   251       else
   252         --bmax;
   253       for (d = bmax; d >= bmin; d -= 2)
   254         {
   255           OFFSET x;
   256           OFFSET y;
   257           OFFSET tlo = bd[d - 1];
   258           OFFSET thi = bd[d + 1];
   259           OFFSET x0 = tlo < thi ? tlo : thi - 1;
   260 
   261           for (x = x0, y = x0 - d;
   262                xoff < x && yoff < y && XREF_YREF_EQUAL (x - 1, y - 1);
   263                x--, y--)
   264             continue;
   265           if (x0 - x > SNAKE_LIMIT)
   266             big_snake = true;
   267           bd[d] = x;
   268           if (!odd && fmin <= d && d <= fmax && x <= fd[d])
   269             {
   270               part->xmid = x;
   271               part->ymid = y;
   272               part->lo_minimal = part->hi_minimal = true;
   273               return;
   274             }
   275         }
   276 
   277       if (find_minimal)
   278         continue;
   279 
   280 #ifdef USE_HEURISTIC
   281       bool heuristic = ctxt->heuristic;
   282 #else
   283       bool heuristic = false;
   284 #endif
   285 
   286       /* Heuristic: check occasionally for a diagonal that has made lots
   287          of progress compared with the edit distance.  If we have any
   288          such, find the one that has made the most progress and return it
   289          as if it had succeeded.
   290 
   291          With this heuristic, for vectors with a constant small density
   292          of changes, the algorithm is linear in the vector size.  */
   293 
   294       if (200 < c && big_snake && heuristic)
   295         {
   296           {
   297             OFFSET best = 0;
   298 
   299             for (d = fmax; d >= fmin; d -= 2)
   300               {
   301                 OFFSET dd = d - fmid;
   302                 OFFSET x = fd[d];
   303                 OFFSET y = x - d;
   304                 OFFSET v = (x - xoff) * 2 - dd;
   305 
   306                 if (v > 12 * (c + (dd < 0 ? -dd : dd)))
   307                   {
   308                     if (v > best
   309                         && xoff + SNAKE_LIMIT <= x && x < xlim
   310                         && yoff + SNAKE_LIMIT <= y && y < ylim)
   311                       {
   312                         /* We have a good enough best diagonal; now insist
   313                            that it end with a significant snake.  */
   314                         int k;
   315 
   316                         for (k = 1; XREF_YREF_EQUAL (x - k, y - k); k++)
   317                           if (k == SNAKE_LIMIT)
   318                             {
   319                               best = v;
   320                               part->xmid = x;
   321                               part->ymid = y;
   322                               break;
   323                             }
   324                       }
   325                   }
   326               }
   327             if (best > 0)
   328               {
   329                 part->lo_minimal = true;
   330                 part->hi_minimal = false;
   331                 return;
   332               }
   333           }
   334 
   335           {
   336             OFFSET best = 0;
   337 
   338             for (d = bmax; d >= bmin; d -= 2)
   339               {
   340                 OFFSET dd = d - bmid;
   341                 OFFSET x = bd[d];
   342                 OFFSET y = x - d;
   343                 OFFSET v = (xlim - x) * 2 + dd;
   344 
   345                 if (v > 12 * (c + (dd < 0 ? -dd : dd)))
   346                   {
   347                     if (v > best
   348                         && xoff < x && x <= xlim - SNAKE_LIMIT
   349                         && yoff < y && y <= ylim - SNAKE_LIMIT)
   350                       {
   351                         /* We have a good enough best diagonal; now insist
   352                            that it end with a significant snake.  */
   353                         int k;
   354 
   355                         for (k = 0; XREF_YREF_EQUAL (x + k, y + k); k++)
   356                           if (k == SNAKE_LIMIT - 1)
   357                             {
   358                               best = v;
   359                               part->xmid = x;
   360                               part->ymid = y;
   361                               break;
   362                             }
   363                       }
   364                   }
   365               }
   366             if (best > 0)
   367               {
   368                 part->lo_minimal = false;
   369                 part->hi_minimal = true;
   370                 return;
   371               }
   372           }
   373         }
   374 
   375       /* Heuristic: if we've gone well beyond the call of duty, give up
   376          and report halfway between our best results so far.  */
   377       if (c >= ctxt->too_expensive)
   378         {
   379           OFFSET fxybest;
   380           OFFSET fxbest IF_LINT (= 0);
   381           OFFSET bxybest;
   382           OFFSET bxbest IF_LINT (= 0);
   383 
   384           /* Find forward diagonal that maximizes X + Y.  */
   385           fxybest = -1;
   386           for (d = fmax; d >= fmin; d -= 2)
   387             {
   388               OFFSET x = MIN (fd[d], xlim);
   389               OFFSET y = x - d;
   390               if (ylim < y)
   391                 {
   392                   x = ylim + d;
   393                   y = ylim;
   394                 }
   395               if (fxybest < x + y)
   396                 {
   397                   fxybest = x + y;
   398                   fxbest = x;
   399                 }
   400             }
   401 
   402           /* Find backward diagonal that minimizes X + Y.  */
   403           bxybest = OFFSET_MAX;
   404           for (d = bmax; d >= bmin; d -= 2)
   405             {
   406               OFFSET x = MAX (xoff, bd[d]);
   407               OFFSET y = x - d;
   408               if (y < yoff)
   409                 {
   410                   x = yoff + d;
   411                   y = yoff;
   412                 }
   413               if (x + y < bxybest)
   414                 {
   415                   bxybest = x + y;
   416                   bxbest = x;
   417                 }
   418             }
   419 
   420           /* Use the better of the two diagonals.  */
   421           if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff))
   422             {
   423               part->xmid = fxbest;
   424               part->ymid = fxybest - fxbest;
   425               part->lo_minimal = true;
   426               part->hi_minimal = false;
   427             }
   428           else
   429             {
   430               part->xmid = bxbest;
   431               part->ymid = bxybest - bxbest;
   432               part->lo_minimal = false;
   433               part->hi_minimal = true;
   434             }
   435           return;
   436         }
   437     }
   438   #undef XREF_YREF_EQUAL
   439 }
   440 
   441 
   442 /* Compare in detail contiguous subsequences of the two vectors
   443    which are known, as a whole, to match each other.
   444 
   445    The subsequence of vector 0 is [XOFF, XLIM) and likewise for vector 1.
   446 
   447    Note that XLIM, YLIM are exclusive bounds.  All indices into the vectors
   448    are origin-0.
   449 
   450    If FIND_MINIMAL, find a minimal difference no matter how
   451    expensive it is.
   452 
   453    The results are recorded by invoking NOTE_DELETE and NOTE_INSERT.
   454 
   455    Return false if terminated normally, or true if terminated through early
   456    abort.  */
   457 
   458 static bool
   459 compareseq (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim,
   460             bool find_minimal, struct context *ctxt)
   461 {
   462 #ifdef ELEMENT
   463   ELEMENT const *xv = ctxt->xvec; /* Help the compiler.  */
   464   ELEMENT const *yv = ctxt->yvec;
   465   #define XREF_YREF_EQUAL(x,y)  EQUAL (xv[x], yv[y])
   466 #else
   467   #define XREF_YREF_EQUAL(x,y)  XVECREF_YVECREF_EQUAL (ctxt, x, y)
   468 #endif
   469 
   470   while (true)
   471     {
   472       /* Slide down the bottom initial diagonal.  */
   473       while (xoff < xlim && yoff < ylim && XREF_YREF_EQUAL (xoff, yoff))
   474         {
   475           xoff++;
   476           yoff++;
   477         }
   478 
   479       /* Slide up the top initial diagonal. */
   480       while (xoff < xlim && yoff < ylim && XREF_YREF_EQUAL (xlim - 1, ylim - 1))
   481         {
   482           xlim--;
   483           ylim--;
   484         }
   485 
   486       /* Handle simple cases. */
   487       if (xoff == xlim)
   488         {
   489           while (yoff < ylim)
   490             {
   491               NOTE_INSERT (ctxt, yoff);
   492               if (EARLY_ABORT (ctxt))
   493                 return true;
   494               yoff++;
   495             }
   496           break;
   497         }
   498       if (yoff == ylim)
   499         {
   500           while (xoff < xlim)
   501             {
   502               NOTE_DELETE (ctxt, xoff);
   503               if (EARLY_ABORT (ctxt))
   504                 return true;
   505               xoff++;
   506             }
   507           break;
   508         }
   509 
   510       struct partition part;
   511 
   512       /* Find a point of correspondence in the middle of the vectors.  */
   513       diag (xoff, xlim, yoff, ylim, find_minimal, &part, ctxt);
   514 
   515       /* Use the partitions to split this problem into subproblems.  */
   516       OFFSET xoff1, xlim1, yoff1, ylim1, xoff2, xlim2, yoff2, ylim2;
   517       bool find_minimal1, find_minimal2;
   518       if (!NOTE_ORDERED
   519           && ((xlim + ylim) - (part.xmid + part.ymid)
   520               < (part.xmid + part.ymid) - (xoff + yoff)))
   521         {
   522           /* The second problem is smaller and the caller doesn't
   523              care about order, so do the second problem first to
   524              lessen recursion.  */
   525           xoff1 = part.xmid; xlim1 = xlim;
   526           yoff1 = part.ymid; ylim1 = ylim;
   527           find_minimal1 = part.hi_minimal;
   528 
   529           xoff2 = xoff; xlim2 = part.xmid;
   530           yoff2 = yoff; ylim2 = part.ymid;
   531           find_minimal2 = part.lo_minimal;
   532         }
   533       else
   534         {
   535           xoff1 = xoff; xlim1 = part.xmid;
   536           yoff1 = yoff; ylim1 = part.ymid;
   537           find_minimal1 = part.lo_minimal;
   538 
   539           xoff2 = part.xmid; xlim2 = xlim;
   540           yoff2 = part.ymid; ylim2 = ylim;
   541           find_minimal2 = part.hi_minimal;
   542         }
   543 
   544       /* Recurse to do one subproblem.  */
   545       bool early = compareseq (xoff1, xlim1, yoff1, ylim1, find_minimal1, ctxt);
   546       if (early)
   547         return early;
   548 
   549       /* Iterate to do the other subproblem.  */
   550       xoff = xoff2; xlim = xlim2;
   551       yoff = yoff2; ylim = ylim2;
   552       find_minimal = find_minimal2;
   553     }
   554 
   555   return false;
   556   #undef XREF_YREF_EQUAL
   557 }
   558 
   559 #undef ELEMENT
   560 #undef EQUAL
   561 #undef OFFSET
   562 #undef EXTRA_CONTEXT_FIELDS
   563 #undef NOTE_DELETE
   564 #undef NOTE_INSERT
   565 #undef EARLY_ABORT
   566 #undef USE_HEURISTIC
   567 #undef XVECREF_YVECREF_EQUAL
   568 #undef OFFSET_MAX

/* [<][>][^][v][top][bottom][index][help] */