root/lib/diffseq.h

/* [<][>][^][v][top][bottom][index][help] */

INCLUDED FROM


DEFINITIONS

This source file includes following definitions.
  1. diag
  2. compareseq

     1 /* Analyze differences between two vectors.
     2 
     3    Copyright (C) 1988-1989, 1992-1995, 2001-2004, 2006-2023 Free Software
     4    Foundation, Inc.
     5 
     6    This program is free software: you can redistribute it and/or modify
     7    it under the terms of the GNU General Public License as published by
     8    the Free Software Foundation, either version 3 of the License, or
     9    (at your option) any later version.
    10 
    11    This program is distributed in the hope that it will be useful,
    12    but WITHOUT ANY WARRANTY; without even the implied warranty of
    13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    14    GNU General Public License for more details.
    15 
    16    You should have received a copy of the GNU General Public License
    17    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
    18 
    19 
    20 /* The basic idea is to consider two vectors as similar if, when
    21    transforming the first vector into the second vector through a
    22    sequence of edits (inserts and deletes of one element each),
    23    this sequence is short - or equivalently, if the ordered list
    24    of elements that are untouched by these edits is long.  For a
    25    good introduction to the subject, read about the "Levenshtein
    26    distance" in Wikipedia.
    27 
    28    The basic algorithm is described in:
    29    "An O(ND) Difference Algorithm and its Variations", Eugene W. Myers,
    30    Algorithmica Vol. 1, 1986, pp. 251-266,
    31    <https://doi.org/10.1007/BF01840446>.
    32    See especially section 4.2, which describes the variation used below.
    33 
    34    The basic algorithm was independently discovered as described in:
    35    "Algorithms for Approximate String Matching", Esko Ukkonen,
    36    Information and Control Vol. 64, 1985, pp. 100-118,
    37    <https://doi.org/10.1016/S0019-9958(85)80046-2>.
    38 
    39    Unless the 'find_minimal' flag is set, this code uses the TOO_EXPENSIVE
    40    heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N)
    41    at the price of producing suboptimal output for large inputs with
    42    many differences.  */
    43 
    44 /* Before including this file, you need to define:
    45      ELEMENT                 The element type of the vectors being compared.
    46      EQUAL                   A two-argument macro that tests two elements for
    47                              equality.
    48      OFFSET                  A signed integer type sufficient to hold the
    49                              difference between two indices.  Usually
    50                              something like ptrdiff_t.
    51      OFFSET_MAX              (Optional) The maximum value of OFFSET (e.g.,
    52                              PTRDIFF_MAX).  If omitted, it is inferred in a
    53                              way portable to the vast majority of C platforms,
    54                              as they lack padding bits.
    55      EXTRA_CONTEXT_FIELDS    Declarations of fields for 'struct context'.
    56      NOTE_DELETE(ctxt, xoff) Record the removal of the object xvec[xoff].
    57      NOTE_INSERT(ctxt, yoff) Record the insertion of the object yvec[yoff].
    58      NOTE_ORDERED            (Optional) A boolean expression saying that
    59                              NOTE_DELETE and NOTE_INSERT calls must be
    60                              issued in offset order.
    61      EARLY_ABORT(ctxt)       (Optional) A boolean expression that triggers an
    62                              early abort of the computation.
    63      USE_HEURISTIC           (Optional) Define if you want to support the
    64                              heuristic for large vectors.
    65 
    66    It is also possible to use this file with abstract arrays.  In this case,
    67    xvec and yvec are not represented in memory.  They only exist conceptually.
    68    In this case, the list of defines above is amended as follows:
    69      ELEMENT                 Undefined.
    70      EQUAL                   Undefined.
    71      XVECREF_YVECREF_EQUAL(ctxt, xoff, yoff)
    72                              A three-argument macro: References xvec[xoff] and
    73                              yvec[yoff] and tests these elements for equality.
    74 
    75    Before including this file, you also need to include:
    76      #include <limits.h>
    77      #include "minmax.h"
    78  */
    79 
    80 /* Maximum value of type OFFSET.  */
    81 #ifndef OFFSET_MAX
    82 # define OFFSET_MAX \
    83    ((((OFFSET) 1 << (sizeof (OFFSET) * CHAR_BIT - 2)) - 1) * 2 + 1)
    84 #endif
    85 
    86 /* Default to no early abort.  */
    87 #ifndef EARLY_ABORT
    88 # define EARLY_ABORT(ctxt) false
    89 #endif
    90 
    91 #ifndef NOTE_ORDERED
    92 # define NOTE_ORDERED false
    93 #endif
    94 
    95 /* Use this to suppress gcc's "...may be used before initialized" warnings.
    96    Beware: The Code argument must not contain commas.  */
    97 #if __GNUC__ + (__GNUC_MINOR__ >= 7) > 4
    98 # pragma GCC diagnostic push
    99 #endif
   100 #ifndef IF_LINT
   101 # if defined GCC_LINT || defined lint
   102 #  define IF_LINT(Code) Code
   103 # else
   104 #  define IF_LINT(Code) /* empty */
   105 #  if __GNUC__ + (__GNUC_MINOR__ >= 7) > 4
   106 #   pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
   107 #  endif
   108 # endif
   109 #endif
   110 
   111 /*
   112  * Context of comparison operation.
   113  */
   114 struct context
   115 {
   116   #ifdef ELEMENT
   117   /* Vectors being compared.  */
   118   ELEMENT const *xvec;
   119   ELEMENT const *yvec;
   120   #endif
   121 
   122   /* Extra fields.  */
   123   EXTRA_CONTEXT_FIELDS
   124 
   125   /* Vector, indexed by diagonal, containing 1 + the X coordinate of the point
   126      furthest along the given diagonal in the forward search of the edit
   127      matrix.  */
   128   OFFSET *fdiag;
   129 
   130   /* Vector, indexed by diagonal, containing the X coordinate of the point
   131      furthest along the given diagonal in the backward search of the edit
   132      matrix.  */
   133   OFFSET *bdiag;
   134 
   135   #ifdef USE_HEURISTIC
   136   /* This corresponds to the diff --speed-large-files flag.  With this
   137      heuristic, for vectors with a constant small density of changes,
   138      the algorithm is linear in the vector size.  */
   139   bool heuristic;
   140   #endif
   141 
   142   /* Edit scripts longer than this are too expensive to compute.  */
   143   OFFSET too_expensive;
   144 
   145   /* Snakes bigger than this are considered "big".  */
   146   #define SNAKE_LIMIT 20
   147 };
   148 
   149 struct partition
   150 {
   151   /* Midpoints of this partition.  */
   152   OFFSET xmid;
   153   OFFSET ymid;
   154 
   155   /* True if low half will be analyzed minimally.  */
   156   bool lo_minimal;
   157 
   158   /* Likewise for high half.  */
   159   bool hi_minimal;
   160 };
   161 
   162 
   163 /* Find the midpoint of the shortest edit script for a specified portion
   164    of the two vectors.
   165 
   166    Scan from the beginnings of the vectors, and simultaneously from the ends,
   167    doing a breadth-first search through the space of edit-sequence.
   168    When the two searches meet, we have found the midpoint of the shortest
   169    edit sequence.
   170 
   171    If FIND_MINIMAL is true, find the minimal edit script regardless of
   172    expense.  Otherwise, if the search is too expensive, use heuristics to
   173    stop the search and report a suboptimal answer.
   174 
   175    Set PART->(xmid,ymid) to the midpoint (XMID,YMID).  The diagonal number
   176    XMID - YMID equals the number of inserted elements minus the number
   177    of deleted elements (counting only elements before the midpoint).
   178 
   179    Set PART->lo_minimal to true iff the minimal edit script for the
   180    left half of the partition is known; similarly for PART->hi_minimal.
   181 
   182    This function assumes that the first elements of the specified portions
   183    of the two vectors do not match, and likewise that the last elements do not
   184    match.  The caller must trim matching elements from the beginning and end
   185    of the portions it is going to specify.
   186 
   187    If we return the "wrong" partitions, the worst this can do is cause
   188    suboptimal diff output.  It cannot cause incorrect diff output.  */
   189 
   190 static void
   191 diag (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim, bool find_minimal,
   192       struct partition *part, struct context *ctxt)
   193 {
   194   OFFSET *const fd = ctxt->fdiag;       /* Give the compiler a chance. */
   195   OFFSET *const bd = ctxt->bdiag;       /* Additional help for the compiler. */
   196 #ifdef ELEMENT
   197   ELEMENT const *const xv = ctxt->xvec; /* Still more help for the compiler. */
   198   ELEMENT const *const yv = ctxt->yvec; /* And more and more . . . */
   199   #define XREF_YREF_EQUAL(x,y)  EQUAL (xv[x], yv[y])
   200 #else
   201   #define XREF_YREF_EQUAL(x,y)  XVECREF_YVECREF_EQUAL (ctxt, x, y)
   202 #endif
   203   const OFFSET dmin = xoff - ylim;      /* Minimum valid diagonal. */
   204   const OFFSET dmax = xlim - yoff;      /* Maximum valid diagonal. */
   205   const OFFSET fmid = xoff - yoff;      /* Center diagonal of top-down search. */
   206   const OFFSET bmid = xlim - ylim;      /* Center diagonal of bottom-up search. */
   207   OFFSET fmin = fmid;
   208   OFFSET fmax = fmid;           /* Limits of top-down search. */
   209   OFFSET bmin = bmid;
   210   OFFSET bmax = bmid;           /* Limits of bottom-up search. */
   211   OFFSET c;                     /* Cost. */
   212   bool odd = (fmid - bmid) & 1; /* True if southeast corner is on an odd
   213                                    diagonal with respect to the northwest. */
   214 
   215   fd[fmid] = xoff;
   216   bd[bmid] = xlim;
   217 
   218   for (c = 1;; ++c)
   219     {
   220       OFFSET d;                 /* Active diagonal. */
   221       bool big_snake = false;
   222 
   223       /* Extend the top-down search by an edit step in each diagonal. */
   224       if (fmin > dmin)
   225         fd[--fmin - 1] = -1;
   226       else
   227         ++fmin;
   228       if (fmax < dmax)
   229         fd[++fmax + 1] = -1;
   230       else
   231         --fmax;
   232       for (d = fmax; d >= fmin; d -= 2)
   233         {
   234           OFFSET x;
   235           OFFSET y;
   236           OFFSET tlo = fd[d - 1];
   237           OFFSET thi = fd[d + 1];
   238           OFFSET x0 = tlo < thi ? thi : tlo + 1;
   239 
   240           for (x = x0, y = x0 - d;
   241                x < xlim && y < ylim && XREF_YREF_EQUAL (x, y);
   242                x++, y++)
   243             continue;
   244           if (x - x0 > SNAKE_LIMIT)
   245             big_snake = true;
   246           fd[d] = x;
   247           if (odd && bmin <= d && d <= bmax && bd[d] <= x)
   248             {
   249               part->xmid = x;
   250               part->ymid = y;
   251               part->lo_minimal = part->hi_minimal = true;
   252               return;
   253             }
   254         }
   255 
   256       /* Similarly extend the bottom-up search.  */
   257       if (bmin > dmin)
   258         bd[--bmin - 1] = OFFSET_MAX;
   259       else
   260         ++bmin;
   261       if (bmax < dmax)
   262         bd[++bmax + 1] = OFFSET_MAX;
   263       else
   264         --bmax;
   265       for (d = bmax; d >= bmin; d -= 2)
   266         {
   267           OFFSET x;
   268           OFFSET y;
   269           OFFSET tlo = bd[d - 1];
   270           OFFSET thi = bd[d + 1];
   271           OFFSET x0 = tlo < thi ? tlo : thi - 1;
   272 
   273           for (x = x0, y = x0 - d;
   274                xoff < x && yoff < y && XREF_YREF_EQUAL (x - 1, y - 1);
   275                x--, y--)
   276             continue;
   277           if (x0 - x > SNAKE_LIMIT)
   278             big_snake = true;
   279           bd[d] = x;
   280           if (!odd && fmin <= d && d <= fmax && x <= fd[d])
   281             {
   282               part->xmid = x;
   283               part->ymid = y;
   284               part->lo_minimal = part->hi_minimal = true;
   285               return;
   286             }
   287         }
   288 
   289       if (find_minimal)
   290         continue;
   291 
   292 #ifdef USE_HEURISTIC
   293       bool heuristic = ctxt->heuristic;
   294 #else
   295       bool heuristic = false;
   296 #endif
   297 
   298       /* Heuristic: check occasionally for a diagonal that has made lots
   299          of progress compared with the edit distance.  If we have any
   300          such, find the one that has made the most progress and return it
   301          as if it had succeeded.
   302 
   303          With this heuristic, for vectors with a constant small density
   304          of changes, the algorithm is linear in the vector size.  */
   305 
   306       if (200 < c && big_snake && heuristic)
   307         {
   308           {
   309             OFFSET best = 0;
   310 
   311             for (d = fmax; d >= fmin; d -= 2)
   312               {
   313                 OFFSET dd = d - fmid;
   314                 OFFSET x = fd[d];
   315                 OFFSET y = x - d;
   316                 OFFSET v = (x - xoff) * 2 - dd;
   317 
   318                 if (v > 12 * (c + (dd < 0 ? -dd : dd)))
   319                   {
   320                     if (v > best
   321                         && xoff + SNAKE_LIMIT <= x && x < xlim
   322                         && yoff + SNAKE_LIMIT <= y && y < ylim)
   323                       {
   324                         /* We have a good enough best diagonal; now insist
   325                            that it end with a significant snake.  */
   326                         int k;
   327 
   328                         for (k = 1; XREF_YREF_EQUAL (x - k, y - k); k++)
   329                           if (k == SNAKE_LIMIT)
   330                             {
   331                               best = v;
   332                               part->xmid = x;
   333                               part->ymid = y;
   334                               break;
   335                             }
   336                       }
   337                   }
   338               }
   339             if (best > 0)
   340               {
   341                 part->lo_minimal = true;
   342                 part->hi_minimal = false;
   343                 return;
   344               }
   345           }
   346 
   347           {
   348             OFFSET best = 0;
   349 
   350             for (d = bmax; d >= bmin; d -= 2)
   351               {
   352                 OFFSET dd = d - bmid;
   353                 OFFSET x = bd[d];
   354                 OFFSET y = x - d;
   355                 OFFSET v = (xlim - x) * 2 + dd;
   356 
   357                 if (v > 12 * (c + (dd < 0 ? -dd : dd)))
   358                   {
   359                     if (v > best
   360                         && xoff < x && x <= xlim - SNAKE_LIMIT
   361                         && yoff < y && y <= ylim - SNAKE_LIMIT)
   362                       {
   363                         /* We have a good enough best diagonal; now insist
   364                            that it end with a significant snake.  */
   365                         int k;
   366 
   367                         for (k = 0; XREF_YREF_EQUAL (x + k, y + k); k++)
   368                           if (k == SNAKE_LIMIT - 1)
   369                             {
   370                               best = v;
   371                               part->xmid = x;
   372                               part->ymid = y;
   373                               break;
   374                             }
   375                       }
   376                   }
   377               }
   378             if (best > 0)
   379               {
   380                 part->lo_minimal = false;
   381                 part->hi_minimal = true;
   382                 return;
   383               }
   384           }
   385         }
   386 
   387       /* Heuristic: if we've gone well beyond the call of duty, give up
   388          and report halfway between our best results so far.  */
   389       if (c >= ctxt->too_expensive)
   390         {
   391           OFFSET fxybest;
   392           OFFSET fxbest IF_LINT (= 0);
   393           OFFSET bxybest;
   394           OFFSET bxbest IF_LINT (= 0);
   395 
   396           /* Find forward diagonal that maximizes X + Y.  */
   397           fxybest = -1;
   398           for (d = fmax; d >= fmin; d -= 2)
   399             {
   400               OFFSET x = MIN (fd[d], xlim);
   401               OFFSET y = x - d;
   402               if (ylim < y)
   403                 {
   404                   x = ylim + d;
   405                   y = ylim;
   406                 }
   407               if (fxybest < x + y)
   408                 {
   409                   fxybest = x + y;
   410                   fxbest = x;
   411                 }
   412             }
   413 
   414           /* Find backward diagonal that minimizes X + Y.  */
   415           bxybest = OFFSET_MAX;
   416           for (d = bmax; d >= bmin; d -= 2)
   417             {
   418               OFFSET x = MAX (xoff, bd[d]);
   419               OFFSET y = x - d;
   420               if (y < yoff)
   421                 {
   422                   x = yoff + d;
   423                   y = yoff;
   424                 }
   425               if (x + y < bxybest)
   426                 {
   427                   bxybest = x + y;
   428                   bxbest = x;
   429                 }
   430             }
   431 
   432           /* Use the better of the two diagonals.  */
   433           if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff))
   434             {
   435               part->xmid = fxbest;
   436               part->ymid = fxybest - fxbest;
   437               part->lo_minimal = true;
   438               part->hi_minimal = false;
   439             }
   440           else
   441             {
   442               part->xmid = bxbest;
   443               part->ymid = bxybest - bxbest;
   444               part->lo_minimal = false;
   445               part->hi_minimal = true;
   446             }
   447           return;
   448         }
   449     }
   450   #undef XREF_YREF_EQUAL
   451 }
   452 
   453 
   454 /* Compare in detail contiguous subsequences of the two vectors
   455    which are known, as a whole, to match each other.
   456 
   457    The subsequence of vector 0 is [XOFF, XLIM) and likewise for vector 1.
   458 
   459    Note that XLIM, YLIM are exclusive bounds.  All indices into the vectors
   460    are origin-0.
   461 
   462    If FIND_MINIMAL, find a minimal difference no matter how
   463    expensive it is.
   464 
   465    The results are recorded by invoking NOTE_DELETE and NOTE_INSERT.
   466 
   467    Return false if terminated normally, or true if terminated through early
   468    abort.  */
   469 
   470 static bool
   471 compareseq (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim,
   472             bool find_minimal, struct context *ctxt)
   473 {
   474 #ifdef ELEMENT
   475   ELEMENT const *xv = ctxt->xvec; /* Help the compiler.  */
   476   ELEMENT const *yv = ctxt->yvec;
   477   #define XREF_YREF_EQUAL(x,y)  EQUAL (xv[x], yv[y])
   478 #else
   479   #define XREF_YREF_EQUAL(x,y)  XVECREF_YVECREF_EQUAL (ctxt, x, y)
   480 #endif
   481 
   482   while (true)
   483     {
   484       /* Slide down the bottom initial diagonal.  */
   485       while (xoff < xlim && yoff < ylim && XREF_YREF_EQUAL (xoff, yoff))
   486         {
   487           xoff++;
   488           yoff++;
   489         }
   490 
   491       /* Slide up the top initial diagonal. */
   492       while (xoff < xlim && yoff < ylim && XREF_YREF_EQUAL (xlim - 1, ylim - 1))
   493         {
   494           xlim--;
   495           ylim--;
   496         }
   497 
   498       /* Handle simple cases. */
   499       if (xoff == xlim)
   500         {
   501           while (yoff < ylim)
   502             {
   503               NOTE_INSERT (ctxt, yoff);
   504               if (EARLY_ABORT (ctxt))
   505                 return true;
   506               yoff++;
   507             }
   508           break;
   509         }
   510       if (yoff == ylim)
   511         {
   512           while (xoff < xlim)
   513             {
   514               NOTE_DELETE (ctxt, xoff);
   515               if (EARLY_ABORT (ctxt))
   516                 return true;
   517               xoff++;
   518             }
   519           break;
   520         }
   521 
   522       struct partition part;
   523 
   524       /* Find a point of correspondence in the middle of the vectors.  */
   525       diag (xoff, xlim, yoff, ylim, find_minimal, &part, ctxt);
   526 
   527       /* Use the partitions to split this problem into subproblems.  */
   528       OFFSET xoff1, xlim1, yoff1, ylim1, xoff2, xlim2, yoff2, ylim2;
   529       bool find_minimal1, find_minimal2;
   530       if (!NOTE_ORDERED
   531           && ((xlim + ylim) - (part.xmid + part.ymid)
   532               < (part.xmid + part.ymid) - (xoff + yoff)))
   533         {
   534           /* The second problem is smaller and the caller doesn't
   535              care about order, so do the second problem first to
   536              lessen recursion.  */
   537           xoff1 = part.xmid; xlim1 = xlim;
   538           yoff1 = part.ymid; ylim1 = ylim;
   539           find_minimal1 = part.hi_minimal;
   540 
   541           xoff2 = xoff; xlim2 = part.xmid;
   542           yoff2 = yoff; ylim2 = part.ymid;
   543           find_minimal2 = part.lo_minimal;
   544         }
   545       else
   546         {
   547           xoff1 = xoff; xlim1 = part.xmid;
   548           yoff1 = yoff; ylim1 = part.ymid;
   549           find_minimal1 = part.lo_minimal;
   550 
   551           xoff2 = part.xmid; xlim2 = xlim;
   552           yoff2 = part.ymid; ylim2 = ylim;
   553           find_minimal2 = part.hi_minimal;
   554         }
   555 
   556       /* Recurse to do one subproblem.  */
   557       bool early = compareseq (xoff1, xlim1, yoff1, ylim1, find_minimal1, ctxt);
   558       if (early)
   559         return early;
   560 
   561       /* Iterate to do the other subproblem.  */
   562       xoff = xoff2; xlim = xlim2;
   563       yoff = yoff2; ylim = ylim2;
   564       find_minimal = find_minimal2;
   565     }
   566 
   567   return false;
   568   #undef XREF_YREF_EQUAL
   569 }
   570 
   571 #if __GNUC__ + (__GNUC_MINOR__ >= 7) > 4
   572 # pragma GCC diagnostic pop
   573 #endif
   574 
   575 #undef ELEMENT
   576 #undef EQUAL
   577 #undef OFFSET
   578 #undef EXTRA_CONTEXT_FIELDS
   579 #undef NOTE_DELETE
   580 #undef NOTE_INSERT
   581 #undef EARLY_ABORT
   582 #undef USE_HEURISTIC
   583 #undef XVECREF_YVECREF_EQUAL
   584 #undef OFFSET_MAX

/* [<][>][^][v][top][bottom][index][help] */