root/lib/diffseq.h

/* [<][>][^][v][top][bottom][index][help] */

INCLUDED FROM


DEFINITIONS

This source file includes following definitions.
  1. diag
  2. compareseq

     1 /* Analyze differences between two vectors.
     2 
     3    Copyright (C) 1988-1989, 1992-1995, 2001-2004, 2006-2023 Free Software
     4    Foundation, Inc.
     5 
     6    This program is free software: you can redistribute it and/or modify
     7    it under the terms of the GNU General Public License as published by
     8    the Free Software Foundation, either version 3 of the License, or
     9    (at your option) any later version.
    10 
    11    This program is distributed in the hope that it will be useful,
    12    but WITHOUT ANY WARRANTY; without even the implied warranty of
    13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    14    GNU General Public License for more details.
    15 
    16    You should have received a copy of the GNU General Public License
    17    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
    18 
    19 
    20 /* The basic idea is to consider two vectors as similar if, when
    21    transforming the first vector into the second vector through a
    22    sequence of edits (inserts and deletes of one element each),
    23    this sequence is short - or equivalently, if the ordered list
    24    of elements that are untouched by these edits is long.  For a
    25    good introduction to the subject, read about the "Levenshtein
    26    distance" in Wikipedia.
    27 
    28    The basic algorithm is described in:
    29    "An O(ND) Difference Algorithm and its Variations", Eugene W. Myers,
    30    Algorithmica Vol. 1, 1986, pp. 251-266,
    31    <https://doi.org/10.1007/BF01840446>.
    32    See especially section 4.2, which describes the variation used below.
    33 
    34    The basic algorithm was independently discovered as described in:
    35    "Algorithms for Approximate String Matching", Esko Ukkonen,
    36    Information and Control Vol. 64, 1985, pp. 100-118,
    37    <https://doi.org/10.1016/S0019-9958(85)80046-2>.
    38 
    39    Unless the 'find_minimal' flag is set, this code uses the TOO_EXPENSIVE
    40    heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N)
    41    at the price of producing suboptimal output for large inputs with
    42    many differences.  */
    43 
    44 /* Before including this file, you need to define:
    45      ELEMENT                 The element type of the vectors being compared.
    46      EQUAL                   A two-argument macro that tests two elements for
    47                              equality.
    48      OFFSET                  A signed integer type sufficient to hold the
    49                              difference between two indices.  Usually
    50                              something like ptrdiff_t.
    51      OFFSET_MAX              (Optional) The maximum value of OFFSET (e.g.,
    52                              PTRDIFF_MAX).  If omitted, it is inferred in a
    53                              way portable to the vast majority of C platforms,
    54                              as they lack padding bits.
    55      EXTRA_CONTEXT_FIELDS    Declarations of fields for 'struct context'.
    56      NOTE_DELETE(ctxt, xoff) Record the removal of the object xvec[xoff].
    57      NOTE_INSERT(ctxt, yoff) Record the insertion of the object yvec[yoff].
    58      NOTE_ORDERED            (Optional) A boolean expression saying that
    59                              NOTE_DELETE and NOTE_INSERT calls must be
    60                              issued in offset order.
    61      EARLY_ABORT(ctxt)       (Optional) A boolean expression that triggers an
    62                              early abort of the computation.
    63      USE_HEURISTIC           (Optional) Define if you want to support the
    64                              heuristic for large vectors.
    65 
    66    It is also possible to use this file with abstract arrays.  In this case,
    67    xvec and yvec are not represented in memory.  They only exist conceptually.
    68    In this case, the list of defines above is amended as follows:
    69      ELEMENT                 Undefined.
    70      EQUAL                   Undefined.
    71      XVECREF_YVECREF_EQUAL(ctxt, xoff, yoff)
    72                              A three-argument macro: References xvec[xoff] and
    73                              yvec[yoff] and tests these elements for equality.
    74 
    75    Before including this file, you also need to include:
    76      #include <limits.h>
    77      #include "minmax.h"
    78  */
    79 
    80 /* Maximum value of type OFFSET.  */
    81 #ifndef OFFSET_MAX
    82 # define OFFSET_MAX \
    83    ((((OFFSET) 1 << (sizeof (OFFSET) * CHAR_BIT - 2)) - 1) * 2 + 1)
    84 #endif
    85 
    86 /* Default to no early abort.  */
    87 #ifndef EARLY_ABORT
    88 # define EARLY_ABORT(ctxt) false
    89 #endif
    90 
    91 #ifndef NOTE_ORDERED
    92 # define NOTE_ORDERED false
    93 #endif
    94 
    95 /* Suppress gcc's "...may be used before initialized" warnings,
    96    generated by GCC versions up to at least GCC 13.2.  */
    97 #if __GNUC__ + (__GNUC_MINOR__ >= 7) > 4
    98 # pragma GCC diagnostic push
    99 # pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
   100 #endif
   101 
   102 /*
   103  * Context of comparison operation.
   104  */
   105 struct context
   106 {
   107   #ifdef ELEMENT
   108   /* Vectors being compared.  */
   109   ELEMENT const *xvec;
   110   ELEMENT const *yvec;
   111   #endif
   112 
   113   /* Extra fields.  */
   114   EXTRA_CONTEXT_FIELDS
   115 
   116   /* Vector, indexed by diagonal, containing 1 + the X coordinate of the point
   117      furthest along the given diagonal in the forward search of the edit
   118      matrix.  */
   119   OFFSET *fdiag;
   120 
   121   /* Vector, indexed by diagonal, containing the X coordinate of the point
   122      furthest along the given diagonal in the backward search of the edit
   123      matrix.  */
   124   OFFSET *bdiag;
   125 
   126   #ifdef USE_HEURISTIC
   127   /* This corresponds to the diff --speed-large-files flag.  With this
   128      heuristic, for vectors with a constant small density of changes,
   129      the algorithm is linear in the vector size.  */
   130   bool heuristic;
   131   #endif
   132 
   133   /* Edit scripts longer than this are too expensive to compute.  */
   134   OFFSET too_expensive;
   135 
   136   /* Snakes bigger than this are considered "big".  */
   137   #define SNAKE_LIMIT 20
   138 };
   139 
   140 struct partition
   141 {
   142   /* Midpoints of this partition.  */
   143   OFFSET xmid;
   144   OFFSET ymid;
   145 
   146   /* True if low half will be analyzed minimally.  */
   147   bool lo_minimal;
   148 
   149   /* Likewise for high half.  */
   150   bool hi_minimal;
   151 };
   152 
   153 
   154 /* Find the midpoint of the shortest edit script for a specified portion
   155    of the two vectors.
   156 
   157    Scan from the beginnings of the vectors, and simultaneously from the ends,
   158    doing a breadth-first search through the space of edit-sequence.
   159    When the two searches meet, we have found the midpoint of the shortest
   160    edit sequence.
   161 
   162    If FIND_MINIMAL is true, find the minimal edit script regardless of
   163    expense.  Otherwise, if the search is too expensive, use heuristics to
   164    stop the search and report a suboptimal answer.
   165 
   166    Set PART->(xmid,ymid) to the midpoint (XMID,YMID).  The diagonal number
   167    XMID - YMID equals the number of inserted elements minus the number
   168    of deleted elements (counting only elements before the midpoint).
   169 
   170    Set PART->lo_minimal to true iff the minimal edit script for the
   171    left half of the partition is known; similarly for PART->hi_minimal.
   172 
   173    This function assumes that the first elements of the specified portions
   174    of the two vectors do not match, and likewise that the last elements do not
   175    match.  The caller must trim matching elements from the beginning and end
   176    of the portions it is going to specify.
   177 
   178    If we return the "wrong" partitions, the worst this can do is cause
   179    suboptimal diff output.  It cannot cause incorrect diff output.  */
   180 
   181 static void
   182 diag (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim, bool find_minimal,
   183       struct partition *part, struct context *ctxt)
   184 {
   185   OFFSET *const fd = ctxt->fdiag;       /* Give the compiler a chance. */
   186   OFFSET *const bd = ctxt->bdiag;       /* Additional help for the compiler. */
   187 #ifdef ELEMENT
   188   ELEMENT const *const xv = ctxt->xvec; /* Still more help for the compiler. */
   189   ELEMENT const *const yv = ctxt->yvec; /* And more and more . . . */
   190   #define XREF_YREF_EQUAL(x,y)  EQUAL (xv[x], yv[y])
   191 #else
   192   #define XREF_YREF_EQUAL(x,y)  XVECREF_YVECREF_EQUAL (ctxt, x, y)
   193 #endif
   194   const OFFSET dmin = xoff - ylim;      /* Minimum valid diagonal. */
   195   const OFFSET dmax = xlim - yoff;      /* Maximum valid diagonal. */
   196   const OFFSET fmid = xoff - yoff;      /* Center diagonal of top-down search. */
   197   const OFFSET bmid = xlim - ylim;      /* Center diagonal of bottom-up search. */
   198   OFFSET fmin = fmid;
   199   OFFSET fmax = fmid;           /* Limits of top-down search. */
   200   OFFSET bmin = bmid;
   201   OFFSET bmax = bmid;           /* Limits of bottom-up search. */
   202   OFFSET c;                     /* Cost. */
   203   bool odd = (fmid - bmid) & 1; /* True if southeast corner is on an odd
   204                                    diagonal with respect to the northwest. */
   205 
   206   fd[fmid] = xoff;
   207   bd[bmid] = xlim;
   208 
   209   for (c = 1;; ++c)
   210     {
   211       OFFSET d;                 /* Active diagonal. */
   212       bool big_snake = false;
   213 
   214       /* Extend the top-down search by an edit step in each diagonal. */
   215       if (fmin > dmin)
   216         fd[--fmin - 1] = -1;
   217       else
   218         ++fmin;
   219       if (fmax < dmax)
   220         fd[++fmax + 1] = -1;
   221       else
   222         --fmax;
   223       for (d = fmax; d >= fmin; d -= 2)
   224         {
   225           OFFSET x;
   226           OFFSET y;
   227           OFFSET tlo = fd[d - 1];
   228           OFFSET thi = fd[d + 1];
   229           OFFSET x0 = tlo < thi ? thi : tlo + 1;
   230 
   231           for (x = x0, y = x0 - d;
   232                x < xlim && y < ylim && XREF_YREF_EQUAL (x, y);
   233                x++, y++)
   234             continue;
   235           if (x - x0 > SNAKE_LIMIT)
   236             big_snake = true;
   237           fd[d] = x;
   238           if (odd && bmin <= d && d <= bmax && bd[d] <= x)
   239             {
   240               part->xmid = x;
   241               part->ymid = y;
   242               part->lo_minimal = part->hi_minimal = true;
   243               return;
   244             }
   245         }
   246 
   247       /* Similarly extend the bottom-up search.  */
   248       if (bmin > dmin)
   249         bd[--bmin - 1] = OFFSET_MAX;
   250       else
   251         ++bmin;
   252       if (bmax < dmax)
   253         bd[++bmax + 1] = OFFSET_MAX;
   254       else
   255         --bmax;
   256       for (d = bmax; d >= bmin; d -= 2)
   257         {
   258           OFFSET x;
   259           OFFSET y;
   260           OFFSET tlo = bd[d - 1];
   261           OFFSET thi = bd[d + 1];
   262           OFFSET x0 = tlo < thi ? tlo : thi - 1;
   263 
   264           for (x = x0, y = x0 - d;
   265                xoff < x && yoff < y && XREF_YREF_EQUAL (x - 1, y - 1);
   266                x--, y--)
   267             continue;
   268           if (x0 - x > SNAKE_LIMIT)
   269             big_snake = true;
   270           bd[d] = x;
   271           if (!odd && fmin <= d && d <= fmax && x <= fd[d])
   272             {
   273               part->xmid = x;
   274               part->ymid = y;
   275               part->lo_minimal = part->hi_minimal = true;
   276               return;
   277             }
   278         }
   279 
   280       if (find_minimal)
   281         continue;
   282 
   283 #ifdef USE_HEURISTIC
   284       bool heuristic = ctxt->heuristic;
   285 #else
   286       bool heuristic = false;
   287 #endif
   288 
   289       /* Heuristic: check occasionally for a diagonal that has made lots
   290          of progress compared with the edit distance.  If we have any
   291          such, find the one that has made the most progress and return it
   292          as if it had succeeded.
   293 
   294          With this heuristic, for vectors with a constant small density
   295          of changes, the algorithm is linear in the vector size.  */
   296 
   297       if (200 < c && big_snake && heuristic)
   298         {
   299           {
   300             OFFSET best = 0;
   301 
   302             for (d = fmax; d >= fmin; d -= 2)
   303               {
   304                 OFFSET dd = d - fmid;
   305                 OFFSET x = fd[d];
   306                 OFFSET y = x - d;
   307                 OFFSET v = (x - xoff) * 2 - dd;
   308 
   309                 if (v > 12 * (c + (dd < 0 ? -dd : dd)))
   310                   {
   311                     if (v > best
   312                         && xoff + SNAKE_LIMIT <= x && x < xlim
   313                         && yoff + SNAKE_LIMIT <= y && y < ylim)
   314                       {
   315                         /* We have a good enough best diagonal; now insist
   316                            that it end with a significant snake.  */
   317                         int k;
   318 
   319                         for (k = 1; XREF_YREF_EQUAL (x - k, y - k); k++)
   320                           if (k == SNAKE_LIMIT)
   321                             {
   322                               best = v;
   323                               part->xmid = x;
   324                               part->ymid = y;
   325                               break;
   326                             }
   327                       }
   328                   }
   329               }
   330             if (best > 0)
   331               {
   332                 part->lo_minimal = true;
   333                 part->hi_minimal = false;
   334                 return;
   335               }
   336           }
   337 
   338           {
   339             OFFSET best = 0;
   340 
   341             for (d = bmax; d >= bmin; d -= 2)
   342               {
   343                 OFFSET dd = d - bmid;
   344                 OFFSET x = bd[d];
   345                 OFFSET y = x - d;
   346                 OFFSET v = (xlim - x) * 2 + dd;
   347 
   348                 if (v > 12 * (c + (dd < 0 ? -dd : dd)))
   349                   {
   350                     if (v > best
   351                         && xoff < x && x <= xlim - SNAKE_LIMIT
   352                         && yoff < y && y <= ylim - SNAKE_LIMIT)
   353                       {
   354                         /* We have a good enough best diagonal; now insist
   355                            that it end with a significant snake.  */
   356                         int k;
   357 
   358                         for (k = 0; XREF_YREF_EQUAL (x + k, y + k); k++)
   359                           if (k == SNAKE_LIMIT - 1)
   360                             {
   361                               best = v;
   362                               part->xmid = x;
   363                               part->ymid = y;
   364                               break;
   365                             }
   366                       }
   367                   }
   368               }
   369             if (best > 0)
   370               {
   371                 part->lo_minimal = false;
   372                 part->hi_minimal = true;
   373                 return;
   374               }
   375           }
   376         }
   377 
   378       /* Heuristic: if we've gone well beyond the call of duty, give up
   379          and report halfway between our best results so far.  */
   380       if (c >= ctxt->too_expensive)
   381         {
   382           /* Find forward diagonal that maximizes X + Y.  */
   383           OFFSET fxybest = -1, fxbest;
   384           for (d = fmax; d >= fmin; d -= 2)
   385             {
   386               OFFSET x = MIN (fd[d], xlim);
   387               OFFSET y = x - d;
   388               if (ylim < y)
   389                 {
   390                   x = ylim + d;
   391                   y = ylim;
   392                 }
   393               if (fxybest < x + y)
   394                 {
   395                   fxybest = x + y;
   396                   fxbest = x;
   397                 }
   398             }
   399 
   400           /* Find backward diagonal that minimizes X + Y.  */
   401           OFFSET bxybest = OFFSET_MAX, bxbest;
   402           for (d = bmax; d >= bmin; d -= 2)
   403             {
   404               OFFSET x = MAX (xoff, bd[d]);
   405               OFFSET y = x - d;
   406               if (y < yoff)
   407                 {
   408                   x = yoff + d;
   409                   y = yoff;
   410                 }
   411               if (x + y < bxybest)
   412                 {
   413                   bxybest = x + y;
   414                   bxbest = x;
   415                 }
   416             }
   417 
   418           /* Use the better of the two diagonals.  */
   419           if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff))
   420             {
   421               part->xmid = fxbest;
   422               part->ymid = fxybest - fxbest;
   423               part->lo_minimal = true;
   424               part->hi_minimal = false;
   425             }
   426           else
   427             {
   428               part->xmid = bxbest;
   429               part->ymid = bxybest - bxbest;
   430               part->lo_minimal = false;
   431               part->hi_minimal = true;
   432             }
   433           return;
   434         }
   435     }
   436   #undef XREF_YREF_EQUAL
   437 }
   438 
   439 
   440 /* Compare in detail contiguous subsequences of the two vectors
   441    which are known, as a whole, to match each other.
   442 
   443    The subsequence of vector 0 is [XOFF, XLIM) and likewise for vector 1.
   444 
   445    Note that XLIM, YLIM are exclusive bounds.  All indices into the vectors
   446    are origin-0.
   447 
   448    If FIND_MINIMAL, find a minimal difference no matter how
   449    expensive it is.
   450 
   451    The results are recorded by invoking NOTE_DELETE and NOTE_INSERT.
   452 
   453    Return false if terminated normally, or true if terminated through early
   454    abort.  */
   455 
   456 static bool
   457 compareseq (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim,
   458             bool find_minimal, struct context *ctxt)
   459 {
   460 #ifdef ELEMENT
   461   ELEMENT const *xv = ctxt->xvec; /* Help the compiler.  */
   462   ELEMENT const *yv = ctxt->yvec;
   463   #define XREF_YREF_EQUAL(x,y)  EQUAL (xv[x], yv[y])
   464 #else
   465   #define XREF_YREF_EQUAL(x,y)  XVECREF_YVECREF_EQUAL (ctxt, x, y)
   466 #endif
   467 
   468   while (true)
   469     {
   470       /* Slide down the bottom initial diagonal.  */
   471       while (xoff < xlim && yoff < ylim && XREF_YREF_EQUAL (xoff, yoff))
   472         {
   473           xoff++;
   474           yoff++;
   475         }
   476 
   477       /* Slide up the top initial diagonal. */
   478       while (xoff < xlim && yoff < ylim && XREF_YREF_EQUAL (xlim - 1, ylim - 1))
   479         {
   480           xlim--;
   481           ylim--;
   482         }
   483 
   484       /* Handle simple cases. */
   485       if (xoff == xlim)
   486         {
   487           while (yoff < ylim)
   488             {
   489               NOTE_INSERT (ctxt, yoff);
   490               if (EARLY_ABORT (ctxt))
   491                 return true;
   492               yoff++;
   493             }
   494           break;
   495         }
   496       if (yoff == ylim)
   497         {
   498           while (xoff < xlim)
   499             {
   500               NOTE_DELETE (ctxt, xoff);
   501               if (EARLY_ABORT (ctxt))
   502                 return true;
   503               xoff++;
   504             }
   505           break;
   506         }
   507 
   508       struct partition part;
   509 
   510       /* Find a point of correspondence in the middle of the vectors.  */
   511       diag (xoff, xlim, yoff, ylim, find_minimal, &part, ctxt);
   512 
   513       /* Use the partitions to split this problem into subproblems.  */
   514       OFFSET xoff1, xlim1, yoff1, ylim1, xoff2, xlim2, yoff2, ylim2;
   515       bool find_minimal1, find_minimal2;
   516       if (!NOTE_ORDERED
   517           && ((xlim + ylim) - (part.xmid + part.ymid)
   518               < (part.xmid + part.ymid) - (xoff + yoff)))
   519         {
   520           /* The second problem is smaller and the caller doesn't
   521              care about order, so do the second problem first to
   522              lessen recursion.  */
   523           xoff1 = part.xmid; xlim1 = xlim;
   524           yoff1 = part.ymid; ylim1 = ylim;
   525           find_minimal1 = part.hi_minimal;
   526 
   527           xoff2 = xoff; xlim2 = part.xmid;
   528           yoff2 = yoff; ylim2 = part.ymid;
   529           find_minimal2 = part.lo_minimal;
   530         }
   531       else
   532         {
   533           xoff1 = xoff; xlim1 = part.xmid;
   534           yoff1 = yoff; ylim1 = part.ymid;
   535           find_minimal1 = part.lo_minimal;
   536 
   537           xoff2 = part.xmid; xlim2 = xlim;
   538           yoff2 = part.ymid; ylim2 = ylim;
   539           find_minimal2 = part.hi_minimal;
   540         }
   541 
   542       /* Recurse to do one subproblem.  */
   543       bool early = compareseq (xoff1, xlim1, yoff1, ylim1, find_minimal1, ctxt);
   544       if (early)
   545         return early;
   546 
   547       /* Iterate to do the other subproblem.  */
   548       xoff = xoff2; xlim = xlim2;
   549       yoff = yoff2; ylim = ylim2;
   550       find_minimal = find_minimal2;
   551     }
   552 
   553   return false;
   554   #undef XREF_YREF_EQUAL
   555 }
   556 
   557 #if __GNUC__ + (__GNUC_MINOR__ >= 7) > 4
   558 # pragma GCC diagnostic pop
   559 #endif
   560 
   561 #undef ELEMENT
   562 #undef EQUAL
   563 #undef OFFSET
   564 #undef EXTRA_CONTEXT_FIELDS
   565 #undef NOTE_DELETE
   566 #undef NOTE_INSERT
   567 #undef EARLY_ABORT
   568 #undef USE_HEURISTIC
   569 #undef XVECREF_YVECREF_EQUAL
   570 #undef OFFSET_MAX

/* [<][>][^][v][top][bottom][index][help] */