001 /*--------------------------------------------------------------------------+ 002 $Id: Diff.java 26283 2010-02-18 11:18:57Z juergens $ 003 | | 004 | Copyright 2005-2010 Technische Universitaet Muenchen | 005 | | 006 | Licensed under the Apache License, Version 2.0 (the "License"); | 007 | you may not use this file except in compliance with the License. | 008 | You may obtain a copy of the License at | 009 | | 010 | http://www.apache.org/licenses/LICENSE-2.0 | 011 | | 012 | Unless required by applicable law or agreed to in writing, software | 013 | distributed under the License is distributed on an "AS IS" BASIS, | 014 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 015 | See the License for the specific language governing permissions and | 016 | limitations under the License. | 017 +--------------------------------------------------------------------------*/ 018 package edu.tum.cs.commons.algo; 019 020 import java.util.ArrayList; 021 import java.util.Arrays; 022 import java.util.List; 023 024 import edu.tum.cs.commons.assertion.CCSMPre; 025 import edu.tum.cs.commons.string.StringUtils; 026 027 /** 028 * Implementation of the diff algorithm described in: E.W. Myers: "An O(ND) 029 * Difference Algorithm and Its Variations". 030 * <p> 031 * Let N be the sum of the concatenated input strings and D the size of the 032 * delta (i.e. the number of changes required to transform one string into the 033 * other). Then the time complexity is O(ND) and the space complexity is O(D^2). 034 * 035 * @param <T> 036 * The type of objects for which the diff is constructed. 037 * 038 * @author hummelb 039 * @author $Author: juergens $ 040 * @version $Rev: 26283 $ 041 * @levd.rating GREEN Hash: 2B0182DC254FAFBACB0727A1A353E51C 042 */ 043 public class Diff<T> { 044 045 /** The first list of objects. */ 046 private final List<T> a; 047 048 /** The second list of objects. */ 049 private final List<T> b; 050 051 /** Length of {@link #a}. */ 052 private final int n; 053 054 /** Length of {@link #b}. */ 055 private final int m; 056 057 /** The maximal possible difference between {@link #a} and {@link #b}. */ 058 private final int max; 059 060 /** 061 * The array for storing the positions on each diagonal. This is an 062 * "unrolled" version compared to the original paper, i.e. we create a new 063 * array for each iteration of the main loop. 064 */ 065 private final int[][] v; 066 067 /** 068 * This array stores from where we came during the 069 * {@link #calculateDeltaSize()} method. Its structure is the same as 070 * {@link #v}. 071 */ 072 private final boolean[][] from; 073 074 /** 075 * Hidden constructor. Use one of the {@link #computeDelta(List, List)} or 076 * {@link #computeDelta(Object[], Object[])} methods instead. 077 */ 078 private Diff(List<T> a, List<T> b) { 079 this.a = a; 080 this.b = b; 081 082 n = a.size(); 083 m = b.size(); 084 max = n + m; 085 v = new int[max + 1][]; 086 from = new boolean[max + 1][]; 087 } 088 089 /** Performs the actual computations. */ 090 private Delta<T> computeDelta() { 091 return constructDelta(calculateDeltaSize()); 092 } 093 094 /** Constructs the actual delta. */ 095 private Delta<T> constructDelta(int size) { 096 int d = size; 097 int k = -size; 098 while (v[size][size + k] < n || v[size][d + k] - k < m) { 099 ++k; 100 } 101 102 Delta<T> delta = new Delta<T>(size, n, m); 103 104 int difference = n - m; 105 while (d > 0) { 106 if (from[d][d + k]) { 107 ++k; 108 } else { 109 --k; 110 } 111 --d; 112 113 int x = v[d][d + k]; 114 int y = x - k; 115 116 int newDifference = x - y; 117 if (newDifference > difference) { 118 delta.position[d] = y + 1; 119 delta.t[d] = b.get(y); 120 } else { 121 delta.position[d] = -x - 1; 122 delta.t[d] = a.get(x); 123 } 124 difference = newDifference; 125 } 126 return delta; 127 } 128 129 /** 130 * Calculates the size of the delta (i.e. the number of additions and 131 * deletions. Additionally the {@link #v} and {@link #from} arrays are 132 * filled. 133 */ 134 private int calculateDeltaSize() { 135 int size = -1; 136 for (int d = 0; size < 0 && d <= max; ++d) { 137 v[d] = new int[2 * d + 1]; 138 from[d] = new boolean[2 * d + 1]; 139 for (int k = -d; k <= d; k += 2) { 140 int x = 0; 141 if (d > 0) { 142 if (k == -d 143 || k != d 144 && v[d - 1][d - 1 + k - 1] < v[d - 1][d - 1 + k + 1]) { 145 x = v[d - 1][d - 1 + k + 1]; 146 from[d][d + k] = true; 147 } else { 148 x = v[d - 1][d - 1 + k - 1] + 1; 149 from[d][d + k] = false; 150 } 151 } 152 int y = x - k; 153 while (x < n && y < m && a.get(x).equals(b.get(y))) { 154 ++x; 155 ++y; 156 } 157 v[d][d + k] = x; 158 if (x >= n && y >= m) { 159 size = d; 160 } 161 } 162 } 163 return size; 164 } 165 166 /** 167 * Applies the diff algorithm on the supplied arrays and returns the delta 168 * between them. 169 * 170 * @param a 171 * the first "word", i.e., array of objects to produce a delta 172 * for. 173 * @param b 174 * the second "word", i.e., array of objects to produce a delta 175 * for. 176 * @return a delta containing the differences between a and b. 177 */ 178 public static <T> Delta<T> computeDelta(T[] a, T[] b) { 179 return computeDelta(Arrays.asList(a), Arrays.asList(b)); 180 } 181 182 /** 183 * Applies the diff algorithm on the supplied lists and returns the delta 184 * between them. 185 * 186 * @param a 187 * the first "word", i.e., list of objects to produce a delta 188 * for. 189 * @param b 190 * the second "word", i.e., list of objects to produce a delta 191 * for. 192 * @return a delta containing the differences between a and b. 193 */ 194 public static <T> Delta<T> computeDelta(List<T> a, List<T> b) { 195 return new Diff<T>(a, b).computeDelta(); 196 } 197 198 /** 199 * Objects of this class describe the additions and deletions used to 200 * transform between two words. 201 */ 202 public static class Delta<T> { 203 204 /** The size of the first word. */ 205 private final int n; 206 207 /** The size of the second word. */ 208 private final int m; 209 210 /** 211 * This array stores the position at which a string is changed. If it is 212 * positive, it indicates an addition (i.e. the position is for the 213 * second string). Otherwise it is a deletion (i.e. the (negated) 214 * position is for the first string). To allow storing a sign for 215 * position 0, all positions are incremented before (so this has to be 216 * compensated for). 217 */ 218 private final int[] position; 219 220 /** 221 * This array stores the characters which are added or deleted 222 * (interpretation depends on {@link #position}). 223 */ 224 private final T[] t; 225 226 /** Create new delta of given size. */ 227 @SuppressWarnings("unchecked") 228 private Delta(int size, int n, int m) { 229 this.n = n; 230 this.m = m; 231 position = new int[size]; 232 t = (T[]) new Object[size]; 233 } 234 235 /** 236 * Returns the size of the delta, i.e. the number of additions and 237 * deletions. 238 */ 239 public int getSize() { 240 return position.length; 241 } 242 243 /** Returns the size of the first word the delta was created for. */ 244 public int getN() { 245 return n; 246 } 247 248 /** Returns the size of the second word the delta was created for. */ 249 public int getM() { 250 return m; 251 } 252 253 /** Returns the i-th element stored as addition or deletion. */ 254 public T getT(int i) { 255 return t[i]; 256 } 257 258 /** 259 * Returns the i-th element of the change positions. If it is positive, 260 * it indicates an addition (i.e. the position is for the second 261 * string). Otherwise it is a deletion (i.e. the (negated) position is 262 * for the first string). To allow storing a sign for position 0, all 263 * positions are incremented before (so this has to be compensated for). 264 */ 265 public int getPosition(int i) { 266 return position[i]; 267 } 268 269 /** 270 * Applies the forward patch, i.e. if the first string is inserted, then 271 * the second string is returned. The input word must be of length n, 272 * the output word will be of length m. 273 */ 274 public List<T> forwardPatch(List<T> a) { 275 CCSMPre.isTrue(a.size() == n, "Input word must be of size " + n); 276 return doPatch(a, new ArrayList<T>(m), 1); 277 } 278 279 /** 280 * Applies the backward patch, i.e. if the second string is inserted, 281 * then the first string is returned. The input word must be of length 282 * m, the output word will be of length n. 283 */ 284 public List<T> backwardPatch(List<T> b) { 285 CCSMPre.isTrue(b.size() == m, "Input word must be of size " + m); 286 return doPatch(b, new ArrayList<T>(n), -1); 287 } 288 289 /** 290 * Performs the patching from a to b put pre-multiplying the positions 291 * with the given factor. 292 */ 293 private List<T> doPatch(List<T> a, List<T> b, int positionFactor) { 294 int posA = 0; 295 int posB = 0; 296 297 for (int j = 0; j < position.length; ++j) { 298 int k = position[j] * positionFactor; 299 if (k > 0) { 300 // add character 301 k = k - 1; 302 while (posB < k) { 303 b.add(a.get(posA)); 304 ++posA; 305 ++posB; 306 } 307 b.add(t[j]); 308 ++posB; 309 } else { 310 // delete character 311 k = -k - 1; 312 while (posA < k) { 313 b.add(a.get(posA)); 314 ++posA; 315 ++posB; 316 } 317 ++posA; 318 } 319 } 320 while (posA < a.size()) { 321 b.add(a.get(posA)); 322 ++posA; 323 ++posB; 324 } 325 326 return b; 327 } 328 329 /** {@inheritDoc} */ 330 @Override 331 public String toString() { 332 StringBuilder sb = new StringBuilder(); 333 for (int i = 0; i < position.length; ++i) { 334 sb.append(Math.abs(position[i]) - 1); 335 if (position[i] > 0) { 336 sb.append("+ "); 337 } else { 338 sb.append("- "); 339 } 340 sb.append(t[i] + StringUtils.CR); 341 } 342 return sb.toString(); 343 } 344 } 345 }