| /* |
| * Copyright (C) 2010, Google Inc. and others |
| * |
| * This program and the accompanying materials are made available under the |
| * terms of the Eclipse Distribution License v. 1.0 which is available at |
| * https://www.eclipse.org/org/documents/edl-v10.php. |
| * |
| * SPDX-License-Identifier: BSD-3-Clause |
| */ |
| |
| package org.eclipse.jgit.diff; |
| |
| import static org.eclipse.jgit.diff.DiffEntry.Side.NEW; |
| import static org.eclipse.jgit.diff.DiffEntry.Side.OLD; |
| |
| import java.io.IOException; |
| import java.util.ArrayList; |
| import java.util.Arrays; |
| import java.util.BitSet; |
| import java.util.List; |
| |
| import org.eclipse.jgit.diff.DiffEntry.ChangeType; |
| import org.eclipse.jgit.diff.SimilarityIndex.TableFullException; |
| import org.eclipse.jgit.errors.CancelledException; |
| import org.eclipse.jgit.internal.JGitText; |
| import org.eclipse.jgit.lib.FileMode; |
| import org.eclipse.jgit.lib.NullProgressMonitor; |
| import org.eclipse.jgit.lib.ProgressMonitor; |
| |
| class SimilarityRenameDetector { |
| /** |
| * Number of bits we need to express an index into src or dst list. |
| * <p> |
| * This must be 28, giving us a limit of 2^28 entries in either list, which |
| * is an insane limit of 536,870,912 file names being considered in a single |
| * rename pass. The other 8 bits are used to store the score, while staying |
| * under 127 so the long doesn't go negative. |
| */ |
| private static final int BITS_PER_INDEX = 28; |
| |
| private static final int INDEX_MASK = (1 << BITS_PER_INDEX) - 1; |
| |
| private static final int SCORE_SHIFT = 2 * BITS_PER_INDEX; |
| |
| private ContentSource.Pair reader; |
| |
| /** |
| * All sources to consider for copies or renames. |
| * <p> |
| * A source is typically a {@link ChangeType#DELETE} change, but could be |
| * another type when trying to perform copy detection concurrently with |
| * rename detection. |
| */ |
| private List<DiffEntry> srcs; |
| |
| /** |
| * All destinations to consider looking for a rename. |
| * <p> |
| * A destination is typically an {@link ChangeType#ADD}, as the name has |
| * just come into existence, and we want to discover where its initial |
| * content came from. |
| */ |
| private List<DiffEntry> dsts; |
| |
| /** |
| * Matrix of all examined file pairs, and their scores. |
| * <p> |
| * The upper 8 bits of each long stores the score, but the score is bounded |
| * to be in the range (0, 128] so that the highest bit is never set, and all |
| * entries are therefore positive. |
| * <p> |
| * List indexes to an element of {@link #srcs} and {@link #dsts} are encoded |
| * as the lower two groups of 28 bits, respectively, but the encoding is |
| * inverted, so that 0 is expressed as {@code (1 << 28) - 1}. This sorts |
| * lower list indices later in the matrix, giving precedence to files whose |
| * names sort earlier in the tree. |
| */ |
| private long[] matrix; |
| |
| /** Score a pair must exceed to be considered a rename. */ |
| private int renameScore = 60; |
| |
| /** Set if any {@link SimilarityIndex.TableFullException} occurs. */ |
| private boolean tableOverflow; |
| |
| private List<DiffEntry> out; |
| |
| SimilarityRenameDetector(ContentSource.Pair reader, List<DiffEntry> srcs, |
| List<DiffEntry> dsts) { |
| this.reader = reader; |
| this.srcs = srcs; |
| this.dsts = dsts; |
| } |
| |
| void setRenameScore(int score) { |
| renameScore = score; |
| } |
| |
| void compute(ProgressMonitor pm) throws IOException, CancelledException { |
| if (pm == null) |
| pm = NullProgressMonitor.INSTANCE; |
| |
| pm.beginTask(JGitText.get().renamesFindingByContent, // |
| 2 * srcs.size() * dsts.size()); |
| |
| int mNext = buildMatrix(pm); |
| out = new ArrayList<>(Math.min(mNext, dsts.size())); |
| |
| // Match rename pairs on a first come, first serve basis until |
| // we have looked at everything that is above our minimum score. |
| // |
| for (--mNext; mNext >= 0; mNext--) { |
| if (pm.isCancelled()) { |
| // TODO(ms): use org.eclipse.jgit.api.errors.CanceledException |
| // in next major version |
| throw new CancelledException(JGitText.get().renameCancelled); |
| } |
| long ent = matrix[mNext]; |
| int sIdx = srcFile(ent); |
| int dIdx = dstFile(ent); |
| DiffEntry s = srcs.get(sIdx); |
| DiffEntry d = dsts.get(dIdx); |
| |
| if (d == null) { |
| pm.update(1); |
| continue; // was already matched earlier |
| } |
| |
| ChangeType type; |
| if (s.changeType == ChangeType.DELETE) { |
| // First use of this source file. Tag it as a rename so we |
| // later know it is already been used as a rename, other |
| // matches (if any) will claim themselves as copies instead. |
| // |
| s.changeType = ChangeType.RENAME; |
| type = ChangeType.RENAME; |
| } else { |
| type = ChangeType.COPY; |
| } |
| |
| out.add(DiffEntry.pair(type, s, d, score(ent))); |
| dsts.set(dIdx, null); // Claim the destination was matched. |
| pm.update(1); |
| } |
| |
| srcs = compactSrcList(srcs); |
| dsts = compactDstList(dsts); |
| pm.endTask(); |
| } |
| |
| List<DiffEntry> getMatches() { |
| return out; |
| } |
| |
| List<DiffEntry> getLeftOverSources() { |
| return srcs; |
| } |
| |
| List<DiffEntry> getLeftOverDestinations() { |
| return dsts; |
| } |
| |
| boolean isTableOverflow() { |
| return tableOverflow; |
| } |
| |
| private static List<DiffEntry> compactSrcList(List<DiffEntry> in) { |
| ArrayList<DiffEntry> r = new ArrayList<>(in.size()); |
| for (DiffEntry e : in) { |
| if (e.changeType == ChangeType.DELETE) |
| r.add(e); |
| } |
| return r; |
| } |
| |
| private static List<DiffEntry> compactDstList(List<DiffEntry> in) { |
| ArrayList<DiffEntry> r = new ArrayList<>(in.size()); |
| for (DiffEntry e : in) { |
| if (e != null) |
| r.add(e); |
| } |
| return r; |
| } |
| |
| private int buildMatrix(ProgressMonitor pm) |
| throws IOException, CancelledException { |
| // Allocate for the worst-case scenario where every pair has a |
| // score that we need to consider. We might not need that many. |
| // |
| matrix = new long[srcs.size() * dsts.size()]; |
| |
| long[] srcSizes = new long[srcs.size()]; |
| long[] dstSizes = new long[dsts.size()]; |
| BitSet dstTooLarge = null; |
| |
| // Consider each pair of files, if the score is above the minimum |
| // threshold we need record that scoring in the matrix so we can |
| // later find the best matches. |
| // |
| int mNext = 0; |
| SRC: for (int srcIdx = 0; srcIdx < srcs.size(); srcIdx++) { |
| DiffEntry srcEnt = srcs.get(srcIdx); |
| if (!isFile(srcEnt.oldMode)) { |
| pm.update(dsts.size()); |
| continue; |
| } |
| |
| SimilarityIndex s = null; |
| |
| for (int dstIdx = 0; dstIdx < dsts.size(); dstIdx++) { |
| if (pm.isCancelled()) { |
| // TODO(ms): use |
| // org.eclipse.jgit.api.errors.CanceledException in next |
| // major version |
| throw new CancelledException( |
| JGitText.get().renameCancelled); |
| } |
| |
| DiffEntry dstEnt = dsts.get(dstIdx); |
| |
| if (!isFile(dstEnt.newMode)) { |
| pm.update(1); |
| continue; |
| } |
| |
| if (!RenameDetector.sameType(srcEnt.oldMode, dstEnt.newMode)) { |
| pm.update(1); |
| continue; |
| } |
| |
| if (dstTooLarge != null && dstTooLarge.get(dstIdx)) { |
| pm.update(1); |
| continue; |
| } |
| |
| long srcSize = srcSizes[srcIdx]; |
| if (srcSize == 0) { |
| srcSize = size(OLD, srcEnt) + 1; |
| srcSizes[srcIdx] = srcSize; |
| } |
| |
| long dstSize = dstSizes[dstIdx]; |
| if (dstSize == 0) { |
| dstSize = size(NEW, dstEnt) + 1; |
| dstSizes[dstIdx] = dstSize; |
| } |
| |
| long max = Math.max(srcSize, dstSize); |
| long min = Math.min(srcSize, dstSize); |
| if (min * 100 / max < renameScore) { |
| // Cannot possibly match, as the file sizes are so different |
| pm.update(1); |
| continue; |
| } |
| |
| if (s == null) { |
| try { |
| s = hash(OLD, srcEnt); |
| } catch (TableFullException tableFull) { |
| tableOverflow = true; |
| continue SRC; |
| } |
| } |
| |
| SimilarityIndex d; |
| try { |
| d = hash(NEW, dstEnt); |
| } catch (TableFullException tableFull) { |
| if (dstTooLarge == null) |
| dstTooLarge = new BitSet(dsts.size()); |
| dstTooLarge.set(dstIdx); |
| tableOverflow = true; |
| pm.update(1); |
| continue; |
| } |
| |
| int contentScore = s.score(d, 10000); |
| |
| // nameScore returns a value between 0 and 100, but we want it |
| // to be in the same range as the content score. This allows it |
| // to be dropped into the pretty formula for the final score. |
| int nameScore = nameScore(srcEnt.oldPath, dstEnt.newPath) * 100; |
| |
| int score = (contentScore * 99 + nameScore * 1) / 10000; |
| |
| if (score < renameScore) { |
| pm.update(1); |
| continue; |
| } |
| |
| matrix[mNext++] = encode(score, srcIdx, dstIdx); |
| pm.update(1); |
| } |
| } |
| |
| // Sort everything in the range we populated, which might be the |
| // entire matrix, or just a smaller slice if we had some bad low |
| // scoring pairs. |
| // |
| Arrays.sort(matrix, 0, mNext); |
| return mNext; |
| } |
| |
| static int nameScore(String a, String b) { |
| int aDirLen = a.lastIndexOf('/') + 1; |
| int bDirLen = b.lastIndexOf('/') + 1; |
| |
| int dirMin = Math.min(aDirLen, bDirLen); |
| int dirMax = Math.max(aDirLen, bDirLen); |
| |
| final int dirScoreLtr; |
| final int dirScoreRtl; |
| |
| if (dirMax == 0) { |
| dirScoreLtr = 100; |
| dirScoreRtl = 100; |
| } else { |
| int dirSim = 0; |
| for (; dirSim < dirMin; dirSim++) { |
| if (a.charAt(dirSim) != b.charAt(dirSim)) |
| break; |
| } |
| dirScoreLtr = (dirSim * 100) / dirMax; |
| |
| if (dirScoreLtr == 100) { |
| dirScoreRtl = 100; |
| } else { |
| for (dirSim = 0; dirSim < dirMin; dirSim++) { |
| if (a.charAt(aDirLen - 1 - dirSim) != b.charAt(bDirLen - 1 |
| - dirSim)) |
| break; |
| } |
| dirScoreRtl = (dirSim * 100) / dirMax; |
| } |
| } |
| |
| int fileMin = Math.min(a.length() - aDirLen, b.length() - bDirLen); |
| int fileMax = Math.max(a.length() - aDirLen, b.length() - bDirLen); |
| |
| int fileSim = 0; |
| for (; fileSim < fileMin; fileSim++) { |
| if (a.charAt(a.length() - 1 - fileSim) != b.charAt(b.length() - 1 |
| - fileSim)) |
| break; |
| } |
| int fileScore = (fileSim * 100) / fileMax; |
| |
| return (((dirScoreLtr + dirScoreRtl) * 25) + (fileScore * 50)) / 100; |
| } |
| |
| private SimilarityIndex hash(DiffEntry.Side side, DiffEntry ent) |
| throws IOException, TableFullException { |
| SimilarityIndex r = new SimilarityIndex(); |
| r.hash(reader.open(side, ent)); |
| r.sort(); |
| return r; |
| } |
| |
| private long size(DiffEntry.Side side, DiffEntry ent) throws IOException { |
| return reader.size(side, ent); |
| } |
| |
| private static int score(long value) { |
| return (int) (value >>> SCORE_SHIFT); |
| } |
| |
| static int srcFile(long value) { |
| return decodeFile(((int) (value >>> BITS_PER_INDEX)) & INDEX_MASK); |
| } |
| |
| static int dstFile(long value) { |
| return decodeFile(((int) value) & INDEX_MASK); |
| } |
| |
| static long encode(int score, int srcIdx, int dstIdx) { |
| return (((long) score) << SCORE_SHIFT) // |
| | (encodeFile(srcIdx) << BITS_PER_INDEX) // |
| | encodeFile(dstIdx); |
| } |
| |
| private static long encodeFile(int idx) { |
| // We invert the index so that the first file in the list sorts |
| // later in the table. This permits us to break ties favoring |
| // earlier names over later ones. |
| // |
| return INDEX_MASK - idx; |
| } |
| |
| private static int decodeFile(int v) { |
| return INDEX_MASK - v; |
| } |
| |
| private static boolean isFile(FileMode mode) { |
| return (mode.getBits() & FileMode.TYPE_MASK) == FileMode.TYPE_FILE; |
| } |
| } |