Git file diff: fix when JGit returns multiple entries for same file path

In some cases, JGit returns two {ADDED, DELETED} diff entries for the
same file path instead of a single {MODIFIED} entry. This happens, e.g.
when a file is modified between patchsets with a change in file mode
(for example converting a symlink to a regular file). In the old diff
cache world, We used to handle this case outside of the cache (see [1])
by combining the two entries into a single entry with change type =
REWRITTEN.

The implementation in the new diff cache did not handle this case: in
GitFileDiffCacheImpl, we used to collect the diff entries into a map,
keyed by file path, hence resulting in duplicate key exception in the
toMap collector.

In this change, we fix this issue
  * We keep the git_modified_files cache (the one that computes the
  modified files between two git trees) as is, allowing it to return two
  {ADDED, DELETED} ModifiedFile entries for the same file.

  * We update the modified_files cache (which wraps the
  git_modified_files cache) so that it merges ADDED/DELETED entries for
  the same file into a single REWRITE entry.

  * We update the git_file_diff cache (the one that computes the
  detailed git diff for a single file) so that it merges ADDED/DELETED
  entries for the same file into a single REWRITE entry. The logic
  is added in this cache and not in file_diff cache (the one that wraps
  the git_file_diff cache and computes the edits due to rebase) since
  the git_file_diff is requested with one key per file, so we expect it
  to return a single result value for that key.

We unignore a test that covers this case (for new diff cache) in
RevisionDiffIT to assert correctness.

We increase the version of the modified_files cache since its logic was
changed. The git_file_diff cache does not require increasing the cache
version since this issue resulted in an exception in this cache and the
entries are not yet cached.

[1] https://gerrit.googlesource.com/gerrit/+/refs/heads/master/java/com/google/gerrit/server/change/FileInfoJsonOldImpl.java#112

Change-Id: If2cab046051d96a2ce669ba573b36e44e3ec64e7
diff --git a/java/com/google/gerrit/server/patch/diff/ModifiedFilesCacheImpl.java b/java/com/google/gerrit/server/patch/diff/ModifiedFilesCacheImpl.java
index b779bf7..e4fd728 100644
--- a/java/com/google/gerrit/server/patch/diff/ModifiedFilesCacheImpl.java
+++ b/java/com/google/gerrit/server/patch/diff/ModifiedFilesCacheImpl.java
@@ -18,11 +18,14 @@
 
 import com.google.common.cache.CacheLoader;
 import com.google.common.cache.LoadingCache;
+import com.google.common.collect.ArrayListMultimap;
 import com.google.common.collect.ImmutableList;
 import com.google.common.collect.ImmutableSet;
+import com.google.common.collect.ListMultimap;
 import com.google.common.collect.Sets;
 import com.google.common.collect.Streams;
 import com.google.common.flogger.FluentLogger;
+import com.google.gerrit.entities.Patch.ChangeType;
 import com.google.gerrit.server.cache.CacheModule;
 import com.google.gerrit.server.git.GitRepositoryManager;
 import com.google.gerrit.server.patch.DiffNotAvailableException;
@@ -37,6 +40,7 @@
 import com.google.inject.TypeLiteral;
 import com.google.inject.name.Named;
 import java.io.IOException;
+import java.util.ArrayList;
 import java.util.List;
 import java.util.Set;
 import java.util.stream.Stream;
@@ -82,7 +86,7 @@
             .valueSerializer(GitModifiedFilesCacheImpl.ValueSerializer.INSTANCE)
             .maximumWeight(10 << 20)
             .weigher(ModifiedFilesWeigher.class)
-            .version(1)
+            .version(2)
             .loader(ModifiedFilesLoader.class);
       }
     };
@@ -139,7 +143,7 @@
               .bTree(bTree)
               .renameScore(key.renameScore())
               .build();
-      List<ModifiedFile> modifiedFiles = gitCache.get(gitKey);
+      List<ModifiedFile> modifiedFiles = mergeRewrittenEntries(gitCache.get(gitKey));
       if (key.aCommit().equals(ObjectId.zeroId())) {
         return ImmutableList.copyOf(modifiedFiles);
       }
@@ -202,5 +206,61 @@
       // value as the set of file paths shouldn't contain it.
       return touchedFilePaths.contains(oldFilePath) || touchedFilePaths.contains(newFilePath);
     }
+
+    /**
+     * Return the {@code modifiedFiles} input list while merging {@link ChangeType#ADDED} and {@link
+     * ChangeType#DELETED} entries for the same file into a single {@link ChangeType#REWRITE} entry.
+     *
+     * <p>Background: In some cases, JGit returns two diff entries (ADDED + DELETED) for the same
+     * file path. This happens e.g. when a file's mode is changed between patchsets, for example
+     * converting a symlink file to a regular file. We identify this case and return a single
+     * modified file with changeType = {@link ChangeType#REWRITE}.
+     */
+    private static List<ModifiedFile> mergeRewrittenEntries(List<ModifiedFile> modifiedFiles) {
+      List<ModifiedFile> result = new ArrayList<>();
+
+      // Handle ADDED and DELETED entries separately.
+      ListMultimap<String, ModifiedFile> byPath = ArrayListMultimap.create();
+      modifiedFiles.stream()
+          .filter(ModifiedFilesLoader::isAddedOrDeleted)
+          .forEach(
+              f -> {
+                if (f.oldPath().isPresent()) {
+                  byPath.get(f.oldPath().get()).add(f);
+                }
+                if (f.newPath().isPresent()) {
+                  byPath.get(f.newPath().get()).add(f);
+                }
+              });
+      for (String path : byPath.keySet()) {
+        List<ModifiedFile> entries = byPath.get(path);
+        if (entries.size() == 1) {
+          result.add(entries.get(0));
+        } else if (entries.size() == 2) {
+          result.add(getAddedEntry(entries).toBuilder().changeType(ChangeType.REWRITE).build());
+        } else {
+          // JGit error. Not expected to happen.
+          logger.atWarning().log(
+              "Found %d ADDED and DELETED entries for the same file path: %s."
+                  + " Adding the first entry only to the result.",
+              entries.size(), entries);
+          result.add(entries.get(0));
+        }
+      }
+
+      // Add the remaining non ADDED/DELETED entries to the result
+      modifiedFiles.stream().filter(f -> !isAddedOrDeleted(f)).forEach(result::add);
+      return result;
+    }
+
+    private static boolean isAddedOrDeleted(ModifiedFile f) {
+      return f.changeType() == ChangeType.ADDED || f.changeType() == ChangeType.DELETED;
+    }
+
+    private static ModifiedFile getAddedEntry(List<ModifiedFile> modifiedFiles) {
+      return modifiedFiles.get(0).changeType() == ChangeType.ADDED
+          ? modifiedFiles.get(0)
+          : modifiedFiles.get(1);
+    }
   }
 }
diff --git a/java/com/google/gerrit/server/patch/gitdiff/ModifiedFile.java b/java/com/google/gerrit/server/patch/gitdiff/ModifiedFile.java
index 9512094..f4e7ca3 100644
--- a/java/com/google/gerrit/server/patch/gitdiff/ModifiedFile.java
+++ b/java/com/google/gerrit/server/patch/gitdiff/ModifiedFile.java
@@ -51,6 +51,8 @@
     return new AutoValue_ModifiedFile.Builder();
   }
 
+  public abstract Builder toBuilder();
+
   /** Computes this object's weight, which is its size in bytes. */
   public int weight() {
     int weight = 1; // the changeType field
diff --git a/java/com/google/gerrit/server/patch/gitfilediff/GitFileDiff.java b/java/com/google/gerrit/server/patch/gitfilediff/GitFileDiff.java
index 2f2d29b..a502a46 100644
--- a/java/com/google/gerrit/server/patch/gitfilediff/GitFileDiff.java
+++ b/java/com/google/gerrit/server/patch/gitfilediff/GitFileDiff.java
@@ -187,6 +187,10 @@
     return result;
   }
 
+  public String getDefaultPath() {
+    return oldPath().isPresent() ? oldPath().get() : newPath().get();
+  }
+
   public static Builder builder() {
     return new AutoValue_GitFileDiff.Builder();
   }
diff --git a/java/com/google/gerrit/server/patch/gitfilediff/GitFileDiffCacheImpl.java b/java/com/google/gerrit/server/patch/gitfilediff/GitFileDiffCacheImpl.java
index 2ce6925..77b8938 100644
--- a/java/com/google/gerrit/server/patch/gitfilediff/GitFileDiffCacheImpl.java
+++ b/java/com/google/gerrit/server/patch/gitfilediff/GitFileDiffCacheImpl.java
@@ -24,7 +24,11 @@
 import com.google.common.collect.ImmutableMap;
 import com.google.common.collect.ImmutableSet;
 import com.google.common.collect.Iterables;
+import com.google.common.collect.ListMultimap;
+import com.google.common.collect.MultimapBuilder;
+import com.google.common.collect.Multimaps;
 import com.google.common.collect.Streams;
+import com.google.gerrit.entities.Patch;
 import com.google.gerrit.entities.Project;
 import com.google.gerrit.extensions.client.DiffPreferencesInfo.Whitespace;
 import com.google.gerrit.server.cache.CacheModule;
@@ -41,6 +45,7 @@
 import com.google.inject.Singleton;
 import com.google.inject.name.Named;
 import java.io.IOException;
+import java.util.ArrayList;
 import java.util.Collection;
 import java.util.List;
 import java.util.Map;
@@ -105,6 +110,9 @@
 
   private final LoadingCache<GitFileDiffCacheKey, GitFileDiff> cache;
 
+  private static final ImmutableSet<Patch.ChangeType> ADDED_AND_DELETED =
+      ImmutableSet.of(Patch.ChangeType.ADDED, Patch.ChangeType.DELETED);
+
   @Inject
   public GitFileDiffCacheImpl(
       @Named(GIT_DIFF) LoadingCache<GitFileDiffCacheKey, GitFileDiff> cache) {
@@ -163,7 +171,7 @@
     }
 
     @Override
-    public GitFileDiff load(GitFileDiffCacheKey key) throws IOException {
+    public GitFileDiff load(GitFileDiffCacheKey key) throws IOException, DiffNotAvailableException {
       try (TraceTimer timer =
           TraceContext.newTimer(
               "Loading a single key from git file diff cache",
@@ -177,7 +185,8 @@
 
     @Override
     public Map<GitFileDiffCacheKey, GitFileDiff> loadAll(
-        Iterable<? extends GitFileDiffCacheKey> keys) throws IOException {
+        Iterable<? extends GitFileDiffCacheKey> keys)
+        throws IOException, DiffNotAvailableException {
       try (TraceTimer timer =
           TraceContext.newTimer("Loading multiple keys from git file diff cache")) {
         ImmutableMap.Builder<GitFileDiffCacheKey, GitFileDiff> result =
@@ -215,13 +224,14 @@
      */
     private Map<GitFileDiffCacheKey, GitFileDiff> loadAllImpl(
         Repository repo, ObjectReader reader, DiffOptions options, List<GitFileDiffCacheKey> keys)
-        throws IOException {
+        throws IOException, DiffNotAvailableException {
       ImmutableMap.Builder<GitFileDiffCacheKey, GitFileDiff> result =
           ImmutableMap.builderWithExpectedSize(keys.size());
       Map<GitFileDiffCacheKey, String> filePaths =
           keys.stream().collect(Collectors.toMap(identity(), GitFileDiffCacheKey::newFilePath));
       DiffFormatter formatter = createDiffFormatter(options, repo, reader);
-      Map<String, DiffEntry> diffEntries = loadDiffEntries(formatter, options, filePaths.values());
+      ListMultimap<String, DiffEntry> diffEntries =
+          loadDiffEntries(formatter, options, filePaths.values());
       for (GitFileDiffCacheKey key : filePaths.keySet()) {
         String newFilePath = filePaths.get(key);
         if (!diffEntries.containsKey(newFilePath)) {
@@ -233,14 +243,25 @@
                   newFilePath));
           continue;
         }
-        DiffEntry diffEntry = diffEntries.get(newFilePath);
-        GitFileDiff gitFileDiff = createGitFileDiff(diffEntry, formatter, key);
-        result.put(key, gitFileDiff);
+        List<DiffEntry> entries = diffEntries.get(newFilePath);
+        if (entries.size() == 1) {
+          result.put(key, createGitFileDiff(entries.get(0), formatter, key));
+        } else {
+          // Handle when JGit returns two {Added, Deleted} entries for the same file. This happens,
+          // for example, when a file's mode is changed between patchsets (e.g. converting a
+          // symlink to a regular file). We combine both diff entries into a single entry with
+          // {changeType = Rewrite}.
+          List<GitFileDiff> gitDiffs = new ArrayList<>();
+          for (DiffEntry entry : diffEntries.get(newFilePath)) {
+            gitDiffs.add(createGitFileDiff(entry, formatter, key));
+          }
+          result.put(key, createRewriteEntry(gitDiffs));
+        }
       }
       return result.build();
     }
 
-    private static Map<String, DiffEntry> loadDiffEntries(
+    private static ListMultimap<String, DiffEntry> loadDiffEntries(
         DiffFormatter diffFormatter, DiffOptions diffOptions, Collection<String> filePaths)
         throws IOException {
       Set<String> filePathsSet = ImmutableSet.copyOf(filePaths);
@@ -251,7 +272,11 @@
 
       return diffEntries.stream()
           .filter(d -> filePathsSet.contains(pathExtractor.apply(d)))
-          .collect(Collectors.toMap(d -> pathExtractor.apply(d), identity()));
+          .collect(
+              Multimaps.toMultimap(
+                  d -> pathExtractor.apply(d),
+                  identity(),
+                  MultimapBuilder.treeKeys().arrayListValues()::build));
     }
 
     private static DiffFormatter createDiffFormatter(
@@ -334,6 +359,39 @@
     }
   }
 
+  /**
+   * Create a single {@link GitFileDiff} with {@link Patch.ChangeType} equals {@link
+   * Patch.ChangeType#REWRITE}, assuming the input list contains two entries with types {@link
+   * Patch.ChangeType#ADDED} and {@link Patch.ChangeType#DELETED}.
+   *
+   * @param gitDiffs input list of exactly two {@link GitFileDiff} for same file path.
+   * @return a single {@link GitFileDiff} with change type equals {@link Patch.ChangeType#REWRITE}.
+   * @throws DiffNotAvailableException if input list contains git diffs with change types other than
+   *     {ADDED, DELETED}. This is a JGit error.
+   */
+  private static GitFileDiff createRewriteEntry(List<GitFileDiff> gitDiffs)
+      throws DiffNotAvailableException {
+    if (gitDiffs.size() != 2) {
+      throw new DiffNotAvailableException(
+          String.format(
+              "JGit error: found %d dff entries for same file path %s",
+              gitDiffs.size(), gitDiffs.get(0).getDefaultPath()));
+    }
+    if (!ImmutableSet.of(gitDiffs.get(0).changeType(), gitDiffs.get(1).changeType())
+        .equals(ADDED_AND_DELETED)) {
+      // This is an illegal state. JGit is not supposed to return this, so we throw an exception.
+      throw new DiffNotAvailableException(
+          String.format(
+              "JGit error: unexpected change types %s and %s for same file path %s",
+              gitDiffs.get(0).changeType(),
+              gitDiffs.get(1).changeType(),
+              gitDiffs.get(0).getDefaultPath()));
+    }
+    GitFileDiff addedEntry =
+        gitDiffs.get(0).changeType() == Patch.ChangeType.ADDED ? gitDiffs.get(0) : gitDiffs.get(1);
+    return addedEntry.toBuilder().changeType(Patch.ChangeType.REWRITE).build();
+  }
+
   /** An entity representing the options affecting the diff computation. */
   @AutoValue
   abstract static class DiffOptions {
diff --git a/javatests/com/google/gerrit/acceptance/api/revision/RevisionDiffIT.java b/javatests/com/google/gerrit/acceptance/api/revision/RevisionDiffIT.java
index f1a8f18..083cbde 100644
--- a/javatests/com/google/gerrit/acceptance/api/revision/RevisionDiffIT.java
+++ b/javatests/com/google/gerrit/acceptance/api/revision/RevisionDiffIT.java
@@ -2837,11 +2837,7 @@
   }
 
   @Test
-  public void symlinkConvertedToRegularFileIsIdentifiedAsAdded() throws Exception {
-    // TODO(ghareeb): fix this test for the new diff cache implementation
-    assume().that(useNewDiffCacheListFiles).isFalse();
-    assume().that(useNewDiffCacheGetDiff).isFalse();
-
+  public void symlinkConvertedToRegularFileIsIdentifiedAsRewritten() throws Exception {
     String target = "file.txt";
     String symlink = "link.lnk";
 
@@ -2869,23 +2865,39 @@
         gApi.changes().id(result.getChangeId()).current().files(initialRev);
 
     assertThat(changedFiles.keySet()).containsExactly("/COMMIT_MSG", symlink);
-    assertThat(changedFiles.get(symlink).status).isEqualTo('W'); // Rewrite
+
+    // Both old and new diff caches agree that the state is rewritten
+    assertThat(changedFiles.get(symlink).status).isEqualTo('W'); // Rewritten
 
     DiffInfo diffInfo =
         gApi.changes().id(result.getChangeId()).current().file(symlink).diff(initialRev);
 
-    // The diff logic identifies two entries for the file:
-    // 1. One entry as 'DELETED' for the symlink.
-    // 2. Another entry as 'ADDED' for the new regular file.
-    // Since the diff logic returns a single entry, we prioritize returning the 'ADDED' entry in
-    // this case so that the client is able to see the new content that was added to the file.
-    assertThat(diffInfo.changeType).isEqualTo(ChangeType.ADDED);
-    assertThat(diffInfo.content).hasSize(1);
-    assertThat(diffInfo)
-        .content()
-        .element(0)
-        .linesOfB()
-        .containsExactly("Content of the new file named 'symlink'");
+    // TODO(ghareeb): Remove the else branch when the new diff cache is rolled out as default.
+    if (useNewDiffCacheGetDiff) {
+      // File diff in New diff cache: change type is correctly identified as REWRITTEN
+      assertThat(diffInfo.changeType).isEqualTo(ChangeType.REWRITE);
+      assertThat(diffInfo.content).hasSize(2);
+      assertThat(diffInfo)
+          .content()
+          .element(0)
+          .linesOfB()
+          .containsExactly("Content of the new file named 'symlink'");
+      assertThat(diffInfo).content().element(1).linesOfA().containsExactly("file.txt");
+    } else {
+      // File diff in old diff cache: The diff logic identifies two entries for the file:
+      // 1. One entry as 'DELETED' for the symlink.
+      // 2. Another entry as 'ADDED' for the new regular file.
+      // Since the diff logic returns a single entry, the implementation prioritizes  the 'ADDED'
+      // entry in this case so that the user is able to see the new content that was added to the
+      // file.
+      assertThat(diffInfo.changeType).isEqualTo(ChangeType.ADDED);
+      assertThat(diffInfo.content).hasSize(1);
+      assertThat(diffInfo)
+          .content()
+          .element(0)
+          .linesOfB()
+          .containsExactly("Content of the new file named 'symlink'");
+    }
   }
 
   @Test