Merge branch 'stable-3.4' * stable-3.4: Register ProjectDeletion events for gson serialization Change-Id: I1a9f991eae5b7ac1c3189306c3cda6435ed79f5f

diff --git a/src/main/java/com/googlesource/gerrit/plugins/replication/ReplicationQueue.java b/src/main/java/com/googlesource/gerrit/plugins/replication/ReplicationQueue.java
index 4abb295..5310c14 100644
--- a/src/main/java/com/googlesource/gerrit/plugins/replication/ReplicationQueue.java
+++ b/src/main/java/com/googlesource/gerrit/plugins/replication/ReplicationQueue.java

@@ -229,7 +229,7 @@
             @Override
             public void onDone() {
               if (Prune.TRUE.equals(prune)) {
-                pruneNoLongerPending(taskNamesByReplicateRefUpdate.values());
+                pruneNoLongerPending(new HashSet<>(taskNamesByReplicateRefUpdate.values()));
               }
               replaying.set(false);
             }
@@ -242,7 +242,7 @@
     }
   }
 
-  private void pruneNoLongerPending(Collection<String> prunableTaskNames) {
+  private void pruneNoLongerPending(Set<String> prunableTaskNames) {
     // Queue tasks have wrappers around them so workQueue.getTasks() does not return the PushOnes.
     // We also cannot access them by taskId since PushOnes don't have a taskId, they do have
     // an Id, but it is not the id assigned to the task in the queues. The tasks in the queue

diff --git a/src/main/java/com/googlesource/gerrit/plugins/replication/ReplicationState.java b/src/main/java/com/googlesource/gerrit/plugins/replication/ReplicationState.java
index 871ed52..fa65803 100644
--- a/src/main/java/com/googlesource/gerrit/plugins/replication/ReplicationState.java
+++ b/src/main/java/com/googlesource/gerrit/plugins/replication/ReplicationState.java

@@ -140,12 +140,12 @@
   }
 
   private RefReplicationStatus getRefStatus(String project, String ref) {
-    if (!statusByProjectRef.contains(project, ref)) {
-      RefReplicationStatus refStatus = new RefReplicationStatus(project, ref);
+    RefReplicationStatus refStatus = statusByProjectRef.get(project, ref);
+    if (refStatus == null) {
+      refStatus = new RefReplicationStatus(project, ref);
       statusByProjectRef.put(project, ref, refStatus);
-      return refStatus;
     }
-    return statusByProjectRef.get(project, ref);
+    return refStatus;
   }
 
   public void waitForReplication() throws InterruptedException {

diff --git a/src/main/resources/Documentation/about.md b/src/main/resources/Documentation/about.md
index ded9d84..d216366 100644
--- a/src/main/resources/Documentation/about.md
+++ b/src/main/resources/Documentation/about.md

@@ -16,11 +16,9 @@
 local path as replication target. This makes e.g. sense if a network
 share is mounted to which the repositories should be replicated.
 
-In multi-primary scenario, any replication work which is already
-in-flight or completed by the other nodes is not performed to
-avoid extra work. This is because, the storage for replication
-events is shared between multiple primaries.(The storage location
-is specified in the config using: `replication.eventsDirectory`).
+It is possible to
+[configure](config.html#configuring-cluster-replication) the plugin so
+that multiple primaries share the replication work approximately evenly.
 
 Replication of account data (NoteDb)
 ------------------------------------

diff --git a/src/main/resources/Documentation/config.md b/src/main/resources/Documentation/config.md
index f4ea9d6..7fe9e15 100644
--- a/src/main/resources/Documentation/config.md
+++ b/src/main/resources/Documentation/config.md

@@ -46,6 +46,59 @@
 To manually trigger replication at runtime, see
 SSH command [start](cmd-start.md).
 
+<a name="configuring-cluster-replication"></a>
+Configuring Cluster Replication
+-------------------------------
+
+The replication plugin is designed to allow multiple primaries in a
+cluster to efficiently cooperate together via the replication event
+persistence subsystem. To enable this cooperation, the directory
+pointed to by the replication.eventsDirectory config key must reside on
+a shared filesystem, such as NFS. By default, simply pointing multiple
+primaries to the same eventsDirectory will enable some cooperation by
+preventing the same replication push from being duplicated by more
+than one primary.
+
+To further improve cooperation across the cluster, the
+replication.distributionInterval config value can be set. With
+distribution enabled, the replication queues for all the nodes sharing
+the same eventsDirectory will reflect approximately the same outstanding
+replication work (i.e. tasks waiting in the queue). Replication pushes
+which are running will continue to only be visible in the queue of the
+node on which the push is actually happening. This feature helps
+administrators get a cluster wide view of outstanding replication
+tasks, while allowing replication tasks triggered by one primary to be
+fulfilled by another node which is less busy.
+
+This enhanced replication work distribution allows the amount of
+replication work a cluster can handle to scale more evenly and linearly
+with the amount of primaries in the cluster. Adding more nodes to a
+cluster without distribution enabled will generally not allow the thread
+count per remote to be reduced without impacting service levels to those
+remotes. This is because without distribution, all events triggered by a
+node will only be fulfilled by the node which triggered the event, even
+if all the other nodes in the cluster are idle. This behavior implies
+that each node should be configured in a way that allows it alone to
+provide the level of service which each remote requires. However, with
+distribution enabled, it becomes possible to reduce the amount of
+replication threads configured per remote proportionally to the amount
+of nodes in the cluster, while maintaining the same approximate service
+level as before adding new nodes.
+
+Threads per remote reduction without service impacts is possible with
+distribution, because when configuring a node it can be expected that
+other nodes will pick up some of the work it triggers. Then the node no
+longer needs to be configured as if it were the only node in the
+cluster. For example, if a remote requires 6 threads with one node to
+achieve acceptable service, it should only take 2 threads on 3
+equivalently powered nodes to provide the same service level with
+distribution enabled. Scaling down such thread requirements per remote
+results in a reduced memory footprint per remote on each node in the
+cluster. This enables the nodes in the cluster to now scale to handle
+more remotes with the approximate same service level than without
+distribution. The amount of extra supported remotes then also scales
+approximately linearly with the extra nodes in a cluster.
+
 File `replication.config`
 -------------------------
 

diff --git a/src/test/java/com/googlesource/gerrit/plugins/replication/ReplicationDistributorIT.java b/src/test/java/com/googlesource/gerrit/plugins/replication/ReplicationDistributorIT.java
index 5ade68d..3d0dfc1 100644
--- a/src/test/java/com/googlesource/gerrit/plugins/replication/ReplicationDistributorIT.java
+++ b/src/test/java/com/googlesource/gerrit/plugins/replication/ReplicationDistributorIT.java

@@ -19,6 +19,7 @@
 import com.google.gerrit.acceptance.TestPlugin;
 import com.google.gerrit.acceptance.UseLocalDisk;
 import com.google.gerrit.acceptance.WaitUtil;
+import com.google.gerrit.entities.BranchNameKey;
 import com.google.gerrit.entities.Project;
 import com.google.gerrit.server.git.WorkQueue;
 import java.time.Duration;
@@ -89,8 +90,9 @@
     reloadConfig();
 
     String newBranch = "refs/heads/foo_branch";
-    createBranch(project, "refs/heads/master", newBranch);
+    createBranch(BranchNameKey.create(project, newBranch));
 
+    assertThat(listWaitingReplicationTasks(newBranch)).hasSize(1);
     deleteWaitingReplicationTasks(newBranch); // This simulates the work being started by other node
 
     assertThat(waitForProjectTaskCount(0, Duration.ofSeconds(TEST_DISTRIBUTION_CYCLE_SECONDS)))