Merge branch 'stable-3.4' * stable-3.4: Register ProjectDeletion events for gson serialization Change-Id: I1a9f991eae5b7ac1c3189306c3cda6435ed79f5f
diff --git a/src/main/java/com/googlesource/gerrit/plugins/replication/ReplicationQueue.java b/src/main/java/com/googlesource/gerrit/plugins/replication/ReplicationQueue.java index 4abb295..5310c14 100644 --- a/src/main/java/com/googlesource/gerrit/plugins/replication/ReplicationQueue.java +++ b/src/main/java/com/googlesource/gerrit/plugins/replication/ReplicationQueue.java
@@ -229,7 +229,7 @@ @Override public void onDone() { if (Prune.TRUE.equals(prune)) { - pruneNoLongerPending(taskNamesByReplicateRefUpdate.values()); + pruneNoLongerPending(new HashSet<>(taskNamesByReplicateRefUpdate.values())); } replaying.set(false); } @@ -242,7 +242,7 @@ } } - private void pruneNoLongerPending(Collection<String> prunableTaskNames) { + private void pruneNoLongerPending(Set<String> prunableTaskNames) { // Queue tasks have wrappers around them so workQueue.getTasks() does not return the PushOnes. // We also cannot access them by taskId since PushOnes don't have a taskId, they do have // an Id, but it is not the id assigned to the task in the queues. The tasks in the queue
diff --git a/src/main/java/com/googlesource/gerrit/plugins/replication/ReplicationState.java b/src/main/java/com/googlesource/gerrit/plugins/replication/ReplicationState.java index 871ed52..fa65803 100644 --- a/src/main/java/com/googlesource/gerrit/plugins/replication/ReplicationState.java +++ b/src/main/java/com/googlesource/gerrit/plugins/replication/ReplicationState.java
@@ -140,12 +140,12 @@ } private RefReplicationStatus getRefStatus(String project, String ref) { - if (!statusByProjectRef.contains(project, ref)) { - RefReplicationStatus refStatus = new RefReplicationStatus(project, ref); + RefReplicationStatus refStatus = statusByProjectRef.get(project, ref); + if (refStatus == null) { + refStatus = new RefReplicationStatus(project, ref); statusByProjectRef.put(project, ref, refStatus); - return refStatus; } - return statusByProjectRef.get(project, ref); + return refStatus; } public void waitForReplication() throws InterruptedException {
diff --git a/src/main/resources/Documentation/about.md b/src/main/resources/Documentation/about.md index ded9d84..d216366 100644 --- a/src/main/resources/Documentation/about.md +++ b/src/main/resources/Documentation/about.md
@@ -16,11 +16,9 @@ local path as replication target. This makes e.g. sense if a network share is mounted to which the repositories should be replicated. -In multi-primary scenario, any replication work which is already -in-flight or completed by the other nodes is not performed to -avoid extra work. This is because, the storage for replication -events is shared between multiple primaries.(The storage location -is specified in the config using: `replication.eventsDirectory`). +It is possible to +[configure](config.html#configuring-cluster-replication) the plugin so +that multiple primaries share the replication work approximately evenly. Replication of account data (NoteDb) ------------------------------------
diff --git a/src/main/resources/Documentation/config.md b/src/main/resources/Documentation/config.md index f4ea9d6..7fe9e15 100644 --- a/src/main/resources/Documentation/config.md +++ b/src/main/resources/Documentation/config.md
@@ -46,6 +46,59 @@ To manually trigger replication at runtime, see SSH command [start](cmd-start.md). +<a name="configuring-cluster-replication"></a> +Configuring Cluster Replication +------------------------------- + +The replication plugin is designed to allow multiple primaries in a +cluster to efficiently cooperate together via the replication event +persistence subsystem. To enable this cooperation, the directory +pointed to by the replication.eventsDirectory config key must reside on +a shared filesystem, such as NFS. By default, simply pointing multiple +primaries to the same eventsDirectory will enable some cooperation by +preventing the same replication push from being duplicated by more +than one primary. + +To further improve cooperation across the cluster, the +replication.distributionInterval config value can be set. With +distribution enabled, the replication queues for all the nodes sharing +the same eventsDirectory will reflect approximately the same outstanding +replication work (i.e. tasks waiting in the queue). Replication pushes +which are running will continue to only be visible in the queue of the +node on which the push is actually happening. This feature helps +administrators get a cluster wide view of outstanding replication +tasks, while allowing replication tasks triggered by one primary to be +fulfilled by another node which is less busy. + +This enhanced replication work distribution allows the amount of +replication work a cluster can handle to scale more evenly and linearly +with the amount of primaries in the cluster. Adding more nodes to a +cluster without distribution enabled will generally not allow the thread +count per remote to be reduced without impacting service levels to those +remotes. This is because without distribution, all events triggered by a +node will only be fulfilled by the node which triggered the event, even +if all the other nodes in the cluster are idle. This behavior implies +that each node should be configured in a way that allows it alone to +provide the level of service which each remote requires. However, with +distribution enabled, it becomes possible to reduce the amount of +replication threads configured per remote proportionally to the amount +of nodes in the cluster, while maintaining the same approximate service +level as before adding new nodes. + +Threads per remote reduction without service impacts is possible with +distribution, because when configuring a node it can be expected that +other nodes will pick up some of the work it triggers. Then the node no +longer needs to be configured as if it were the only node in the +cluster. For example, if a remote requires 6 threads with one node to +achieve acceptable service, it should only take 2 threads on 3 +equivalently powered nodes to provide the same service level with +distribution enabled. Scaling down such thread requirements per remote +results in a reduced memory footprint per remote on each node in the +cluster. This enables the nodes in the cluster to now scale to handle +more remotes with the approximate same service level than without +distribution. The amount of extra supported remotes then also scales +approximately linearly with the extra nodes in a cluster. + File `replication.config` -------------------------
diff --git a/src/test/java/com/googlesource/gerrit/plugins/replication/ReplicationDistributorIT.java b/src/test/java/com/googlesource/gerrit/plugins/replication/ReplicationDistributorIT.java index 5ade68d..3d0dfc1 100644 --- a/src/test/java/com/googlesource/gerrit/plugins/replication/ReplicationDistributorIT.java +++ b/src/test/java/com/googlesource/gerrit/plugins/replication/ReplicationDistributorIT.java
@@ -19,6 +19,7 @@ import com.google.gerrit.acceptance.TestPlugin; import com.google.gerrit.acceptance.UseLocalDisk; import com.google.gerrit.acceptance.WaitUtil; +import com.google.gerrit.entities.BranchNameKey; import com.google.gerrit.entities.Project; import com.google.gerrit.server.git.WorkQueue; import java.time.Duration; @@ -89,8 +90,9 @@ reloadConfig(); String newBranch = "refs/heads/foo_branch"; - createBranch(project, "refs/heads/master", newBranch); + createBranch(BranchNameKey.create(project, newBranch)); + assertThat(listWaitingReplicationTasks(newBranch)).hasSize(1); deleteWaitingReplicationTasks(newBranch); // This simulates the work being started by other node assertThat(waitForProjectTaskCount(0, Duration.ofSeconds(TEST_DISTRIBUTION_CYCLE_SECONDS)))