Set max retries to avoid queue congestion
When servers have *a lot* of remote slaves, some of them
unstable and potentially offline, a maximum retry policy is
needed to prevent push events to stay in the replication queue
and getting rescheduled forever.
Keep backward-compatible configuration by setting maxRetry
by default to zero, which means disabled.
Change-Id: I060cc7bc3a4d1089b0815db02d2e1430f83a2015
diff --git a/src/main/java/com/googlesource/gerrit/plugins/replication/Destination.java b/src/main/java/com/googlesource/gerrit/plugins/replication/Destination.java
index 06cbe33..ed361a9 100644
--- a/src/main/java/com/googlesource/gerrit/plugins/replication/Destination.java
+++ b/src/main/java/com/googlesource/gerrit/plugins/replication/Destination.java
@@ -383,8 +383,9 @@
case TRANSPORT_ERROR:
case REPOSITORY_MISSING:
default:
- pushOp.setToRetry();
- pool.schedule(pushOp, config.getRetryDelay(), TimeUnit.MINUTES);
+ if (pushOp.setToRetry()) {
+ pool.schedule(pushOp, config.getRetryDelay(), TimeUnit.MINUTES);
+ }
break;
}
}
@@ -549,6 +550,10 @@
return config.getRemoteConfig().getName();
}
+ public int getMaxRetries() {
+ return config.getMaxRetries();
+ }
+
private static boolean matches(URIish uri, String urlMatch) {
if (urlMatch == null || urlMatch.equals("") || urlMatch.equals("*")) {
return true;
diff --git a/src/main/java/com/googlesource/gerrit/plugins/replication/DestinationConfiguration.java b/src/main/java/com/googlesource/gerrit/plugins/replication/DestinationConfiguration.java
index 0d7d3ce..f79f616 100644
--- a/src/main/java/com/googlesource/gerrit/plugins/replication/DestinationConfiguration.java
+++ b/src/main/java/com/googlesource/gerrit/plugins/replication/DestinationConfiguration.java
@@ -35,6 +35,7 @@
private final ImmutableList<String> projects;
private final ImmutableList<String> authGroupNames;
private final RemoteConfig remoteConfig;
+ private final int maxRetries;
DestinationConfiguration(RemoteConfig remoteConfig, Config cfg) {
this.remoteConfig = remoteConfig;
@@ -62,6 +63,9 @@
cfg.getBoolean("remote", name, "replicateHiddenProjects", false);
remoteNameStyle = MoreObjects.firstNonNull(
cfg.getString("remote", name, "remoteNameStyle"), "slash");
+ maxRetries =
+ getInt(
+ remoteConfig, cfg, "replicationMaxRetries", cfg.getInt("replication", "maxRetries", 0));
}
public int getDelay() {
@@ -120,6 +124,10 @@
return remoteConfig;
}
+ public int getMaxRetries() {
+ return maxRetries;
+ }
+
private static int getInt(
RemoteConfig rc, Config cfg, String name, int defValue) {
return cfg.getInt("remote", rc.getName(), name, defValue);
diff --git a/src/main/java/com/googlesource/gerrit/plugins/replication/PushOne.java b/src/main/java/com/googlesource/gerrit/plugins/replication/PushOne.java
index a251832..16c8577 100644
--- a/src/main/java/com/googlesource/gerrit/plugins/replication/PushOne.java
+++ b/src/main/java/com/googlesource/gerrit/plugins/replication/PushOne.java
@@ -110,6 +110,7 @@
private Repository git;
private boolean retrying;
private int retryCount;
+ private final int maxRetries;
private boolean canceled;
private final Multimap<String,ReplicationState> stateMap =
LinkedListMultimap.create();
@@ -155,6 +156,7 @@
createdAt = System.nanoTime();
metrics = m;
canceledWhileRunning = new AtomicBoolean(false);
+ maxRetries = p.getMaxRetries();
}
@Override
@@ -199,9 +201,10 @@
return retrying;
}
- void setToRetry() {
+ boolean setToRetry() {
retrying = true;
retryCount++;
+ return retryCount <= maxRetries;
}
void canceledByReplication() {
diff --git a/src/main/resources/Documentation/config.md b/src/main/resources/Documentation/config.md
index 709d61f..9fae6a3 100644
--- a/src/main/resources/Documentation/config.md
+++ b/src/main/resources/Documentation/config.md
@@ -104,6 +104,18 @@
Default: 0 (disabled, i.e. never retry)
+replication.maxRetries
+: Maximum number of times to retry a push operation that previously
+ failed.
+
+ When a push operation reaches its maximum number of retries
+ the replication event is discarded from the queue and the remote
+ destinations could be out of sync.
+
+ Can be overridden at remote-level by setting replicationMaxRetries.
+
+ By default, push are retried indefinitely.
+
remote.NAME.url
: Address of the remote server to push to. Multiple URLs may be
specified within a single remote block, listing different
@@ -210,12 +222,25 @@
If a remote push operation fails because a remote server was
offline, all push operations to the same destination URL are
- blocked, and the remote push is continuously retried.
+ blocked, and the remote push is continuously retried unless
+ the replicationMaxRetries value is set.
This is a Gerrit specific extension to the Git remote block.
By default, 1 minute.
+remote.NAME.replicationMaxRetries
+: Maximum number of times to retry a push operation that previously
+ failed.
+
+ When a push operation reaches its maximum number of retries
+ the replication event is discarded from the queue and the remote
+ destinations could be out of sync.
+
+ This is a Gerrit specific extension to the Git remote block.
+
+ By default, use replication.maxRetries.
+
remote.NAME.threads
: Number of worker threads to dedicate to pushing to the
repositories described by this remote. Each thread can push