Set max retries to avoid queue congestion

When servers have *a lot* of remote slaves, some of them
unstable and potentially offline, a maximum retry policy is
needed to prevent push events to stay in the replication queue
and getting rescheduled forever.

Keep backward-compatible configuration by setting maxRetry
by default to zero, which means disabled.

Change-Id: I060cc7bc3a4d1089b0815db02d2e1430f83a2015
diff --git a/src/main/java/com/googlesource/gerrit/plugins/replication/Destination.java b/src/main/java/com/googlesource/gerrit/plugins/replication/Destination.java
index 06cbe33..ed361a9 100644
--- a/src/main/java/com/googlesource/gerrit/plugins/replication/Destination.java
+++ b/src/main/java/com/googlesource/gerrit/plugins/replication/Destination.java
@@ -383,8 +383,9 @@
           case TRANSPORT_ERROR:
           case REPOSITORY_MISSING:
           default:
-            pushOp.setToRetry();
-            pool.schedule(pushOp, config.getRetryDelay(), TimeUnit.MINUTES);
+            if (pushOp.setToRetry()) {
+              pool.schedule(pushOp, config.getRetryDelay(), TimeUnit.MINUTES);
+            }
             break;
         }
       }
@@ -549,6 +550,10 @@
     return config.getRemoteConfig().getName();
   }
 
+  public int getMaxRetries() {
+    return config.getMaxRetries();
+  }
+
   private static boolean matches(URIish uri, String urlMatch) {
     if (urlMatch == null || urlMatch.equals("") || urlMatch.equals("*")) {
       return true;
diff --git a/src/main/java/com/googlesource/gerrit/plugins/replication/DestinationConfiguration.java b/src/main/java/com/googlesource/gerrit/plugins/replication/DestinationConfiguration.java
index 0d7d3ce..f79f616 100644
--- a/src/main/java/com/googlesource/gerrit/plugins/replication/DestinationConfiguration.java
+++ b/src/main/java/com/googlesource/gerrit/plugins/replication/DestinationConfiguration.java
@@ -35,6 +35,7 @@
   private final ImmutableList<String> projects;
   private final ImmutableList<String> authGroupNames;
   private final RemoteConfig remoteConfig;
+  private final int maxRetries;
 
   DestinationConfiguration(RemoteConfig remoteConfig, Config cfg) {
     this.remoteConfig = remoteConfig;
@@ -62,6 +63,9 @@
         cfg.getBoolean("remote", name, "replicateHiddenProjects", false);
     remoteNameStyle = MoreObjects.firstNonNull(
         cfg.getString("remote", name, "remoteNameStyle"), "slash");
+    maxRetries =
+        getInt(
+            remoteConfig, cfg, "replicationMaxRetries", cfg.getInt("replication", "maxRetries", 0));
   }
 
   public int getDelay() {
@@ -120,6 +124,10 @@
     return remoteConfig;
   }
 
+  public int getMaxRetries() {
+    return maxRetries;
+  }
+
   private static int getInt(
       RemoteConfig rc, Config cfg, String name, int defValue) {
     return cfg.getInt("remote", rc.getName(), name, defValue);
diff --git a/src/main/java/com/googlesource/gerrit/plugins/replication/PushOne.java b/src/main/java/com/googlesource/gerrit/plugins/replication/PushOne.java
index a251832..16c8577 100644
--- a/src/main/java/com/googlesource/gerrit/plugins/replication/PushOne.java
+++ b/src/main/java/com/googlesource/gerrit/plugins/replication/PushOne.java
@@ -110,6 +110,7 @@
   private Repository git;
   private boolean retrying;
   private int retryCount;
+  private final int maxRetries;
   private boolean canceled;
   private final Multimap<String,ReplicationState> stateMap =
       LinkedListMultimap.create();
@@ -155,6 +156,7 @@
     createdAt = System.nanoTime();
     metrics = m;
     canceledWhileRunning = new AtomicBoolean(false);
+    maxRetries = p.getMaxRetries();
   }
 
   @Override
@@ -199,9 +201,10 @@
     return retrying;
   }
 
-  void setToRetry() {
+  boolean setToRetry() {
     retrying = true;
     retryCount++;
+    return retryCount <= maxRetries;
   }
 
   void canceledByReplication() {
diff --git a/src/main/resources/Documentation/config.md b/src/main/resources/Documentation/config.md
index 709d61f..9fae6a3 100644
--- a/src/main/resources/Documentation/config.md
+++ b/src/main/resources/Documentation/config.md
@@ -104,6 +104,18 @@
 
 	Default: 0 (disabled, i.e. never retry)
 
+replication.maxRetries
+:	Maximum number of times to retry a push operation that previously
+	failed.
+
+	When a push operation reaches its maximum number of retries
+	the replication event is discarded from the queue and the remote
+	destinations could be out of sync.
+
+	Can be overridden at remote-level by setting replicationMaxRetries.
+
+	By default, push are retried indefinitely.
+
 remote.NAME.url
 :	Address of the remote server to push to.  Multiple URLs may be
 	specified within a single remote block, listing different
@@ -210,12 +222,25 @@
 
 	If a remote push operation fails because a remote server was
 	offline, all push operations to the same destination URL are
-	blocked, and the remote push is continuously retried.
+	blocked, and the remote push is continuously retried unless
+	the replicationMaxRetries value is set.
 
 	This is a Gerrit specific extension to the Git remote block.
 
 	By default, 1 minute.
 
+remote.NAME.replicationMaxRetries
+:	Maximum number of times to retry a push operation that previously
+	failed.
+
+	When a push operation reaches its maximum number of retries
+	the replication event is discarded from the queue and the remote
+	destinations could be out of sync.
+
+	This is a Gerrit specific extension to the Git remote block.
+
+	By default, use replication.maxRetries.
+
 remote.NAME.threads
 :	Number of worker threads to dedicate to pushing to the
 	repositories described by this remote.  Each thread can push