Retry replications failed due to "failed to lock" errors

If two or more replication PushOps (to the same GIT and Ref) are
scheduled at approximately same time (and end up on different
replication threads), there is a large probability that the last
push to complete will fail with a remote "failure to lock" error.

This is due to the first replication operation updating the very
same remote ref during the execution of the second replication push
operation.

Since this is not recognized as a transient issue, the last ref
update will never happen and the commit will not be reachable from
the Gerrit slave.

In a very active system as ours, this happens often enough to be a
huge issue for both users and CI-systems.

This fix will acknowledge these "failed to lock" issues as a
transient error and schedule the failed replication operation
for a retry.

Change-Id: Ic51d8407e38eea257b0a2e5fb529cf23b5875ca8
diff --git a/src/main/java/com/googlesource/gerrit/plugins/replication/Destination.java b/src/main/java/com/googlesource/gerrit/plugins/replication/Destination.java
index 0da6cb2..63fc350 100644
--- a/src/main/java/com/googlesource/gerrit/plugins/replication/Destination.java
+++ b/src/main/java/com/googlesource/gerrit/plugins/replication/Destination.java
@@ -77,6 +77,7 @@
   private final int delay;
   private final int retryDelay;
   private final Object stateLock = new Object();
+  private final int lockErrorMaxRetries;
   private final Map<URIish, PushOne> pending = new HashMap<URIish, PushOne>();
   private final Map<URIish, PushOne> inFlight = new HashMap<URIish, PushOne>();
   private final PushOne.Factory opFactory;
@@ -105,6 +106,7 @@
     gitManager = gitRepositoryManager;
     delay = Math.max(0, getInt(rc, cfg, "replicationdelay", 15));
     retryDelay = Math.max(0, getInt(rc, cfg, "replicationretry", 1));
+    lockErrorMaxRetries = cfg.getInt("replication", "lockErrorMaxRetries", 0);
     adminUrls = cfg.getStringList("remote", rc.getName(), "adminUrl");
 
     poolThreads = Math.max(0, getInt(rc, cfg, "threads", 1));
@@ -511,6 +513,10 @@
     return adminUrls;
   }
 
+  int getLockErrorMaxRetries() {
+    return lockErrorMaxRetries;
+  }
+
   private static boolean matches(URIish uri, String urlMatch) {
     if (urlMatch == null || urlMatch.equals("") || urlMatch.equals("*")) {
       return true;
diff --git a/src/main/java/com/googlesource/gerrit/plugins/replication/PushOne.java b/src/main/java/com/googlesource/gerrit/plugins/replication/PushOne.java
index 16d4266..39e18f4 100644
--- a/src/main/java/com/googlesource/gerrit/plugins/replication/PushOne.java
+++ b/src/main/java/com/googlesource/gerrit/plugins/replication/PushOne.java
@@ -104,6 +104,8 @@
   private boolean canceled;
   private final Multimap<String,ReplicationState> stateMap =
       LinkedListMultimap.create();
+  private final int maxLockRetries;
+  private int lockRetryCount;
 
   @Inject
   PushOne(final GitRepositoryManager grm,
@@ -128,6 +130,8 @@
     replicationQueue = rq;
     projectName = d;
     uri = u;
+    lockRetryCount = 0;
+    maxLockRetries = pool.getLockErrorMaxRetries();
   }
 
   @Override
@@ -293,6 +297,19 @@
       if (cause instanceof JSchException
           && cause.getMessage().startsWith("UnknownHostKey:")) {
         log.error("Cannot replicate to " + uri + ": " + cause.getMessage());
+      } else if (e instanceof LockFailureException) {
+        lockRetryCount++;
+        // The LockFailureException message contains both URI and reason
+        // for this failure.
+        log.error("Cannot replicate to " + e.getMessage());
+
+        // The remote push operation should be retried.
+        if (lockRetryCount <= maxLockRetries) {
+          pool.reschedule(this, Destination.RetryReason.TRANSPORT_ERROR);
+        } else {
+          log.error("Giving up after " + lockRetryCount
+              + " of this error during replication to " + e.getMessage());
+        }
       } else {
         log.error("Cannot replicate to " + uri, e);
         // The remote push operation should be retried.
@@ -300,7 +317,6 @@
       }
     } catch (IOException e) {
       wrappedLog.error("Cannot replicate to " + uri, e, getStatesAsArray());
-
     } catch (RuntimeException e) {
       wrappedLog.error("Unexpected error during replication to " + uri, e, getStatesAsArray());
 
@@ -521,7 +537,8 @@
     cmds.add(new RemoteRefUpdate(git, (Ref) null, dst, force, null, null));
   }
 
-  private void updateStates(Collection<RemoteRefUpdate> refUpdates) {
+  private void updateStates(Collection<RemoteRefUpdate> refUpdates)
+      throws LockFailureException {
     Set<String> doneRefs = new HashSet<String>();
     boolean anyRefFailed = false;
 
@@ -557,6 +574,8 @@
                 + ", remote rejected non-fast-forward push."
                 + "  Check receive.denyNonFastForwards variable in config file"
                 + " of destination repository.", u.getRemoteName(), uri), logStatesArray);
+          } else if ("failed to lock".equals(u.getMessage())) {
+            throw new LockFailureException(uri, u.getMessage());
           } else {
             wrappedLog.error(String.format(
                 "Failed replicate of %s to %s, reason: %s",
@@ -586,4 +605,12 @@
     }
     stateMap.clear();
   }
+
+  public class LockFailureException extends TransportException {
+    private static final long serialVersionUID = 1L;
+
+    public LockFailureException(URIish uri, String message) {
+      super(uri, message);
+    }
+  };
 }
diff --git a/src/main/resources/Documentation/config.md b/src/main/resources/Documentation/config.md
index e8f9473..94b8473 100644
--- a/src/main/resources/Documentation/config.md
+++ b/src/main/resources/Documentation/config.md
@@ -77,6 +77,33 @@
 :	If true, the default push refspec will be set to use forced
 	update to the remote when no refspec is given.  By default, false.
 
+replication.lockErrorMaxRetries
+:	Number of times to retry a replication operation if a lock
+	error is detected.
+
+	If two or more replication operations (to the same GIT and Ref)
+	are scheduled at approximately the same time (and end up on different
+	replication threads), there is a large probability that the last
+	push to complete will fail with a remote "failure to lock" error.
+	This option allows Gerrit to retry the replication push when the
+	"failure to lock" error is detected.
+
+	A good value would be 3 retries or less, depending on how often
+	you see lockError collisions in your server logs. A too highly set
+	value risks keeping around the replication operations in the queue
+	for a long time, and the number of items in the queue will increase
+	with time.
+
+	Normally Gerrit will succeed with the replication during its first
+	retry, but in certain edge cases (e.g. a mirror introduces a ref
+	namespace with the same name as a branch on the master) the retry
+	will never succeed.
+
+	The issue can also be mitigated somewhat by increasing the
+	replicationDelay.
+
+	Default: 0 (disabled, i.e. never retry)
+
 remote.NAME.url
 :	Address of the remote server to push to.  Multiple URLs may be
 	specified within a single remote block, listing different