Automatically retry REST operations on failure with tracing enabled

On REST endpoint level we already retry the REST operation if there was
a potentially temporary failure such as a LockFailure. Extend this to
also retry once on non-recoverable failures, but with request tracing
switched on. This is the same as if the user would retry the REST call
with request tracing after it has failed. Capturing traces for failures
automatically can help with debugging issues and safe roundtrips between
user and supporting engineer.

This only works for REST endpoints which support retrying (REST
endpoints that extend RetryingRestModifyView or
RetryingRestCollectionModifyView). Enabling more REST endpoints to retry
is outside the scope of this change, but surely feasible.

Submit has custom retry login in MergeOp. Adapt the retry in there too.

Whether retry on failure should be done can be configured by setting
retry.retryWithTraceOnFailure in gerrit.config (default = false).

Since not all REST endpoints are atomic it can happen that a REST
endpoint did some partial updates before it failed. If the operation is
retried, in this case it can happen that this partial update is done
again or that the failure is different now. This is OK since the chances
for this are low and the same would happen if the user retries the
operation.

Change-Id: I2b78cbef50e5c3ce4fafce0d7cdcb3b1835abc86
Signed-off-by: Edwin Kempin <ekempin@google.com>
diff --git a/Documentation/config-gerrit.txt b/Documentation/config-gerrit.txt
index 1601d7a..91cb87f 100644
--- a/Documentation/config-gerrit.txt
+++ b/Documentation/config-gerrit.txt
@@ -3790,6 +3790,14 @@
 Defaults to link:#retry.timeout[`retry.timeout`]; unit suffixes are supported,
 and assumes milliseconds if not specified.
 
+[[retry.retryWithTraceOnFailure]]retry.retryWithTraceOnFailure::
++
+Whether Gerrit should automatically retry operations on failure with tracing
+enabled. The automatically generated traces can help with debugging. Please
+note that only some of the REST endpoints support automatic retry.
++
+By default this is set to false.
+
 [[rules]]
 === Section rules
 
diff --git a/java/com/google/gerrit/server/submit/MergeOp.java b/java/com/google/gerrit/server/submit/MergeOp.java
index 06c52c7..6c3d48b 100644
--- a/java/com/google/gerrit/server/submit/MergeOp.java
+++ b/java/com/google/gerrit/server/submit/MergeOp.java
@@ -511,6 +511,8 @@
                     retryHelper
                         .getDefaultTimeout(ActionType.CHANGE_UPDATE)
                         .multipliedBy(cs.projects().size()))
+                .caller(getClass())
+                .retryWithTrace(t -> !(t instanceof RestApiException))
                 .build());
 
         if (projects > 1) {
diff --git a/java/com/google/gerrit/server/update/RetryHelper.java b/java/com/google/gerrit/server/update/RetryHelper.java
index 14d04cb..695feba 100644
--- a/java/com/google/gerrit/server/update/RetryHelper.java
+++ b/java/com/google/gerrit/server/update/RetryHelper.java
@@ -40,11 +40,14 @@
 import com.google.gerrit.metrics.MetricMaker;
 import com.google.gerrit.server.config.GerritServerConfig;
 import com.google.gerrit.server.logging.Metadata;
+import com.google.gerrit.server.logging.RequestId;
+import com.google.gerrit.server.logging.TraceContext;
 import com.google.inject.Inject;
 import com.google.inject.Singleton;
 import java.time.Duration;
 import java.util.Arrays;
 import java.util.Map;
+import java.util.Optional;
 import java.util.concurrent.ExecutionException;
 import java.util.function.Consumer;
 import java.util.function.Predicate;
@@ -94,12 +97,20 @@
     @Nullable
     abstract Duration timeout();
 
+    abstract Optional<Class<?>> caller();
+
+    abstract Optional<Predicate<Throwable>> retryWithTrace();
+
     @AutoValue.Builder
     public abstract static class Builder {
       public abstract Builder listener(RetryListener listener);
 
       public abstract Builder timeout(Duration timeout);
 
+      public abstract Builder caller(Class<?> caller);
+
+      public abstract Builder retryWithTrace(Predicate<Throwable> exceptionPredicate);
+
       public abstract Options build();
     }
   }
@@ -147,6 +158,7 @@
   private final Map<ActionType, Duration> defaultTimeouts;
   private final WaitStrategy waitStrategy;
   @Nullable private final Consumer<RetryerBuilder<?>> overwriteDefaultRetryerStrategySetup;
+  private final boolean retryWithTraceOnFailure;
 
   @Inject
   RetryHelper(@GerritServerConfig Config cfg, Metrics metrics, BatchUpdate.Factory updateFactory) {
@@ -186,6 +198,7 @@
                 MILLISECONDS),
             WaitStrategies.randomWait(50, MILLISECONDS));
     this.overwriteDefaultRetryerStrategySetup = overwriteDefaultRetryerStrategySetup;
+    this.retryWithTraceOnFailure = cfg.getBoolean("retry", "retryWithTraceOnFailure", false);
   }
 
   public Duration getDefaultTimeout(ActionType actionType) {
@@ -256,8 +269,35 @@
       Predicate<Throwable> exceptionPredicate)
       throws Throwable {
     MetricListener listener = new MetricListener();
-    try {
-      RetryerBuilder<T> retryerBuilder = createRetryerBuilder(actionType, opts, exceptionPredicate);
+    try (TraceContext traceContext = TraceContext.open()) {
+      RetryerBuilder<T> retryerBuilder =
+          createRetryerBuilder(
+              actionType,
+              opts,
+              t -> {
+                // exceptionPredicate checks for temporary errors for which the operation should be
+                // retried (e.g. LockFailure). The retry has good chances to succeed.
+                if (exceptionPredicate.test(t)) {
+                  return true;
+                }
+
+                // A non-recoverable failure occurred. Check if we should retry to capture a trace
+                // of the failure. If a trace was already done there is no need to retry.
+                if (retryWithTraceOnFailure
+                    && opts.retryWithTrace().isPresent()
+                    && opts.retryWithTrace().get().test(t)
+                    && !traceContext.isTracing()) {
+                  traceContext
+                      .addTag(RequestId.Type.TRACE_ID, "retry-on-failure-" + new RequestId())
+                      .forceLogging();
+                  logger.atFine().withCause(t).log(
+                      "%s failed, retry with tracing eanbled",
+                      opts.caller().map(Class::getSimpleName).orElse("N/A"));
+                  return true;
+                }
+
+                return false;
+              });
       retryerBuilder.withRetryListener(listener);
       return executeWithTimeoutCount(actionType, action, retryerBuilder.build());
     } finally {
diff --git a/java/com/google/gerrit/server/update/RetryingRestCollectionModifyView.java b/java/com/google/gerrit/server/update/RetryingRestCollectionModifyView.java
index 7620386..9204565 100644
--- a/java/com/google/gerrit/server/update/RetryingRestCollectionModifyView.java
+++ b/java/com/google/gerrit/server/update/RetryingRestCollectionModifyView.java
@@ -17,6 +17,7 @@
 import com.google.gerrit.extensions.restapi.AuthException;
 import com.google.gerrit.extensions.restapi.BadRequestException;
 import com.google.gerrit.extensions.restapi.ResourceConflictException;
+import com.google.gerrit.extensions.restapi.RestApiException;
 import com.google.gerrit.extensions.restapi.RestCollectionModifyView;
 import com.google.gerrit.extensions.restapi.RestResource;
 
@@ -32,7 +33,13 @@
   @Override
   public final O apply(P parentResource, I input)
       throws AuthException, BadRequestException, ResourceConflictException, Exception {
-    return retryHelper.execute((updateFactory) -> applyImpl(updateFactory, parentResource, input));
+    RetryHelper.Options retryOptions =
+        RetryHelper.options()
+            .caller(getClass())
+            .retryWithTrace(t -> !(t instanceof RestApiException))
+            .build();
+    return retryHelper.execute(
+        (updateFactory) -> applyImpl(updateFactory, parentResource, input), retryOptions);
   }
 
   protected abstract O applyImpl(BatchUpdate.Factory updateFactory, P parentResource, I input)
diff --git a/java/com/google/gerrit/server/update/RetryingRestModifyView.java b/java/com/google/gerrit/server/update/RetryingRestModifyView.java
index e2f4a02..b471d70 100644
--- a/java/com/google/gerrit/server/update/RetryingRestModifyView.java
+++ b/java/com/google/gerrit/server/update/RetryingRestModifyView.java
@@ -14,6 +14,7 @@
 
 package com.google.gerrit.server.update;
 
+import com.google.gerrit.extensions.restapi.RestApiException;
 import com.google.gerrit.extensions.restapi.RestModifyView;
 import com.google.gerrit.extensions.restapi.RestResource;
 
@@ -27,7 +28,13 @@
 
   @Override
   public final O apply(R resource, I input) throws Exception {
-    return retryHelper.execute((updateFactory) -> applyImpl(updateFactory, resource, input));
+    RetryHelper.Options retryOptions =
+        RetryHelper.options()
+            .caller(getClass())
+            .retryWithTrace(t -> !(t instanceof RestApiException))
+            .build();
+    return retryHelper.execute(
+        (updateFactory) -> applyImpl(updateFactory, resource, input), retryOptions);
   }
 
   protected abstract O applyImpl(BatchUpdate.Factory updateFactory, R resource, I input)
diff --git a/javatests/com/google/gerrit/acceptance/rest/TraceIT.java b/javatests/com/google/gerrit/acceptance/rest/TraceIT.java
index 3a01517..56a9b69 100644
--- a/javatests/com/google/gerrit/acceptance/rest/TraceIT.java
+++ b/javatests/com/google/gerrit/acceptance/rest/TraceIT.java
@@ -16,6 +16,7 @@
 
 import static com.google.common.truth.Truth.assertThat;
 import static org.apache.http.HttpStatus.SC_CREATED;
+import static org.apache.http.HttpStatus.SC_INTERNAL_SERVER_ERROR;
 import static org.apache.http.HttpStatus.SC_OK;
 
 import com.google.auto.value.AutoValue;
@@ -28,6 +29,7 @@
 import com.google.gerrit.acceptance.GerritConfig;
 import com.google.gerrit.acceptance.PushOneCommit;
 import com.google.gerrit.acceptance.RestResponse;
+import com.google.gerrit.common.data.SubmitRecord;
 import com.google.gerrit.extensions.api.changes.ReviewInput;
 import com.google.gerrit.extensions.events.ChangeIndexedListener;
 import com.google.gerrit.extensions.registration.DynamicSet;
@@ -44,10 +46,14 @@
 import com.google.gerrit.server.logging.PerformanceLogger;
 import com.google.gerrit.server.logging.TraceContext;
 import com.google.gerrit.server.project.CreateProjectArgs;
+import com.google.gerrit.server.project.SubmitRuleOptions;
+import com.google.gerrit.server.query.change.ChangeData;
+import com.google.gerrit.server.rules.SubmitRule;
 import com.google.gerrit.server.validators.ProjectCreationValidationListener;
 import com.google.gerrit.server.validators.ValidationException;
 import com.google.inject.Inject;
 import java.util.ArrayList;
+import java.util.Collection;
 import java.util.List;
 import java.util.SortedMap;
 import java.util.SortedSet;
@@ -79,6 +85,7 @@
   @Inject private DynamicSet<CommitValidationListener> commitValidationListeners;
   @Inject private DynamicSet<ChangeIndexedListener> changeIndexedListeners;
   @Inject private DynamicSet<PerformanceLogger> performanceLoggers;
+  @Inject private DynamicSet<SubmitRule> submitRules;
   @Inject private WorkQueue workQueue;
 
   private TraceValidatingProjectCreationValidationListener projectCreationListener;
@@ -579,6 +586,44 @@
     assertThat(projectCreationListener.tags.get("project")).containsExactly("new24");
   }
 
+  @GerritConfig(name = "retry.retryWithTraceOnFailure", value = "true")
+  public void autoRetryWithTrace() throws Exception {
+    String changeId = createChange().getChangeId();
+    approve(changeId);
+
+    TraceSubmitRule traceSubmitRule = new TraceSubmitRule();
+    traceSubmitRule.failOnce = true;
+    RegistrationHandle submitRuleRegistrationHandle = submitRules.add("gerrit", traceSubmitRule);
+    try {
+      RestResponse response = adminRestSession.post("/changes/" + changeId + "/submit");
+      assertThat(response.getStatusCode()).isEqualTo(SC_OK);
+      assertThat(response.getHeader(RestApiServlet.X_GERRIT_TRACE)).isNull();
+      assertThat(traceSubmitRule.traceId).startsWith("retry-on-failure-");
+      assertThat(traceSubmitRule.isLoggingForced).isTrue();
+    } finally {
+      submitRuleRegistrationHandle.remove();
+    }
+  }
+
+  @Test
+  public void noAutoRetryWithTraceIfDisabled() throws Exception {
+    String changeId = createChange().getChangeId();
+    approve(changeId);
+
+    TraceSubmitRule traceSubmitRule = new TraceSubmitRule();
+    traceSubmitRule.failOnce = true;
+    RegistrationHandle submitRuleRegistrationHandle = submitRules.add("gerrit", traceSubmitRule);
+    try {
+      RestResponse response = adminRestSession.post("/changes/" + changeId + "/submit");
+      assertThat(response.getStatusCode()).isEqualTo(SC_INTERNAL_SERVER_ERROR);
+      assertThat(response.getHeader(RestApiServlet.X_GERRIT_TRACE)).isNull();
+      assertThat(traceSubmitRule.traceId).isNull();
+      assertThat(traceSubmitRule.isLoggingForced).isNull();
+    } finally {
+      submitRuleRegistrationHandle.remove();
+    }
+  }
+
   private void assertForceLogging(boolean expected) {
     assertThat(LoggingContext.getInstance().shouldForceLogging(null, null, false))
         .isEqualTo(expected);
@@ -629,6 +674,28 @@
     public void onChangeDeleted(int id) {}
   }
 
+  private static class TraceSubmitRule implements SubmitRule {
+    String traceId;
+    Boolean isLoggingForced;
+    boolean failOnce;
+
+    @Override
+    public Collection<SubmitRecord> evaluate(ChangeData changeData, SubmitRuleOptions options) {
+      if (failOnce) {
+        failOnce = false;
+        throw new IllegalStateException("forced failure from test");
+      }
+
+      this.traceId =
+          Iterables.getFirst(LoggingContext.getInstance().getTagsAsMap().get("TRACE_ID"), null);
+      this.isLoggingForced = LoggingContext.getInstance().shouldForceLogging(null, null, false);
+
+      SubmitRecord submitRecord = new SubmitRecord();
+      submitRecord.status = SubmitRecord.Status.OK;
+      return ImmutableList.of(submitRecord);
+    }
+  }
+
   private static class TestPerformanceLogger implements PerformanceLogger {
     private List<PerformanceLogEntry> logEntries = new ArrayList<>();