Merge changes Idd0e11ba,Iaf00cea9

* changes:
  Add metric to count how often requests are automatically retried
  Log failure if a request was automatically retried and failed again
diff --git a/Documentation/metrics.txt b/Documentation/metrics.txt
index 63cbd7c..367b59d 100644
--- a/Documentation/metrics.txt
+++ b/Documentation/metrics.txt
@@ -19,6 +19,7 @@
 by RetryHelper to execute an action (0 == single attempt, no retry)
 * `action/retry_timeout_count`: Number of action executions of RetryHelper
 that ultimately timed out
+* `action/auto_retry_count`: Number of automatic retries with tracing
 
 === Pushes
 
diff --git a/java/com/google/gerrit/server/logging/Metadata.java b/java/com/google/gerrit/server/logging/Metadata.java
index 7eba4de..7184a92 100644
--- a/java/com/google/gerrit/server/logging/Metadata.java
+++ b/java/com/google/gerrit/server/logging/Metadata.java
@@ -89,6 +89,9 @@
   // One or more resources
   public abstract Optional<Boolean> multiple();
 
+  // The name of an operation that is performed.
+  public abstract Optional<String> operationName();
+
   // Partial or full computation
   public abstract Optional<Boolean> partial();
 
@@ -185,6 +188,8 @@
 
     public abstract Builder multiple(boolean multiple);
 
+    public abstract Builder operationName(String operationName);
+
     public abstract Builder partial(boolean partial);
 
     public abstract Builder noteDbFilePath(@Nullable String noteDbFilePath);
diff --git a/java/com/google/gerrit/server/update/RetryHelper.java b/java/com/google/gerrit/server/update/RetryHelper.java
index f50f857..871dfe0 100644
--- a/java/com/google/gerrit/server/update/RetryHelper.java
+++ b/java/com/google/gerrit/server/update/RetryHelper.java
@@ -35,6 +35,7 @@
 import com.google.gerrit.extensions.restapi.RestApiException;
 import com.google.gerrit.git.LockFailureException;
 import com.google.gerrit.metrics.Counter1;
+import com.google.gerrit.metrics.Counter2;
 import com.google.gerrit.metrics.Description;
 import com.google.gerrit.metrics.Field;
 import com.google.gerrit.metrics.MetricMaker;
@@ -120,6 +121,7 @@
   public static class Metrics {
     final Counter1<ActionType> attemptCounts;
     final Counter1<ActionType> timeoutCount;
+    final Counter2<ActionType, String> autoRetryCount;
 
     @Inject
     Metrics(MetricMaker metricMaker) {
@@ -142,6 +144,16 @@
                   .setCumulative()
                   .setUnit("timeouts"),
               actionTypeField);
+      autoRetryCount =
+          metricMaker.newCounter(
+              "action/auto_retry_count",
+              new Description("Number of automatic retries with tracing")
+                  .setCumulative()
+                  .setUnit("retries"),
+              actionTypeField,
+              Field.ofString("operation_name", Metadata.Builder::operationName)
+                  .description("The name of the operation that was retried.")
+                  .build());
     }
   }
 
@@ -285,15 +297,23 @@
                 // of the failure. If a trace was already done there is no need to retry.
                 if (retryWithTraceOnFailure
                     && opts.retryWithTrace().isPresent()
-                    && opts.retryWithTrace().get().test(t)
-                    && !traceContext.isTracing()) {
-                  traceContext
-                      .addTag(RequestId.Type.TRACE_ID, "retry-on-failure-" + new RequestId())
-                      .forceLogging();
-                  logger.atFine().withCause(t).log(
-                      "%s failed, retry with tracing enabled",
-                      opts.caller().map(Class::getSimpleName).orElse("N/A"));
-                  return true;
+                    && opts.retryWithTrace().get().test(t)) {
+                  String caller = opts.caller().map(Class::getSimpleName).orElse("N/A");
+                  if (!traceContext.isTracing()) {
+                    traceContext
+                        .addTag(RequestId.Type.TRACE_ID, "retry-on-failure-" + new RequestId())
+                        .forceLogging();
+                    logger.atFine().withCause(t).log(
+                        "%s failed, retry with tracing enabled", caller);
+                    metrics.autoRetryCount.increment(actionType, caller);
+                    return true;
+                  }
+
+                  // A non-recoverable failure occurred. We retried the operation with tracing
+                  // enabled and it failed again. Log the failure so that admin can see if it
+                  // differs from the failure that triggered the retry.
+                  logger.atFine().withCause(t).log("auto-retry of %s has failed", caller);
+                  return false;
                 }
 
                 return false;