Merge changes Idd0e11ba,Iaf00cea9
* changes:
Add metric to count how often requests are automatically retried
Log failure if a request was automatically retried and failed again
diff --git a/Documentation/metrics.txt b/Documentation/metrics.txt
index 63cbd7c..367b59d 100644
--- a/Documentation/metrics.txt
+++ b/Documentation/metrics.txt
@@ -19,6 +19,7 @@
by RetryHelper to execute an action (0 == single attempt, no retry)
* `action/retry_timeout_count`: Number of action executions of RetryHelper
that ultimately timed out
+* `action/auto_retry_count`: Number of automatic retries with tracing
=== Pushes
diff --git a/java/com/google/gerrit/server/logging/Metadata.java b/java/com/google/gerrit/server/logging/Metadata.java
index 7eba4de..7184a92 100644
--- a/java/com/google/gerrit/server/logging/Metadata.java
+++ b/java/com/google/gerrit/server/logging/Metadata.java
@@ -89,6 +89,9 @@
// One or more resources
public abstract Optional<Boolean> multiple();
+ // The name of an operation that is performed.
+ public abstract Optional<String> operationName();
+
// Partial or full computation
public abstract Optional<Boolean> partial();
@@ -185,6 +188,8 @@
public abstract Builder multiple(boolean multiple);
+ public abstract Builder operationName(String operationName);
+
public abstract Builder partial(boolean partial);
public abstract Builder noteDbFilePath(@Nullable String noteDbFilePath);
diff --git a/java/com/google/gerrit/server/update/RetryHelper.java b/java/com/google/gerrit/server/update/RetryHelper.java
index f50f857..871dfe0 100644
--- a/java/com/google/gerrit/server/update/RetryHelper.java
+++ b/java/com/google/gerrit/server/update/RetryHelper.java
@@ -35,6 +35,7 @@
import com.google.gerrit.extensions.restapi.RestApiException;
import com.google.gerrit.git.LockFailureException;
import com.google.gerrit.metrics.Counter1;
+import com.google.gerrit.metrics.Counter2;
import com.google.gerrit.metrics.Description;
import com.google.gerrit.metrics.Field;
import com.google.gerrit.metrics.MetricMaker;
@@ -120,6 +121,7 @@
public static class Metrics {
final Counter1<ActionType> attemptCounts;
final Counter1<ActionType> timeoutCount;
+ final Counter2<ActionType, String> autoRetryCount;
@Inject
Metrics(MetricMaker metricMaker) {
@@ -142,6 +144,16 @@
.setCumulative()
.setUnit("timeouts"),
actionTypeField);
+ autoRetryCount =
+ metricMaker.newCounter(
+ "action/auto_retry_count",
+ new Description("Number of automatic retries with tracing")
+ .setCumulative()
+ .setUnit("retries"),
+ actionTypeField,
+ Field.ofString("operation_name", Metadata.Builder::operationName)
+ .description("The name of the operation that was retried.")
+ .build());
}
}
@@ -285,15 +297,23 @@
// of the failure. If a trace was already done there is no need to retry.
if (retryWithTraceOnFailure
&& opts.retryWithTrace().isPresent()
- && opts.retryWithTrace().get().test(t)
- && !traceContext.isTracing()) {
- traceContext
- .addTag(RequestId.Type.TRACE_ID, "retry-on-failure-" + new RequestId())
- .forceLogging();
- logger.atFine().withCause(t).log(
- "%s failed, retry with tracing enabled",
- opts.caller().map(Class::getSimpleName).orElse("N/A"));
- return true;
+ && opts.retryWithTrace().get().test(t)) {
+ String caller = opts.caller().map(Class::getSimpleName).orElse("N/A");
+ if (!traceContext.isTracing()) {
+ traceContext
+ .addTag(RequestId.Type.TRACE_ID, "retry-on-failure-" + new RequestId())
+ .forceLogging();
+ logger.atFine().withCause(t).log(
+ "%s failed, retry with tracing enabled", caller);
+ metrics.autoRetryCount.increment(actionType, caller);
+ return true;
+ }
+
+ // A non-recoverable failure occurred. We retried the operation with tracing
+ // enabled and it failed again. Log the failure so that admin can see if it
+ // differs from the failure that triggered the retry.
+ logger.atFine().withCause(t).log("auto-retry of %s has failed", caller);
+ return false;
}
return false;