Always report healthy when instance has caught up with outstanding tasks Once the instance has caught up with outstanding pull-replication tasks on startup, we consider the node healthy and ready to receive write traffic. As such, any subsequent invocations of the check should return a healthy status, regardless of pending or inflight tasks being present. Bug: Issue 312895374 Change-Id: I831dbb723a8a3af15eb12331e6073396534cdde8
diff --git a/src/main/java/com/googlesource/gerrit/plugins/replication/pull/health/PullReplicationTasksHealthCheck.java b/src/main/java/com/googlesource/gerrit/plugins/replication/pull/health/PullReplicationTasksHealthCheck.java index a621778..778a6cc 100644 --- a/src/main/java/com/googlesource/gerrit/plugins/replication/pull/health/PullReplicationTasksHealthCheck.java +++ b/src/main/java/com/googlesource/gerrit/plugins/replication/pull/health/PullReplicationTasksHealthCheck.java
@@ -46,6 +46,7 @@ private final SourcesCollection sourcesCollection; private final Ticker ticker; private Optional<Long> successfulSince = Optional.empty(); + private boolean isCaughtUp; @Inject public PullReplicationTasksHealthCheck( @@ -86,6 +87,9 @@ @Override protected Result doCheck() throws Exception { + if (isCaughtUp) { + return Result.PASSED; + } long checkTime = ticker.read(); List<Source> sources = sourcesCollection.getAll(); boolean hasNoOutstandingTasks = @@ -108,9 +112,9 @@ } private HealthCheck.Result reportResult(long checkTime) { - return successfulSince - .filter(ss -> checkTime >= ss + periodOfTimeNanos) - .map(ss -> Result.PASSED) - .orElse(Result.FAILED); + Optional<Result> maybePassed = + successfulSince.filter(ss -> checkTime >= ss + periodOfTimeNanos).map(ss -> Result.PASSED); + isCaughtUp = maybePassed.isPresent(); + return maybePassed.orElse(Result.FAILED); } }
diff --git a/src/main/resources/Documentation/healthcheck.md b/src/main/resources/Documentation/healthcheck.md index 3b84642..afe773b 100644 --- a/src/main/resources/Documentation/healthcheck.md +++ b/src/main/resources/Documentation/healthcheck.md
@@ -2,8 +2,8 @@ ============== The @PLUGIN@ plugin registers the `pull-replication-outstanding-tasks` -healthcheck. This check will mark a gerrit instance as healthy -only when the node has caught up with all the outstanding +healthcheck. This check will mark a gerrit instance as healthy upon +startup only when the node has caught up with all the outstanding pull-replication tasks. The goal is to mark the node as healthy when it is ready to receive write traffic. "Caught up" means: @@ -14,6 +14,10 @@ See [Healthcheck based on replication tasks](https://issues.gerritcodereview.com/issues/312895374) for more details. +**It is worth noting that once the healthcheck eventually succeeds and +the instance is marked healthy, the check is then skipped (ie any +subsequent invocations will always mark the instance as healthy +irrespective of any pending or inflight tasks being present).** Health check configuration --------------------------
diff --git a/src/test/java/com/googlesource/gerrit/plugins/replication/pull/health/PullReplicationTasksHealthCheckTest.java b/src/test/java/com/googlesource/gerrit/plugins/replication/pull/health/PullReplicationTasksHealthCheckTest.java index 6dfbc03..06fa996 100644 --- a/src/test/java/com/googlesource/gerrit/plugins/replication/pull/health/PullReplicationTasksHealthCheckTest.java +++ b/src/test/java/com/googlesource/gerrit/plugins/replication/pull/health/PullReplicationTasksHealthCheckTest.java
@@ -179,6 +179,26 @@ assertThat(check.run().result).isEqualTo(HealthCheck.Result.PASSED); } + @Test + public void shouldAlwaysReportHealthyAfterItHasCaughtUpWithOutstandingTasks() { + int numCheckIterations = 2; + when(source.pendingTasksCount()).thenReturn(0L, 0L, 1L); + when(source.inflightTasksCount()).thenReturn(0L); + + Injector injector = testInjector(new TestModule(List.of(), periodOfTimeMillisStr)); + PullReplicationTasksHealthCheck check = + injector.getInstance(PullReplicationTasksHealthCheck.class); + + assertThat( + runNTimes( + numCheckIterations, + check, + () -> fakeTicker.advance(Duration.ofMillis(periodOfTimeMillis)))) + .containsExactly(HealthCheck.Result.FAILED, HealthCheck.Result.PASSED); + + assertThat(check.run().result).isEqualTo(HealthCheck.Result.PASSED); + } + private Injector testInjector(AbstractModule testModule) { return Guice.createInjector(new HealthCheckExtensionApiModule(), testModule); }