Assess healthiness by factoring in periodOfTime
Modify the health check so that it reports healthy only when there are
no outstanding tasks for a duration of at least N, where N corresponds to the
value set in the `periodOfTime` setting.
Bug: Issue 312895374
Change-Id: Id3fe47605e6a82ca09e79f52b224af7beb77e2f4
diff --git a/src/main/java/com/googlesource/gerrit/plugins/replication/pull/health/PullReplicationTasksHealthCheck.java b/src/main/java/com/googlesource/gerrit/plugins/replication/pull/health/PullReplicationTasksHealthCheck.java
index 3d3be10..a621778 100644
--- a/src/main/java/com/googlesource/gerrit/plugins/replication/pull/health/PullReplicationTasksHealthCheck.java
+++ b/src/main/java/com/googlesource/gerrit/plugins/replication/pull/health/PullReplicationTasksHealthCheck.java
@@ -14,6 +14,7 @@
package com.googlesource.gerrit.plugins.replication.pull.health;
+import com.google.common.base.Ticker;
import com.google.common.collect.ImmutableSet;
import com.google.common.util.concurrent.ListeningExecutorService;
import com.google.gerrit.entities.Project;
@@ -24,10 +25,12 @@
import com.google.inject.Singleton;
import com.googlesource.gerrit.plugins.healthcheck.HealthCheckConfig;
import com.googlesource.gerrit.plugins.healthcheck.check.AbstractHealthCheck;
+import com.googlesource.gerrit.plugins.healthcheck.check.HealthCheck;
import com.googlesource.gerrit.plugins.replication.MergedConfigResource;
import com.googlesource.gerrit.plugins.replication.pull.Source;
import com.googlesource.gerrit.plugins.replication.pull.SourcesCollection;
import java.util.List;
+import java.util.Optional;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import org.eclipse.jgit.lib.Config;
@@ -39,8 +42,10 @@
public static final String PROJECTS_FILTER_FIELD = "projects";
public static final String PERIOD_OF_TIME_FIELD = "periodOfTime";
private final Set<String> projects;
- private final long periodOfTimeSec;
+ private final long periodOfTimeNanos;
private final SourcesCollection sourcesCollection;
+ private final Ticker ticker;
+ private Optional<Long> successfulSince = Optional.empty();
@Inject
public PullReplicationTasksHealthCheck(
@@ -49,7 +54,8 @@
MergedConfigResource configResource,
@PluginName String name,
MetricMaker metricMaker,
- SourcesCollection sourcesCollection) {
+ SourcesCollection sourcesCollection,
+ Ticker ticker) {
super(executor, healthCheckConfig, name + HEALTHCHECK_NAME_SUFFIX, metricMaker);
String healthCheckName = name + HEALTHCHECK_NAME_SUFFIX;
@@ -58,19 +64,20 @@
Set.of(
replicationConfig.getStringList(
HealthCheckConfig.HEALTHCHECK, healthCheckName, PROJECTS_FILTER_FIELD));
- this.periodOfTimeSec =
+ this.periodOfTimeNanos =
ConfigUtil.getTimeUnit(
replicationConfig,
HealthCheckConfig.HEALTHCHECK,
healthCheckName,
PERIOD_OF_TIME_FIELD,
DEFAULT_PERIOD_OF_TIME_SECS,
- TimeUnit.SECONDS);
+ TimeUnit.NANOSECONDS);
this.sourcesCollection = sourcesCollection;
+ this.ticker = ticker;
}
- public long getPeriodOfTimeSec() {
- return periodOfTimeSec;
+ public long getPeriodOfTimeNanos() {
+ return periodOfTimeNanos;
}
public Set<String> getProjects() {
@@ -79,6 +86,7 @@
@Override
protected Result doCheck() throws Exception {
+ long checkTime = ticker.read();
List<Source> sources = sourcesCollection.getAll();
boolean hasNoOutstandingTasks =
sources.stream()
@@ -94,6 +102,15 @@
&& source.zeroInflightTasksForRepo(Project.nameKey(project)));
}
});
- return hasNoOutstandingTasks ? Result.PASSED : Result.FAILED;
+ successfulSince =
+ hasNoOutstandingTasks ? successfulSince.or(() -> Optional.of(checkTime)) : Optional.empty();
+ return reportResult(checkTime);
+ }
+
+ private HealthCheck.Result reportResult(long checkTime) {
+ return successfulSince
+ .filter(ss -> checkTime >= ss + periodOfTimeNanos)
+ .map(ss -> Result.PASSED)
+ .orElse(Result.FAILED);
}
}
diff --git a/src/main/resources/Documentation/healthcheck.md b/src/main/resources/Documentation/healthcheck.md
index f3001c9..3b84642 100644
--- a/src/main/resources/Documentation/healthcheck.md
+++ b/src/main/resources/Documentation/healthcheck.md
@@ -9,6 +9,8 @@
- All pending & in-flight replication tasks across all sources (or
across a configurable set of repos) have completed
+- There are no queued replication tasks pending and the above condition
+lasts for at least N seconds (configurable)
See [Healthcheck based on replication tasks](https://issues.gerritcodereview.com/issues/312895374) for more details.
@@ -26,10 +28,11 @@
The health check can be configured as follows:
- `healthcheck.@PLUGIN@-outstanding-tasks.projects`: The repo(s) that
the health check will track outstanding replication tasks against.
-Multiple entries are supported.
+Multiple entries are supported. If not specified, all the outstanding
+replication tasks are tracked.
- `healthcheck.@PLUGIN@-outstanding-tasks.periodOfTime`: The time for
which the check needs to be successful, in order for the instance to be
-marked healthy. If the time unit is omitted it defaults to seconds.
+marked healthy. If the time unit is omitted it defaults to milliseconds.
Values should use common unit suffixes to express their setting:
* ms, milliseconds
@@ -37,6 +40,17 @@
* m, min, minute, minutes
* h, hr, hour, hours
+Default is 10s.
+
+This example config will report the node healthy when there are no
+pending tasks for the `foo` and `bar/baz` repos continuously for a
+period of 5 seconds after the plugin startup.
+```
+[healthcheck "pull-replication-tasks"]
+ projects = foo
+ projects = bar/baz
+ periodOfTime = 5 sec
+```
Useful information
------------------
diff --git a/src/test/java/com/googlesource/gerrit/plugins/replication/pull/PullReplicationHealthCheckIT.java b/src/test/java/com/googlesource/gerrit/plugins/replication/pull/PullReplicationHealthCheckIT.java
index 226e509..346b7fa 100644
--- a/src/test/java/com/googlesource/gerrit/plugins/replication/pull/PullReplicationHealthCheckIT.java
+++ b/src/test/java/com/googlesource/gerrit/plugins/replication/pull/PullReplicationHealthCheckIT.java
@@ -19,6 +19,7 @@
import static com.googlesource.gerrit.plugins.replication.pull.health.PullReplicationTasksHealthCheck.HEALTHCHECK_NAME_SUFFIX;
import static com.googlesource.gerrit.plugins.replication.pull.health.PullReplicationTasksHealthCheck.PERIOD_OF_TIME_FIELD;
+import com.google.common.base.Stopwatch;
import com.google.gerrit.acceptance.SkipProjectClone;
import com.google.gerrit.acceptance.TestPlugin;
import com.google.gerrit.acceptance.UseLocalDisk;
@@ -41,6 +42,7 @@
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
+import java.util.concurrent.TimeUnit;
import java.util.stream.Stream;
import org.eclipse.jgit.lib.ObjectId;
import org.eclipse.jgit.lib.Ref;
@@ -56,6 +58,8 @@
"com.googlesource.gerrit.plugins.replication.pull.PullReplicationHealthCheckIT$PullReplicationHealthCheckTestModule",
httpModule = "com.googlesource.gerrit.plugins.replication.pull.api.HttpModule")
public class PullReplicationHealthCheckIT extends PullReplicationSetupBase {
+ private static final int TEST_HEALTHCHECK_PERIOD_OF_TIME_SEC = 5;
+ private static final String TEST_PLUGIN_NAME = "pull-replication";
public static class PullReplicationHealthCheckTestModule extends AbstractModule {
private final PullReplicationModule pullReplicationModule;
@@ -151,6 +155,11 @@
project.ifPresent(prj -> config.setString("remote", remoteName, "projects", prj));
config.setBoolean("gerrit", null, "autoReload", true);
config.setInt("replication", null, "maxApiPayloadSize", 1);
+ config.setString(
+ HealthCheckConfig.HEALTHCHECK,
+ TEST_PLUGIN_NAME + HEALTHCHECK_NAME_SUFFIX,
+ PERIOD_OF_TIME_FIELD,
+ TEST_HEALTHCHECK_PERIOD_OF_TIME_SEC + " sec");
config.save();
}
@@ -158,6 +167,7 @@
@GerritConfig(name = "gerrit.instanceId", value = TEST_REPLICATION_REMOTE)
public void shouldReportInstanceHealthyWhenThereAreNoOutstandingReplicationTasks()
throws Exception {
+ Stopwatch testStartStopwatch = Stopwatch.createUnstarted();
testRepo = cloneProject(createTestProject(project + TEST_REPLICATION_SUFFIX));
PullReplicationTasksHealthCheck healthcheck =
getInstance(PullReplicationTasksHealthCheck.class);
@@ -182,20 +192,29 @@
waitUntil(
() -> {
+ boolean healthCheckPassed = healthcheck.run().result == HealthCheck.Result.PASSED;
boolean replicationFinished = hasReplicationFinished();
if (!replicationFinished) {
- boolean healthCheckPassed = healthcheck.run().result == HealthCheck.Result.PASSED;
-
assertWithMessage("Instance reported healthy while waiting for replication to finish")
// Racy condition: we need to make sure that this isn't a false alarm
// and accept the case when the replication finished between the
// if(!replicationFinished) check and the assertion here
.that(!healthCheckPassed || hasReplicationFinished())
.isTrue();
+ } else {
+ if (!testStartStopwatch.isRunning()) {
+ testStartStopwatch.start();
+ }
}
return replicationFinished;
});
+ if (testStartStopwatch.elapsed(TimeUnit.SECONDS) < TEST_HEALTHCHECK_PERIOD_OF_TIME_SEC) {
+ assertThat(healthcheck.run().result).isEqualTo(HealthCheck.Result.FAILED);
+ }
+
+ waitUntil(
+ () -> testStartStopwatch.elapsed(TimeUnit.SECONDS) > TEST_HEALTHCHECK_PERIOD_OF_TIME_SEC);
assertThat(healthcheck.run().result).isEqualTo(HealthCheck.Result.PASSED);
}
diff --git a/src/test/java/com/googlesource/gerrit/plugins/replication/pull/health/PullReplicationTasksHealthCheckTest.java b/src/test/java/com/googlesource/gerrit/plugins/replication/pull/health/PullReplicationTasksHealthCheckTest.java
index 6c5b537..6dfbc03 100644
--- a/src/test/java/com/googlesource/gerrit/plugins/replication/pull/health/PullReplicationTasksHealthCheckTest.java
+++ b/src/test/java/com/googlesource/gerrit/plugins/replication/pull/health/PullReplicationTasksHealthCheckTest.java
@@ -21,6 +21,9 @@
import static org.mockito.Mockito.lenient;
import static org.mockito.Mockito.when;
+import com.google.common.base.Ticker;
+import com.google.common.testing.FakeTicker;
+import com.google.gerrit.common.Nullable;
import com.google.gerrit.entities.Project;
import com.google.gerrit.extensions.annotations.PluginName;
import com.google.gerrit.metrics.DisabledMetricMaker;
@@ -35,9 +38,11 @@
import com.googlesource.gerrit.plugins.replication.MergedConfigResource;
import com.googlesource.gerrit.plugins.replication.pull.Source;
import com.googlesource.gerrit.plugins.replication.pull.SourcesCollection;
+import java.time.Duration;
import java.util.ArrayList;
import java.util.List;
-import java.util.stream.IntStream;
+import java.util.Optional;
+import java.util.concurrent.TimeUnit;
import org.eclipse.jgit.lib.Config;
import org.junit.Before;
import org.junit.Test;
@@ -49,8 +54,12 @@
public class PullReplicationTasksHealthCheckTest {
private static final String PLUGIN_NAME = "pull-replication";
private static final String SECTION_NAME = PLUGIN_NAME + HEALTHCHECK_NAME_SUFFIX;
+ private static final String ZERO_PERIOD_OF_TIME = "0 sec";
+ private static final Optional<String> NO_PROJECT_FILTER = Optional.empty();
- private final int periodOfCheckSec = 10;
+ private final int periodOfTimeMillis = 10;
+ private final String periodOfTimeMillisStr = periodOfTimeMillis + " ms";
+ private final FakeTicker fakeTicker = new FakeTicker();
@Mock private SourcesCollection sourcesCollection;
@Mock private Source source;
@@ -64,29 +73,28 @@
@Test
public void shouldReadConfig() {
List<String> projectsToCheck = List.of("foo", "bar/baz");
- Injector injector = testInjector(new TestModule(projectsToCheck, periodOfCheckSec + " sec"));
+ Injector injector = testInjector(new TestModule(projectsToCheck, periodOfTimeMillisStr));
PullReplicationTasksHealthCheck check =
injector.getInstance(PullReplicationTasksHealthCheck.class);
assertThat(check.getProjects()).containsExactlyElementsIn(projectsToCheck);
- assertThat(check.getPeriodOfTimeSec()).isEqualTo(periodOfCheckSec);
+ assertThat(check.getPeriodOfTimeNanos())
+ .isEqualTo(TimeUnit.MILLISECONDS.toNanos(periodOfTimeMillis));
}
@Test
public void shouldOnlyCheckTasksForReposThatMatchTheRepoFilter() {
+ int numIterations = 3;
String repo = "foo";
when(source.zeroPendingTasksForRepo(Project.nameKey(repo))).thenReturn(false).thenReturn(true);
when(source.zeroInflightTasksForRepo(Project.nameKey(repo))).thenReturn(false).thenReturn(true);
lenient().when(source.zeroInflightTasksForRepo(Project.nameKey("ignored"))).thenReturn(false);
lenient().when(source.pendingTasksCount()).thenReturn(10L);
- Injector injector = testInjector(new TestModule(List.of(repo), periodOfCheckSec + " sec"));
+ PullReplicationTasksHealthCheck check = newPullReplicationTasksHealthCheck(Optional.of(repo));
- PullReplicationTasksHealthCheck check =
- injector.getInstance(PullReplicationTasksHealthCheck.class);
-
- List<HealthCheck.Result> checkResults = runNTimes(3, check);
+ List<HealthCheck.Result> checkResults = runNTimes(numIterations, check);
assertThat(checkResults)
.containsExactly(
HealthCheck.Result.FAILED, HealthCheck.Result.FAILED, HealthCheck.Result.PASSED);
@@ -94,13 +102,10 @@
@Test
public void shouldCheckAllOutstandingTasksWhenRepoFilterIsNotConfigured() {
- List<String> noRepoFilter = List.of();
when(source.pendingTasksCount()).thenReturn(1L).thenReturn(0L);
when(source.inflightTasksCount()).thenReturn(1L).thenReturn(0L);
- Injector injector = testInjector(new TestModule(noRepoFilter, periodOfCheckSec + " sec"));
- PullReplicationTasksHealthCheck check =
- injector.getInstance(PullReplicationTasksHealthCheck.class);
+ PullReplicationTasksHealthCheck check = newPullReplicationTasksHealthCheck(NO_PROJECT_FILTER);
List<HealthCheck.Result> checkResults = runNTimes(3, check);
assertThat(checkResults)
@@ -116,9 +121,7 @@
when(anotherSource.pendingTasksCount()).thenReturn(1L).thenReturn(0L);
when(anotherSource.inflightTasksCount()).thenReturn(1L).thenReturn(0L);
- Injector injector = testInjector(new TestModule(List.of(), periodOfCheckSec + " sec"));
- PullReplicationTasksHealthCheck check =
- injector.getInstance(PullReplicationTasksHealthCheck.class);
+ PullReplicationTasksHealthCheck check = newPullReplicationTasksHealthCheck(NO_PROJECT_FILTER);
List<HealthCheck.Result> checkResults = runNTimes(5, check);
assertThat(checkResults)
@@ -130,17 +133,88 @@
HealthCheck.Result.PASSED);
}
- private List<HealthCheck.Result> runNTimes(int nTimes, PullReplicationTasksHealthCheck check) {
- List<HealthCheck.Result> results = new ArrayList<>();
- IntStream.rangeClosed(1, nTimes).mapToObj(i -> check.run().result).forEach(results::add);
+ @Test
+ public void shouldFailIfCheckDoesNotReportHealthyConsistentlyOverPeriodOfTime() {
+ long healthCheckPeriodOfTimeMsec = 50L;
+ String healthCheckPeriodOfTime = healthCheckPeriodOfTimeMsec + " ms";
+ int checkInterations = 3;
+ Duration checkInterval =
+ Duration.ofMillis(healthCheckPeriodOfTimeMsec).dividedBy(checkInterations);
+ long fakeTimerStartNanos = fakeTicker.read();
+ when(source.pendingTasksCount()).thenReturn(0L).thenReturn(1L).thenReturn(0L);
+ when(source.inflightTasksCount()).thenReturn(0L);
- return results;
+ Injector injector = testInjector(new TestModule(List.of(), healthCheckPeriodOfTime));
+ PullReplicationTasksHealthCheck check =
+ injector.getInstance(PullReplicationTasksHealthCheck.class);
+
+ List<HealthCheck.Result> checkResults =
+ runNTimes(checkInterations, check, () -> fakeTicker.advance(checkInterval));
+
+ assertThat(checkResults).doesNotContain(HealthCheck.Result.PASSED);
+ assertThat(fakeTicker.read())
+ .isAtLeast(Duration.ofMillis(periodOfTimeMillis).plusNanos(fakeTimerStartNanos).toNanos());
+ }
+
+ @Test
+ public void shouldFailOnFirstInvocationEvenIfThereAreNoOutstandingTasksAndANonZeroPeriodOfTime() {
+ mockSourceWithNoOutstandingTasks();
+
+ Injector injector = testInjector(new TestModule(List.of(), periodOfTimeMillisStr));
+ PullReplicationTasksHealthCheck check =
+ injector.getInstance(PullReplicationTasksHealthCheck.class);
+
+ assertThat(check.run().result).isEqualTo(HealthCheck.Result.FAILED);
+ }
+
+ @Test
+ public void
+ shouldReportHealthyOnFirstInvocationIfThereAreNoOutstandingTasksAndAZeroPeriodOfTime() {
+ mockSourceWithNoOutstandingTasks();
+
+ Injector injector = testInjector(new TestModule(List.of(), ZERO_PERIOD_OF_TIME));
+ PullReplicationTasksHealthCheck check =
+ injector.getInstance(PullReplicationTasksHealthCheck.class);
+
+ assertThat(check.run().result).isEqualTo(HealthCheck.Result.PASSED);
}
private Injector testInjector(AbstractModule testModule) {
return Guice.createInjector(new HealthCheckExtensionApiModule(), testModule);
}
+ private List<HealthCheck.Result> runNTimes(int nTimes, PullReplicationTasksHealthCheck check) {
+ return runNTimes(nTimes, check, null);
+ }
+
+ private PullReplicationTasksHealthCheck newPullReplicationTasksHealthCheck(
+ Optional<String> projectNameToCheck) {
+ Injector injector =
+ testInjector(new TestModule(projectNameToCheck.stream().toList(), ZERO_PERIOD_OF_TIME));
+
+ PullReplicationTasksHealthCheck check =
+ injector.getInstance(PullReplicationTasksHealthCheck.class);
+ return check;
+ }
+
+ private List<HealthCheck.Result> runNTimes(
+ int nTimes, PullReplicationTasksHealthCheck check, @Nullable Runnable postRunFunc) {
+ List<HealthCheck.Result> results = new ArrayList<>();
+ for (int i = 0; i < nTimes; i++) {
+ results.add(check.run().result);
+ if (postRunFunc != null) {
+ postRunFunc.run();
+ }
+ }
+
+ return results;
+ }
+
+ private void mockSourceWithNoOutstandingTasks() {
+ when(source.pendingTasksCount()).thenReturn(0L);
+ when(source.inflightTasksCount()).thenReturn(0L);
+ }
+
private class TestModule extends AbstractModule {
Config config;
MergedConfigResource configResource;
@@ -175,6 +249,7 @@
bind(HealthCheckConfig.class).toInstance(healthCheckConfig);
bind(String.class).annotatedWith(PluginName.class).toInstance(PLUGIN_NAME);
bind(SourcesCollection.class).toInstance(sourcesCollection);
+ bind(Ticker.class).toInstance(fakeTicker);
}
}
}