Add healthcheck ping and dashboard for Gerrit

The healthcheck plugin for Gerrit provides a convenient way to determine
the health of different functionalities and components of Gerrit. If
the endpoint provided by the plugin is pinged, it will execute a set
of checks and return either 200 if all checks passed or 500 if at least
one failed. It will also provide metrics that can be scraped by
Prometheus.

This change adds the option for Gerrit installations outside of Kubernetes
to install a sidecar container in the Prometheus deployment that every
30 s pings the healthcheck plugin's endpoint, thereby triggering the
checks. This is not provided for kubernetes, since there the ping should
be the task of the Kubernetes liveness probes.

The change additionally adds a dashboard displaying the status of the
healthcheck for each Gerrit instance over time.

Change-Id: Ieeedc4406b642e542c89679a8314d771ca0928af
diff --git a/README.md b/README.md
index 465fff0..bc99c97 100644
--- a/README.md
+++ b/README.md
@@ -167,13 +167,14 @@
   Gerrit installations with just one replica that can run anywhere, where they
   are reachable via HTTP.
 
-| option                                         | description                                                                        |
-|------------------------------------------------|------------------------------------------------------------------------------------|
-| `gerritServers.other.[*].host`                 | Hostname (incl. port, if required) of the Gerrit server to monitor                 |
-| `gerritServers.other.[*].username`             | Username of Gerrit user with 'View Metrics' capabilities                           |
-| `gerritServers.other.[*].password`             | Password of Gerrit user with 'View Metrics' capabilities                           |
-| `gerritServers.other.[*].promtail.storagePath` | Path to directory, where Promtail is allowed to save files (e.g. `positions.yaml`) |
-| `gerritServers.other.[*].promtail.logPath`     | Path to directory containing the Gerrit logs (e.g. `/var/gerrit/logs`)             |
+| option                                         | description                                                                                  |
+|------------------------------------------------|----------------------------------------------------------------------------------------------|
+| `gerritServers.other.[*].host`                 | Hostname (incl. port, if required) of the Gerrit server to monitor                           |
+| `gerritServers.other.[*].username`             | Username of Gerrit user with 'View Metrics' capabilities                                     |
+| `gerritServers.other.[*].password`             | Password of Gerrit user with 'View Metrics' capabilities                                     |
+| `gerritServers.other.[*].healthcheck`          | Whether to deploy a container that regularly pings the healthcheck plugin endpoint in Gerrit |
+| `gerritServers.other.[*].promtail.storagePath` | Path to directory, where Promtail is allowed to save files (e.g. `positions.yaml`)           |
+| `gerritServers.other.[*].promtail.logPath`     | Path to directory containing the Gerrit logs (e.g. `/var/gerrit/logs`)                       |
 
 
 ### Encryption
diff --git a/charts/prometheus/prometheus.yaml b/charts/prometheus/prometheus.yaml
index 7cda71c..e7f4045 100644
--- a/charts/prometheus/prometheus.yaml
+++ b/charts/prometheus/prometheus.yaml
@@ -561,6 +561,25 @@
 
   name: server
   sidecarContainers:
+  #@ for instance in data.values.gerritServers.other:
+  #@ if instance.healthcheck:
+  - name: #@ "{}-health-ping".format(instance.host).replace('.', '-')
+    image: curlimages/curl:7.73.0
+    command:
+    - "watch"
+    - "-n"
+    - "30"
+    args:
+    - #@ "curl -Lk https://{}/config/server/healthcheck~status || echo 'Healthcheck failed'".format(instance.host)
+    resources:
+      limits:
+        cpu: 50m
+        memory: 128Mi
+      requests:
+        cpu: 10m
+        memory: 32Mi
+  #@ end
+  #@ end
 
   ## Prometheus server container image
   ##
diff --git a/config.yaml b/config.yaml
index 41f2057..a59baa5 100644
--- a/config.yaml
+++ b/config.yaml
@@ -12,6 +12,7 @@
   - host: gerrit.example.com
     username: admin
     password: secret
+    healthcheck: false
     promtail:
       storagePath: /var/promtail
       logPath: /var/gerrit/logs
diff --git a/dashboards/gerrit/healthcheck/gerrit-healthcheck.jsonnet b/dashboards/gerrit/healthcheck/gerrit-healthcheck.jsonnet
new file mode 100644
index 0000000..54678e2
--- /dev/null
+++ b/dashboards/gerrit/healthcheck/gerrit-healthcheck.jsonnet
@@ -0,0 +1,71 @@
+local grafana = import '../../../vendor/grafonnet/grafana.libsonnet';
+local dashboard = grafana.dashboard;
+local row = grafana.row;
+local template = grafana.template;
+
+local defaults = import '../../globals/defaults.libsonnet';
+local gridPos = import '../../globals/grid_pos.libsonnet';
+local publishVariables = import '../../globals/publish.libsonnet';
+local variables = import '../globals/variables.libsonnet';
+
+local current_healthcheck_panel = import './panels/current-healthcheck.libsonnet';
+local timeseries_healthcheck_panel = import './panels/timeseries-healthcheck.libsonnet';
+
+local HEALTHCHECKS = [
+  'activeworkers',
+  'auth',
+  'deadlock',
+  'httpactiveworkers',
+  'jgit',
+  'projectslist',
+  'querychanges',
+  'reviewdb'
+];
+
+dashboard.new(
+  'Gerrit - Healthcheck',
+  tags=['gerrit'],
+  schemaVersion=defaults.dashboards.schemaVersion,
+  editable=defaults.dashboards.editable,
+  time_from=defaults.dashboards.timeFrom,
+  time_to=defaults.dashboards.timeTo,
+  refresh=defaults.dashboards.refresh,
+  graphTooltip='shared_tooltip',
+)
+.addTemplate(variables.instance)
+.addTemplate(variables.replica)
+.addTemplate(
+  template.new(
+    name='check',
+    datasource='Prometheus',
+    query='metrics(^plugins_healthcheck_.+_failure_total$)',
+    regex='plugins_healthcheck_(.+)_failure_total',
+    label='Check',
+    multi=true,
+    includeAll=true,
+    refresh='time',
+  )
+)
+.addPanel(
+  row.new(title='CURRENT'),
+  gridPos={x: 0, y: 0},
+)
+.addPanel(
+  current_healthcheck_panel.new() + {
+    repeat: 'check',
+    repeatDirection: 'h',
+    maxPerRow: 8,
+  },
+  gridPos={w: 3, h: 6})
+.addPanel(
+  row.new(title='OVER TIME'),
+  gridPos={x: 0, y: 6},
+)
+.addPanel(
+  timeseries_healthcheck_panel.new() + {
+    repeat: 'check',
+    repeatDirection: 'h',
+    maxPerRow: 3,
+  },
+  gridPos={x: 0, y: 6, w: 8, h: 6})
++ if std.extVar('publish') then publishVariables else {}
diff --git a/dashboards/gerrit/healthcheck/panels/current-healthcheck.libsonnet b/dashboards/gerrit/healthcheck/panels/current-healthcheck.libsonnet
new file mode 100644
index 0000000..a46ca23
--- /dev/null
+++ b/dashboards/gerrit/healthcheck/panels/current-healthcheck.libsonnet
@@ -0,0 +1,50 @@
+local grafana = import '../../../../vendor/grafonnet/grafana.libsonnet';
+local statPanel = grafana.statPanel;
+local prometheus = grafana.prometheus;
+
+local defaults = import '../../../globals/defaults.libsonnet';
+
+{
+  new():: statPanel.new(
+    colorMode='background',
+    datasource=defaults.datasource,
+    decimals=2,
+    displayName='${check}',
+    graphMode='none',
+    title='',
+  )
+  .addTarget(
+    prometheus.target(
+      '1-clamp_max(increase(plugins_healthcheck_${check}_failure_total{instance="$instance",replica="$replica"}[2m]), 1)',
+      instant=true,
+    )
+  )
+  .addThresholds([
+    {
+      "color": "dark-red",
+      "value": null
+    },
+    {
+      "color": "semi-dark-green",
+      "value": 1
+    }
+  ])
+  .addMappings([
+    {
+      "from": "",
+      "id": 1,
+      "text": "ok",
+      "to": "",
+      "type": 1,
+      "value": "1"
+    },
+    {
+      "from": "",
+      "id": 2,
+      "text": "failed",
+      "to": "",
+      "type": 1,
+      "value": "0"
+    }
+  ])
+}
diff --git a/dashboards/gerrit/healthcheck/panels/timeseries-healthcheck.libsonnet b/dashboards/gerrit/healthcheck/panels/timeseries-healthcheck.libsonnet
new file mode 100644
index 0000000..dbb939a
--- /dev/null
+++ b/dashboards/gerrit/healthcheck/panels/timeseries-healthcheck.libsonnet
@@ -0,0 +1,22 @@
+local grafana = import '../../../../vendor/grafonnet/grafana.libsonnet';
+local graphPanel = grafana.graphPanel;
+local prometheus = grafana.prometheus;
+
+local defaults = import '../../../globals/defaults.libsonnet';
+
+{
+  new():: graphPanel.new(
+    datasource=defaults.datasource,
+    decimals=0,
+    fill=5,
+    min=0,
+    max=1,
+    title='${check}',
+  )
+  .addTarget(
+    prometheus.target(
+      '1-clamp_max(increase(plugins_healthcheck_${check}_failure_total{instance="$instance",replica="$replica"}[2m]), 1)',
+      legendFormat='${check}'
+    )
+  )
+}