Add basic Cloudwatch dashboard for master-slave receipe

Feature: Issue 13218
Change-Id: Ie4dbd25de9f279180b570b0da99c96602d123f6e
diff --git a/Configuration.md b/Configuration.md
index bfd73e1..3e5f89b 100644
--- a/Configuration.md
+++ b/Configuration.md
@@ -59,7 +59,7 @@
 plugin.
 
 * `METRICS_CLOUDWATCH_ENABLED`: Optional - Boolean.
-Whether to publish metrics to CloudWatch. Default: false
+Whether to publish metrics to CloudWatch and create CloudWatch dashboard. Default: false
 * `METRICS_CLOUDWATCH_NAMESPACE`: Optional - String.
 The CloudWatch namespace for Gerrit metrics. Default: _gerrit_
 * `METRICS_CLOUDWATCH_RATE`: Optional - String.
diff --git a/master-slave/Makefile b/master-slave/Makefile
index 03e06c7..a32fd37 100644
--- a/master-slave/Makefile
+++ b/master-slave/Makefile
@@ -6,20 +6,26 @@
 SERVICE_MASTER_TEMPLATE:=cf-service-master.yml
 SERVICE_SLAVE_TEMPLATE:=cf-service-slave.yml
 DNS_ROUTING_TEMPLATE:=cf-dns-route.yml
+CLOUDWATCH_DASHBOARD_TEMPLATE:=cf-dashboard.yml
 AWS_FC_COMMAND=export AWS_PAGER=;aws cloudformation
 
 .PHONY: create-all delete-all \
-				cluster cluster-keys service-master service-slave dns-routing \
-				delete-cluster delete-service-master delete-service-slave delete-dns-routing \
-				wait-for-cluster-creation wait-for-service-master-creation wait-for-dns-routing-creation \
-				wait-for-cluster-deletion wait-for-service-master-deletion wait-for-dns-routing-deletion \
+				cluster cluster-keys service-master service-slave dns-routing dashboard \
+				delete-cluster delete-service-master delete-service-slave delete-dns-routing delete-dashboard \
+				wait-for-cluster-creation wait-for-service-master-creation wait-for-dns-routing-creation wait-for-dashboard \
+				wait-for-cluster-deletion wait-for-service-master-deletion wait-for-dns-routing-deletion wait-for-dashboard-deletion \
 				gerrit-build gerrit-publish
 
+ifeq ($(METRICS_CLOUDWATCH_ENABLED),true)
+optional_dashboard_targets=dashboard wait-for-dashboard-creation
+endif
+
 create-all: upload-common-templates \
 						gerrit-publish git-daemon-publish git-ssh-publish \
 						cluster wait-for-cluster-creation \
 						service-slave service-master \
 						wait-for-service-master-creation wait-for-service-slave-creation \
+						$(optional_dashboard_targets) \
 						dns-routing wait-for-dns-routing-creation
 
 cluster: cluster-keys
@@ -129,6 +135,31 @@
 		ParameterKey=MasterServiceStackName,ParameterValue=$(SERVICE_MASTER_STACK_NAME) \
 		ParameterKey=SlaveServiceStackName,ParameterValue=$(SERVICE_SLAVE_STACK_NAME)
 
+dashboard:
+ifeq ($(METRICS_CLOUDWATCH_ENABLED),true)
+ifdef GERRIT_MASTER_INSTANCE_ID
+	$(eval DASHBOARD_OPTIONAL_PARAMS := $(DASHBOARD_OPTIONAL_PARAMS) ParameterKey=MasterInstanceId,ParameterValue=$(GERRIT_MASTER_INSTANCE_ID))
+endif
+ifdef GERRIT_SLAVE_INSTANCE_ID
+	$(eval DASHBOARD_OPTIONAL_PARAMS := $(DASHBOARD_OPTIONAL_PARAMS) ParameterKey=SlaveInstanceId,ParameterValue=$(GERRIT_SLAVE_INSTANCE_ID))
+endif
+ifdef METRICS_CLOUDWATCH_NAMESPACE
+	$(eval DASHBOARD_OPTIONAL_PARAMS := $(DASHBOARD_OPTIONAL_PARAMS) ParameterKey=MetricsCloudwatchNamespace,ParameterValue=$(METRICS_CLOUDWATCH_NAMESPACE))
+endif
+
+	$(AWS_FC_COMMAND) create-stack \
+		--stack-name $(DASHBOARD_STACK_NAME) \
+		--capabilities CAPABILITY_IAM  \
+		--template-body file://`pwd`/$(CLOUDWATCH_DASHBOARD_TEMPLATE) \
+		--region $(AWS_REGION) \
+		--parameters \
+		$(DASHBOARD_OPTIONAL_PARAMS)
+
+else
+		@echo "METRICS_CLOUDWATCH_ENABLED is set to false. Dashboard creation skipped".
+endif
+
+
 wait-for-cluster-creation:
 	@echo "*** Wait for cluster stack '$(CLUSTER_STACK_NAME)' creation"
 	$(AWS_FC_COMMAND) wait stack-create-complete \
@@ -157,6 +188,13 @@
 	--region $(AWS_REGION)
 	@echo "*** DNS Routing stack '$(DNS_ROUTING_STACK_NAME)' created"
 
+wait-for-dashboard-creation:
+	@echo "*** Wait for dashboard stack '$(DASHBOARD_STACK_NAME)' creation"
+	$(AWS_FC_COMMAND) wait stack-create-complete \
+	--stack-name $(DASHBOARD_STACK_NAME) \
+	--region $(AWS_REGION)
+	@echo "*** Dashboard stack '$(DASHBOARD_STACK_NAME)' created"
+
 wait-for-cluster-deletion:
 	@echo "*** Wait for cluster stack '$(CLUSTER_STACK_NAME)' deletion"
 	$(AWS_FC_COMMAND) wait stack-delete-complete \
@@ -185,6 +223,13 @@
 	--region $(AWS_REGION)
 	@echo "*** DNS routing stack '$(DNS_ROUTING_STACK_NAME)' deleted"
 
+wait-for-dashboard-deletion:
+	@echo "*** Wait for dashboard stack '$(DASHBOARD_STACK_NAME)' deletion"
+	$(AWS_FC_COMMAND) wait stack-delete-complete \
+	--stack-name $(DASHBOARD_STACK_NAME) \
+	--region $(AWS_REGION)
+	@echo "*** Dashboard stack '$(DASHBOARD_STACK_NAME)' deleted"
+
 delete-cluster:
 	$(AWS_FC_COMMAND) delete-stack \
 	--stack-name $(CLUSTER_STACK_NAME) \
@@ -205,10 +250,16 @@
 	--stack-name $(DNS_ROUTING_STACK_NAME) \
 	--region $(AWS_REGION)
 
+delete-dashboard:
+	$(AWS_FC_COMMAND) delete-stack \
+	--stack-name $(DASHBOARD_STACK_NAME) \
+	--region $(AWS_REGION)
+
 delete-all: delete-dns-routing wait-for-dns-routing-deletion \
 						delete-service-slave wait-for-service-slave-deletion \
 						delete-service-master wait-for-service-master-deletion \
-						delete-cluster wait-for-cluster-deletion
+						delete-cluster wait-for-cluster-deletion \
+						delete-dashboard wait-for-dashboard-deletion
 
 gerrit-publish:
 	$(MAKE) -C ../gerrit gerrit-publish RECIPE=master-slave
diff --git a/master-slave/README.md b/master-slave/README.md
index 4338ac8..2581e05 100644
--- a/master-slave/README.md
+++ b/master-slave/README.md
@@ -5,11 +5,12 @@
 
 ## Architecture
 
-Four templates are provided in this example:
+Five templates are provided in this example:
 * `cf-cluster`: define the ECS cluster and the networking stack
 * `cf-service-master`: define the service stack running Gerrit master
 * `cf-service-slave`: define the service stack running Gerrit slave
 * `cf-dns-route`: define the DNS routing for the service
+* `cf-dashboard`: define the CloudWatch dashboard for the services
 
 ### Networking
 
@@ -72,6 +73,7 @@
 
 * `SERVICE_MASTER_STACK_NAME`: Optional. Name of the master service stack. `gerrit-service-master` by default.
 * `SERVICE_SLAVE_STACK_NAME`: Optional. Name of the slave service stack. `gerrit-service-slave` by default.
+* `DASHBOARD_STACK_NAME` : Optional. Name of the dashboard stack. `gerrit-dashboard` by default.
 * `MASTER_SUBDOMAIN`: Optional. Name of the master sub domain. `gerrit-master-demo` by default.
 * `SLAVE_SUBDOMAIN`: Optional. Name of the slave sub domain. `gerrit-slave-demo` by default.
 * `CLUSTER_DESIRED_CAPACITY`: Optional. Number of EC2 instances composing the cluster. `1` by default.
diff --git a/master-slave/cf-dashboard.yml b/master-slave/cf-dashboard.yml
new file mode 100644
index 0000000..66120ea
--- /dev/null
+++ b/master-slave/cf-dashboard.yml
@@ -0,0 +1,392 @@
+AWSTemplateFormatVersion: '2010-09-09'
+Description: Deploy a Master-Slave CloudWatch dashboard.
+Parameters:
+  MasterInstanceId:
+      Description: Optional identifier for the Gerrit master instance
+      Default: gerrit-master-slave-MASTER
+      Type: String
+  SlaveInstanceId:
+      Description: Optional identifier for the Gerrit slave instance
+      Default: gerrit-master-slave-SLAVE
+      Type: String
+  MetricsCloudwatchNamespace:
+      Description: The CloudWatch namespace for Gerrit metrics
+      Type: String
+      Default: gerrit
+Resources:
+  CloudWatchDashboard:
+      Type: AWS::CloudWatch::Dashboard
+      Properties:
+          DashboardName: !Ref AWS::StackName
+          DashboardBody: !Sub |
+                {
+                    "start": "-PT6H",
+                    "periodOverride": "auto",
+                    "widgets": [
+                        {
+                            "type": "metric",
+                            "x": 0,
+                            "y": 0,
+                            "width": 24,
+                            "height": 3,
+                            "properties": {
+                                "metrics": [
+                                    [ "${MetricsCloudwatchNamespace}", "sshd/sessions/connected", "InstanceId", "${MasterInstanceId}", "Type", "gauge" ],
+                                    [ "${MetricsCloudwatchNamespace}", "sshd/sessions/connected", "InstanceId", "${SlaveInstanceId}", "Type", "gauge" ]
+                                ],
+                                "view": "singleValue",
+                                "region": "${AWS::Region}",
+                                "stat": "Maximum",
+                                "period": 60,
+                                "title": "SSH Connection",
+                                "stacked": false
+                            }
+                        },
+                        {
+                            "type": "metric",
+                            "x": 6,
+                            "y": 3,
+                            "width": 6,
+                            "height": 6,
+                            "properties": {
+                                "metrics": [
+                                    [ { "expression": "RATE(METRICS())", "label": "Expression1", "id": "e1", "region": "${AWS::Region}" } ],
+                                    [ "${MetricsCloudwatchNamespace}", "proc/cpu/usage", "InstanceId", "${MasterInstanceId}", "Type", "gauge", { "id": "m1", "visible": false } ],
+                                    [ "${MetricsCloudwatchNamespace}", "proc/cpu/usage", "InstanceId", "${SlaveInstanceId}", "Type", "gauge", { "id": "m2", "visible": false } ]
+                                ],
+                                "view": "timeSeries",
+                                "stacked": false,
+                                "region": "${AWS::Region}",
+                                "stat": "Average",
+                                "period": 60,
+                                "title": "Gerrit Cpu Load",
+                                "yAxis": {
+                                    "right": {
+                                        "showUnits": true
+                                    },
+                                    "left": {
+                                        "showUnits": false,
+                                        "min": 0
+                                    }
+                                }
+                            }
+                        },
+                        {
+                            "type": "metric",
+                            "x": 12,
+                            "y": 33,
+                            "width": 12,
+                            "height": 6,
+                            "properties": {
+                                "metrics": [
+                                    [ "${MetricsCloudwatchNamespace}", "jgit/block_cache/cache_used", "InstanceId", "${MasterInstanceId}", "Type", "gauge", { "id": "m1" } ],
+                                    [ "${MetricsCloudwatchNamespace}", "jgit/block_cache/cache_used", "InstanceId", "${SlaveInstanceId}", "Type", "gauge", { "id": "m2" } ]
+                                ],
+                                "view": "timeSeries",
+                                "stacked": false,
+                                "region": "${AWS::Region}",
+                                "yAxis": {
+                                    "left": {
+                                        "showUnits": false,
+                                        "label": ""
+                                    }
+                                },
+                                "title": "JGit Cache",
+                                "period": 60,
+                                "stat": "Average"
+                            }
+                        },
+                        {
+                            "type": "metric",
+                            "x": 18,
+                            "y": 3,
+                            "width": 6,
+                            "height": 6,
+                            "properties": {
+                                "metrics": [
+                                    [ "${MetricsCloudwatchNamespace}", "jvm.memory.total.used", "InstanceId", "${MasterInstanceId}", "Type", "gauge" ],
+                                    [ "${MetricsCloudwatchNamespace}", "jvm.memory.total.used", "InstanceId", "${SlaveInstanceId}", "Type", "gauge" ]
+                                ],
+                                "view": "timeSeries",
+                                "stacked": false,
+                                "region": "${AWS::Region}",
+                                "yAxis": {
+                                    "left": {
+                                        "showUnits": false
+                                    }
+                                },
+                                "stat": "Average",
+                                "period": 60,
+                                "title": "Gerrit Used memory"
+                            }
+                        },
+                        {
+                            "type": "metric",
+                            "x": 12,
+                            "y": 3,
+                            "width": 6,
+                            "height": 6,
+                            "properties": {
+                                "metrics": [
+                                    [ "${MetricsCloudwatchNamespace}", "proc/cpu/system_load", "InstanceId", "${MasterInstanceId}", "Type", "gauge" ],
+                                    [ "${MetricsCloudwatchNamespace}", "proc/cpu/system_load", "InstanceId", "${SlaveInstanceId}", "Type", "gauge" ]
+                                ],
+                                "view": "timeSeries",
+                                "stacked": false,
+                                "title": "Gerrit System Load",
+                                "region": "${AWS::Region}",
+                                "yAxis": {
+                                    "left": {
+                                        "showUnits": false
+                                    }
+                                },
+                                "stat": "Average",
+                                "period": 60
+                            }
+                        },
+                        {
+                            "type": "metric",
+                            "x": 0,
+                            "y": 9,
+                            "width": 6,
+                            "height": 6,
+                            "properties": {
+                                "metrics": [
+                                    [ "${MetricsCloudwatchNamespace}", "jvm.gc.G1-Old-Generation.time",  "InstanceId", "${MasterInstanceId}", "Type", "gauge" ],
+                                    [ "${MetricsCloudwatchNamespace}", "jvm.gc.G1-Old-Generation.time",  "InstanceId", "${SlaveInstanceId}", "Type", "gauge" ]
+                                ],
+                                "view": "timeSeries",
+                                "stacked": false,
+                                "title": "GC Time",
+                                "region": "${AWS::Region}",
+                                "stat": "Average",
+                                "period": 60,
+                                "yAxis": {
+                                    "left": {
+                                        "showUnits": false
+                                    }
+                                }
+                            }
+                        },
+                        {
+                            "type": "metric",
+                            "x": 6,
+                            "y": 9,
+                            "width": 6,
+                            "height": 6,
+                            "properties": {
+                                "metrics": [
+                                    [ "${MetricsCloudwatchNamespace}", "proc/jvm/thread/num_live",  "InstanceId", "${MasterInstanceId}", "Type", "gauge" ],
+                                    [ "${MetricsCloudwatchNamespace}", "proc/jvm/thread/num_live",  "InstanceId", "${SlaveInstanceId}", "Type", "gauge" ]
+                                ],
+                                "view": "timeSeries",
+                                "stacked": false,
+                                "title": "Active Threads",
+                                "region": "${AWS::Region}",
+                                "period": 60,
+                                "stat": "Average",
+                                "yAxis": {
+                                    "left": {
+                                        "showUnits": false
+                                    }
+                                }
+                            }
+                        },
+                        {
+                            "type": "metric",
+                            "x": 0,
+                            "y": 15,
+                            "width": 6,
+                            "height": 6,
+                            "properties": {
+                                "metrics": [
+                                    [ "${MetricsCloudwatchNamespace}", "http/server/rest_api/server_latency_total", "InstanceId", "${MasterInstanceId}", "Type", "99.9%", { "id": "m2" } ]
+                                ],
+                                "view": "timeSeries",
+                                "stacked": true,
+                                "region": "${AWS::Region}",
+                                "stat": "Average",
+                                "period": 60,
+                                "title": "HTTP requests latency",
+                                "yAxis": {
+                                    "left": {
+                                        "showUnits": false,
+                                        "label": "ms"
+                                    }
+                                }
+                            }
+                        },
+                        {
+                            "type": "metric",
+                            "x": 6,
+                            "y": 15,
+                            "width": 6,
+                            "height": 6,
+                            "properties": {
+                                "metrics": [
+                                    [ { "expression": "m1+m2", "label": "HTTP hits per second", "id": "e2", "region": "${AWS::Region}" } ],
+                                    [ "${MetricsCloudwatchNamespace}", "http/server/success_count_total", "InstanceId", "${MasterInstanceId}", "Type", "count", { "id": "m2", "visible": false } ],
+                                    [ ".", "http/server/error_count_total", ".", ".", ".", ".", { "id": "m1", "visible": false } ]
+                                ],
+                                "view": "timeSeries",
+                                "stacked": true,
+                                "title": "HTTP hits per second",
+                                "region": "${AWS::Region}",
+                                "stat": "Sum",
+                                "period": 60,
+                                "yAxis": {
+                                    "left": {
+                                        "showUnits": false,
+                                        "label": "ops",
+                                        "min": 0
+                                    }
+                                }
+                            }
+                        },
+                        {
+                            "type": "metric",
+                            "x": 12,
+                            "y": 15,
+                            "width": 6,
+                            "height": 6,
+                            "properties": {
+                                "metrics": [
+                                    [ { "expression": "(m1/(m1+m2))*100", "label": "% of HTTP Errors", "id": "e1", "region": "${AWS::Region}" } ],
+                                    [ "${MetricsCloudwatchNamespace}", "http/server/error_count_total", "InstanceId", "${MasterInstanceId}", "Type", "count", { "id": "m1", "visible": false } ],
+                                    [ ".", "http/server/success_count_total", ".", ".", ".", ".", { "id": "m2", "visible": false } ]
+                                ],
+                                "view": "timeSeries",
+                                "stacked": true,
+                                "region": "${AWS::Region}",
+                                "stat": "Sum",
+                                "period": 60,
+                                "title": "% of HTTP Errors",
+                                "yAxis": {
+                                    "left": {
+                                        "showUnits": false,
+                                        "max": 100,
+                                        "min": 0
+                                    }
+                                }
+                            }
+                        },
+                        {
+                            "type": "metric",
+                            "x": 12,
+                            "y": 9,
+                            "width": 6,
+                            "height": 6,
+                            "properties": {
+                                "metrics": [
+                                    [ "${MetricsCloudwatchNamespace}", "git/upload-pack/request_count_total", "InstanceId", "${MasterInstanceId}", "Type", "count", { "id": "m1" } ],
+                                    [ "${MetricsCloudwatchNamespace}", "git/upload-pack/request_count_total", "InstanceId", "${SlaveInstanceId}", "Type", "count", { "id": "m1" } ]
+                                ],
+                                "view": "timeSeries",
+                                "stacked": false,
+                                "region": "${AWS::Region}",
+                                "title": "Git upload pack - count",
+                                "stat": "Sum",
+                                "period": 60,
+                                "yAxis": {
+                                    "left": {
+                                        "showUnits": false,
+                                        "min": 0
+                                    }
+                                }
+                            }
+                        },
+                        {
+                            "type": "metric",
+                            "x": 0,
+                            "y": 21,
+                            "width": 24,
+                            "height": 6,
+                            "properties": {
+                                "metrics": [
+                                    [ { "expression": "RATE(METRICS())*PERIOD(m1)", "label": "Expression1", "id": "e1" } ],
+                                    [ "${MetricsCloudwatchNamespace}", "queue/index_batch/total_scheduled_tasks_count", "InstanceId", "${MasterInstanceId}", "Type", "gauge", { "id": "m1", "visible": false } ],
+                                    [ ".", "queue/receive_commits/total_scheduled_tasks_count", ".", ".", ".", ".", { "id": "m2", "visible": false } ],
+                                    [ ".", "queue/work_queue/total_scheduled_tasks_count", ".", ".", ".", ".", { "id": "m3", "visible": false } ],
+                                    [ ".", "queue/ssh_command_start/total_scheduled_tasks_count", ".", ".", ".", ".", { "id": "m4", "visible": false } ]
+                                ],
+                                "view": "timeSeries",
+                                "stacked": true,
+                                "title": "Scheduled Tasks Queues",
+                                "region": "${AWS::Region}",
+                                "stat": "Sum",
+                                "period": 60,
+                                "yAxis": {
+                                    "left": {
+                                        "showUnits": false,
+                                        "min": 0
+                                    }
+                                }
+                            }
+                        },
+                        {
+                            "type": "metric",
+                            "x": 0,
+                            "y": 27,
+                            "width": 24,
+                            "height": 6,
+                            "properties": {
+                                "metrics": [
+                                    [ "${MetricsCloudwatchNamespace}", "queue/send_email/scheduled_tasks", "InstanceId", "${MasterInstanceId}", "Type", "gauge" ]
+                                ],
+                                "view": "timeSeries",
+                                "stacked": true,
+                                "region": "${AWS::Region}",
+                                "title": "Scheduled email tasks in the queue",
+                                "stat": "Sum",
+                                "period": 300,
+                                "yAxis": {
+                                    "left": {
+                                        "showUnits": false
+                                    }
+                                }
+                            }
+                        },
+                        {
+                            "type": "metric",
+                            "x": 0,
+                            "y": 33,
+                            "width": 12,
+                            "height": 6,
+                            "properties": {
+                                "metrics": [
+                                    [ "${MetricsCloudwatchNamespace}", "jgit/block_cache/open_files", "InstanceId", "${MasterInstanceId}", "Type", "gauge" ],
+                                    [ "${MetricsCloudwatchNamespace}", "jgit/block_cache/open_files", "InstanceId", "${SlaveInstanceId}", "Type", "gauge" ]
+                                ],
+                                "view": "timeSeries",
+                                "stacked": false,
+                                "title": "Pack files cached",
+                                "region": "${AWS::Region}",
+                                "stat": "Sum",
+                                "period": 60,
+                                "yAxis": {
+                                    "left": {
+                                        "showUnits": false
+                                    }
+                                }
+                            }
+                        },
+                        {
+                            "type": "metric",
+                            "x": 0,
+                            "y": 3,
+                            "width": 6,
+                            "height": 6,
+                            "properties": {
+                                "metrics": [
+                                    [ "AWS/EC2", "CPUUtilization" ]
+                                ],
+                                "view": "timeSeries",
+                                "stacked": false,
+                                "region": "${AWS::Region}",
+                                "title": "Cluster CPU Load",
+                                "period": 60,
+                                "stat": "Average"
+                            }
+                        }
+                    ]
+                }
diff --git a/master-slave/setup.env.template b/master-slave/setup.env.template
index 4a295ff..98389ae 100644
--- a/master-slave/setup.env.template
+++ b/master-slave/setup.env.template
@@ -6,6 +6,7 @@
 SERVICE_GRAFANA_STACK_NAME:=$(AWS_PREFIX)-grafana
 DNS_ROUTING_STACK_NAME:=$(AWS_PREFIX)-dns-routing
 DNS_ROUTING_MONITORING_STACK_NAME:=$(AWS_PREFIX)-monitoring-dns-routing
+DASHBOARD_STACK_NAME:=$(AWS_PREFIX)-dashboard
 HOSTED_ZONE_NAME:=yourcompany.com
 MASTER_SUBDOMAIN:=$(AWS_PREFIX)-master.gerrit-demo
 SLAVE_SUBDOMAIN:=$(AWS_PREFIX)-slave.gerrit-demo