blob: faba38ffa6662d185c602f3bfae0c9cbfb1879af [file] [log] [blame]
AWSTemplateFormatVersion: '2010-09-09'
Description: Deploy a dual-primary CloudWatch dashboard.
Parameters:
Primary1InstanceId:
Description: Optional identifier for the Gerrit primary1 instance
Default: gerrit-dual-primary-PRIMARY1
Type: String
Primary2InstanceId:
Description: Optional identifier for the Gerrit primary2 instance
Default: gerrit-dual-primary-PRIMARY2
Type: String
ReplicaInstanceId:
Description: Optional identifier for the Gerrit replica instance
Default: gerrit-dual-primary-REPLICA
Type: String
MetricsCloudwatchNamespace:
Description: The CloudWatch namespace for Gerrit metrics
Type: String
Default: gerrit
Resources:
CloudWatchDashboard:
Type: AWS::CloudWatch::Dashboard
Properties:
DashboardName: !Ref AWS::StackName
DashboardBody: !Sub |
{
"start": "-PT6H",
"periodOverride": "auto",
"widgets": [
{
"type": "metric",
"x": 0,
"y": 0,
"width": 24,
"height": 3,
"properties": {
"metrics": [
[ "${MetricsCloudwatchNamespace}", "sshd/sessions/connected", "InstanceId", "${Primary1InstanceId}", "Type", "gauge" ],
[ "${MetricsCloudwatchNamespace}", "sshd/sessions/connected", "InstanceId", "${Primary2InstanceId}", "Type", "gauge" ],
[ "${MetricsCloudwatchNamespace}", "sshd/sessions/connected", "InstanceId", "${ReplicaInstanceId}", "Type", "gauge" ]
],
"view": "singleValue",
"region": "${AWS::Region}",
"stat": "Maximum",
"period": 60,
"title": "SSH Connection",
"stacked": false
}
},
{
"type": "metric",
"x": 6,
"y": 3,
"width": 6,
"height": 6,
"properties": {
"metrics": [
[ { "expression": "RATE(METRICS())", "label": "Expression1", "id": "e1", "region": "${AWS::Region}" } ],
[ "${MetricsCloudwatchNamespace}", "proc/cpu/usage", "InstanceId", "${Primary1InstanceId}", "Type", "gauge", { "id": "m1", "visible": false } ],
[ "${MetricsCloudwatchNamespace}", "proc/cpu/usage", "InstanceId", "${Primary2InstanceId}", "Type", "gauge", { "id": "m3", "visible": false } ],
[ "${MetricsCloudwatchNamespace}", "proc/cpu/usage", "InstanceId", "${ReplicaInstanceId}", "Type", "gauge", { "id": "m2", "visible": false } ]
],
"view": "timeSeries",
"stacked": false,
"region": "${AWS::Region}",
"stat": "Average",
"period": 60,
"title": "Gerrit Cpu Load",
"yAxis": {
"right": {
"showUnits": true
},
"left": {
"showUnits": false,
"min": 0
}
}
}
},
{
"type": "metric",
"x": 12,
"y": 33,
"width": 12,
"height": 6,
"properties": {
"metrics": [
[ "${MetricsCloudwatchNamespace}", "jgit/block_cache/cache_used", "InstanceId", "${Primary1InstanceId}", "Type", "gauge", { "id": "m1" } ],
[ "${MetricsCloudwatchNamespace}", "jgit/block_cache/cache_used", "InstanceId", "${Primary2InstanceId}", "Type", "gauge", { "id": "m3" } ],
[ "${MetricsCloudwatchNamespace}", "jgit/block_cache/cache_used", "InstanceId", "${ReplicaInstanceId}", "Type", "gauge", { "id": "m2" } ]
],
"view": "timeSeries",
"stacked": false,
"region": "${AWS::Region}",
"yAxis": {
"left": {
"showUnits": false,
"label": ""
}
},
"title": "JGit Cache",
"period": 60,
"stat": "Average"
}
},
{
"type": "metric",
"x": 18,
"y": 3,
"width": 6,
"height": 6,
"properties": {
"metrics": [
[ "${MetricsCloudwatchNamespace}", "jvm.memory.total.used", "InstanceId", "${Primary1InstanceId}", "Type", "gauge" ],
[ "${MetricsCloudwatchNamespace}", "jvm.memory.total.used", "InstanceId", "${Primary2InstanceId}", "Type", "gauge" ],
[ "${MetricsCloudwatchNamespace}", "jvm.memory.total.used", "InstanceId", "${ReplicaInstanceId}", "Type", "gauge" ]
],
"view": "timeSeries",
"stacked": false,
"region": "${AWS::Region}",
"yAxis": {
"left": {
"showUnits": false
}
},
"stat": "Average",
"period": 60,
"title": "Gerrit Used memory"
}
},
{
"type": "metric",
"x": 12,
"y": 3,
"width": 6,
"height": 6,
"properties": {
"metrics": [
[ "${MetricsCloudwatchNamespace}", "proc/cpu/system_load", "InstanceId", "${Primary1InstanceId}", "Type", "gauge" ],
[ "${MetricsCloudwatchNamespace}", "proc/cpu/system_load", "InstanceId", "${Primary2InstanceId}", "Type", "gauge" ],
[ "${MetricsCloudwatchNamespace}", "proc/cpu/system_load", "InstanceId", "${ReplicaInstanceId}", "Type", "gauge" ]
],
"view": "timeSeries",
"stacked": false,
"title": "Gerrit System Load",
"region": "${AWS::Region}",
"yAxis": {
"left": {
"showUnits": false
}
},
"stat": "Average",
"period": 60
}
},
{
"type": "metric",
"x": 0,
"y": 9,
"width": 6,
"height": 6,
"properties": {
"metrics": [
[ "${MetricsCloudwatchNamespace}", "jvm.gc.G1-Old-Generation.time", "InstanceId", "${Primary1InstanceId}", "Type", "gauge" ],
[ "${MetricsCloudwatchNamespace}", "jvm.gc.G1-Old-Generation.time", "InstanceId", "${Primary2InstanceId}", "Type", "gauge" ],
[ "${MetricsCloudwatchNamespace}", "jvm.gc.G1-Old-Generation.time", "InstanceId", "${ReplicaInstanceId}", "Type", "gauge" ]
],
"view": "timeSeries",
"stacked": false,
"title": "GC Time",
"region": "${AWS::Region}",
"stat": "Average",
"period": 60,
"yAxis": {
"left": {
"showUnits": false
}
}
}
},
{
"type": "metric",
"x": 6,
"y": 9,
"width": 6,
"height": 6,
"properties": {
"metrics": [
[ "${MetricsCloudwatchNamespace}", "proc/jvm/thread/num_live", "InstanceId", "${Primary1InstanceId}", "Type", "gauge" ],
[ "${MetricsCloudwatchNamespace}", "proc/jvm/thread/num_live", "InstanceId", "${Primary2InstanceId}", "Type", "gauge" ],
[ "${MetricsCloudwatchNamespace}", "proc/jvm/thread/num_live", "InstanceId", "${ReplicaInstanceId}", "Type", "gauge" ]
],
"view": "timeSeries",
"stacked": false,
"title": "Active Threads",
"region": "${AWS::Region}",
"period": 60,
"stat": "Average",
"yAxis": {
"left": {
"showUnits": false
}
}
}
},
{
"type": "metric",
"x": 0,
"y": 15,
"width": 6,
"height": 6,
"properties": {
"metrics": [
[ "${MetricsCloudwatchNamespace}", "http/server/rest_api/server_latency_total", "InstanceId", "${Primary1InstanceId}", "Type", "99.9%", { "id": "m2" } ],
[ "${MetricsCloudwatchNamespace}", "http/server/rest_api/server_latency_total", "InstanceId", "${Primary2InstanceId}", "Type", "99.9%", { "id": "m3" } ]
],
"view": "timeSeries",
"stacked": false,
"region": "${AWS::Region}",
"stat": "Average",
"period": 60,
"title": "HTTP requests latency",
"yAxis": {
"left": {
"showUnits": false,
"label": "ms"
}
}
}
},
{
"type": "metric",
"x": 6,
"y": 15,
"width": 6,
"height": 6,
"properties": {
"metrics": [
[ { "expression": "RATE(m1+m2)", "label": "Primary 1 HTTP hits per second", "id": "e2", "region": "${AWS::Region}" } ],
[ { "expression": "RATE(m3+m4)", "label": "Primary 2 HTTP hits per second", "id": "e1", "region": "${AWS::Region}" } ],
[ "${MetricsCloudwatchNamespace}", "http/server/success_count_total", "InstanceId", "${Primary1InstanceId}", "Type", "count", { "id": "m2", "visible": false } ],
[ "${MetricsCloudwatchNamespace}", "http/server/error_count_total", "InstanceId", "${Primary1InstanceId}", "Type", "count", { "id": "m1", "visible": false } ],
[ "${MetricsCloudwatchNamespace}", "http/server/success_count_total", "InstanceId", "${Primary2InstanceId}", "Type", "count", { "id": "m3", "visible": false } ],
[ "${MetricsCloudwatchNamespace}", "http/server/error_count_total", "InstanceId", "${Primary2InstanceId}", "Type", "count", { "id": "m4", "visible": false } ]
],
"view": "timeSeries",
"stacked": false,
"title": "HTTP hits per second",
"region": "${AWS::Region}",
"stat": "Sum",
"period": 60,
"yAxis": {
"left": {
"showUnits": false,
"label": "ops",
"min": 0
}
}
}
},
{
"type": "metric",
"x": 12,
"y": 15,
"width": 6,
"height": 6,
"properties": {
"metrics": [
[ { "expression": "(m1/(m1+m2))*100", "label": "Primary 1 % of HTTP Errors", "id": "e1", "region": "${AWS::Region}" } ],
[ { "expression": "(m3/(m3+m4))*100", "label": "Primary 2 % of HTTP Errors", "id": "e2", "region": "${AWS::Region}" } ],
[ "${MetricsCloudwatchNamespace}", "http/server/error_count_total", "InstanceId", "${Primary1InstanceId}", "Type", "count", { "id": "m1", "visible": false } ],
[ "${MetricsCloudwatchNamespace}", "http/server/success_count_total", "InstanceId", "${Primary1InstanceId}", "Type", "count", { "id": "m2", "visible": false } ],
[ "${MetricsCloudwatchNamespace}", "http/server/error_count_total", "InstanceId", "${Primary2InstanceId}", "Type", "count", { "id": "m3", "visible": false } ],
[ "${MetricsCloudwatchNamespace}", "http/server/success_count_total", "InstanceId", "${Primary2InstanceId}", "Type", "count", { "id": "m4", "visible": false } ]
],
"view": "timeSeries",
"stacked": false,
"region": "${AWS::Region}",
"stat": "Sum",
"period": 60,
"title": "% of HTTP Errors",
"yAxis": {
"left": {
"showUnits": false,
"max": 100,
"min": 0
}
}
}
},
{
"type": "metric",
"x": 12,
"y": 9,
"width": 6,
"height": 6,
"properties": {
"metrics": [
[ "${MetricsCloudwatchNamespace}", "git/upload-pack/request_count_total", "InstanceId", "${Primary1InstanceId}", "Type", "count", { "id": "m1" } ],
[ "${MetricsCloudwatchNamespace}", "git/upload-pack/request_count_total", "InstanceId", "${Primary2InstanceId}", "Type", "count", { "id": "m2" } ],
[ "${MetricsCloudwatchNamespace}", "git/upload-pack/request_count_total", "InstanceId", "${ReplicaInstanceId}", "Type", "count", { "id": "m3" } ]
],
"view": "timeSeries",
"stacked": false,
"region": "${AWS::Region}",
"title": "Git upload pack - count",
"stat": "Sum",
"period": 60,
"yAxis": {
"left": {
"showUnits": false,
"min": 0
}
}
}
},
{
"type": "metric",
"x": 0,
"y": 21,
"width": 24,
"height": 6,
"properties": {
"metrics": [
[ { "expression": "RATE(METRICS())*PERIOD(m1)", "label": "Expression1", "id": "e1" } ],
[ "${MetricsCloudwatchNamespace}", "queue/index_batch/total_scheduled_tasks_count", "InstanceId", "${Primary1InstanceId}", "Type", "gauge", { "id": "m1", "visible": false } ],
[ "${MetricsCloudwatchNamespace}", "queue/receive_commits/total_scheduled_tasks_count", "InstanceId", "${Primary1InstanceId}", "Type", "gauge", { "id": "m2", "visible": false } ],
[ "${MetricsCloudwatchNamespace}", "queue/work_queue/total_scheduled_tasks_count", "InstanceId", "${Primary1InstanceId}", "Type", "gauge", { "id": "m3", "visible": false } ],
[ "${MetricsCloudwatchNamespace}", "queue/ssh_command_start/total_scheduled_tasks_count", "InstanceId", "${Primary1InstanceId}", "Type", "gauge", { "id": "m4", "visible": false } ],
[ "${MetricsCloudwatchNamespace}", "queue/index_batch/total_scheduled_tasks_count", "InstanceId", "${Primary2InstanceId}", "Type", "gauge", { "id": "m5", "visible": false } ],
[ "${MetricsCloudwatchNamespace}", "queue/receive_commits/total_scheduled_tasks_count", "InstanceId", "${Primary2InstanceId}", "Type", "gauge", { "id": "m6", "visible": false } ],
[ "${MetricsCloudwatchNamespace}", "queue/work_queue/total_scheduled_tasks_count", "InstanceId", "${Primary2InstanceId}", "Type", "gauge", { "id": "m7", "visible": false } ],
[ "${MetricsCloudwatchNamespace}", "queue/ssh_command_start/total_scheduled_tasks_count", "InstanceId", "${Primary2InstanceId}", "Type", "gauge", { "id": "m8", "visible": false } ]
],
"view": "timeSeries",
"stacked": false,
"title": "Scheduled Tasks Queues",
"region": "${AWS::Region}",
"stat": "Sum",
"period": 60,
"yAxis": {
"left": {
"showUnits": false,
"min": 0
}
}
}
},
{
"type": "metric",
"x": 0,
"y": 27,
"width": 24,
"height": 6,
"properties": {
"metrics": [
[ "${MetricsCloudwatchNamespace}", "queue/send_email/scheduled_tasks", "InstanceId", "${Primary1InstanceId}", "Type", "gauge" ],
[ "${MetricsCloudwatchNamespace}", "queue/send_email/scheduled_tasks", "InstanceId", "${Primary2InstanceId}", "Type", "gauge" ]
],
"view": "timeSeries",
"stacked": false,
"region": "${AWS::Region}",
"title": "Scheduled email tasks in the queue",
"stat": "Sum",
"period": 300,
"yAxis": {
"left": {
"showUnits": false
}
}
}
},
{
"type": "metric",
"x": 0,
"y": 33,
"width": 12,
"height": 6,
"properties": {
"metrics": [
[ "${MetricsCloudwatchNamespace}", "jgit/block_cache/open_files", "InstanceId", "${Primary1InstanceId}", "Type", "gauge" ],
[ "${MetricsCloudwatchNamespace}", "jgit/block_cache/open_files", "InstanceId", "${Primary2InstanceId}", "Type", "gauge" ],
[ "${MetricsCloudwatchNamespace}", "jgit/block_cache/open_files", "InstanceId", "${ReplicaInstanceId}", "Type", "gauge" ]
],
"view": "timeSeries",
"stacked": false,
"title": "Pack files cached",
"region": "${AWS::Region}",
"stat": "Sum",
"period": 60,
"yAxis": {
"left": {
"showUnits": false
}
}
}
},
{
"type": "metric",
"x": 0,
"y": 3,
"width": 6,
"height": 6,
"properties": {
"metrics": [
[ "AWS/EC2", "CPUUtilization" ]
],
"view": "timeSeries",
"stacked": false,
"region": "${AWS::Region}",
"title": "Cluster CPU Load",
"period": 60,
"stat": "Average"
}
}
]
}