dual-primary: auto-scaling policy for replicas
Allow replicas to scale in and out based on average CPU value.
Bug: Issue 14211
Change-Id: I3a5cc90139cd0f9ecbdea6786bfdb65491e92c8f
diff --git a/dual-primary/Makefile b/dual-primary/Makefile
index 933ce14..88ac236 100644
--- a/dual-primary/Makefile
+++ b/dual-primary/Makefile
@@ -50,7 +50,9 @@
$(optional_git_gc_targets_creation) \
dns-routing wait-for-dns-routing-creation
-cluster: cluster-keys set-optional-gerrit-primary-volume set-optional-params-for-replica-filesystem
+cluster: cluster-keys set-optional-gerrit-primary-volume \
+ set-optional-params-for-replica-filesystem \
+ set-optional-params-for-replica-auto-scaling-capacity
ifdef CLUSTER_INSTANCE_TYPE
$(eval CLUSTER_OPTIONAL_PARAMS := $(CLUSTER_OPTIONAL_PARAMS) ParameterKey=InstanceType,ParameterValue=$(CLUSTER_INSTANCE_TYPE))
endif
@@ -92,7 +94,8 @@
ParameterKey=SubnetIdProp,ParameterValue=$(SUBNET_ID) \
$(CLUSTER_OPTIONAL_PARAMS) \
$(GERRIT_OPTIONAL_PRIMARY_VOLUME) \
- $(GERRIT_OPTIONAL_PARAMS_REPLICA_FILESYSTEM)
+ $(GERRIT_OPTIONAL_PARAMS_REPLICA_FILESYSTEM) \
+ $(GERRIT_OPTIONAL_PARAMS_REPLICA_AUTO_SCALING_CAPACITY)
service-primary-1: set-optional-params-metrics-cloudwatch set-optional-params-smtp \
set-optional-params-multisite set-ldap-account-pattern \
@@ -227,7 +230,13 @@
ParameterKey=HostedZoneName,ParameterValue=$(HOSTED_ZONE_NAME) \
ParameterKey=GitReplicationSubdomain,ParameterValue=$(GIT_REPLICATION_SUBDOMAIN)
-service-replica: set-optional-params-metrics-cloudwatch set-ldap-account-pattern set-optional-gerrit-ulimits set-optional-jgit-conf
+service-replica: set-optional-params-metrics-cloudwatch \
+ set-ldap-account-pattern \
+ set-optional-gerrit-ulimits \
+ set-optional-jgit-conf \
+ set-optional-params-for-replica-auto-scaling-capacity \
+ set-optional-params-for-replica-auto-scaling-policy
+
ifdef GERRIT_REPLICA_INSTANCE_ID
$(eval REPLICA_SERVICE_OPTIONAL_PARAMS := $(REPLICA_SERVICE_OPTIONAL_PARAMS) ParameterKey=InstanceId,ParameterValue=$(GERRIT_REPLICA_INSTANCE_ID))
endif
@@ -261,7 +270,9 @@
$(LDAP_ACCOUNT_PATTERN_PARAM) \
$(REPLICA_SERVICE_OPTIONAL_PARAMS) \
$(METRICS_CW_OPTIONAL_PARAMS) \
- $(GERRIT_ULIMITS)
+ $(GERRIT_ULIMITS) \
+ $(GERRIT_OPTIONAL_PARAMS_REPLICA_AUTO_SCALING_CAPACITY) \
+ $(GERRIT_OPTIONAL_PARAMS_REPLICA_AUTO_SCALING_POLICY)
service-lb:
ifdef LOAD_BALANCER_SCHEME
diff --git a/dual-primary/README.md b/dual-primary/README.md
index baa0874..7552d79 100644
--- a/dual-primary/README.md
+++ b/dual-primary/README.md
@@ -242,6 +242,55 @@
* `REPLICA_FILESYSTEM_PROVISIONED_THROUGHPUT_IN_MIBPS`: Optional. Only used when `REPLICA_FILESYSTEM_THROUGHPUT_MODE` is set to `provisioned`.
default: `256`.
+##### Auto Scaling of replicas instances
+
+Gerrit replicas have the ability to scale in or out automatically to accommodate
+to the increase or decrease of traffic. The traffic might be typically coming
+from build or test jobs executed by some sort of automated build pipeline.
+
+Since they all [share the same git data over EFS](#shared-filesystem-for-replicas),
+replicas are immediately ready to serve traffic as soon as they come up and
+register behind the loadbalancer.
+
+There is a 1 to 1 relationship between replica and EC2 instances: on each EC2
+instance in the 'replica' ASG, runs one and only one replica task.
+Because of this, when specifying the capacity for replicas (minimum, desired and
+maximum), they will both configure for the capacity of tasks as well as the
+capacity of the ASG, since they always need to be in sync.
+
+The scaling policy adds or removes capacity as required to keep the average CPU
+Usage (of the replica service) close to the specified target value.
+
+These are the available settings:
+
+* `REPLICA_AUTOSCALING_MIN_CAPACITY` Optional. The minimum number of tasks that
+replicas should scale in to. This is also the minimum number of EC2 instances in
+the replica ASG
+default: *1*
+
+* `REPLICA_AUTOSCALING_DESIRED_CAPACITY` Optional. The desired number of
+replica tasks to run. This is also the desired number of EC2 instances in the
+replica ASG.
+default: *1*
+
+* `REPLICA_AUTOSCALING_MAX_CAPACITY` Optional. The maximum number of tasks that
+replicas should scale out to. This is also the maximum number of EC2 instances
+in the replica ASG
+default: *2*
+
+* `REPLICA_AUTOSCALING_SCALE_IN_COOLDOWN` Optional. The amount of time, in
+seconds, after a scale-in activity completes before another scale-in activity
+can start
+default: *300* seconds
+
+* `REPLICA_AUTOSCALING_SCALE_OUT_COOLDOWN` Optional. The amount of time, in
+seconds, to wait for a previous scale-out activity to take effect
+default: *300* seconds
+
+* `REPLICA_AUTOSCALING_TARGET_CPU_PERCENTAGE` Optional. Aggregate CPU
+utilization target for auto-scaling. Auto-scaling will add or remove tasks in
+the replica service to be as close as possible to this value
+
#### REPLICATION SERVICE
* `REPLICATION_SERVICE_ENABLED`: Optional. Whether to expose a replication endpoint.
diff --git a/dual-primary/cf-cluster.yml b/dual-primary/cf-cluster.yml
index 17a9b9b..c13cd99 100644
--- a/dual-primary/cf-cluster.yml
+++ b/dual-primary/cf-cluster.yml
@@ -106,6 +106,18 @@
Description: Gerrit replicas shared filesystem throughput, measured in MiB/s. Valid values are 1-1024.
Type: Number
Default: 256
+ ReplicaAutoScalingMinCapacity:
+ Type: Number
+ Description: The minimum number of tasks that replicas should scale in to
+ Default: 1
+ ReplicaAutoScalingDesiredCapacity:
+ Description: The desired number of replica tasks to run
+ Type: Number
+ Default: 1
+ ReplicaAutoScalingMaxCapacity:
+ Type: Number
+ Description: The maximum number of tasks that replicas should scale out to
+ Default: 2
Conditions:
isProvisionedThroughput: !Equals [!Ref PrimaryFileSystemThroughputMode, "provisioned"]
@@ -144,9 +156,9 @@
VPCZoneIdentifier:
- !If [NetworkStackNeeded, !GetAtt ECSTaskNetworkStack.Outputs.PublicSubnetOneRef, !Ref SubnetIdProp]
LaunchConfigurationName: !Ref 'ReplicaLaunchConfiguration'
- MinSize: '1'
- MaxSize: '1'
- DesiredCapacity: '1'
+ MinSize: !Ref ReplicaAutoScalingMinCapacity
+ MaxSize: !Ref ReplicaAutoScalingMaxCapacity
+ DesiredCapacity: !Ref ReplicaAutoScalingDesiredCapacity
CreationPolicy:
ResourceSignal:
Timeout: PT15M
diff --git a/dual-primary/cf-service-replica.yml b/dual-primary/cf-service-replica.yml
index ff59a9a..5cd9660 100644
--- a/dual-primary/cf-service-replica.yml
+++ b/dual-primary/cf-service-replica.yml
@@ -36,10 +36,6 @@
DockerRegistryUrl:
Description: Docker registry URL
Type: String
- DesiredCount:
- Description: How many instances of this task should we run across our cluster?
- Type: Number
- Default: 1
HTTPHostPort:
Description: Gerrit Host HTTP port
Type: Number
@@ -189,6 +185,30 @@
Description: Comma separated list of regex patterns to exclude metrics reported to CloudWatch
Type: CommaDelimitedList
Default: ''
+ ReplicaAutoScalingMinCapacity:
+ Type: Number
+ Description: The minimum number of tasks that replicas should scale in to
+ Default: 1
+ ReplicaAutoScalingDesiredCapacity:
+ Description: The desired number of replica tasks to run
+ Type: Number
+ Default: 1
+ ReplicaAutoScalingMaxCapacity:
+ Type: Number
+ Description: The maximum number of tasks that replicas should scale out to
+ Default: 2
+ ReplicaAutoScalingScaleInCooldown:
+ Type: Number
+ Description: The amount of time, in seconds, after a scale-in activity completes before another scale-in activity can start
+ Default: 300
+ ReplicaAutoScalingScaleOutCooldown:
+ Type: Number
+ Description: The amount of time, in seconds, to wait for a previous scale-out activity to take effect.
+ Default: 300
+ ReplicaAutoScalingTargetCPUPercentage:
+ Type: Number
+ Description: Aggregate CPU utilization target for auto-scaling
+ Default: 75.0
Resources:
GerritService:
@@ -202,7 +222,7 @@
Cluster:
Fn::ImportValue:
!Join [':', [!Ref 'ClusterStackName', 'ClusterName']]
- DesiredCount: !Ref DesiredCount
+ DesiredCount: !Ref ReplicaAutoScalingDesiredCapacity
TaskDefinition: !Ref GerritTaskDefinition
LoadBalancers:
- ContainerName: !Ref GerritServiceName
@@ -385,6 +405,26 @@
Labels:
gerrit-logs: !Join ['-', [!Ref EnvironmentName, !Ref GerritLogsVolume]]
+ ReplicaCPUAutoScaling:
+ Type: AWS::CloudFormation::Stack
+ Properties:
+ TemplateURL: !Join [ '', ['https://', !Ref TemplateBucketName, '.s3.amazonaws.com/cf-ecs-service-cpu-autoscaling.yml'] ]
+ TimeoutInMinutes: '5'
+ Parameters:
+ AutoScalingMinCapacity: !Ref ReplicaAutoScalingMinCapacity
+ AutoScalingDesiredCapacity: !Ref ReplicaAutoScalingDesiredCapacity
+ AutoScalingMaxCapacity: !Ref ReplicaAutoScalingMaxCapacity
+ AutoScalingScaleInCooldown: !Ref ReplicaAutoScalingScaleInCooldown
+ AutoScalingScaleOutCooldown: !Ref ReplicaAutoScalingScaleOutCooldown
+ AutoScalingTargetCPUPercentage: !Ref ReplicaAutoScalingTargetCPUPercentage
+ ResourceId:
+ !Join
+ - ''
+ - - 'service/'
+ - Fn::ImportValue: !Join [':', [!Ref 'ClusterStackName', 'ClusterName']]
+ - '/'
+ - !GetAtt GerritService.Name
+
LoadBalancer:
Type: AWS::ElasticLoadBalancingV2::LoadBalancer
Properties:
diff --git a/dual-primary/setup.env.template b/dual-primary/setup.env.template
index 72bfd37..2f911d2 100644
--- a/dual-primary/setup.env.template
+++ b/dual-primary/setup.env.template
@@ -73,4 +73,12 @@
REPLICA_FILESYSTEM_ID=""
REPLICA_FILESYSTEM_THROUGHPUT_MODE="provisioned"
-REPLICA_FILESYSTEM_PROVISIONED_THROUGHPUT_IN_MIBPS="256"
\ No newline at end of file
+REPLICA_FILESYSTEM_PROVISIONED_THROUGHPUT_IN_MIBPS="256"
+
+REPLICA_AUTOSCALING_MIN_CAPACITY=1
+REPLICA_AUTOSCALING_DESIRED_CAPACITY=1
+REPLICA_AUTOSCALING_MAX_CAPACITY=1
+
+REPLICA_AUTOSCALING_SCALE_IN_COOLDOWN=300
+REPLICA_AUTOSCALING_SCALE_OUT_COOLDOWN=300
+REPLICA_AUTOSCALING_TARGET_CPU_PERCENTAGE=75
\ No newline at end of file