Elastically scale replicas ASG based on ECS task requirements
dual-primary and primary-replica recipes:
Use capacity provider strategy to scale replicas ASG based on ECS tasks
demand.
Bug: Issue 14196
Change-Id: I947f4d4912955e5b2ff9264e5296c28e07b0f70e
diff --git a/Makefile.common b/Makefile.common
index 2e01b03..47c08a5 100644
--- a/Makefile.common
+++ b/Makefile.common
@@ -133,6 +133,18 @@
$(eval GERRIT_OPTIONAL_PARAMS_REPLICA_AUTO_SCALING_POLICY := $(GERRIT_OPTIONAL_PARAMS_REPLICA_AUTO_SCALING_POLICY) ParameterKey=ReplicaAutoScalingTargetCPUPercentage,ParameterValue=$(REPLICA_AUTOSCALING_TARGET_CPU_PERCENTAGE))
endif
+set-optional-params-for-replica-capacity-provider:
+ $(eval GERRIT_OPTIONAL_PARAMS_REPLICA_CAPACITY_PROVIDER=)
+ifdef REPLICA_CAPACITY_PROVIDER_TARGET
+ $(eval GERRIT_OPTIONAL_PARAMS_REPLICA_CAPACITY_PROVIDER := $(GERRIT_OPTIONAL_PARAMS_REPLICA_CAPACITY_PROVIDER) ParameterKey=ReplicaCapacityProviderTarget,ParameterValue=$(REPLICA_CAPACITY_PROVIDER_TARGET))
+endif
+ifdef REPLICA_CAPACITY_PROVIDER_MIN_STEP_SIZE
+ $(eval GERRIT_OPTIONAL_PARAMS_REPLICA_CAPACITY_PROVIDER := $(GERRIT_OPTIONAL_PARAMS_REPLICA_CAPACITY_PROVIDER) ParameterKey=ReplicaCapacityProviderMinStepSize,ParameterValue=$(REPLICA_CAPACITY_PROVIDER_MIN_STEP_SIZE))
+endif
+ifdef REPLICA_CAPACITY_PROVIDER_MAX_STEP_SIZE
+ $(eval GERRIT_OPTIONAL_PARAMS_REPLICA_CAPACITY_PROVIDER := $(GERRIT_OPTIONAL_PARAMS_REPLICA_CAPACITY_PROVIDER) ParameterKey=ReplicaCapacityProviderMaxStepSize,ParameterValue=$(REPLICA_CAPACITY_PROVIDER_MAX_STEP_SIZE))
+endif
+
confirm-persistent-stack-deletion:
@echo ""
@echo "* * * * WARNING * * * * this is going to completely destroy the stack, including git data."
diff --git a/dual-primary/Makefile b/dual-primary/Makefile
index 88ac236..8b44868 100644
--- a/dual-primary/Makefile
+++ b/dual-primary/Makefile
@@ -52,7 +52,8 @@
cluster: cluster-keys set-optional-gerrit-primary-volume \
set-optional-params-for-replica-filesystem \
- set-optional-params-for-replica-auto-scaling-capacity
+ set-optional-params-for-replica-auto-scaling-capacity \
+ set-optional-params-for-replica-capacity-provider
ifdef CLUSTER_INSTANCE_TYPE
$(eval CLUSTER_OPTIONAL_PARAMS := $(CLUSTER_OPTIONAL_PARAMS) ParameterKey=InstanceType,ParameterValue=$(CLUSTER_INSTANCE_TYPE))
endif
@@ -95,7 +96,8 @@
$(CLUSTER_OPTIONAL_PARAMS) \
$(GERRIT_OPTIONAL_PRIMARY_VOLUME) \
$(GERRIT_OPTIONAL_PARAMS_REPLICA_FILESYSTEM) \
- $(GERRIT_OPTIONAL_PARAMS_REPLICA_AUTO_SCALING_CAPACITY)
+ $(GERRIT_OPTIONAL_PARAMS_REPLICA_AUTO_SCALING_CAPACITY) \
+ $(GERRIT_OPTIONAL_PARAMS_REPLICA_CAPACITY_PROVIDER)
service-primary-1: set-optional-params-metrics-cloudwatch set-optional-params-smtp \
set-optional-params-multisite set-ldap-account-pattern \
diff --git a/dual-primary/README.md b/dual-primary/README.md
index 7552d79..0c0bc8e 100644
--- a/dual-primary/README.md
+++ b/dual-primary/README.md
@@ -261,6 +261,18 @@
The scaling policy adds or removes capacity as required to keep the average CPU
Usage (of the replica service) close to the specified target value.
+Now, tasks in the provisioning state that cannot find sufficient resources on
+the existing instances will automatically trigger the capacity provider to scale
+out the replica ASG. As more EC2 instances become available, tasks in the
+provisioning state will get placed onto those instances, reducing the number of
+tasks in provisioning.
+
+Conversely, as the average CPU usage (of the replica service) drops under the
+specified target value, and replica tasks get removed, the capacity provider
+will reduce the number of EC2 instances too.
+
+Note that only EC2 instances that are not running any replica task will scale in.
+
These are the available settings:
* `REPLICA_AUTOSCALING_MIN_CAPACITY` Optional. The minimum number of tasks that
@@ -291,6 +303,32 @@
utilization target for auto-scaling. Auto-scaling will add or remove tasks in
the replica service to be as close as possible to this value
+* `REPLICA_CAPACITY_PROVIDER_TARGET` Optional. The target capacity value for the
+capacity provider of replicas (must be > 0 and <= 100).
+default: *100*
+
+ Setting this value to 100 means that there will be no _spare capacity_
+allocated on the replica ASG:
+
+ If 3 replica tasks are needed, then the ASG will adjust to have exactly 3 EC2
+
+ Setting this value to less than 100 enables spare capacity in the ASG. For
+example, if you set this value to 50 the scaling policy will adjust the EC2
+until it is exactly twice the number of instances needed to run all of the
+tasks:
+
+ If 3 replica tasks are needed, then there ASG will adjust to 6 EC2
+
+* `REPLICA_CAPACITY_PROVIDER_MIN_STEP_SIZE` Optional. The minimum number of EC2
+instances for replicas that will scale in or scale out at one time (must be >= 1
+and <= 10)
+default: *1*
+
+* `REPLICA_CAPACITY_PROVIDER_MAX_STEP_SIZE` Optional. The maximum number of EC2
+instances for replicas that will scale in or scale out at one time (must be >= 1
+and <= 10)
+default: *1*
+
#### REPLICATION SERVICE
* `REPLICATION_SERVICE_ENABLED`: Optional. Whether to expose a replication endpoint.
diff --git a/dual-primary/cf-cluster.yml b/dual-primary/cf-cluster.yml
index c13cd99..2797932 100644
--- a/dual-primary/cf-cluster.yml
+++ b/dual-primary/cf-cluster.yml
@@ -118,6 +118,27 @@
Type: Number
Description: The maximum number of tasks that replicas should scale out to
Default: 2
+ ReplicaCapacityProviderTarget:
+ Type: Number
+ Description: The target capacity value for the capacity provider of replicas
+ ConstraintDescription: The specified value must be > 0 and <= 100
+ Default: 100
+ MinValue: 1
+ MaxValue: 100
+ ReplicaCapacityProviderMinStepSize:
+ Type: Number
+ Description: The minimum number of EC2 instances for replicas that will scale in or scale out at one time
+ ConstraintDescription: The specified value must be >= 1 and <= 10
+ Default: 1
+ MinValue: 1
+ MaxValue: 10
+ ReplicaCapacityProviderMaxStepSize:
+ Type: Number
+ Description: The maximum number of EC2 instances for replicas that will scale in or scale out at one time
+ ConstraintDescription: The specified value must be >= 1 and less <= 10
+ Default: 1
+ MinValue: 1
+ MaxValue: 10
Conditions:
isProvisionedThroughput: !Equals [!Ref PrimaryFileSystemThroughputMode, "provisioned"]
@@ -133,6 +154,22 @@
# ECS Resources
ECSCluster:
Type: AWS::ECS::Cluster
+ Properties:
+ ClusterName: !Sub '${AWS::StackName}-ECSCluster'
+ CapacityProviders: [ !Ref ReplicaCapacityProvider ]
+
+ ReplicaCapacityProvider:
+ Type: AWS::ECS::CapacityProvider
+ Properties:
+ Name: !Sub '${AWS::StackName}-ReplicaCapacityProvider'
+ AutoScalingGroupProvider:
+ AutoScalingGroupArn: !Ref ReplicaECSAutoScalingGroup
+ ManagedTerminationProtection: ENABLED
+ ManagedScaling:
+ MaximumScalingStepSize: !Ref ReplicaCapacityProviderMaxStepSize
+ MinimumScalingStepSize: !Ref ReplicaCapacityProviderMinStepSize
+ Status: ENABLED
+ TargetCapacity: !Ref ReplicaCapacityProviderTarget
EcsHostSecurityGroup:
Type: AWS::EC2::SecurityGroup
@@ -159,6 +196,7 @@
MinSize: !Ref ReplicaAutoScalingMinCapacity
MaxSize: !Ref ReplicaAutoScalingMaxCapacity
DesiredCapacity: !Ref ReplicaAutoScalingDesiredCapacity
+ NewInstancesProtectedFromScaleIn: true
CreationPolicy:
ResourceSignal:
Timeout: PT15M
@@ -177,7 +215,7 @@
UserData:
Fn::Base64: !Sub |
#!/bin/bash -xe
- echo ECS_CLUSTER=${ECSCluster} >> /etc/ecs/ecs.config
+ echo ECS_CLUSTER=${AWS::StackName}-ECSCluster >> /etc/ecs/ecs.config
echo ECS_INSTANCE_ATTRIBUTES={\"target_group\":\"replica\"} >> /etc/ecs/ecs.config
# Make sure latest version of the helper scripts are installed as per recommendation:
@@ -218,7 +256,7 @@
UserData:
Fn::Base64: !Sub |
#!/bin/bash -xe
- echo ECS_CLUSTER=${ECSCluster} >> /etc/ecs/ecs.config
+ echo ECS_CLUSTER=${AWS::StackName}-ECSCluster >> /etc/ecs/ecs.config
echo ECS_INSTANCE_ATTRIBUTES={\"target_group\":\"haproxy\"} >> /etc/ecs/ecs.config
# Make sure latest version of the helper scripts are installed as per recommendation:
diff --git a/dual-primary/cf-service-replica.yml b/dual-primary/cf-service-replica.yml
index 5cd9660..85e9604 100644
--- a/dual-primary/cf-service-replica.yml
+++ b/dual-primary/cf-service-replica.yml
@@ -222,6 +222,9 @@
Cluster:
Fn::ImportValue:
!Join [':', [!Ref 'ClusterStackName', 'ClusterName']]
+ CapacityProviderStrategy:
+ - CapacityProvider: !Sub '${ClusterStackName}-ReplicaCapacityProvider'
+ Weight: 100
DesiredCount: !Ref ReplicaAutoScalingDesiredCapacity
TaskDefinition: !Ref GerritTaskDefinition
LoadBalancers:
diff --git a/dual-primary/setup.env.template b/dual-primary/setup.env.template
index 2f911d2..51b234f 100644
--- a/dual-primary/setup.env.template
+++ b/dual-primary/setup.env.template
@@ -81,4 +81,8 @@
REPLICA_AUTOSCALING_SCALE_IN_COOLDOWN=300
REPLICA_AUTOSCALING_SCALE_OUT_COOLDOWN=300
-REPLICA_AUTOSCALING_TARGET_CPU_PERCENTAGE=75
\ No newline at end of file
+REPLICA_AUTOSCALING_TARGET_CPU_PERCENTAGE=75
+
+REPLICA_CAPACITY_PROVIDER_TARGET=50
+REPLICA_CAPACITY_PROVIDER_MIN_STEP_SIZE=1
+REPLICA_CAPACITY_PROVIDER_MAX_STEP_SIZE=2
\ No newline at end of file
diff --git a/primary-replica/Makefile b/primary-replica/Makefile
index 691f138..d7f7313 100644
--- a/primary-replica/Makefile
+++ b/primary-replica/Makefile
@@ -33,7 +33,8 @@
cluster: cluster-keys set-optional-gerrit-primary-volume \
set-optional-params-for-replica-filesystem \
- set-optional-params-for-replica-auto-scaling-capacity
+ set-optional-params-for-replica-auto-scaling-capacity \
+ set-optional-params-for-replica-capacity-provider
ifdef CLUSTER_INSTANCE_TYPE
$(eval CLUSTER_OPTIONAL_PARAMS := $(CLUSTER_OPTIONAL_PARAMS) ParameterKey=InstanceType,ParameterValue=$(CLUSTER_INSTANCE_TYPE))
endif
@@ -61,7 +62,8 @@
$(CLUSTER_OPTIONAL_PARAMS) \
$(GERRIT_OPTIONAL_PRIMARY_VOLUME) \
$(GERRIT_OPTIONAL_PARAMS_REPLICA_FILESYSTEM) \
- $(GERRIT_OPTIONAL_PARAMS_REPLICA_AUTO_SCALING_CAPACITY)
+ $(GERRIT_OPTIONAL_PARAMS_REPLICA_AUTO_SCALING_CAPACITY) \
+ $(GERRIT_OPTIONAL_PARAMS_REPLICA_CAPACITY_PROVIDER)
service-primary: set-optional-params-metrics-cloudwatch set-optional-params-smtp set-ldap-account-pattern set-optional-gerrit-ulimits set-optional-jgit-conf
ifdef LOAD_BALANCER_SCHEME
diff --git a/primary-replica/README.md b/primary-replica/README.md
index 7d8da55..ccb37d7 100644
--- a/primary-replica/README.md
+++ b/primary-replica/README.md
@@ -133,6 +133,18 @@
The scaling policy adds or removes capacity as required to keep the average CPU
Usage (of the replica service) close to the specified target value.
+Now, tasks in the provisioning state that cannot find sufficient resources on
+the existing instances will automatically trigger the capacity provider to scale
+out the replica ASG. As more EC2 instances become available, tasks in the
+provisioning state will get placed onto those instances, reducing the number of
+tasks in provisioning.
+
+Conversely, as the average CPU usage (of the replica service) drops under the
+specified target value, and replica tasks get removed, the capacity provider
+will reduce the number of EC2 instances too.
+
+Note that only EC2 instances that are not running any replica task will scale in.
+
These are the available settings:
* `REPLICA_AUTOSCALING_MIN_CAPACITY` Optional. The minimum number of tasks that
@@ -163,6 +175,32 @@
utilization target for auto-scaling. Auto-scaling will add or remove tasks in
the replica service to be as close as possible to this value
+* `REPLICA_CAPACITY_PROVIDER_TARGET` Optional. The target capacity value for the
+capacity provider of replicas (must be > 0 and <= 100).
+default: *100*
+
+ Setting this value to 100 means that there will be no _spare capacity_
+allocated on the replica ASG:
+
+ If 3 replica tasks are needed, then the ASG will adjust to have exactly 3 EC2
+
+ Setting this value to less than 100 enables spare capacity in the ASG. For
+example, if you set this value to 50 the scaling policy will adjust the EC2
+until it is exactly twice the number of instances needed to run all of the
+tasks:
+
+ If 3 replica tasks are needed, then there ASG will adjust to 6 EC2
+
+* `REPLICA_CAPACITY_PROVIDER_MIN_STEP_SIZE` Optional. The minimum number of EC2
+instances for replicas that will scale in or scale out at one time (must be >= 1
+and <= 10)
+default: *1*
+
+* `REPLICA_CAPACITY_PROVIDER_MAX_STEP_SIZE` Optional. The maximum number of EC2
+instances for replicas that will scale in or scale out at one time (must be >= 1
+and <= 10)
+default: *1*
+
### 2 - Deploy
* Create the cluster, services and DNS routing stacks:
diff --git a/primary-replica/cf-cluster.yml b/primary-replica/cf-cluster.yml
index 8c0015a..3523810 100644
--- a/primary-replica/cf-cluster.yml
+++ b/primary-replica/cf-cluster.yml
@@ -96,6 +96,27 @@
Type: Number
Description: The maximum number of EC2 instances in the replica ASG
Default: 2
+ ReplicaCapacityProviderTarget:
+ Type: Number
+ Description: The target capacity value for the capacity provider of replicas
+ ConstraintDescription: The specified value must be > 0 and <= 100
+ Default: 100
+ MinValue: 1
+ MaxValue: 100
+ ReplicaCapacityProviderMinStepSize:
+ Type: Number
+ Description: The minimum number of EC2 instances for replicas that will scale in or scale out at one time
+ ConstraintDescription: The specified value must be >= 1 and <= 10
+ Default: 1
+ MinValue: 1
+ MaxValue: 10
+ ReplicaCapacityProviderMaxStepSize:
+ Type: Number
+ Description: The maximum number of EC2 instances for replicas that will scale in or scale out at one time
+ ConstraintDescription: The specified value must be >= 1 and less <= 10
+ Default: 1
+ MinValue: 1
+ MaxValue: 10
Conditions:
CreateReplicaEFS: !Equals [!Ref ReplicaFileSystemID, ""]
@@ -109,6 +130,22 @@
# ECS Resources
ECSCluster:
Type: AWS::ECS::Cluster
+ Properties:
+ ClusterName: !Sub '${AWS::StackName}-ECSCluster'
+ CapacityProviders: [ !Ref ReplicaCapacityProvider ]
+
+ ReplicaCapacityProvider:
+ Type: AWS::ECS::CapacityProvider
+ Properties:
+ Name: !Sub '${AWS::StackName}-ReplicaCapacityProvider'
+ AutoScalingGroupProvider:
+ AutoScalingGroupArn: !Ref ReplicaASG
+ ManagedTerminationProtection: ENABLED
+ ManagedScaling:
+ MaximumScalingStepSize: !Ref ReplicaCapacityProviderMaxStepSize
+ MinimumScalingStepSize: !Ref ReplicaCapacityProviderMinStepSize
+ Status: ENABLED
+ TargetCapacity: !Ref ReplicaCapacityProviderTarget
EcsHostSecurityGroup:
Type: AWS::EC2::SecurityGroup
@@ -153,7 +190,7 @@
UserData:
Fn::Base64: !Sub |
#!/bin/bash -xe
- echo ECS_CLUSTER=${ECSCluster} >> /etc/ecs/ecs.config
+ echo ECS_CLUSTER=${AWS::StackName}-ECSCluster >> /etc/ecs/ecs.config
echo ECS_INSTANCE_ATTRIBUTES={\"target_group\":\"primary\"} >> /etc/ecs/ecs.config
# Make sure latest version of the helper scripts are installed as per recommendation:
# https://github.com/awsdocs/aws-cloudformation-user-guide/blob/master/doc_source/cfn-helper-scripts-reference.md#using-the-latest-version
@@ -253,6 +290,7 @@
MinSize: !Ref ReplicaAutoScalingMinCapacity
MaxSize: !Ref ReplicaAutoScalingMaxCapacity
DesiredCapacity: !Ref ReplicaAutoScalingDesiredCapacity
+ NewInstancesProtectedFromScaleIn: true
CreationPolicy:
ResourceSignal:
Timeout: PT15M
@@ -271,7 +309,7 @@
UserData:
Fn::Base64: !Sub |
#!/bin/bash -xe
- echo ECS_CLUSTER=${ECSCluster} >> /etc/ecs/ecs.config
+ echo ECS_CLUSTER=${AWS::StackName}-ECSCluster >> /etc/ecs/ecs.config
echo ECS_INSTANCE_ATTRIBUTES={\"target_group\":\"replica\"} >> /etc/ecs/ecs.config
# Make sure latest version of the helper scripts are installed as per recommendation:
diff --git a/primary-replica/cf-service-replica.yml b/primary-replica/cf-service-replica.yml
index 7ad5be5..2e3d1c6 100644
--- a/primary-replica/cf-service-replica.yml
+++ b/primary-replica/cf-service-replica.yml
@@ -222,6 +222,9 @@
Cluster:
Fn::ImportValue:
!Join [':', [!Ref 'ClusterStackName', 'ClusterName']]
+ CapacityProviderStrategy:
+ - CapacityProvider: !Sub '${ClusterStackName}-ReplicaCapacityProvider'
+ Weight: 100
DesiredCount: !Ref ReplicaAutoScalingDesiredCapacity
TaskDefinition: !Ref GerritTaskDefinition
LoadBalancers:
diff --git a/primary-replica/setup.env.template b/primary-replica/setup.env.template
index cc3b284..7a3e5f7 100644
--- a/primary-replica/setup.env.template
+++ b/primary-replica/setup.env.template
@@ -59,4 +59,8 @@
REPLICA_AUTOSCALING_SCALE_IN_COOLDOWN=300
REPLICA_AUTOSCALING_SCALE_OUT_COOLDOWN=300
-REPLICA_AUTOSCALING_TARGET_CPU_PERCENTAGE=75
\ No newline at end of file
+REPLICA_AUTOSCALING_TARGET_CPU_PERCENTAGE=75
+
+REPLICA_CAPACITY_PROVIDER_TARGET=50
+REPLICA_CAPACITY_PROVIDER_MIN_STEP_SIZE=1
+REPLICA_CAPACITY_PROVIDER_MAX_STEP_SIZE=2
\ No newline at end of file