Elastically scale replicas ASG based on ECS task requirements

dual-primary and primary-replica recipes:
Use capacity provider strategy to scale replicas ASG based on ECS tasks
demand.

Bug: Issue 14196
Change-Id: I947f4d4912955e5b2ff9264e5296c28e07b0f70e
diff --git a/Makefile.common b/Makefile.common
index 2e01b03..47c08a5 100644
--- a/Makefile.common
+++ b/Makefile.common
@@ -133,6 +133,18 @@
 		$(eval GERRIT_OPTIONAL_PARAMS_REPLICA_AUTO_SCALING_POLICY := $(GERRIT_OPTIONAL_PARAMS_REPLICA_AUTO_SCALING_POLICY) ParameterKey=ReplicaAutoScalingTargetCPUPercentage,ParameterValue=$(REPLICA_AUTOSCALING_TARGET_CPU_PERCENTAGE))
 endif
 
+set-optional-params-for-replica-capacity-provider:
+	$(eval GERRIT_OPTIONAL_PARAMS_REPLICA_CAPACITY_PROVIDER=)
+ifdef REPLICA_CAPACITY_PROVIDER_TARGET
+		$(eval GERRIT_OPTIONAL_PARAMS_REPLICA_CAPACITY_PROVIDER := $(GERRIT_OPTIONAL_PARAMS_REPLICA_CAPACITY_PROVIDER) ParameterKey=ReplicaCapacityProviderTarget,ParameterValue=$(REPLICA_CAPACITY_PROVIDER_TARGET))
+endif
+ifdef REPLICA_CAPACITY_PROVIDER_MIN_STEP_SIZE
+		$(eval GERRIT_OPTIONAL_PARAMS_REPLICA_CAPACITY_PROVIDER := $(GERRIT_OPTIONAL_PARAMS_REPLICA_CAPACITY_PROVIDER) ParameterKey=ReplicaCapacityProviderMinStepSize,ParameterValue=$(REPLICA_CAPACITY_PROVIDER_MIN_STEP_SIZE))
+endif
+ifdef REPLICA_CAPACITY_PROVIDER_MAX_STEP_SIZE
+		$(eval GERRIT_OPTIONAL_PARAMS_REPLICA_CAPACITY_PROVIDER := $(GERRIT_OPTIONAL_PARAMS_REPLICA_CAPACITY_PROVIDER) ParameterKey=ReplicaCapacityProviderMaxStepSize,ParameterValue=$(REPLICA_CAPACITY_PROVIDER_MAX_STEP_SIZE))
+endif
+
 confirm-persistent-stack-deletion:
 	@echo ""
 	@echo "* * * * WARNING * * * * this is going to completely destroy the stack, including git data."
diff --git a/dual-primary/Makefile b/dual-primary/Makefile
index 88ac236..8b44868 100644
--- a/dual-primary/Makefile
+++ b/dual-primary/Makefile
@@ -52,7 +52,8 @@
 
 cluster: cluster-keys set-optional-gerrit-primary-volume \
 			set-optional-params-for-replica-filesystem \
-			set-optional-params-for-replica-auto-scaling-capacity
+			set-optional-params-for-replica-auto-scaling-capacity \
+			set-optional-params-for-replica-capacity-provider
 ifdef CLUSTER_INSTANCE_TYPE
 		$(eval CLUSTER_OPTIONAL_PARAMS := $(CLUSTER_OPTIONAL_PARAMS) ParameterKey=InstanceType,ParameterValue=$(CLUSTER_INSTANCE_TYPE))
 endif
@@ -95,7 +96,8 @@
 		$(CLUSTER_OPTIONAL_PARAMS) \
 		$(GERRIT_OPTIONAL_PRIMARY_VOLUME) \
 		$(GERRIT_OPTIONAL_PARAMS_REPLICA_FILESYSTEM) \
-		$(GERRIT_OPTIONAL_PARAMS_REPLICA_AUTO_SCALING_CAPACITY)
+		$(GERRIT_OPTIONAL_PARAMS_REPLICA_AUTO_SCALING_CAPACITY) \
+		$(GERRIT_OPTIONAL_PARAMS_REPLICA_CAPACITY_PROVIDER)
 
 service-primary-1: set-optional-params-metrics-cloudwatch set-optional-params-smtp \
 					set-optional-params-multisite set-ldap-account-pattern \
diff --git a/dual-primary/README.md b/dual-primary/README.md
index 7552d79..0c0bc8e 100644
--- a/dual-primary/README.md
+++ b/dual-primary/README.md
@@ -261,6 +261,18 @@
 The scaling policy adds or removes capacity as required to keep the average CPU
 Usage (of the replica service) close to the specified target value.
 
+Now, tasks in the provisioning state that cannot find sufficient resources on
+the existing instances will automatically trigger the capacity provider to scale
+out the replica ASG. As more EC2 instances become available, tasks in the
+provisioning state will get placed onto those instances, reducing the number of
+tasks in provisioning.
+
+Conversely, as the average CPU usage (of the replica service) drops under the
+specified target value, and replica tasks get removed, the capacity provider
+will reduce the number of EC2 instances too.
+
+Note that only EC2 instances that are not running any replica task will scale in.
+
 These are the available settings:
 
 * `REPLICA_AUTOSCALING_MIN_CAPACITY` Optional. The minimum number of tasks that
@@ -291,6 +303,32 @@
 utilization target for auto-scaling. Auto-scaling will add or remove tasks in
 the replica service to be as close as possible to this value
 
+* `REPLICA_CAPACITY_PROVIDER_TARGET` Optional. The target capacity value for the
+capacity provider of replicas (must be > 0 and <= 100).
+default: *100*
+
+   Setting this value to 100 means that there will be no _spare capacity_
+allocated on the replica ASG:
+
+   If 3 replica tasks are needed, then the ASG will adjust to have exactly 3 EC2
+
+   Setting this value to less than 100 enables spare capacity in the ASG. For
+example, if you set this value to 50 the scaling policy will adjust the EC2
+until it is exactly twice the number of instances needed to run all of the
+tasks:
+
+   If 3 replica tasks are needed, then there ASG will adjust to 6 EC2
+
+* `REPLICA_CAPACITY_PROVIDER_MIN_STEP_SIZE` Optional. The minimum number of EC2
+instances for replicas that will scale in or scale out at one time (must be >= 1
+and <= 10)
+default: *1*
+
+* `REPLICA_CAPACITY_PROVIDER_MAX_STEP_SIZE` Optional. The maximum number of EC2
+instances for replicas that will scale in or scale out at one time (must be >= 1
+and <= 10)
+default: *1*
+
 #### REPLICATION SERVICE
 
 * `REPLICATION_SERVICE_ENABLED`: Optional. Whether to expose a replication endpoint.
diff --git a/dual-primary/cf-cluster.yml b/dual-primary/cf-cluster.yml
index c13cd99..2797932 100644
--- a/dual-primary/cf-cluster.yml
+++ b/dual-primary/cf-cluster.yml
@@ -118,6 +118,27 @@
     Type: Number
     Description: The maximum number of tasks that replicas should scale out to
     Default: 2
+  ReplicaCapacityProviderTarget:
+    Type: Number
+    Description: The target capacity value for the capacity provider of replicas
+    ConstraintDescription: The specified value must be > 0 and <= 100
+    Default: 100
+    MinValue: 1
+    MaxValue: 100
+  ReplicaCapacityProviderMinStepSize:
+    Type: Number
+    Description: The minimum number of EC2 instances for replicas that will scale in or scale out at one time
+    ConstraintDescription: The specified value must be >= 1 and <= 10
+    Default: 1
+    MinValue: 1
+    MaxValue: 10
+  ReplicaCapacityProviderMaxStepSize:
+    Type: Number
+    Description: The maximum number of EC2 instances for replicas that will scale in or scale out at one time
+    ConstraintDescription: The specified value must be >= 1 and less <= 10
+    Default: 1
+    MinValue: 1
+    MaxValue: 10
 
 Conditions:
   isProvisionedThroughput: !Equals [!Ref PrimaryFileSystemThroughputMode, "provisioned"]
@@ -133,6 +154,22 @@
   # ECS Resources
   ECSCluster:
     Type: AWS::ECS::Cluster
+    Properties:
+      ClusterName: !Sub '${AWS::StackName}-ECSCluster'
+      CapacityProviders: [ !Ref ReplicaCapacityProvider ]
+
+  ReplicaCapacityProvider:
+    Type: AWS::ECS::CapacityProvider
+    Properties:
+      Name: !Sub '${AWS::StackName}-ReplicaCapacityProvider'
+      AutoScalingGroupProvider:
+        AutoScalingGroupArn: !Ref ReplicaECSAutoScalingGroup
+        ManagedTerminationProtection: ENABLED
+        ManagedScaling:
+          MaximumScalingStepSize: !Ref ReplicaCapacityProviderMaxStepSize
+          MinimumScalingStepSize: !Ref ReplicaCapacityProviderMinStepSize
+          Status: ENABLED
+          TargetCapacity: !Ref ReplicaCapacityProviderTarget
 
   EcsHostSecurityGroup:
     Type: AWS::EC2::SecurityGroup
@@ -159,6 +196,7 @@
       MinSize: !Ref ReplicaAutoScalingMinCapacity
       MaxSize: !Ref ReplicaAutoScalingMaxCapacity
       DesiredCapacity: !Ref ReplicaAutoScalingDesiredCapacity
+      NewInstancesProtectedFromScaleIn: true
     CreationPolicy:
       ResourceSignal:
         Timeout: PT15M
@@ -177,7 +215,7 @@
       UserData:
         Fn::Base64: !Sub |
           #!/bin/bash -xe
-          echo ECS_CLUSTER=${ECSCluster} >> /etc/ecs/ecs.config
+          echo ECS_CLUSTER=${AWS::StackName}-ECSCluster >> /etc/ecs/ecs.config
           echo ECS_INSTANCE_ATTRIBUTES={\"target_group\":\"replica\"} >> /etc/ecs/ecs.config
 
           # Make sure latest version of the helper scripts are installed as per recommendation:
@@ -218,7 +256,7 @@
       UserData:
         Fn::Base64: !Sub |
           #!/bin/bash -xe
-          echo ECS_CLUSTER=${ECSCluster} >> /etc/ecs/ecs.config
+          echo ECS_CLUSTER=${AWS::StackName}-ECSCluster >> /etc/ecs/ecs.config
           echo ECS_INSTANCE_ATTRIBUTES={\"target_group\":\"haproxy\"} >> /etc/ecs/ecs.config
 
           # Make sure latest version of the helper scripts are installed as per recommendation:
diff --git a/dual-primary/cf-service-replica.yml b/dual-primary/cf-service-replica.yml
index 5cd9660..85e9604 100644
--- a/dual-primary/cf-service-replica.yml
+++ b/dual-primary/cf-service-replica.yml
@@ -222,6 +222,9 @@
             Cluster:
               Fn::ImportValue:
                   !Join [':', [!Ref 'ClusterStackName', 'ClusterName']]
+            CapacityProviderStrategy:
+              - CapacityProvider: !Sub '${ClusterStackName}-ReplicaCapacityProvider'
+                Weight: 100
             DesiredCount: !Ref ReplicaAutoScalingDesiredCapacity
             TaskDefinition: !Ref GerritTaskDefinition
             LoadBalancers:
diff --git a/dual-primary/setup.env.template b/dual-primary/setup.env.template
index 2f911d2..51b234f 100644
--- a/dual-primary/setup.env.template
+++ b/dual-primary/setup.env.template
@@ -81,4 +81,8 @@
 
 REPLICA_AUTOSCALING_SCALE_IN_COOLDOWN=300
 REPLICA_AUTOSCALING_SCALE_OUT_COOLDOWN=300
-REPLICA_AUTOSCALING_TARGET_CPU_PERCENTAGE=75
\ No newline at end of file
+REPLICA_AUTOSCALING_TARGET_CPU_PERCENTAGE=75
+
+REPLICA_CAPACITY_PROVIDER_TARGET=50
+REPLICA_CAPACITY_PROVIDER_MIN_STEP_SIZE=1
+REPLICA_CAPACITY_PROVIDER_MAX_STEP_SIZE=2
\ No newline at end of file
diff --git a/primary-replica/Makefile b/primary-replica/Makefile
index 691f138..d7f7313 100644
--- a/primary-replica/Makefile
+++ b/primary-replica/Makefile
@@ -33,7 +33,8 @@
 
 cluster: cluster-keys set-optional-gerrit-primary-volume \
 			set-optional-params-for-replica-filesystem \
-			set-optional-params-for-replica-auto-scaling-capacity
+			set-optional-params-for-replica-auto-scaling-capacity \
+			set-optional-params-for-replica-capacity-provider
 ifdef CLUSTER_INSTANCE_TYPE
 		$(eval CLUSTER_OPTIONAL_PARAMS := $(CLUSTER_OPTIONAL_PARAMS) ParameterKey=InstanceType,ParameterValue=$(CLUSTER_INSTANCE_TYPE))
 endif
@@ -61,7 +62,8 @@
 		$(CLUSTER_OPTIONAL_PARAMS) \
 		$(GERRIT_OPTIONAL_PRIMARY_VOLUME) \
 		$(GERRIT_OPTIONAL_PARAMS_REPLICA_FILESYSTEM) \
-		$(GERRIT_OPTIONAL_PARAMS_REPLICA_AUTO_SCALING_CAPACITY)
+		$(GERRIT_OPTIONAL_PARAMS_REPLICA_AUTO_SCALING_CAPACITY) \
+		$(GERRIT_OPTIONAL_PARAMS_REPLICA_CAPACITY_PROVIDER)
 
 service-primary: set-optional-params-metrics-cloudwatch set-optional-params-smtp set-ldap-account-pattern set-optional-gerrit-ulimits set-optional-jgit-conf
 ifdef LOAD_BALANCER_SCHEME
diff --git a/primary-replica/README.md b/primary-replica/README.md
index 7d8da55..ccb37d7 100644
--- a/primary-replica/README.md
+++ b/primary-replica/README.md
@@ -133,6 +133,18 @@
 The scaling policy adds or removes capacity as required to keep the average CPU
 Usage (of the replica service) close to the specified target value.
 
+Now, tasks in the provisioning state that cannot find sufficient resources on
+the existing instances will automatically trigger the capacity provider to scale
+out the replica ASG. As more EC2 instances become available, tasks in the
+provisioning state will get placed onto those instances, reducing the number of
+tasks in provisioning.
+
+Conversely, as the average CPU usage (of the replica service) drops under the
+specified target value, and replica tasks get removed, the capacity provider
+will reduce the number of EC2 instances too.
+
+Note that only EC2 instances that are not running any replica task will scale in.
+
 These are the available settings:
 
 * `REPLICA_AUTOSCALING_MIN_CAPACITY` Optional. The minimum number of tasks that
@@ -163,6 +175,32 @@
 utilization target for auto-scaling. Auto-scaling will add or remove tasks in
 the replica service to be as close as possible to this value
 
+* `REPLICA_CAPACITY_PROVIDER_TARGET` Optional. The target capacity value for the
+capacity provider of replicas (must be > 0 and <= 100).
+default: *100*
+
+   Setting this value to 100 means that there will be no _spare capacity_
+allocated on the replica ASG:
+
+   If 3 replica tasks are needed, then the ASG will adjust to have exactly 3 EC2
+
+   Setting this value to less than 100 enables spare capacity in the ASG. For
+example, if you set this value to 50 the scaling policy will adjust the EC2
+until it is exactly twice the number of instances needed to run all of the
+tasks:
+
+   If 3 replica tasks are needed, then there ASG will adjust to 6 EC2
+
+* `REPLICA_CAPACITY_PROVIDER_MIN_STEP_SIZE` Optional. The minimum number of EC2
+instances for replicas that will scale in or scale out at one time (must be >= 1
+and <= 10)
+default: *1*
+
+* `REPLICA_CAPACITY_PROVIDER_MAX_STEP_SIZE` Optional. The maximum number of EC2
+instances for replicas that will scale in or scale out at one time (must be >= 1
+and <= 10)
+default: *1*
+
 ### 2 - Deploy
 
 * Create the cluster, services and DNS routing stacks:
diff --git a/primary-replica/cf-cluster.yml b/primary-replica/cf-cluster.yml
index 8c0015a..3523810 100644
--- a/primary-replica/cf-cluster.yml
+++ b/primary-replica/cf-cluster.yml
@@ -96,6 +96,27 @@
     Type: Number
     Description: The maximum number of EC2 instances in the replica ASG
     Default: 2
+  ReplicaCapacityProviderTarget:
+    Type: Number
+    Description: The target capacity value for the capacity provider of replicas
+    ConstraintDescription: The specified value must be > 0 and <= 100
+    Default: 100
+    MinValue: 1
+    MaxValue: 100
+  ReplicaCapacityProviderMinStepSize:
+    Type: Number
+    Description: The minimum number of EC2 instances for replicas that will scale in or scale out at one time
+    ConstraintDescription: The specified value must be >= 1 and <= 10
+    Default: 1
+    MinValue: 1
+    MaxValue: 10
+  ReplicaCapacityProviderMaxStepSize:
+    Type: Number
+    Description: The maximum number of EC2 instances for replicas that will scale in or scale out at one time
+    ConstraintDescription: The specified value must be >= 1 and less <= 10
+    Default: 1
+    MinValue: 1
+    MaxValue: 10
 
 Conditions:
   CreateReplicaEFS: !Equals [!Ref ReplicaFileSystemID, ""]
@@ -109,6 +130,22 @@
   # ECS Resources
   ECSCluster:
     Type: AWS::ECS::Cluster
+    Properties:
+      ClusterName: !Sub '${AWS::StackName}-ECSCluster'
+      CapacityProviders: [ !Ref ReplicaCapacityProvider ]
+
+  ReplicaCapacityProvider:
+    Type: AWS::ECS::CapacityProvider
+    Properties:
+      Name: !Sub '${AWS::StackName}-ReplicaCapacityProvider'
+      AutoScalingGroupProvider:
+        AutoScalingGroupArn: !Ref ReplicaASG
+        ManagedTerminationProtection: ENABLED
+        ManagedScaling:
+          MaximumScalingStepSize: !Ref ReplicaCapacityProviderMaxStepSize
+          MinimumScalingStepSize: !Ref ReplicaCapacityProviderMinStepSize
+          Status: ENABLED
+          TargetCapacity: !Ref ReplicaCapacityProviderTarget
 
   EcsHostSecurityGroup:
     Type: AWS::EC2::SecurityGroup
@@ -153,7 +190,7 @@
       UserData:
         Fn::Base64: !Sub |
           #!/bin/bash -xe
-          echo ECS_CLUSTER=${ECSCluster} >> /etc/ecs/ecs.config
+          echo ECS_CLUSTER=${AWS::StackName}-ECSCluster >> /etc/ecs/ecs.config
           echo ECS_INSTANCE_ATTRIBUTES={\"target_group\":\"primary\"} >> /etc/ecs/ecs.config
           # Make sure latest version of the helper scripts are installed as per recommendation:
           # https://github.com/awsdocs/aws-cloudformation-user-guide/blob/master/doc_source/cfn-helper-scripts-reference.md#using-the-latest-version
@@ -253,6 +290,7 @@
       MinSize: !Ref ReplicaAutoScalingMinCapacity
       MaxSize: !Ref ReplicaAutoScalingMaxCapacity
       DesiredCapacity: !Ref ReplicaAutoScalingDesiredCapacity
+      NewInstancesProtectedFromScaleIn: true
     CreationPolicy:
       ResourceSignal:
         Timeout: PT15M
@@ -271,7 +309,7 @@
       UserData:
         Fn::Base64: !Sub |
           #!/bin/bash -xe
-          echo ECS_CLUSTER=${ECSCluster} >> /etc/ecs/ecs.config
+          echo ECS_CLUSTER=${AWS::StackName}-ECSCluster >> /etc/ecs/ecs.config
           echo ECS_INSTANCE_ATTRIBUTES={\"target_group\":\"replica\"} >> /etc/ecs/ecs.config
 
           # Make sure latest version of the helper scripts are installed as per recommendation:
diff --git a/primary-replica/cf-service-replica.yml b/primary-replica/cf-service-replica.yml
index 7ad5be5..2e3d1c6 100644
--- a/primary-replica/cf-service-replica.yml
+++ b/primary-replica/cf-service-replica.yml
@@ -222,6 +222,9 @@
             Cluster:
               Fn::ImportValue:
                   !Join [':', [!Ref 'ClusterStackName', 'ClusterName']]
+            CapacityProviderStrategy:
+              - CapacityProvider: !Sub '${ClusterStackName}-ReplicaCapacityProvider'
+                Weight: 100
             DesiredCount: !Ref ReplicaAutoScalingDesiredCapacity
             TaskDefinition: !Ref GerritTaskDefinition
             LoadBalancers:
diff --git a/primary-replica/setup.env.template b/primary-replica/setup.env.template
index cc3b284..7a3e5f7 100644
--- a/primary-replica/setup.env.template
+++ b/primary-replica/setup.env.template
@@ -59,4 +59,8 @@
 
 REPLICA_AUTOSCALING_SCALE_IN_COOLDOWN=300
 REPLICA_AUTOSCALING_SCALE_OUT_COOLDOWN=300
-REPLICA_AUTOSCALING_TARGET_CPU_PERCENTAGE=75
\ No newline at end of file
+REPLICA_AUTOSCALING_TARGET_CPU_PERCENTAGE=75
+
+REPLICA_CAPACITY_PROVIDER_TARGET=50
+REPLICA_CAPACITY_PROVIDER_MIN_STEP_SIZE=1
+REPLICA_CAPACITY_PROVIDER_MAX_STEP_SIZE=2
\ No newline at end of file