dual-primary: auto-scaling policy for replicas

Allow replicas to scale in and out based on average CPU value.

Bug: Issue 14211
Change-Id: I3a5cc90139cd0f9ecbdea6786bfdb65491e92c8f
diff --git a/dual-primary/Makefile b/dual-primary/Makefile
index 933ce14..88ac236 100644
--- a/dual-primary/Makefile
+++ b/dual-primary/Makefile
@@ -50,7 +50,9 @@
 						$(optional_git_gc_targets_creation) \
 						dns-routing wait-for-dns-routing-creation
 
-cluster: cluster-keys set-optional-gerrit-primary-volume set-optional-params-for-replica-filesystem
+cluster: cluster-keys set-optional-gerrit-primary-volume \
+			set-optional-params-for-replica-filesystem \
+			set-optional-params-for-replica-auto-scaling-capacity
 ifdef CLUSTER_INSTANCE_TYPE
 		$(eval CLUSTER_OPTIONAL_PARAMS := $(CLUSTER_OPTIONAL_PARAMS) ParameterKey=InstanceType,ParameterValue=$(CLUSTER_INSTANCE_TYPE))
 endif
@@ -92,7 +94,8 @@
 		ParameterKey=SubnetIdProp,ParameterValue=$(SUBNET_ID) \
 		$(CLUSTER_OPTIONAL_PARAMS) \
 		$(GERRIT_OPTIONAL_PRIMARY_VOLUME) \
-		$(GERRIT_OPTIONAL_PARAMS_REPLICA_FILESYSTEM)
+		$(GERRIT_OPTIONAL_PARAMS_REPLICA_FILESYSTEM) \
+		$(GERRIT_OPTIONAL_PARAMS_REPLICA_AUTO_SCALING_CAPACITY)
 
 service-primary-1: set-optional-params-metrics-cloudwatch set-optional-params-smtp \
 					set-optional-params-multisite set-ldap-account-pattern \
@@ -227,7 +230,13 @@
 		ParameterKey=HostedZoneName,ParameterValue=$(HOSTED_ZONE_NAME) \
 		ParameterKey=GitReplicationSubdomain,ParameterValue=$(GIT_REPLICATION_SUBDOMAIN)
 
-service-replica: set-optional-params-metrics-cloudwatch set-ldap-account-pattern set-optional-gerrit-ulimits set-optional-jgit-conf
+service-replica: set-optional-params-metrics-cloudwatch \
+					set-ldap-account-pattern \
+					set-optional-gerrit-ulimits \
+					set-optional-jgit-conf \
+					set-optional-params-for-replica-auto-scaling-capacity \
+					set-optional-params-for-replica-auto-scaling-policy
+
 ifdef GERRIT_REPLICA_INSTANCE_ID
 		$(eval REPLICA_SERVICE_OPTIONAL_PARAMS := $(REPLICA_SERVICE_OPTIONAL_PARAMS) ParameterKey=InstanceId,ParameterValue=$(GERRIT_REPLICA_INSTANCE_ID))
 endif
@@ -261,7 +270,9 @@
 		$(LDAP_ACCOUNT_PATTERN_PARAM) \
 		$(REPLICA_SERVICE_OPTIONAL_PARAMS) \
 		$(METRICS_CW_OPTIONAL_PARAMS) \
-		$(GERRIT_ULIMITS)
+		$(GERRIT_ULIMITS) \
+		$(GERRIT_OPTIONAL_PARAMS_REPLICA_AUTO_SCALING_CAPACITY) \
+		$(GERRIT_OPTIONAL_PARAMS_REPLICA_AUTO_SCALING_POLICY)
 
 service-lb:
 ifdef LOAD_BALANCER_SCHEME
diff --git a/dual-primary/README.md b/dual-primary/README.md
index baa0874..7552d79 100644
--- a/dual-primary/README.md
+++ b/dual-primary/README.md
@@ -242,6 +242,55 @@
 * `REPLICA_FILESYSTEM_PROVISIONED_THROUGHPUT_IN_MIBPS`: Optional. Only used when `REPLICA_FILESYSTEM_THROUGHPUT_MODE` is set to `provisioned`.
 default: `256`.
 
+##### Auto Scaling of replicas instances
+
+Gerrit replicas have the ability to scale in or out automatically to accommodate
+to the increase or decrease of traffic. The traffic might be typically coming
+from build or test jobs executed by some sort of automated build pipeline.
+
+Since they all [share the same git data over EFS](#shared-filesystem-for-replicas),
+replicas are immediately ready to serve traffic as soon as they come up and
+register behind the loadbalancer.
+
+There is a 1 to 1 relationship between replica and EC2 instances: on each EC2
+instance in the 'replica' ASG, runs one and only one replica task.
+Because of this, when specifying the capacity for replicas (minimum, desired and
+maximum), they will both configure for the capacity of tasks as well as the
+capacity of the ASG, since they always need to be in sync.
+
+The scaling policy adds or removes capacity as required to keep the average CPU
+Usage (of the replica service) close to the specified target value.
+
+These are the available settings:
+
+* `REPLICA_AUTOSCALING_MIN_CAPACITY` Optional. The minimum number of tasks that
+replicas should scale in to. This is also the minimum number of EC2 instances in
+the replica ASG
+default: *1*
+
+* `REPLICA_AUTOSCALING_DESIRED_CAPACITY` Optional. The desired number of
+replica tasks to run. This is also the desired number of EC2 instances in the
+replica ASG.
+default: *1*
+
+* `REPLICA_AUTOSCALING_MAX_CAPACITY` Optional. The maximum number of tasks that
+replicas should scale out to. This is also the maximum number of EC2 instances
+in the replica ASG
+default: *2*
+
+* `REPLICA_AUTOSCALING_SCALE_IN_COOLDOWN` Optional. The amount of time, in
+seconds, after a scale-in activity completes before another scale-in activity
+can start
+default: *300* seconds
+
+* `REPLICA_AUTOSCALING_SCALE_OUT_COOLDOWN` Optional. The amount of time, in
+seconds, to wait for a previous scale-out activity to take effect
+default: *300* seconds
+
+* `REPLICA_AUTOSCALING_TARGET_CPU_PERCENTAGE` Optional. Aggregate CPU
+utilization target for auto-scaling. Auto-scaling will add or remove tasks in
+the replica service to be as close as possible to this value
+
 #### REPLICATION SERVICE
 
 * `REPLICATION_SERVICE_ENABLED`: Optional. Whether to expose a replication endpoint.
diff --git a/dual-primary/cf-cluster.yml b/dual-primary/cf-cluster.yml
index 17a9b9b..c13cd99 100644
--- a/dual-primary/cf-cluster.yml
+++ b/dual-primary/cf-cluster.yml
@@ -106,6 +106,18 @@
     Description: Gerrit replicas shared filesystem throughput, measured in MiB/s. Valid values are 1-1024.
     Type: Number
     Default: 256
+  ReplicaAutoScalingMinCapacity:
+    Type: Number
+    Description: The minimum number of tasks that replicas should scale in to
+    Default: 1
+  ReplicaAutoScalingDesiredCapacity:
+    Description: The desired number of replica tasks to run
+    Type: Number
+    Default: 1
+  ReplicaAutoScalingMaxCapacity:
+    Type: Number
+    Description: The maximum number of tasks that replicas should scale out to
+    Default: 2
 
 Conditions:
   isProvisionedThroughput: !Equals [!Ref PrimaryFileSystemThroughputMode, "provisioned"]
@@ -144,9 +156,9 @@
       VPCZoneIdentifier:
         - !If [NetworkStackNeeded, !GetAtt ECSTaskNetworkStack.Outputs.PublicSubnetOneRef, !Ref SubnetIdProp]
       LaunchConfigurationName: !Ref 'ReplicaLaunchConfiguration'
-      MinSize: '1'
-      MaxSize: '1'
-      DesiredCapacity: '1'
+      MinSize: !Ref ReplicaAutoScalingMinCapacity
+      MaxSize: !Ref ReplicaAutoScalingMaxCapacity
+      DesiredCapacity: !Ref ReplicaAutoScalingDesiredCapacity
     CreationPolicy:
       ResourceSignal:
         Timeout: PT15M
diff --git a/dual-primary/cf-service-replica.yml b/dual-primary/cf-service-replica.yml
index ff59a9a..5cd9660 100644
--- a/dual-primary/cf-service-replica.yml
+++ b/dual-primary/cf-service-replica.yml
@@ -36,10 +36,6 @@
   DockerRegistryUrl:
       Description: Docker registry URL
       Type: String
-  DesiredCount:
-      Description: How many instances of this task should we run across our cluster?
-      Type: Number
-      Default: 1
   HTTPHostPort:
       Description: Gerrit Host HTTP port
       Type: Number
@@ -189,6 +185,30 @@
     Description: Comma separated list of regex patterns to exclude metrics reported to CloudWatch
     Type: CommaDelimitedList
     Default: ''
+  ReplicaAutoScalingMinCapacity:
+    Type: Number
+    Description: The minimum number of tasks that replicas should scale in to
+    Default: 1
+  ReplicaAutoScalingDesiredCapacity:
+    Description: The desired number of replica tasks to run
+    Type: Number
+    Default: 1
+  ReplicaAutoScalingMaxCapacity:
+    Type: Number
+    Description: The maximum number of tasks that replicas should scale out to
+    Default: 2
+  ReplicaAutoScalingScaleInCooldown:
+    Type: Number
+    Description: The amount of time, in seconds, after a scale-in activity completes before another scale-in activity can start
+    Default: 300
+  ReplicaAutoScalingScaleOutCooldown:
+    Type: Number
+    Description: The amount of time, in seconds, to wait for a previous scale-out activity to take effect.
+    Default: 300
+  ReplicaAutoScalingTargetCPUPercentage:
+    Type: Number
+    Description: Aggregate CPU utilization target for auto-scaling
+    Default: 75.0
 
 Resources:
     GerritService:
@@ -202,7 +222,7 @@
             Cluster:
               Fn::ImportValue:
                   !Join [':', [!Ref 'ClusterStackName', 'ClusterName']]
-            DesiredCount: !Ref DesiredCount
+            DesiredCount: !Ref ReplicaAutoScalingDesiredCapacity
             TaskDefinition: !Ref GerritTaskDefinition
             LoadBalancers:
                 - ContainerName: !Ref GerritServiceName
@@ -385,6 +405,26 @@
                   Labels:
                     gerrit-logs: !Join ['-', [!Ref EnvironmentName, !Ref GerritLogsVolume]]
 
+    ReplicaCPUAutoScaling:
+      Type: AWS::CloudFormation::Stack
+      Properties:
+        TemplateURL: !Join [ '', ['https://', !Ref TemplateBucketName, '.s3.amazonaws.com/cf-ecs-service-cpu-autoscaling.yml'] ]
+        TimeoutInMinutes: '5'
+        Parameters:
+          AutoScalingMinCapacity: !Ref ReplicaAutoScalingMinCapacity
+          AutoScalingDesiredCapacity: !Ref ReplicaAutoScalingDesiredCapacity
+          AutoScalingMaxCapacity: !Ref ReplicaAutoScalingMaxCapacity
+          AutoScalingScaleInCooldown: !Ref ReplicaAutoScalingScaleInCooldown
+          AutoScalingScaleOutCooldown: !Ref ReplicaAutoScalingScaleOutCooldown
+          AutoScalingTargetCPUPercentage: !Ref ReplicaAutoScalingTargetCPUPercentage
+          ResourceId:
+            !Join
+            - ''
+            - - 'service/'
+              - Fn::ImportValue: !Join [':', [!Ref 'ClusterStackName', 'ClusterName']]
+              - '/'
+              - !GetAtt GerritService.Name
+
     LoadBalancer:
         Type: AWS::ElasticLoadBalancingV2::LoadBalancer
         Properties:
diff --git a/dual-primary/setup.env.template b/dual-primary/setup.env.template
index 72bfd37..2f911d2 100644
--- a/dual-primary/setup.env.template
+++ b/dual-primary/setup.env.template
@@ -73,4 +73,12 @@
 
 REPLICA_FILESYSTEM_ID=""
 REPLICA_FILESYSTEM_THROUGHPUT_MODE="provisioned"
-REPLICA_FILESYSTEM_PROVISIONED_THROUGHPUT_IN_MIBPS="256"
\ No newline at end of file
+REPLICA_FILESYSTEM_PROVISIONED_THROUGHPUT_IN_MIBPS="256"
+
+REPLICA_AUTOSCALING_MIN_CAPACITY=1
+REPLICA_AUTOSCALING_DESIRED_CAPACITY=1
+REPLICA_AUTOSCALING_MAX_CAPACITY=1
+
+REPLICA_AUTOSCALING_SCALE_IN_COOLDOWN=300
+REPLICA_AUTOSCALING_SCALE_OUT_COOLDOWN=300
+REPLICA_AUTOSCALING_TARGET_CPU_PERCENTAGE=75
\ No newline at end of file