Merge "Remove unneeded first time redirect"

commit: a304efe7430f7584e8a2f6f8447b1676be4916a7 [log] [tgz]
author: Luca Milanesio <luca.milanesio@gmail.com> Mon Mar 15 21:27:51 2021 +0000
committer: Gerrit Code Review <noreply-gerritcodereview@google.com> Mon Mar 15 21:27:51 2021 +0000
tree: 678557c1f29fa40f98ea4d8873308ad964465deb
parent: 9311a0531448cec3f2cd00adcbc76c6c9223ff86 [diff]
parent: a43152a9d4d4306d56a2c82334afbc022ab8fd8d [diff]
diff --git a/Makefile.common b/Makefile.common
index 78c1f9a..47c08a5 100644
--- a/Makefile.common
+++ b/Makefile.common

@@ -21,6 +21,7 @@
 	aws s3 cp ../common-templates/cf-gerrit-volume.yml s3://$(TEMPLATE_BUCKET_NAME)/
 	aws s3 cp ../common-templates/cf-primary-asg.yml s3://$(TEMPLATE_BUCKET_NAME)/
 	aws s3 cp ../common-templates/cf-efs-stack.yml s3://$(TEMPLATE_BUCKET_NAME)/
+	aws s3 cp ../common-templates/cf-ecs-service-cpu-autoscaling.yml s3://$(TEMPLATE_BUCKET_NAME)/
 
 set-optional-params-metrics-cloudwatch:
 ifdef METRICS_CLOUDWATCH_ENABLED
@@ -108,6 +109,42 @@
 		$(eval GERRIT_OPTIONAL_PARAMS_REPLICA_FILESYSTEM := $(GERRIT_OPTIONAL_PARAMS_REPLICA_FILESYSTEM) ParameterKey=ReplicaProvisionedThroughputInMibps,ParameterValue=$(REPLICA_FILESYSTEM_PROVISIONED_THROUGHPUT_IN_MIBPS))
 endif
 
+set-optional-params-for-replica-auto-scaling-capacity:
+	$(eval GERRIT_OPTIONAL_PARAMS_REPLICA_AUTO_SCALING_CAPACITY=)
+ifdef REPLICA_AUTOSCALING_MIN_CAPACITY
+		$(eval GERRIT_OPTIONAL_PARAMS_REPLICA_AUTO_SCALING_CAPACITY := $(GERRIT_OPTIONAL_PARAMS_REPLICA_AUTO_SCALING_CAPACITY) ParameterKey=ReplicaAutoScalingMinCapacity,ParameterValue=$(REPLICA_AUTOSCALING_MIN_CAPACITY))
+endif
+ifdef REPLICA_AUTOSCALING_DESIRED_CAPACITY
+		$(eval GERRIT_OPTIONAL_PARAMS_REPLICA_AUTO_SCALING_CAPACITY := $(GERRIT_OPTIONAL_PARAMS_REPLICA_AUTO_SCALING_CAPACITY) ParameterKey=ReplicaAutoScalingDesiredCapacity,ParameterValue=$(REPLICA_AUTOSCALING_DESIRED_CAPACITY))
+endif
+ifdef REPLICA_AUTOSCALING_MAX_CAPACITY
+		$(eval GERRIT_OPTIONAL_PARAMS_REPLICA_AUTO_SCALING_CAPACITY := $(GERRIT_OPTIONAL_PARAMS_REPLICA_AUTO_SCALING_CAPACITY) ParameterKey=ReplicaAutoScalingMaxCapacity,ParameterValue=$(REPLICA_AUTOSCALING_MAX_CAPACITY))
+endif
+
+set-optional-params-for-replica-auto-scaling-policy:
+	$(eval GERRIT_OPTIONAL_PARAMS_REPLICA_AUTO_SCALING_POLICY=)
+ifdef REPLICA_AUTOSCALING_SCALE_IN_COOLDOWN
+		$(eval GERRIT_OPTIONAL_PARAMS_REPLICA_AUTO_SCALING_POLICY := $(GERRIT_OPTIONAL_PARAMS_REPLICA_AUTO_SCALING_POLICY) ParameterKey=ReplicaAutoScalingScaleInCooldown,ParameterValue=$(REPLICA_AUTOSCALING_SCALE_IN_COOLDOWN))
+endif
+ifdef REPLICA_AUTOSCALING_SCALE_OUT_COOLDOWN
+		$(eval GERRIT_OPTIONAL_PARAMS_REPLICA_AUTO_SCALING_POLICY := $(GERRIT_OPTIONAL_PARAMS_REPLICA_AUTO_SCALING_POLICY) ParameterKey=ReplicaAutoScalingScaleOutCooldown,ParameterValue=$(REPLICA_AUTOSCALING_SCALE_OUT_COOLDOWN))
+endif
+ifdef REPLICA_AUTOSCALING_TARGET_CPU_PERCENTAGE
+		$(eval GERRIT_OPTIONAL_PARAMS_REPLICA_AUTO_SCALING_POLICY := $(GERRIT_OPTIONAL_PARAMS_REPLICA_AUTO_SCALING_POLICY) ParameterKey=ReplicaAutoScalingTargetCPUPercentage,ParameterValue=$(REPLICA_AUTOSCALING_TARGET_CPU_PERCENTAGE))
+endif
+
+set-optional-params-for-replica-capacity-provider:
+	$(eval GERRIT_OPTIONAL_PARAMS_REPLICA_CAPACITY_PROVIDER=)
+ifdef REPLICA_CAPACITY_PROVIDER_TARGET
+		$(eval GERRIT_OPTIONAL_PARAMS_REPLICA_CAPACITY_PROVIDER := $(GERRIT_OPTIONAL_PARAMS_REPLICA_CAPACITY_PROVIDER) ParameterKey=ReplicaCapacityProviderTarget,ParameterValue=$(REPLICA_CAPACITY_PROVIDER_TARGET))
+endif
+ifdef REPLICA_CAPACITY_PROVIDER_MIN_STEP_SIZE
+		$(eval GERRIT_OPTIONAL_PARAMS_REPLICA_CAPACITY_PROVIDER := $(GERRIT_OPTIONAL_PARAMS_REPLICA_CAPACITY_PROVIDER) ParameterKey=ReplicaCapacityProviderMinStepSize,ParameterValue=$(REPLICA_CAPACITY_PROVIDER_MIN_STEP_SIZE))
+endif
+ifdef REPLICA_CAPACITY_PROVIDER_MAX_STEP_SIZE
+		$(eval GERRIT_OPTIONAL_PARAMS_REPLICA_CAPACITY_PROVIDER := $(GERRIT_OPTIONAL_PARAMS_REPLICA_CAPACITY_PROVIDER) ParameterKey=ReplicaCapacityProviderMaxStepSize,ParameterValue=$(REPLICA_CAPACITY_PROVIDER_MAX_STEP_SIZE))
+endif
+
 confirm-persistent-stack-deletion:
 	@echo ""
 	@echo "* * * * WARNING * * * * this is going to completely destroy the stack, including git data."

diff --git a/common-templates/cf-ecs-service-cpu-autoscaling.yml b/common-templates/cf-ecs-service-cpu-autoscaling.yml
new file mode 100644
index 0000000..0c3b77b
--- /dev/null
+++ b/common-templates/cf-ecs-service-cpu-autoscaling.yml

@@ -0,0 +1,71 @@
+AWSTemplateFormatVersion: '2010-09-09'
+Description: Resources related to the EFS filesystem apt to store git data.
+Parameters:
+  AutoScalingMinCapacity:
+    Type: Number
+    Description: The minimum number of tasks the service should scale in to
+  AutoScalingDesiredCapacity:
+    Description: The desired number of tasks to run
+    Type: Number
+  AutoScalingMaxCapacity:
+    Type: Number
+    Description: The maximum number of tasks the service should scale out to
+  AutoScalingScaleInCooldown:
+    Type: Number
+    Description: The amount of time, in seconds, after a scale-in activity completes before another scale-in activity can start
+  AutoScalingScaleOutCooldown:
+    Type: Number
+    Description: The amount of time, in seconds, to wait for a previous scale-out activity to take effect.
+  AutoScalingTargetCPUPercentage:
+    Type: Number
+    Description: Aggregate CPU utilization target for auto-scaling
+  ResourceId:
+    Type: String
+    Description: The identifier of the resource associated with the scalable target.
+
+Resources:
+    GerritServiceScalingTarget:
+      Type: AWS::ApplicationAutoScaling::ScalableTarget
+      Properties:
+        MinCapacity: !Ref AutoScalingMinCapacity
+        MaxCapacity: !Ref AutoScalingMaxCapacity
+        ResourceId: !Ref ResourceId
+        RoleARN: !GetAtt [AutoscalingRole, Arn]
+        ScalableDimension: ecs:service:DesiredCount
+        ServiceNamespace: ecs
+
+    GerritServiceScalingPolicy:
+      Type: AWS::ApplicationAutoScaling::ScalingPolicy
+      Properties:
+        PolicyName: ReplicaCPUTrackingPolicy
+        PolicyType: TargetTrackingScaling
+        ScalingTargetId: !Ref GerritServiceScalingTarget
+        TargetTrackingScalingPolicyConfiguration:
+          PredefinedMetricSpecification:
+            PredefinedMetricType: ECSServiceAverageCPUUtilization
+          ScaleInCooldown: !Ref AutoScalingScaleInCooldown
+          ScaleOutCooldown: !Ref AutoScalingScaleOutCooldown
+          TargetValue: !Ref AutoScalingTargetCPUPercentage
+
+    AutoscalingRole:
+      Type: AWS::IAM::Role
+      Properties:
+        AssumeRolePolicyDocument:
+          Statement:
+            - Effect: Allow
+              Principal:
+                Service: [application-autoscaling.amazonaws.com]
+              Action: ['sts:AssumeRole']
+        Path: /
+        Policies:
+          - PolicyName: gerrit-service-autoscaling
+            PolicyDocument:
+              Statement:
+                - Effect: Allow
+                  Action:
+                    - 'application-autoscaling:*'
+                    - 'cloudwatch:DescribeAlarms'
+                    - 'cloudwatch:PutMetricAlarm'
+                    - 'ecs:DescribeServices'
+                    - 'ecs:UpdateService'
+                  Resource: '*'
\ No newline at end of file

diff --git a/dual-primary/Makefile b/dual-primary/Makefile
index 933ce14..8b44868 100644
--- a/dual-primary/Makefile
+++ b/dual-primary/Makefile

@@ -50,7 +50,10 @@
 						$(optional_git_gc_targets_creation) \
 						dns-routing wait-for-dns-routing-creation
 
-cluster: cluster-keys set-optional-gerrit-primary-volume set-optional-params-for-replica-filesystem
+cluster: cluster-keys set-optional-gerrit-primary-volume \
+			set-optional-params-for-replica-filesystem \
+			set-optional-params-for-replica-auto-scaling-capacity \
+			set-optional-params-for-replica-capacity-provider
 ifdef CLUSTER_INSTANCE_TYPE
 		$(eval CLUSTER_OPTIONAL_PARAMS := $(CLUSTER_OPTIONAL_PARAMS) ParameterKey=InstanceType,ParameterValue=$(CLUSTER_INSTANCE_TYPE))
 endif
@@ -92,7 +95,9 @@
 		ParameterKey=SubnetIdProp,ParameterValue=$(SUBNET_ID) \
 		$(CLUSTER_OPTIONAL_PARAMS) \
 		$(GERRIT_OPTIONAL_PRIMARY_VOLUME) \
-		$(GERRIT_OPTIONAL_PARAMS_REPLICA_FILESYSTEM)
+		$(GERRIT_OPTIONAL_PARAMS_REPLICA_FILESYSTEM) \
+		$(GERRIT_OPTIONAL_PARAMS_REPLICA_AUTO_SCALING_CAPACITY) \
+		$(GERRIT_OPTIONAL_PARAMS_REPLICA_CAPACITY_PROVIDER)
 
 service-primary-1: set-optional-params-metrics-cloudwatch set-optional-params-smtp \
 					set-optional-params-multisite set-ldap-account-pattern \
@@ -227,7 +232,13 @@
 		ParameterKey=HostedZoneName,ParameterValue=$(HOSTED_ZONE_NAME) \
 		ParameterKey=GitReplicationSubdomain,ParameterValue=$(GIT_REPLICATION_SUBDOMAIN)
 
-service-replica: set-optional-params-metrics-cloudwatch set-ldap-account-pattern set-optional-gerrit-ulimits set-optional-jgit-conf
+service-replica: set-optional-params-metrics-cloudwatch \
+					set-ldap-account-pattern \
+					set-optional-gerrit-ulimits \
+					set-optional-jgit-conf \
+					set-optional-params-for-replica-auto-scaling-capacity \
+					set-optional-params-for-replica-auto-scaling-policy
+
 ifdef GERRIT_REPLICA_INSTANCE_ID
 		$(eval REPLICA_SERVICE_OPTIONAL_PARAMS := $(REPLICA_SERVICE_OPTIONAL_PARAMS) ParameterKey=InstanceId,ParameterValue=$(GERRIT_REPLICA_INSTANCE_ID))
 endif
@@ -261,7 +272,9 @@
 		$(LDAP_ACCOUNT_PATTERN_PARAM) \
 		$(REPLICA_SERVICE_OPTIONAL_PARAMS) \
 		$(METRICS_CW_OPTIONAL_PARAMS) \
-		$(GERRIT_ULIMITS)
+		$(GERRIT_ULIMITS) \
+		$(GERRIT_OPTIONAL_PARAMS_REPLICA_AUTO_SCALING_CAPACITY) \
+		$(GERRIT_OPTIONAL_PARAMS_REPLICA_AUTO_SCALING_POLICY)
 
 service-lb:
 ifdef LOAD_BALANCER_SCHEME

diff --git a/dual-primary/README.md b/dual-primary/README.md
index baa0874..0c0bc8e 100644
--- a/dual-primary/README.md
+++ b/dual-primary/README.md

@@ -242,6 +242,93 @@
 * `REPLICA_FILESYSTEM_PROVISIONED_THROUGHPUT_IN_MIBPS`: Optional. Only used when `REPLICA_FILESYSTEM_THROUGHPUT_MODE` is set to `provisioned`.
 default: `256`.
 
+##### Auto Scaling of replicas instances
+
+Gerrit replicas have the ability to scale in or out automatically to accommodate
+to the increase or decrease of traffic. The traffic might be typically coming
+from build or test jobs executed by some sort of automated build pipeline.
+
+Since they all [share the same git data over EFS](#shared-filesystem-for-replicas),
+replicas are immediately ready to serve traffic as soon as they come up and
+register behind the loadbalancer.
+
+There is a 1 to 1 relationship between replica and EC2 instances: on each EC2
+instance in the 'replica' ASG, runs one and only one replica task.
+Because of this, when specifying the capacity for replicas (minimum, desired and
+maximum), they will both configure for the capacity of tasks as well as the
+capacity of the ASG, since they always need to be in sync.
+
+The scaling policy adds or removes capacity as required to keep the average CPU
+Usage (of the replica service) close to the specified target value.
+
+Now, tasks in the provisioning state that cannot find sufficient resources on
+the existing instances will automatically trigger the capacity provider to scale
+out the replica ASG. As more EC2 instances become available, tasks in the
+provisioning state will get placed onto those instances, reducing the number of
+tasks in provisioning.
+
+Conversely, as the average CPU usage (of the replica service) drops under the
+specified target value, and replica tasks get removed, the capacity provider
+will reduce the number of EC2 instances too.
+
+Note that only EC2 instances that are not running any replica task will scale in.
+
+These are the available settings:
+
+* `REPLICA_AUTOSCALING_MIN_CAPACITY` Optional. The minimum number of tasks that
+replicas should scale in to. This is also the minimum number of EC2 instances in
+the replica ASG
+default: *1*
+
+* `REPLICA_AUTOSCALING_DESIRED_CAPACITY` Optional. The desired number of
+replica tasks to run. This is also the desired number of EC2 instances in the
+replica ASG.
+default: *1*
+
+* `REPLICA_AUTOSCALING_MAX_CAPACITY` Optional. The maximum number of tasks that
+replicas should scale out to. This is also the maximum number of EC2 instances
+in the replica ASG
+default: *2*
+
+* `REPLICA_AUTOSCALING_SCALE_IN_COOLDOWN` Optional. The amount of time, in
+seconds, after a scale-in activity completes before another scale-in activity
+can start
+default: *300* seconds
+
+* `REPLICA_AUTOSCALING_SCALE_OUT_COOLDOWN` Optional. The amount of time, in
+seconds, to wait for a previous scale-out activity to take effect
+default: *300* seconds
+
+* `REPLICA_AUTOSCALING_TARGET_CPU_PERCENTAGE` Optional. Aggregate CPU
+utilization target for auto-scaling. Auto-scaling will add or remove tasks in
+the replica service to be as close as possible to this value
+
+* `REPLICA_CAPACITY_PROVIDER_TARGET` Optional. The target capacity value for the
+capacity provider of replicas (must be > 0 and <= 100).
+default: *100*
+
+   Setting this value to 100 means that there will be no _spare capacity_
+allocated on the replica ASG:
+
+   If 3 replica tasks are needed, then the ASG will adjust to have exactly 3 EC2
+
+   Setting this value to less than 100 enables spare capacity in the ASG. For
+example, if you set this value to 50 the scaling policy will adjust the EC2
+until it is exactly twice the number of instances needed to run all of the
+tasks:
+
+   If 3 replica tasks are needed, then there ASG will adjust to 6 EC2
+
+* `REPLICA_CAPACITY_PROVIDER_MIN_STEP_SIZE` Optional. The minimum number of EC2
+instances for replicas that will scale in or scale out at one time (must be >= 1
+and <= 10)
+default: *1*
+
+* `REPLICA_CAPACITY_PROVIDER_MAX_STEP_SIZE` Optional. The maximum number of EC2
+instances for replicas that will scale in or scale out at one time (must be >= 1
+and <= 10)
+default: *1*
+
 #### REPLICATION SERVICE
 
 * `REPLICATION_SERVICE_ENABLED`: Optional. Whether to expose a replication endpoint.

diff --git a/dual-primary/cf-cluster.yml b/dual-primary/cf-cluster.yml
index 17a9b9b..2797932 100644
--- a/dual-primary/cf-cluster.yml
+++ b/dual-primary/cf-cluster.yml

@@ -106,6 +106,39 @@
     Description: Gerrit replicas shared filesystem throughput, measured in MiB/s. Valid values are 1-1024.
     Type: Number
     Default: 256
+  ReplicaAutoScalingMinCapacity:
+    Type: Number
+    Description: The minimum number of tasks that replicas should scale in to
+    Default: 1
+  ReplicaAutoScalingDesiredCapacity:
+    Description: The desired number of replica tasks to run
+    Type: Number
+    Default: 1
+  ReplicaAutoScalingMaxCapacity:
+    Type: Number
+    Description: The maximum number of tasks that replicas should scale out to
+    Default: 2
+  ReplicaCapacityProviderTarget:
+    Type: Number
+    Description: The target capacity value for the capacity provider of replicas
+    ConstraintDescription: The specified value must be > 0 and <= 100
+    Default: 100
+    MinValue: 1
+    MaxValue: 100
+  ReplicaCapacityProviderMinStepSize:
+    Type: Number
+    Description: The minimum number of EC2 instances for replicas that will scale in or scale out at one time
+    ConstraintDescription: The specified value must be >= 1 and <= 10
+    Default: 1
+    MinValue: 1
+    MaxValue: 10
+  ReplicaCapacityProviderMaxStepSize:
+    Type: Number
+    Description: The maximum number of EC2 instances for replicas that will scale in or scale out at one time
+    ConstraintDescription: The specified value must be >= 1 and less <= 10
+    Default: 1
+    MinValue: 1
+    MaxValue: 10
 
 Conditions:
   isProvisionedThroughput: !Equals [!Ref PrimaryFileSystemThroughputMode, "provisioned"]
@@ -121,6 +154,22 @@
   # ECS Resources
   ECSCluster:
     Type: AWS::ECS::Cluster
+    Properties:
+      ClusterName: !Sub '${AWS::StackName}-ECSCluster'
+      CapacityProviders: [ !Ref ReplicaCapacityProvider ]
+
+  ReplicaCapacityProvider:
+    Type: AWS::ECS::CapacityProvider
+    Properties:
+      Name: !Sub '${AWS::StackName}-ReplicaCapacityProvider'
+      AutoScalingGroupProvider:
+        AutoScalingGroupArn: !Ref ReplicaECSAutoScalingGroup
+        ManagedTerminationProtection: ENABLED
+        ManagedScaling:
+          MaximumScalingStepSize: !Ref ReplicaCapacityProviderMaxStepSize
+          MinimumScalingStepSize: !Ref ReplicaCapacityProviderMinStepSize
+          Status: ENABLED
+          TargetCapacity: !Ref ReplicaCapacityProviderTarget
 
   EcsHostSecurityGroup:
     Type: AWS::EC2::SecurityGroup
@@ -144,9 +193,10 @@
       VPCZoneIdentifier:
         - !If [NetworkStackNeeded, !GetAtt ECSTaskNetworkStack.Outputs.PublicSubnetOneRef, !Ref SubnetIdProp]
       LaunchConfigurationName: !Ref 'ReplicaLaunchConfiguration'
-      MinSize: '1'
-      MaxSize: '1'
-      DesiredCapacity: '1'
+      MinSize: !Ref ReplicaAutoScalingMinCapacity
+      MaxSize: !Ref ReplicaAutoScalingMaxCapacity
+      DesiredCapacity: !Ref ReplicaAutoScalingDesiredCapacity
+      NewInstancesProtectedFromScaleIn: true
     CreationPolicy:
       ResourceSignal:
         Timeout: PT15M
@@ -165,7 +215,7 @@
       UserData:
         Fn::Base64: !Sub |
           #!/bin/bash -xe
-          echo ECS_CLUSTER=${ECSCluster} >> /etc/ecs/ecs.config
+          echo ECS_CLUSTER=${AWS::StackName}-ECSCluster >> /etc/ecs/ecs.config
           echo ECS_INSTANCE_ATTRIBUTES={\"target_group\":\"replica\"} >> /etc/ecs/ecs.config
 
           # Make sure latest version of the helper scripts are installed as per recommendation:
@@ -206,7 +256,7 @@
       UserData:
         Fn::Base64: !Sub |
           #!/bin/bash -xe
-          echo ECS_CLUSTER=${ECSCluster} >> /etc/ecs/ecs.config
+          echo ECS_CLUSTER=${AWS::StackName}-ECSCluster >> /etc/ecs/ecs.config
           echo ECS_INSTANCE_ATTRIBUTES={\"target_group\":\"haproxy\"} >> /etc/ecs/ecs.config
 
           # Make sure latest version of the helper scripts are installed as per recommendation:

diff --git a/dual-primary/cf-service-replica.yml b/dual-primary/cf-service-replica.yml
index ff59a9a..85e9604 100644
--- a/dual-primary/cf-service-replica.yml
+++ b/dual-primary/cf-service-replica.yml

@@ -36,10 +36,6 @@
   DockerRegistryUrl:
       Description: Docker registry URL
       Type: String
-  DesiredCount:
-      Description: How many instances of this task should we run across our cluster?
-      Type: Number
-      Default: 1
   HTTPHostPort:
       Description: Gerrit Host HTTP port
       Type: Number
@@ -189,6 +185,30 @@
     Description: Comma separated list of regex patterns to exclude metrics reported to CloudWatch
     Type: CommaDelimitedList
     Default: ''
+  ReplicaAutoScalingMinCapacity:
+    Type: Number
+    Description: The minimum number of tasks that replicas should scale in to
+    Default: 1
+  ReplicaAutoScalingDesiredCapacity:
+    Description: The desired number of replica tasks to run
+    Type: Number
+    Default: 1
+  ReplicaAutoScalingMaxCapacity:
+    Type: Number
+    Description: The maximum number of tasks that replicas should scale out to
+    Default: 2
+  ReplicaAutoScalingScaleInCooldown:
+    Type: Number
+    Description: The amount of time, in seconds, after a scale-in activity completes before another scale-in activity can start
+    Default: 300
+  ReplicaAutoScalingScaleOutCooldown:
+    Type: Number
+    Description: The amount of time, in seconds, to wait for a previous scale-out activity to take effect.
+    Default: 300
+  ReplicaAutoScalingTargetCPUPercentage:
+    Type: Number
+    Description: Aggregate CPU utilization target for auto-scaling
+    Default: 75.0
 
 Resources:
     GerritService:
@@ -202,7 +222,10 @@
             Cluster:
               Fn::ImportValue:
                   !Join [':', [!Ref 'ClusterStackName', 'ClusterName']]
-            DesiredCount: !Ref DesiredCount
+            CapacityProviderStrategy:
+              - CapacityProvider: !Sub '${ClusterStackName}-ReplicaCapacityProvider'
+                Weight: 100
+            DesiredCount: !Ref ReplicaAutoScalingDesiredCapacity
             TaskDefinition: !Ref GerritTaskDefinition
             LoadBalancers:
                 - ContainerName: !Ref GerritServiceName
@@ -385,6 +408,26 @@
                   Labels:
                     gerrit-logs: !Join ['-', [!Ref EnvironmentName, !Ref GerritLogsVolume]]
 
+    ReplicaCPUAutoScaling:
+      Type: AWS::CloudFormation::Stack
+      Properties:
+        TemplateURL: !Join [ '', ['https://', !Ref TemplateBucketName, '.s3.amazonaws.com/cf-ecs-service-cpu-autoscaling.yml'] ]
+        TimeoutInMinutes: '5'
+        Parameters:
+          AutoScalingMinCapacity: !Ref ReplicaAutoScalingMinCapacity
+          AutoScalingDesiredCapacity: !Ref ReplicaAutoScalingDesiredCapacity
+          AutoScalingMaxCapacity: !Ref ReplicaAutoScalingMaxCapacity
+          AutoScalingScaleInCooldown: !Ref ReplicaAutoScalingScaleInCooldown
+          AutoScalingScaleOutCooldown: !Ref ReplicaAutoScalingScaleOutCooldown
+          AutoScalingTargetCPUPercentage: !Ref ReplicaAutoScalingTargetCPUPercentage
+          ResourceId:
+            !Join
+            - ''
+            - - 'service/'
+              - Fn::ImportValue: !Join [':', [!Ref 'ClusterStackName', 'ClusterName']]
+              - '/'
+              - !GetAtt GerritService.Name
+
     LoadBalancer:
         Type: AWS::ElasticLoadBalancingV2::LoadBalancer
         Properties:

diff --git a/dual-primary/setup.env.template b/dual-primary/setup.env.template
index 72bfd37..51b234f 100644
--- a/dual-primary/setup.env.template
+++ b/dual-primary/setup.env.template

@@ -73,4 +73,16 @@
 
 REPLICA_FILESYSTEM_ID=""
 REPLICA_FILESYSTEM_THROUGHPUT_MODE="provisioned"
-REPLICA_FILESYSTEM_PROVISIONED_THROUGHPUT_IN_MIBPS="256"
\ No newline at end of file
+REPLICA_FILESYSTEM_PROVISIONED_THROUGHPUT_IN_MIBPS="256"
+
+REPLICA_AUTOSCALING_MIN_CAPACITY=1
+REPLICA_AUTOSCALING_DESIRED_CAPACITY=1
+REPLICA_AUTOSCALING_MAX_CAPACITY=1
+
+REPLICA_AUTOSCALING_SCALE_IN_COOLDOWN=300
+REPLICA_AUTOSCALING_SCALE_OUT_COOLDOWN=300
+REPLICA_AUTOSCALING_TARGET_CPU_PERCENTAGE=75
+
+REPLICA_CAPACITY_PROVIDER_TARGET=50
+REPLICA_CAPACITY_PROVIDER_MIN_STEP_SIZE=1
+REPLICA_CAPACITY_PROVIDER_MAX_STEP_SIZE=2
\ No newline at end of file

diff --git a/primary-replica/Makefile b/primary-replica/Makefile
index 4378812..d7f7313 100644
--- a/primary-replica/Makefile
+++ b/primary-replica/Makefile

@@ -31,7 +31,10 @@
 						$(optional_git_gc_targets_creation) \
 						dns-routing wait-for-dns-routing-creation
 
-cluster: cluster-keys set-optional-gerrit-primary-volume set-optional-params-for-replica-filesystem
+cluster: cluster-keys set-optional-gerrit-primary-volume \
+			set-optional-params-for-replica-filesystem \
+			set-optional-params-for-replica-auto-scaling-capacity \
+			set-optional-params-for-replica-capacity-provider
 ifdef CLUSTER_INSTANCE_TYPE
 		$(eval CLUSTER_OPTIONAL_PARAMS := $(CLUSTER_OPTIONAL_PARAMS) ParameterKey=InstanceType,ParameterValue=$(CLUSTER_INSTANCE_TYPE))
 endif
@@ -58,7 +61,9 @@
 		ParameterKey=SubnetIdProp,ParameterValue=$(SUBNET_ID) \
 		$(CLUSTER_OPTIONAL_PARAMS) \
 		$(GERRIT_OPTIONAL_PRIMARY_VOLUME) \
-		$(GERRIT_OPTIONAL_PARAMS_REPLICA_FILESYSTEM)
+		$(GERRIT_OPTIONAL_PARAMS_REPLICA_FILESYSTEM) \
+		$(GERRIT_OPTIONAL_PARAMS_REPLICA_AUTO_SCALING_CAPACITY) \
+		$(GERRIT_OPTIONAL_PARAMS_REPLICA_CAPACITY_PROVIDER)
 
 service-primary: set-optional-params-metrics-cloudwatch set-optional-params-smtp set-ldap-account-pattern set-optional-gerrit-ulimits set-optional-jgit-conf
 ifdef LOAD_BALANCER_SCHEME
@@ -103,7 +108,12 @@
 		$(GERRIT_ULIMITS)
 
 
-service-replica: set-optional-params-metrics-cloudwatch set-ldap-account-pattern set-optional-gerrit-ulimits set-optional-jgit-conf
+service-replica: set-optional-params-metrics-cloudwatch \
+					set-ldap-account-pattern \
+					set-optional-gerrit-ulimits set-optional-jgit-conf \
+					set-optional-params-for-replica-auto-scaling-capacity \
+					set-optional-params-for-replica-auto-scaling-policy
+
 ifdef LOAD_BALANCER_SCHEME
 		$(eval REPLICA_SERVICE_OPTIONAL_PARAMS := $(REPLICA_SERVICE_OPTIONAL_PARAMS) ParameterKey=LoadBalancerScheme,ParameterValue=$(LOAD_BALANCER_SCHEME))
 endif
@@ -137,7 +147,9 @@
 		$(LDAP_ACCOUNT_PATTERN_PARAM) \
 		$(REPLICA_SERVICE_OPTIONAL_PARAMS) \
 		$(METRICS_CW_OPTIONAL_PARAMS) \
-		$(GERRIT_ULIMITS)
+		$(GERRIT_ULIMITS) \
+		$(GERRIT_OPTIONAL_PARAMS_REPLICA_AUTO_SCALING_CAPACITY) \
+		$(GERRIT_OPTIONAL_PARAMS_REPLICA_AUTO_SCALING_POLICY)
 
 dns-routing:
 	$(AWS_FC_COMMAND) create-stack \

diff --git a/primary-replica/README.md b/primary-replica/README.md
index f7bcddf..ccb37d7 100644
--- a/primary-replica/README.md
+++ b/primary-replica/README.md

@@ -114,6 +114,93 @@
 * `REPLICA_FILESYSTEM_PROVISIONED_THROUGHPUT_IN_MIBPS`: Optional. Only used when `REPLICA_FILESYSTEM_THROUGHPUT_MODE` is set to `provisioned`.
 default: `256`.
 
+##### Auto Scaling of replicas instances
+
+Gerrit replicas have the ability to scale in or out automatically to accommodate
+to the increase or decrease of traffic. The traffic might be typically coming
+from build or test jobs executed by some sort of automated build pipeline.
+
+Since they all [share the same git data over EFS](#shared-filesystem-for-replicas),
+replicas are immediately ready to serve traffic as soon as they come up and
+register behind the loadbalancer.
+
+There is a 1 to 1 relationship between replica and EC2 instances: on each EC2
+instance in the 'replica' ASG, runs one and only one replica task.
+Because of this, when specifying the capacity for replicas (minimum, desired and
+maximum), they will both configure for the capacity of tasks as well as the
+capacity of the ASG, since they always need to be in sync.
+
+The scaling policy adds or removes capacity as required to keep the average CPU
+Usage (of the replica service) close to the specified target value.
+
+Now, tasks in the provisioning state that cannot find sufficient resources on
+the existing instances will automatically trigger the capacity provider to scale
+out the replica ASG. As more EC2 instances become available, tasks in the
+provisioning state will get placed onto those instances, reducing the number of
+tasks in provisioning.
+
+Conversely, as the average CPU usage (of the replica service) drops under the
+specified target value, and replica tasks get removed, the capacity provider
+will reduce the number of EC2 instances too.
+
+Note that only EC2 instances that are not running any replica task will scale in.
+
+These are the available settings:
+
+* `REPLICA_AUTOSCALING_MIN_CAPACITY` Optional. The minimum number of tasks that
+replicas should scale in to. This is also the minimum number of EC2 instances in
+the replica ASG
+default: *1*
+
+* `REPLICA_AUTOSCALING_DESIRED_CAPACITY` Optional. The desired number of
+replica tasks to run. This is also the desired number of EC2 instances in the
+replica ASG.
+default: *1*
+
+* `REPLICA_AUTOSCALING_MAX_CAPACITY` Optional. The maximum number of tasks that
+replicas should scale out to. This is also the maximum number of EC2 instances
+in the replica ASG
+default: *2*
+
+* `REPLICA_AUTOSCALING_SCALE_IN_COOLDOWN` Optional. The amount of time, in
+seconds, after a scale-in activity completes before another scale-in activity
+can start
+default: *300* seconds
+
+* `REPLICA_AUTOSCALING_SCALE_OUT_COOLDOWN` Optional. The amount of time, in
+seconds, to wait for a previous scale-out activity to take effect
+default: *300* seconds
+
+* `REPLICA_AUTOSCALING_TARGET_CPU_PERCENTAGE` Optional. Aggregate CPU
+utilization target for auto-scaling. Auto-scaling will add or remove tasks in
+the replica service to be as close as possible to this value
+
+* `REPLICA_CAPACITY_PROVIDER_TARGET` Optional. The target capacity value for the
+capacity provider of replicas (must be > 0 and <= 100).
+default: *100*
+
+   Setting this value to 100 means that there will be no _spare capacity_
+allocated on the replica ASG:
+
+   If 3 replica tasks are needed, then the ASG will adjust to have exactly 3 EC2
+
+   Setting this value to less than 100 enables spare capacity in the ASG. For
+example, if you set this value to 50 the scaling policy will adjust the EC2
+until it is exactly twice the number of instances needed to run all of the
+tasks:
+
+   If 3 replica tasks are needed, then there ASG will adjust to 6 EC2
+
+* `REPLICA_CAPACITY_PROVIDER_MIN_STEP_SIZE` Optional. The minimum number of EC2
+instances for replicas that will scale in or scale out at one time (must be >= 1
+and <= 10)
+default: *1*
+
+* `REPLICA_CAPACITY_PROVIDER_MAX_STEP_SIZE` Optional. The maximum number of EC2
+instances for replicas that will scale in or scale out at one time (must be >= 1
+and <= 10)
+default: *1*
+
 ### 2 - Deploy
 
 * Create the cluster, services and DNS routing stacks:

diff --git a/primary-replica/cf-cluster.yml b/primary-replica/cf-cluster.yml
index 8360292..3523810 100644
--- a/primary-replica/cf-cluster.yml
+++ b/primary-replica/cf-cluster.yml

@@ -84,6 +84,39 @@
     Description: Gerrit replicas shared filesystem throughput, measured in MiB/s. Valid values are 1-1024.
     Type: Number
     Default: 256
+  ReplicaAutoScalingMinCapacity:
+    Type: Number
+    Description: The minimum number of EC2 instances in the replica ASG
+    Default: 1
+  ReplicaAutoScalingDesiredCapacity:
+    Description: The desired number of EC2 instances in the replica ASG
+    Type: Number
+    Default: 1
+  ReplicaAutoScalingMaxCapacity:
+    Type: Number
+    Description: The maximum number of EC2 instances in the replica ASG
+    Default: 2
+  ReplicaCapacityProviderTarget:
+    Type: Number
+    Description: The target capacity value for the capacity provider of replicas
+    ConstraintDescription: The specified value must be > 0 and <= 100
+    Default: 100
+    MinValue: 1
+    MaxValue: 100
+  ReplicaCapacityProviderMinStepSize:
+    Type: Number
+    Description: The minimum number of EC2 instances for replicas that will scale in or scale out at one time
+    ConstraintDescription: The specified value must be >= 1 and <= 10
+    Default: 1
+    MinValue: 1
+    MaxValue: 10
+  ReplicaCapacityProviderMaxStepSize:
+    Type: Number
+    Description: The maximum number of EC2 instances for replicas that will scale in or scale out at one time
+    ConstraintDescription: The specified value must be >= 1 and less <= 10
+    Default: 1
+    MinValue: 1
+    MaxValue: 10
 
 Conditions:
   CreateReplicaEFS: !Equals [!Ref ReplicaFileSystemID, ""]
@@ -97,6 +130,22 @@
   # ECS Resources
   ECSCluster:
     Type: AWS::ECS::Cluster
+    Properties:
+      ClusterName: !Sub '${AWS::StackName}-ECSCluster'
+      CapacityProviders: [ !Ref ReplicaCapacityProvider ]
+
+  ReplicaCapacityProvider:
+    Type: AWS::ECS::CapacityProvider
+    Properties:
+      Name: !Sub '${AWS::StackName}-ReplicaCapacityProvider'
+      AutoScalingGroupProvider:
+        AutoScalingGroupArn: !Ref ReplicaASG
+        ManagedTerminationProtection: ENABLED
+        ManagedScaling:
+          MaximumScalingStepSize: !Ref ReplicaCapacityProviderMaxStepSize
+          MinimumScalingStepSize: !Ref ReplicaCapacityProviderMinStepSize
+          Status: ENABLED
+          TargetCapacity: !Ref ReplicaCapacityProviderTarget
 
   EcsHostSecurityGroup:
     Type: AWS::EC2::SecurityGroup
@@ -141,7 +190,7 @@
       UserData:
         Fn::Base64: !Sub |
           #!/bin/bash -xe
-          echo ECS_CLUSTER=${ECSCluster} >> /etc/ecs/ecs.config
+          echo ECS_CLUSTER=${AWS::StackName}-ECSCluster >> /etc/ecs/ecs.config
           echo ECS_INSTANCE_ATTRIBUTES={\"target_group\":\"primary\"} >> /etc/ecs/ecs.config
           # Make sure latest version of the helper scripts are installed as per recommendation:
           # https://github.com/awsdocs/aws-cloudformation-user-guide/blob/master/doc_source/cfn-helper-scripts-reference.md#using-the-latest-version
@@ -238,9 +287,10 @@
       VPCZoneIdentifier:
         - !If [NetworkStackNeeded, !GetAtt ECSTaskNetworkStack.Outputs.PublicSubnetOneRef, !Ref SubnetIdProp]
       LaunchConfigurationName: !Ref 'ReplicaLaunchConfiguration'
-      MinSize: '1'
-      MaxSize: '1'
-      DesiredCapacity: '1'
+      MinSize: !Ref ReplicaAutoScalingMinCapacity
+      MaxSize: !Ref ReplicaAutoScalingMaxCapacity
+      DesiredCapacity: !Ref ReplicaAutoScalingDesiredCapacity
+      NewInstancesProtectedFromScaleIn: true
     CreationPolicy:
       ResourceSignal:
         Timeout: PT15M
@@ -259,7 +309,7 @@
       UserData:
         Fn::Base64: !Sub |
           #!/bin/bash -xe
-          echo ECS_CLUSTER=${ECSCluster} >> /etc/ecs/ecs.config
+          echo ECS_CLUSTER=${AWS::StackName}-ECSCluster >> /etc/ecs/ecs.config
           echo ECS_INSTANCE_ATTRIBUTES={\"target_group\":\"replica\"} >> /etc/ecs/ecs.config
 
           # Make sure latest version of the helper scripts are installed as per recommendation:

diff --git a/primary-replica/cf-service-replica.yml b/primary-replica/cf-service-replica.yml
index fdc8350..2e3d1c6 100644
--- a/primary-replica/cf-service-replica.yml
+++ b/primary-replica/cf-service-replica.yml

@@ -36,10 +36,6 @@
   DockerRegistryUrl:
       Description: Docker registry URL
       Type: String
-  DesiredCount:
-      Description: How many instances of this task should we run across our cluster?
-      Type: Number
-      Default: 1
   HTTPHostPort:
       Description: Gerrit Host HTTP port
       Type: Number
@@ -189,6 +185,30 @@
     Description: Comma separated list of regex patterns to exclude metrics reported to CloudWatch
     Type: CommaDelimitedList
     Default: ''
+  ReplicaAutoScalingMinCapacity:
+    Type: Number
+    Description: The minimum number of tasks that replicas should scale in to
+    Default: 1
+  ReplicaAutoScalingDesiredCapacity:
+    Description: The desired number of replica tasks to run
+    Type: Number
+    Default: 1
+  ReplicaAutoScalingMaxCapacity:
+    Type: Number
+    Description: The maximum number of tasks that replicas should scale out to
+    Default: 2
+  ReplicaAutoScalingScaleInCooldown:
+    Type: Number
+    Description: The amount of time, in seconds, after a scale-in activity completes before another scale-in activity can start
+    Default: 300
+  ReplicaAutoScalingScaleOutCooldown:
+    Type: Number
+    Description: The amount of time, in seconds, to wait for a previous scale-out activity to take effect.
+    Default: 300
+  ReplicaAutoScalingTargetCPUPercentage:
+    Type: Number
+    Description: Aggregate CPU utilization target for auto-scaling
+    Default: 75.0
 
 Resources:
     GerritService:
@@ -202,7 +222,10 @@
             Cluster:
               Fn::ImportValue:
                   !Join [':', [!Ref 'ClusterStackName', 'ClusterName']]
-            DesiredCount: !Ref DesiredCount
+            CapacityProviderStrategy:
+              - CapacityProvider: !Sub '${ClusterStackName}-ReplicaCapacityProvider'
+                Weight: 100
+            DesiredCount: !Ref ReplicaAutoScalingDesiredCapacity
             TaskDefinition: !Ref GerritTaskDefinition
             LoadBalancers:
                 - ContainerName: !Ref GerritServiceName
@@ -385,6 +408,26 @@
                   Labels:
                     gerrit-logs: !Join ['-', [!Ref EnvironmentName, !Ref GerritLogsVolume]]
 
+    ReplicaCPUAutoScaling:
+      Type: AWS::CloudFormation::Stack
+      Properties:
+        TemplateURL: !Join [ '', ['https://', !Ref TemplateBucketName, '.s3.amazonaws.com/cf-ecs-service-cpu-autoscaling.yml'] ]
+        TimeoutInMinutes: '5'
+        Parameters:
+          AutoScalingMinCapacity: !Ref ReplicaAutoScalingMinCapacity
+          AutoScalingDesiredCapacity: !Ref ReplicaAutoScalingDesiredCapacity
+          AutoScalingMaxCapacity: !Ref ReplicaAutoScalingMaxCapacity
+          AutoScalingScaleInCooldown: !Ref ReplicaAutoScalingScaleInCooldown
+          AutoScalingScaleOutCooldown: !Ref ReplicaAutoScalingScaleOutCooldown
+          AutoScalingTargetCPUPercentage: !Ref ReplicaAutoScalingTargetCPUPercentage
+          ResourceId:
+            !Join
+            - ''
+            - - 'service/'
+              - Fn::ImportValue: !Join [':', [!Ref 'ClusterStackName', 'ClusterName']]
+              - '/'
+              - !GetAtt GerritService.Name
+
     LoadBalancer:
         Type: AWS::ElasticLoadBalancingV2::LoadBalancer
         Properties:

diff --git a/primary-replica/setup.env.template b/primary-replica/setup.env.template
index 5d2418c..7a3e5f7 100644
--- a/primary-replica/setup.env.template
+++ b/primary-replica/setup.env.template

@@ -48,7 +48,19 @@
 SERVICE_GIT_GC_STACK_NAME=$(AWS_PREFIX)-scheduled-gc
 GIT_GC_CRON_EXPRESSION="0 2 ? * SAT *"
 GIT_GC_PROJECT_LIST="All-Users"
-             
+
 REPLICA_FILESYSTEM_ID=""
 REPLICA_FILESYSTEM_THROUGHPUT_MODE="provisioned"
-REPLICA_FILESYSTEM_PROVISIONED_THROUGHPUT_IN_MIBPS="256"
\ No newline at end of file
+REPLICA_FILESYSTEM_PROVISIONED_THROUGHPUT_IN_MIBPS="256"
+
+REPLICA_AUTOSCALING_MIN_CAPACITY=1
+REPLICA_AUTOSCALING_DESIRED_CAPACITY=1
+REPLICA_AUTOSCALING_MAX_CAPACITY=1
+
+REPLICA_AUTOSCALING_SCALE_IN_COOLDOWN=300
+REPLICA_AUTOSCALING_SCALE_OUT_COOLDOWN=300
+REPLICA_AUTOSCALING_TARGET_CPU_PERCENTAGE=75
+
+REPLICA_CAPACITY_PROVIDER_TARGET=50
+REPLICA_CAPACITY_PROVIDER_MIN_STEP_SIZE=1
+REPLICA_CAPACITY_PROVIDER_MAX_STEP_SIZE=2
\ No newline at end of file
commit	a304efe7430f7584e8a2f6f8447b1676be4916a7	[log] [tgz]
author	Luca Milanesio <luca.milanesio@gmail.com>	Mon Mar 15 21:27:51 2021 +0000
committer	Gerrit Code Review <noreply-gerritcodereview@google.com>	Mon Mar 15 21:27:51 2021 +0000
tree	678557c1f29fa40f98ea4d8873308ad964465deb
parent	9311a0531448cec3f2cd00adcbc76c6c9223ff86 [diff]
parent	a43152a9d4d4306d56a2c82334afbc022ab8fd8d [diff]