Merge "Remove unneeded first time redirect"
diff --git a/Makefile.common b/Makefile.common
index 78c1f9a..47c08a5 100644
--- a/Makefile.common
+++ b/Makefile.common
@@ -21,6 +21,7 @@
aws s3 cp ../common-templates/cf-gerrit-volume.yml s3://$(TEMPLATE_BUCKET_NAME)/
aws s3 cp ../common-templates/cf-primary-asg.yml s3://$(TEMPLATE_BUCKET_NAME)/
aws s3 cp ../common-templates/cf-efs-stack.yml s3://$(TEMPLATE_BUCKET_NAME)/
+ aws s3 cp ../common-templates/cf-ecs-service-cpu-autoscaling.yml s3://$(TEMPLATE_BUCKET_NAME)/
set-optional-params-metrics-cloudwatch:
ifdef METRICS_CLOUDWATCH_ENABLED
@@ -108,6 +109,42 @@
$(eval GERRIT_OPTIONAL_PARAMS_REPLICA_FILESYSTEM := $(GERRIT_OPTIONAL_PARAMS_REPLICA_FILESYSTEM) ParameterKey=ReplicaProvisionedThroughputInMibps,ParameterValue=$(REPLICA_FILESYSTEM_PROVISIONED_THROUGHPUT_IN_MIBPS))
endif
+set-optional-params-for-replica-auto-scaling-capacity:
+ $(eval GERRIT_OPTIONAL_PARAMS_REPLICA_AUTO_SCALING_CAPACITY=)
+ifdef REPLICA_AUTOSCALING_MIN_CAPACITY
+ $(eval GERRIT_OPTIONAL_PARAMS_REPLICA_AUTO_SCALING_CAPACITY := $(GERRIT_OPTIONAL_PARAMS_REPLICA_AUTO_SCALING_CAPACITY) ParameterKey=ReplicaAutoScalingMinCapacity,ParameterValue=$(REPLICA_AUTOSCALING_MIN_CAPACITY))
+endif
+ifdef REPLICA_AUTOSCALING_DESIRED_CAPACITY
+ $(eval GERRIT_OPTIONAL_PARAMS_REPLICA_AUTO_SCALING_CAPACITY := $(GERRIT_OPTIONAL_PARAMS_REPLICA_AUTO_SCALING_CAPACITY) ParameterKey=ReplicaAutoScalingDesiredCapacity,ParameterValue=$(REPLICA_AUTOSCALING_DESIRED_CAPACITY))
+endif
+ifdef REPLICA_AUTOSCALING_MAX_CAPACITY
+ $(eval GERRIT_OPTIONAL_PARAMS_REPLICA_AUTO_SCALING_CAPACITY := $(GERRIT_OPTIONAL_PARAMS_REPLICA_AUTO_SCALING_CAPACITY) ParameterKey=ReplicaAutoScalingMaxCapacity,ParameterValue=$(REPLICA_AUTOSCALING_MAX_CAPACITY))
+endif
+
+set-optional-params-for-replica-auto-scaling-policy:
+ $(eval GERRIT_OPTIONAL_PARAMS_REPLICA_AUTO_SCALING_POLICY=)
+ifdef REPLICA_AUTOSCALING_SCALE_IN_COOLDOWN
+ $(eval GERRIT_OPTIONAL_PARAMS_REPLICA_AUTO_SCALING_POLICY := $(GERRIT_OPTIONAL_PARAMS_REPLICA_AUTO_SCALING_POLICY) ParameterKey=ReplicaAutoScalingScaleInCooldown,ParameterValue=$(REPLICA_AUTOSCALING_SCALE_IN_COOLDOWN))
+endif
+ifdef REPLICA_AUTOSCALING_SCALE_OUT_COOLDOWN
+ $(eval GERRIT_OPTIONAL_PARAMS_REPLICA_AUTO_SCALING_POLICY := $(GERRIT_OPTIONAL_PARAMS_REPLICA_AUTO_SCALING_POLICY) ParameterKey=ReplicaAutoScalingScaleOutCooldown,ParameterValue=$(REPLICA_AUTOSCALING_SCALE_OUT_COOLDOWN))
+endif
+ifdef REPLICA_AUTOSCALING_TARGET_CPU_PERCENTAGE
+ $(eval GERRIT_OPTIONAL_PARAMS_REPLICA_AUTO_SCALING_POLICY := $(GERRIT_OPTIONAL_PARAMS_REPLICA_AUTO_SCALING_POLICY) ParameterKey=ReplicaAutoScalingTargetCPUPercentage,ParameterValue=$(REPLICA_AUTOSCALING_TARGET_CPU_PERCENTAGE))
+endif
+
+set-optional-params-for-replica-capacity-provider:
+ $(eval GERRIT_OPTIONAL_PARAMS_REPLICA_CAPACITY_PROVIDER=)
+ifdef REPLICA_CAPACITY_PROVIDER_TARGET
+ $(eval GERRIT_OPTIONAL_PARAMS_REPLICA_CAPACITY_PROVIDER := $(GERRIT_OPTIONAL_PARAMS_REPLICA_CAPACITY_PROVIDER) ParameterKey=ReplicaCapacityProviderTarget,ParameterValue=$(REPLICA_CAPACITY_PROVIDER_TARGET))
+endif
+ifdef REPLICA_CAPACITY_PROVIDER_MIN_STEP_SIZE
+ $(eval GERRIT_OPTIONAL_PARAMS_REPLICA_CAPACITY_PROVIDER := $(GERRIT_OPTIONAL_PARAMS_REPLICA_CAPACITY_PROVIDER) ParameterKey=ReplicaCapacityProviderMinStepSize,ParameterValue=$(REPLICA_CAPACITY_PROVIDER_MIN_STEP_SIZE))
+endif
+ifdef REPLICA_CAPACITY_PROVIDER_MAX_STEP_SIZE
+ $(eval GERRIT_OPTIONAL_PARAMS_REPLICA_CAPACITY_PROVIDER := $(GERRIT_OPTIONAL_PARAMS_REPLICA_CAPACITY_PROVIDER) ParameterKey=ReplicaCapacityProviderMaxStepSize,ParameterValue=$(REPLICA_CAPACITY_PROVIDER_MAX_STEP_SIZE))
+endif
+
confirm-persistent-stack-deletion:
@echo ""
@echo "* * * * WARNING * * * * this is going to completely destroy the stack, including git data."
diff --git a/common-templates/cf-ecs-service-cpu-autoscaling.yml b/common-templates/cf-ecs-service-cpu-autoscaling.yml
new file mode 100644
index 0000000..0c3b77b
--- /dev/null
+++ b/common-templates/cf-ecs-service-cpu-autoscaling.yml
@@ -0,0 +1,71 @@
+AWSTemplateFormatVersion: '2010-09-09'
+Description: Resources related to the EFS filesystem apt to store git data.
+Parameters:
+ AutoScalingMinCapacity:
+ Type: Number
+ Description: The minimum number of tasks the service should scale in to
+ AutoScalingDesiredCapacity:
+ Description: The desired number of tasks to run
+ Type: Number
+ AutoScalingMaxCapacity:
+ Type: Number
+ Description: The maximum number of tasks the service should scale out to
+ AutoScalingScaleInCooldown:
+ Type: Number
+ Description: The amount of time, in seconds, after a scale-in activity completes before another scale-in activity can start
+ AutoScalingScaleOutCooldown:
+ Type: Number
+ Description: The amount of time, in seconds, to wait for a previous scale-out activity to take effect.
+ AutoScalingTargetCPUPercentage:
+ Type: Number
+ Description: Aggregate CPU utilization target for auto-scaling
+ ResourceId:
+ Type: String
+ Description: The identifier of the resource associated with the scalable target.
+
+Resources:
+ GerritServiceScalingTarget:
+ Type: AWS::ApplicationAutoScaling::ScalableTarget
+ Properties:
+ MinCapacity: !Ref AutoScalingMinCapacity
+ MaxCapacity: !Ref AutoScalingMaxCapacity
+ ResourceId: !Ref ResourceId
+ RoleARN: !GetAtt [AutoscalingRole, Arn]
+ ScalableDimension: ecs:service:DesiredCount
+ ServiceNamespace: ecs
+
+ GerritServiceScalingPolicy:
+ Type: AWS::ApplicationAutoScaling::ScalingPolicy
+ Properties:
+ PolicyName: ReplicaCPUTrackingPolicy
+ PolicyType: TargetTrackingScaling
+ ScalingTargetId: !Ref GerritServiceScalingTarget
+ TargetTrackingScalingPolicyConfiguration:
+ PredefinedMetricSpecification:
+ PredefinedMetricType: ECSServiceAverageCPUUtilization
+ ScaleInCooldown: !Ref AutoScalingScaleInCooldown
+ ScaleOutCooldown: !Ref AutoScalingScaleOutCooldown
+ TargetValue: !Ref AutoScalingTargetCPUPercentage
+
+ AutoscalingRole:
+ Type: AWS::IAM::Role
+ Properties:
+ AssumeRolePolicyDocument:
+ Statement:
+ - Effect: Allow
+ Principal:
+ Service: [application-autoscaling.amazonaws.com]
+ Action: ['sts:AssumeRole']
+ Path: /
+ Policies:
+ - PolicyName: gerrit-service-autoscaling
+ PolicyDocument:
+ Statement:
+ - Effect: Allow
+ Action:
+ - 'application-autoscaling:*'
+ - 'cloudwatch:DescribeAlarms'
+ - 'cloudwatch:PutMetricAlarm'
+ - 'ecs:DescribeServices'
+ - 'ecs:UpdateService'
+ Resource: '*'
\ No newline at end of file
diff --git a/dual-primary/Makefile b/dual-primary/Makefile
index 933ce14..8b44868 100644
--- a/dual-primary/Makefile
+++ b/dual-primary/Makefile
@@ -50,7 +50,10 @@
$(optional_git_gc_targets_creation) \
dns-routing wait-for-dns-routing-creation
-cluster: cluster-keys set-optional-gerrit-primary-volume set-optional-params-for-replica-filesystem
+cluster: cluster-keys set-optional-gerrit-primary-volume \
+ set-optional-params-for-replica-filesystem \
+ set-optional-params-for-replica-auto-scaling-capacity \
+ set-optional-params-for-replica-capacity-provider
ifdef CLUSTER_INSTANCE_TYPE
$(eval CLUSTER_OPTIONAL_PARAMS := $(CLUSTER_OPTIONAL_PARAMS) ParameterKey=InstanceType,ParameterValue=$(CLUSTER_INSTANCE_TYPE))
endif
@@ -92,7 +95,9 @@
ParameterKey=SubnetIdProp,ParameterValue=$(SUBNET_ID) \
$(CLUSTER_OPTIONAL_PARAMS) \
$(GERRIT_OPTIONAL_PRIMARY_VOLUME) \
- $(GERRIT_OPTIONAL_PARAMS_REPLICA_FILESYSTEM)
+ $(GERRIT_OPTIONAL_PARAMS_REPLICA_FILESYSTEM) \
+ $(GERRIT_OPTIONAL_PARAMS_REPLICA_AUTO_SCALING_CAPACITY) \
+ $(GERRIT_OPTIONAL_PARAMS_REPLICA_CAPACITY_PROVIDER)
service-primary-1: set-optional-params-metrics-cloudwatch set-optional-params-smtp \
set-optional-params-multisite set-ldap-account-pattern \
@@ -227,7 +232,13 @@
ParameterKey=HostedZoneName,ParameterValue=$(HOSTED_ZONE_NAME) \
ParameterKey=GitReplicationSubdomain,ParameterValue=$(GIT_REPLICATION_SUBDOMAIN)
-service-replica: set-optional-params-metrics-cloudwatch set-ldap-account-pattern set-optional-gerrit-ulimits set-optional-jgit-conf
+service-replica: set-optional-params-metrics-cloudwatch \
+ set-ldap-account-pattern \
+ set-optional-gerrit-ulimits \
+ set-optional-jgit-conf \
+ set-optional-params-for-replica-auto-scaling-capacity \
+ set-optional-params-for-replica-auto-scaling-policy
+
ifdef GERRIT_REPLICA_INSTANCE_ID
$(eval REPLICA_SERVICE_OPTIONAL_PARAMS := $(REPLICA_SERVICE_OPTIONAL_PARAMS) ParameterKey=InstanceId,ParameterValue=$(GERRIT_REPLICA_INSTANCE_ID))
endif
@@ -261,7 +272,9 @@
$(LDAP_ACCOUNT_PATTERN_PARAM) \
$(REPLICA_SERVICE_OPTIONAL_PARAMS) \
$(METRICS_CW_OPTIONAL_PARAMS) \
- $(GERRIT_ULIMITS)
+ $(GERRIT_ULIMITS) \
+ $(GERRIT_OPTIONAL_PARAMS_REPLICA_AUTO_SCALING_CAPACITY) \
+ $(GERRIT_OPTIONAL_PARAMS_REPLICA_AUTO_SCALING_POLICY)
service-lb:
ifdef LOAD_BALANCER_SCHEME
diff --git a/dual-primary/README.md b/dual-primary/README.md
index baa0874..0c0bc8e 100644
--- a/dual-primary/README.md
+++ b/dual-primary/README.md
@@ -242,6 +242,93 @@
* `REPLICA_FILESYSTEM_PROVISIONED_THROUGHPUT_IN_MIBPS`: Optional. Only used when `REPLICA_FILESYSTEM_THROUGHPUT_MODE` is set to `provisioned`.
default: `256`.
+##### Auto Scaling of replicas instances
+
+Gerrit replicas have the ability to scale in or out automatically to accommodate
+to the increase or decrease of traffic. The traffic might be typically coming
+from build or test jobs executed by some sort of automated build pipeline.
+
+Since they all [share the same git data over EFS](#shared-filesystem-for-replicas),
+replicas are immediately ready to serve traffic as soon as they come up and
+register behind the loadbalancer.
+
+There is a 1 to 1 relationship between replica and EC2 instances: on each EC2
+instance in the 'replica' ASG, runs one and only one replica task.
+Because of this, when specifying the capacity for replicas (minimum, desired and
+maximum), they will both configure for the capacity of tasks as well as the
+capacity of the ASG, since they always need to be in sync.
+
+The scaling policy adds or removes capacity as required to keep the average CPU
+Usage (of the replica service) close to the specified target value.
+
+Now, tasks in the provisioning state that cannot find sufficient resources on
+the existing instances will automatically trigger the capacity provider to scale
+out the replica ASG. As more EC2 instances become available, tasks in the
+provisioning state will get placed onto those instances, reducing the number of
+tasks in provisioning.
+
+Conversely, as the average CPU usage (of the replica service) drops under the
+specified target value, and replica tasks get removed, the capacity provider
+will reduce the number of EC2 instances too.
+
+Note that only EC2 instances that are not running any replica task will scale in.
+
+These are the available settings:
+
+* `REPLICA_AUTOSCALING_MIN_CAPACITY` Optional. The minimum number of tasks that
+replicas should scale in to. This is also the minimum number of EC2 instances in
+the replica ASG
+default: *1*
+
+* `REPLICA_AUTOSCALING_DESIRED_CAPACITY` Optional. The desired number of
+replica tasks to run. This is also the desired number of EC2 instances in the
+replica ASG.
+default: *1*
+
+* `REPLICA_AUTOSCALING_MAX_CAPACITY` Optional. The maximum number of tasks that
+replicas should scale out to. This is also the maximum number of EC2 instances
+in the replica ASG
+default: *2*
+
+* `REPLICA_AUTOSCALING_SCALE_IN_COOLDOWN` Optional. The amount of time, in
+seconds, after a scale-in activity completes before another scale-in activity
+can start
+default: *300* seconds
+
+* `REPLICA_AUTOSCALING_SCALE_OUT_COOLDOWN` Optional. The amount of time, in
+seconds, to wait for a previous scale-out activity to take effect
+default: *300* seconds
+
+* `REPLICA_AUTOSCALING_TARGET_CPU_PERCENTAGE` Optional. Aggregate CPU
+utilization target for auto-scaling. Auto-scaling will add or remove tasks in
+the replica service to be as close as possible to this value
+
+* `REPLICA_CAPACITY_PROVIDER_TARGET` Optional. The target capacity value for the
+capacity provider of replicas (must be > 0 and <= 100).
+default: *100*
+
+ Setting this value to 100 means that there will be no _spare capacity_
+allocated on the replica ASG:
+
+ If 3 replica tasks are needed, then the ASG will adjust to have exactly 3 EC2
+
+ Setting this value to less than 100 enables spare capacity in the ASG. For
+example, if you set this value to 50 the scaling policy will adjust the EC2
+until it is exactly twice the number of instances needed to run all of the
+tasks:
+
+ If 3 replica tasks are needed, then there ASG will adjust to 6 EC2
+
+* `REPLICA_CAPACITY_PROVIDER_MIN_STEP_SIZE` Optional. The minimum number of EC2
+instances for replicas that will scale in or scale out at one time (must be >= 1
+and <= 10)
+default: *1*
+
+* `REPLICA_CAPACITY_PROVIDER_MAX_STEP_SIZE` Optional. The maximum number of EC2
+instances for replicas that will scale in or scale out at one time (must be >= 1
+and <= 10)
+default: *1*
+
#### REPLICATION SERVICE
* `REPLICATION_SERVICE_ENABLED`: Optional. Whether to expose a replication endpoint.
diff --git a/dual-primary/cf-cluster.yml b/dual-primary/cf-cluster.yml
index 17a9b9b..2797932 100644
--- a/dual-primary/cf-cluster.yml
+++ b/dual-primary/cf-cluster.yml
@@ -106,6 +106,39 @@
Description: Gerrit replicas shared filesystem throughput, measured in MiB/s. Valid values are 1-1024.
Type: Number
Default: 256
+ ReplicaAutoScalingMinCapacity:
+ Type: Number
+ Description: The minimum number of tasks that replicas should scale in to
+ Default: 1
+ ReplicaAutoScalingDesiredCapacity:
+ Description: The desired number of replica tasks to run
+ Type: Number
+ Default: 1
+ ReplicaAutoScalingMaxCapacity:
+ Type: Number
+ Description: The maximum number of tasks that replicas should scale out to
+ Default: 2
+ ReplicaCapacityProviderTarget:
+ Type: Number
+ Description: The target capacity value for the capacity provider of replicas
+ ConstraintDescription: The specified value must be > 0 and <= 100
+ Default: 100
+ MinValue: 1
+ MaxValue: 100
+ ReplicaCapacityProviderMinStepSize:
+ Type: Number
+ Description: The minimum number of EC2 instances for replicas that will scale in or scale out at one time
+ ConstraintDescription: The specified value must be >= 1 and <= 10
+ Default: 1
+ MinValue: 1
+ MaxValue: 10
+ ReplicaCapacityProviderMaxStepSize:
+ Type: Number
+ Description: The maximum number of EC2 instances for replicas that will scale in or scale out at one time
+ ConstraintDescription: The specified value must be >= 1 and less <= 10
+ Default: 1
+ MinValue: 1
+ MaxValue: 10
Conditions:
isProvisionedThroughput: !Equals [!Ref PrimaryFileSystemThroughputMode, "provisioned"]
@@ -121,6 +154,22 @@
# ECS Resources
ECSCluster:
Type: AWS::ECS::Cluster
+ Properties:
+ ClusterName: !Sub '${AWS::StackName}-ECSCluster'
+ CapacityProviders: [ !Ref ReplicaCapacityProvider ]
+
+ ReplicaCapacityProvider:
+ Type: AWS::ECS::CapacityProvider
+ Properties:
+ Name: !Sub '${AWS::StackName}-ReplicaCapacityProvider'
+ AutoScalingGroupProvider:
+ AutoScalingGroupArn: !Ref ReplicaECSAutoScalingGroup
+ ManagedTerminationProtection: ENABLED
+ ManagedScaling:
+ MaximumScalingStepSize: !Ref ReplicaCapacityProviderMaxStepSize
+ MinimumScalingStepSize: !Ref ReplicaCapacityProviderMinStepSize
+ Status: ENABLED
+ TargetCapacity: !Ref ReplicaCapacityProviderTarget
EcsHostSecurityGroup:
Type: AWS::EC2::SecurityGroup
@@ -144,9 +193,10 @@
VPCZoneIdentifier:
- !If [NetworkStackNeeded, !GetAtt ECSTaskNetworkStack.Outputs.PublicSubnetOneRef, !Ref SubnetIdProp]
LaunchConfigurationName: !Ref 'ReplicaLaunchConfiguration'
- MinSize: '1'
- MaxSize: '1'
- DesiredCapacity: '1'
+ MinSize: !Ref ReplicaAutoScalingMinCapacity
+ MaxSize: !Ref ReplicaAutoScalingMaxCapacity
+ DesiredCapacity: !Ref ReplicaAutoScalingDesiredCapacity
+ NewInstancesProtectedFromScaleIn: true
CreationPolicy:
ResourceSignal:
Timeout: PT15M
@@ -165,7 +215,7 @@
UserData:
Fn::Base64: !Sub |
#!/bin/bash -xe
- echo ECS_CLUSTER=${ECSCluster} >> /etc/ecs/ecs.config
+ echo ECS_CLUSTER=${AWS::StackName}-ECSCluster >> /etc/ecs/ecs.config
echo ECS_INSTANCE_ATTRIBUTES={\"target_group\":\"replica\"} >> /etc/ecs/ecs.config
# Make sure latest version of the helper scripts are installed as per recommendation:
@@ -206,7 +256,7 @@
UserData:
Fn::Base64: !Sub |
#!/bin/bash -xe
- echo ECS_CLUSTER=${ECSCluster} >> /etc/ecs/ecs.config
+ echo ECS_CLUSTER=${AWS::StackName}-ECSCluster >> /etc/ecs/ecs.config
echo ECS_INSTANCE_ATTRIBUTES={\"target_group\":\"haproxy\"} >> /etc/ecs/ecs.config
# Make sure latest version of the helper scripts are installed as per recommendation:
diff --git a/dual-primary/cf-service-replica.yml b/dual-primary/cf-service-replica.yml
index ff59a9a..85e9604 100644
--- a/dual-primary/cf-service-replica.yml
+++ b/dual-primary/cf-service-replica.yml
@@ -36,10 +36,6 @@
DockerRegistryUrl:
Description: Docker registry URL
Type: String
- DesiredCount:
- Description: How many instances of this task should we run across our cluster?
- Type: Number
- Default: 1
HTTPHostPort:
Description: Gerrit Host HTTP port
Type: Number
@@ -189,6 +185,30 @@
Description: Comma separated list of regex patterns to exclude metrics reported to CloudWatch
Type: CommaDelimitedList
Default: ''
+ ReplicaAutoScalingMinCapacity:
+ Type: Number
+ Description: The minimum number of tasks that replicas should scale in to
+ Default: 1
+ ReplicaAutoScalingDesiredCapacity:
+ Description: The desired number of replica tasks to run
+ Type: Number
+ Default: 1
+ ReplicaAutoScalingMaxCapacity:
+ Type: Number
+ Description: The maximum number of tasks that replicas should scale out to
+ Default: 2
+ ReplicaAutoScalingScaleInCooldown:
+ Type: Number
+ Description: The amount of time, in seconds, after a scale-in activity completes before another scale-in activity can start
+ Default: 300
+ ReplicaAutoScalingScaleOutCooldown:
+ Type: Number
+ Description: The amount of time, in seconds, to wait for a previous scale-out activity to take effect.
+ Default: 300
+ ReplicaAutoScalingTargetCPUPercentage:
+ Type: Number
+ Description: Aggregate CPU utilization target for auto-scaling
+ Default: 75.0
Resources:
GerritService:
@@ -202,7 +222,10 @@
Cluster:
Fn::ImportValue:
!Join [':', [!Ref 'ClusterStackName', 'ClusterName']]
- DesiredCount: !Ref DesiredCount
+ CapacityProviderStrategy:
+ - CapacityProvider: !Sub '${ClusterStackName}-ReplicaCapacityProvider'
+ Weight: 100
+ DesiredCount: !Ref ReplicaAutoScalingDesiredCapacity
TaskDefinition: !Ref GerritTaskDefinition
LoadBalancers:
- ContainerName: !Ref GerritServiceName
@@ -385,6 +408,26 @@
Labels:
gerrit-logs: !Join ['-', [!Ref EnvironmentName, !Ref GerritLogsVolume]]
+ ReplicaCPUAutoScaling:
+ Type: AWS::CloudFormation::Stack
+ Properties:
+ TemplateURL: !Join [ '', ['https://', !Ref TemplateBucketName, '.s3.amazonaws.com/cf-ecs-service-cpu-autoscaling.yml'] ]
+ TimeoutInMinutes: '5'
+ Parameters:
+ AutoScalingMinCapacity: !Ref ReplicaAutoScalingMinCapacity
+ AutoScalingDesiredCapacity: !Ref ReplicaAutoScalingDesiredCapacity
+ AutoScalingMaxCapacity: !Ref ReplicaAutoScalingMaxCapacity
+ AutoScalingScaleInCooldown: !Ref ReplicaAutoScalingScaleInCooldown
+ AutoScalingScaleOutCooldown: !Ref ReplicaAutoScalingScaleOutCooldown
+ AutoScalingTargetCPUPercentage: !Ref ReplicaAutoScalingTargetCPUPercentage
+ ResourceId:
+ !Join
+ - ''
+ - - 'service/'
+ - Fn::ImportValue: !Join [':', [!Ref 'ClusterStackName', 'ClusterName']]
+ - '/'
+ - !GetAtt GerritService.Name
+
LoadBalancer:
Type: AWS::ElasticLoadBalancingV2::LoadBalancer
Properties:
diff --git a/dual-primary/setup.env.template b/dual-primary/setup.env.template
index 72bfd37..51b234f 100644
--- a/dual-primary/setup.env.template
+++ b/dual-primary/setup.env.template
@@ -73,4 +73,16 @@
REPLICA_FILESYSTEM_ID=""
REPLICA_FILESYSTEM_THROUGHPUT_MODE="provisioned"
-REPLICA_FILESYSTEM_PROVISIONED_THROUGHPUT_IN_MIBPS="256"
\ No newline at end of file
+REPLICA_FILESYSTEM_PROVISIONED_THROUGHPUT_IN_MIBPS="256"
+
+REPLICA_AUTOSCALING_MIN_CAPACITY=1
+REPLICA_AUTOSCALING_DESIRED_CAPACITY=1
+REPLICA_AUTOSCALING_MAX_CAPACITY=1
+
+REPLICA_AUTOSCALING_SCALE_IN_COOLDOWN=300
+REPLICA_AUTOSCALING_SCALE_OUT_COOLDOWN=300
+REPLICA_AUTOSCALING_TARGET_CPU_PERCENTAGE=75
+
+REPLICA_CAPACITY_PROVIDER_TARGET=50
+REPLICA_CAPACITY_PROVIDER_MIN_STEP_SIZE=1
+REPLICA_CAPACITY_PROVIDER_MAX_STEP_SIZE=2
\ No newline at end of file
diff --git a/primary-replica/Makefile b/primary-replica/Makefile
index 4378812..d7f7313 100644
--- a/primary-replica/Makefile
+++ b/primary-replica/Makefile
@@ -31,7 +31,10 @@
$(optional_git_gc_targets_creation) \
dns-routing wait-for-dns-routing-creation
-cluster: cluster-keys set-optional-gerrit-primary-volume set-optional-params-for-replica-filesystem
+cluster: cluster-keys set-optional-gerrit-primary-volume \
+ set-optional-params-for-replica-filesystem \
+ set-optional-params-for-replica-auto-scaling-capacity \
+ set-optional-params-for-replica-capacity-provider
ifdef CLUSTER_INSTANCE_TYPE
$(eval CLUSTER_OPTIONAL_PARAMS := $(CLUSTER_OPTIONAL_PARAMS) ParameterKey=InstanceType,ParameterValue=$(CLUSTER_INSTANCE_TYPE))
endif
@@ -58,7 +61,9 @@
ParameterKey=SubnetIdProp,ParameterValue=$(SUBNET_ID) \
$(CLUSTER_OPTIONAL_PARAMS) \
$(GERRIT_OPTIONAL_PRIMARY_VOLUME) \
- $(GERRIT_OPTIONAL_PARAMS_REPLICA_FILESYSTEM)
+ $(GERRIT_OPTIONAL_PARAMS_REPLICA_FILESYSTEM) \
+ $(GERRIT_OPTIONAL_PARAMS_REPLICA_AUTO_SCALING_CAPACITY) \
+ $(GERRIT_OPTIONAL_PARAMS_REPLICA_CAPACITY_PROVIDER)
service-primary: set-optional-params-metrics-cloudwatch set-optional-params-smtp set-ldap-account-pattern set-optional-gerrit-ulimits set-optional-jgit-conf
ifdef LOAD_BALANCER_SCHEME
@@ -103,7 +108,12 @@
$(GERRIT_ULIMITS)
-service-replica: set-optional-params-metrics-cloudwatch set-ldap-account-pattern set-optional-gerrit-ulimits set-optional-jgit-conf
+service-replica: set-optional-params-metrics-cloudwatch \
+ set-ldap-account-pattern \
+ set-optional-gerrit-ulimits set-optional-jgit-conf \
+ set-optional-params-for-replica-auto-scaling-capacity \
+ set-optional-params-for-replica-auto-scaling-policy
+
ifdef LOAD_BALANCER_SCHEME
$(eval REPLICA_SERVICE_OPTIONAL_PARAMS := $(REPLICA_SERVICE_OPTIONAL_PARAMS) ParameterKey=LoadBalancerScheme,ParameterValue=$(LOAD_BALANCER_SCHEME))
endif
@@ -137,7 +147,9 @@
$(LDAP_ACCOUNT_PATTERN_PARAM) \
$(REPLICA_SERVICE_OPTIONAL_PARAMS) \
$(METRICS_CW_OPTIONAL_PARAMS) \
- $(GERRIT_ULIMITS)
+ $(GERRIT_ULIMITS) \
+ $(GERRIT_OPTIONAL_PARAMS_REPLICA_AUTO_SCALING_CAPACITY) \
+ $(GERRIT_OPTIONAL_PARAMS_REPLICA_AUTO_SCALING_POLICY)
dns-routing:
$(AWS_FC_COMMAND) create-stack \
diff --git a/primary-replica/README.md b/primary-replica/README.md
index f7bcddf..ccb37d7 100644
--- a/primary-replica/README.md
+++ b/primary-replica/README.md
@@ -114,6 +114,93 @@
* `REPLICA_FILESYSTEM_PROVISIONED_THROUGHPUT_IN_MIBPS`: Optional. Only used when `REPLICA_FILESYSTEM_THROUGHPUT_MODE` is set to `provisioned`.
default: `256`.
+##### Auto Scaling of replicas instances
+
+Gerrit replicas have the ability to scale in or out automatically to accommodate
+to the increase or decrease of traffic. The traffic might be typically coming
+from build or test jobs executed by some sort of automated build pipeline.
+
+Since they all [share the same git data over EFS](#shared-filesystem-for-replicas),
+replicas are immediately ready to serve traffic as soon as they come up and
+register behind the loadbalancer.
+
+There is a 1 to 1 relationship between replica and EC2 instances: on each EC2
+instance in the 'replica' ASG, runs one and only one replica task.
+Because of this, when specifying the capacity for replicas (minimum, desired and
+maximum), they will both configure for the capacity of tasks as well as the
+capacity of the ASG, since they always need to be in sync.
+
+The scaling policy adds or removes capacity as required to keep the average CPU
+Usage (of the replica service) close to the specified target value.
+
+Now, tasks in the provisioning state that cannot find sufficient resources on
+the existing instances will automatically trigger the capacity provider to scale
+out the replica ASG. As more EC2 instances become available, tasks in the
+provisioning state will get placed onto those instances, reducing the number of
+tasks in provisioning.
+
+Conversely, as the average CPU usage (of the replica service) drops under the
+specified target value, and replica tasks get removed, the capacity provider
+will reduce the number of EC2 instances too.
+
+Note that only EC2 instances that are not running any replica task will scale in.
+
+These are the available settings:
+
+* `REPLICA_AUTOSCALING_MIN_CAPACITY` Optional. The minimum number of tasks that
+replicas should scale in to. This is also the minimum number of EC2 instances in
+the replica ASG
+default: *1*
+
+* `REPLICA_AUTOSCALING_DESIRED_CAPACITY` Optional. The desired number of
+replica tasks to run. This is also the desired number of EC2 instances in the
+replica ASG.
+default: *1*
+
+* `REPLICA_AUTOSCALING_MAX_CAPACITY` Optional. The maximum number of tasks that
+replicas should scale out to. This is also the maximum number of EC2 instances
+in the replica ASG
+default: *2*
+
+* `REPLICA_AUTOSCALING_SCALE_IN_COOLDOWN` Optional. The amount of time, in
+seconds, after a scale-in activity completes before another scale-in activity
+can start
+default: *300* seconds
+
+* `REPLICA_AUTOSCALING_SCALE_OUT_COOLDOWN` Optional. The amount of time, in
+seconds, to wait for a previous scale-out activity to take effect
+default: *300* seconds
+
+* `REPLICA_AUTOSCALING_TARGET_CPU_PERCENTAGE` Optional. Aggregate CPU
+utilization target for auto-scaling. Auto-scaling will add or remove tasks in
+the replica service to be as close as possible to this value
+
+* `REPLICA_CAPACITY_PROVIDER_TARGET` Optional. The target capacity value for the
+capacity provider of replicas (must be > 0 and <= 100).
+default: *100*
+
+ Setting this value to 100 means that there will be no _spare capacity_
+allocated on the replica ASG:
+
+ If 3 replica tasks are needed, then the ASG will adjust to have exactly 3 EC2
+
+ Setting this value to less than 100 enables spare capacity in the ASG. For
+example, if you set this value to 50 the scaling policy will adjust the EC2
+until it is exactly twice the number of instances needed to run all of the
+tasks:
+
+ If 3 replica tasks are needed, then there ASG will adjust to 6 EC2
+
+* `REPLICA_CAPACITY_PROVIDER_MIN_STEP_SIZE` Optional. The minimum number of EC2
+instances for replicas that will scale in or scale out at one time (must be >= 1
+and <= 10)
+default: *1*
+
+* `REPLICA_CAPACITY_PROVIDER_MAX_STEP_SIZE` Optional. The maximum number of EC2
+instances for replicas that will scale in or scale out at one time (must be >= 1
+and <= 10)
+default: *1*
+
### 2 - Deploy
* Create the cluster, services and DNS routing stacks:
diff --git a/primary-replica/cf-cluster.yml b/primary-replica/cf-cluster.yml
index 8360292..3523810 100644
--- a/primary-replica/cf-cluster.yml
+++ b/primary-replica/cf-cluster.yml
@@ -84,6 +84,39 @@
Description: Gerrit replicas shared filesystem throughput, measured in MiB/s. Valid values are 1-1024.
Type: Number
Default: 256
+ ReplicaAutoScalingMinCapacity:
+ Type: Number
+ Description: The minimum number of EC2 instances in the replica ASG
+ Default: 1
+ ReplicaAutoScalingDesiredCapacity:
+ Description: The desired number of EC2 instances in the replica ASG
+ Type: Number
+ Default: 1
+ ReplicaAutoScalingMaxCapacity:
+ Type: Number
+ Description: The maximum number of EC2 instances in the replica ASG
+ Default: 2
+ ReplicaCapacityProviderTarget:
+ Type: Number
+ Description: The target capacity value for the capacity provider of replicas
+ ConstraintDescription: The specified value must be > 0 and <= 100
+ Default: 100
+ MinValue: 1
+ MaxValue: 100
+ ReplicaCapacityProviderMinStepSize:
+ Type: Number
+ Description: The minimum number of EC2 instances for replicas that will scale in or scale out at one time
+ ConstraintDescription: The specified value must be >= 1 and <= 10
+ Default: 1
+ MinValue: 1
+ MaxValue: 10
+ ReplicaCapacityProviderMaxStepSize:
+ Type: Number
+ Description: The maximum number of EC2 instances for replicas that will scale in or scale out at one time
+ ConstraintDescription: The specified value must be >= 1 and less <= 10
+ Default: 1
+ MinValue: 1
+ MaxValue: 10
Conditions:
CreateReplicaEFS: !Equals [!Ref ReplicaFileSystemID, ""]
@@ -97,6 +130,22 @@
# ECS Resources
ECSCluster:
Type: AWS::ECS::Cluster
+ Properties:
+ ClusterName: !Sub '${AWS::StackName}-ECSCluster'
+ CapacityProviders: [ !Ref ReplicaCapacityProvider ]
+
+ ReplicaCapacityProvider:
+ Type: AWS::ECS::CapacityProvider
+ Properties:
+ Name: !Sub '${AWS::StackName}-ReplicaCapacityProvider'
+ AutoScalingGroupProvider:
+ AutoScalingGroupArn: !Ref ReplicaASG
+ ManagedTerminationProtection: ENABLED
+ ManagedScaling:
+ MaximumScalingStepSize: !Ref ReplicaCapacityProviderMaxStepSize
+ MinimumScalingStepSize: !Ref ReplicaCapacityProviderMinStepSize
+ Status: ENABLED
+ TargetCapacity: !Ref ReplicaCapacityProviderTarget
EcsHostSecurityGroup:
Type: AWS::EC2::SecurityGroup
@@ -141,7 +190,7 @@
UserData:
Fn::Base64: !Sub |
#!/bin/bash -xe
- echo ECS_CLUSTER=${ECSCluster} >> /etc/ecs/ecs.config
+ echo ECS_CLUSTER=${AWS::StackName}-ECSCluster >> /etc/ecs/ecs.config
echo ECS_INSTANCE_ATTRIBUTES={\"target_group\":\"primary\"} >> /etc/ecs/ecs.config
# Make sure latest version of the helper scripts are installed as per recommendation:
# https://github.com/awsdocs/aws-cloudformation-user-guide/blob/master/doc_source/cfn-helper-scripts-reference.md#using-the-latest-version
@@ -238,9 +287,10 @@
VPCZoneIdentifier:
- !If [NetworkStackNeeded, !GetAtt ECSTaskNetworkStack.Outputs.PublicSubnetOneRef, !Ref SubnetIdProp]
LaunchConfigurationName: !Ref 'ReplicaLaunchConfiguration'
- MinSize: '1'
- MaxSize: '1'
- DesiredCapacity: '1'
+ MinSize: !Ref ReplicaAutoScalingMinCapacity
+ MaxSize: !Ref ReplicaAutoScalingMaxCapacity
+ DesiredCapacity: !Ref ReplicaAutoScalingDesiredCapacity
+ NewInstancesProtectedFromScaleIn: true
CreationPolicy:
ResourceSignal:
Timeout: PT15M
@@ -259,7 +309,7 @@
UserData:
Fn::Base64: !Sub |
#!/bin/bash -xe
- echo ECS_CLUSTER=${ECSCluster} >> /etc/ecs/ecs.config
+ echo ECS_CLUSTER=${AWS::StackName}-ECSCluster >> /etc/ecs/ecs.config
echo ECS_INSTANCE_ATTRIBUTES={\"target_group\":\"replica\"} >> /etc/ecs/ecs.config
# Make sure latest version of the helper scripts are installed as per recommendation:
diff --git a/primary-replica/cf-service-replica.yml b/primary-replica/cf-service-replica.yml
index fdc8350..2e3d1c6 100644
--- a/primary-replica/cf-service-replica.yml
+++ b/primary-replica/cf-service-replica.yml
@@ -36,10 +36,6 @@
DockerRegistryUrl:
Description: Docker registry URL
Type: String
- DesiredCount:
- Description: How many instances of this task should we run across our cluster?
- Type: Number
- Default: 1
HTTPHostPort:
Description: Gerrit Host HTTP port
Type: Number
@@ -189,6 +185,30 @@
Description: Comma separated list of regex patterns to exclude metrics reported to CloudWatch
Type: CommaDelimitedList
Default: ''
+ ReplicaAutoScalingMinCapacity:
+ Type: Number
+ Description: The minimum number of tasks that replicas should scale in to
+ Default: 1
+ ReplicaAutoScalingDesiredCapacity:
+ Description: The desired number of replica tasks to run
+ Type: Number
+ Default: 1
+ ReplicaAutoScalingMaxCapacity:
+ Type: Number
+ Description: The maximum number of tasks that replicas should scale out to
+ Default: 2
+ ReplicaAutoScalingScaleInCooldown:
+ Type: Number
+ Description: The amount of time, in seconds, after a scale-in activity completes before another scale-in activity can start
+ Default: 300
+ ReplicaAutoScalingScaleOutCooldown:
+ Type: Number
+ Description: The amount of time, in seconds, to wait for a previous scale-out activity to take effect.
+ Default: 300
+ ReplicaAutoScalingTargetCPUPercentage:
+ Type: Number
+ Description: Aggregate CPU utilization target for auto-scaling
+ Default: 75.0
Resources:
GerritService:
@@ -202,7 +222,10 @@
Cluster:
Fn::ImportValue:
!Join [':', [!Ref 'ClusterStackName', 'ClusterName']]
- DesiredCount: !Ref DesiredCount
+ CapacityProviderStrategy:
+ - CapacityProvider: !Sub '${ClusterStackName}-ReplicaCapacityProvider'
+ Weight: 100
+ DesiredCount: !Ref ReplicaAutoScalingDesiredCapacity
TaskDefinition: !Ref GerritTaskDefinition
LoadBalancers:
- ContainerName: !Ref GerritServiceName
@@ -385,6 +408,26 @@
Labels:
gerrit-logs: !Join ['-', [!Ref EnvironmentName, !Ref GerritLogsVolume]]
+ ReplicaCPUAutoScaling:
+ Type: AWS::CloudFormation::Stack
+ Properties:
+ TemplateURL: !Join [ '', ['https://', !Ref TemplateBucketName, '.s3.amazonaws.com/cf-ecs-service-cpu-autoscaling.yml'] ]
+ TimeoutInMinutes: '5'
+ Parameters:
+ AutoScalingMinCapacity: !Ref ReplicaAutoScalingMinCapacity
+ AutoScalingDesiredCapacity: !Ref ReplicaAutoScalingDesiredCapacity
+ AutoScalingMaxCapacity: !Ref ReplicaAutoScalingMaxCapacity
+ AutoScalingScaleInCooldown: !Ref ReplicaAutoScalingScaleInCooldown
+ AutoScalingScaleOutCooldown: !Ref ReplicaAutoScalingScaleOutCooldown
+ AutoScalingTargetCPUPercentage: !Ref ReplicaAutoScalingTargetCPUPercentage
+ ResourceId:
+ !Join
+ - ''
+ - - 'service/'
+ - Fn::ImportValue: !Join [':', [!Ref 'ClusterStackName', 'ClusterName']]
+ - '/'
+ - !GetAtt GerritService.Name
+
LoadBalancer:
Type: AWS::ElasticLoadBalancingV2::LoadBalancer
Properties:
diff --git a/primary-replica/setup.env.template b/primary-replica/setup.env.template
index 5d2418c..7a3e5f7 100644
--- a/primary-replica/setup.env.template
+++ b/primary-replica/setup.env.template
@@ -48,7 +48,19 @@
SERVICE_GIT_GC_STACK_NAME=$(AWS_PREFIX)-scheduled-gc
GIT_GC_CRON_EXPRESSION="0 2 ? * SAT *"
GIT_GC_PROJECT_LIST="All-Users"
-
+
REPLICA_FILESYSTEM_ID=""
REPLICA_FILESYSTEM_THROUGHPUT_MODE="provisioned"
-REPLICA_FILESYSTEM_PROVISIONED_THROUGHPUT_IN_MIBPS="256"
\ No newline at end of file
+REPLICA_FILESYSTEM_PROVISIONED_THROUGHPUT_IN_MIBPS="256"
+
+REPLICA_AUTOSCALING_MIN_CAPACITY=1
+REPLICA_AUTOSCALING_DESIRED_CAPACITY=1
+REPLICA_AUTOSCALING_MAX_CAPACITY=1
+
+REPLICA_AUTOSCALING_SCALE_IN_COOLDOWN=300
+REPLICA_AUTOSCALING_SCALE_OUT_COOLDOWN=300
+REPLICA_AUTOSCALING_TARGET_CPU_PERCENTAGE=75
+
+REPLICA_CAPACITY_PROVIDER_TARGET=50
+REPLICA_CAPACITY_PROVIDER_MIN_STEP_SIZE=1
+REPLICA_CAPACITY_PROVIDER_MAX_STEP_SIZE=2
\ No newline at end of file