Skip to content
2 changes: 1 addition & 1 deletion cli/cmd/lib_cluster_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ func getInstallClusterConfig(awsClient *aws.Client, clusterConfigFile string, di
return nil, err
}

err = clusterConfig.Validate(awsClient, false)
err = clusterConfig.Validate(awsClient)
if err != nil {
err = errors.Append(err, fmt.Sprintf("\n\ncluster configuration schema can be found at https://docs.cortex.dev/v/%s/", consts.CortexVersionMinor))
return nil, errors.Wrap(err, clusterConfigFile)
Expand Down
22 changes: 22 additions & 0 deletions pkg/lib/aws/ec2.go
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,28 @@ func (c *Client) DescribeVpcs() ([]ec2.Vpc, error) {
return vpcs, nil
}

func (c *Client) DescribeSecurityGroups() ([]ec2.SecurityGroup, error) {
var sgs []ec2.SecurityGroup
err := c.EC2().DescribeSecurityGroupsPages(&ec2.DescribeSecurityGroupsInput{}, func(output *ec2.DescribeSecurityGroupsOutput, lastPage bool) bool {
if output == nil {
return false
}
for _, sg := range output.SecurityGroups {
if sg == nil {
continue
}
sgs = append(sgs, *sg)
}

return true
})
if err != nil {
return nil, errors.WithStack(err)
}

return sgs, nil
}

func (c *Client) ListVolumes(tags ...ec2.Tag) ([]ec2.Volume, error) {
var volumes []ec2.Volume
err := c.EC2().DescribeVolumesPages(&ec2.DescribeVolumesInput{}, func(output *ec2.DescribeVolumesOutput, lastPage bool) bool {
Expand Down
18 changes: 18 additions & 0 deletions pkg/lib/aws/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ const (
ErrEIPLimitExceeded = "aws.eip_limit_exceeded"
ErrInternetGatewayLimitExceeded = "aws.internet_gateway_limit_exceeded"
ErrVPCLimitExceeded = "aws.vpc_limit_exceeded"
ErrSecurityGroupRulesExceeded = "aws.security_group_rules_exceeded"
ErrSecurityGroupLimitExceeded = "aws.security_group_limit_exceeded"
)

func IsAWSError(err error) bool {
Expand Down Expand Up @@ -232,3 +234,19 @@ func ErrorVPCLimitExceeded(currentLimit, additionalQuotaRequired int, region str
Message: fmt.Sprintf("VPC limit of %d exceeded in region %s; remove some of the existing VPCs or increase your quota for VPCs by at least %d here: %s (if your request was recently approved, please allow ~30 minutes for AWS to reflect this change)", currentLimit, region, additionalQuotaRequired, url),
})
}

func ErrorSecurityGroupRulesExceeded(currentLimit, additionalQuotaRequired int, region string) error {
url := "https://console.aws.amazon.com/servicequotas/home?#!/services/vpc/quotas"
return errors.WithStack(&errors.Error{
Kind: ErrSecurityGroupRulesExceeded,
Message: fmt.Sprintf("security group rules limit of %d exceeded in region %s; use fewer availability zones, remove some node groups from your cluster config, reduce the number of CIDR white lists (if you have any), or increase your quota for inbound/outbound rules per security group by at least %d here: %s (if your request was recently approved, please allow ~30 minutes for AWS to reflect this change)", currentLimit, region, additionalQuotaRequired, url),
})
}

func ErrorSecurityGroupLimitExceeded(currentLimit, additionalQuotaRequired int, region string) error {
url := "https://console.aws.amazon.com/servicequotas/home?#!/services/vpc/quotas"
return errors.WithStack(&errors.Error{
Kind: ErrSecurityGroupLimitExceeded,
Message: fmt.Sprintf("security group limit of %d exceeded in region %s; remove some node groups from your cluster config or increase your quota for security groups by at least %d here: %s (if your request was recently approved, please allow ~30 minutes for AWS to reflect this change)", currentLimit, region, additionalQuotaRequired, url),
})
}
82 changes: 73 additions & 9 deletions pkg/lib/aws/servicequotas.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,18 @@ var _standardInstanceCategories = strset.New("a", "c", "d", "h", "i", "m", "r",
var _knownInstanceCategories = strset.Union(_standardInstanceCategories, strset.New("p", "g", "inf", "x", "f"))

const (
_elasticIPsQuotaCode = "L-0263D0A3"
_internetGatewayQuotaCode = "L-A4707A72"
_natGatewayQuotaCode = "L-FE5A380F"
_vpcQuotaCode = "L-F678F1CE"
_elasticIPsQuotaCode = "L-0263D0A3"
_internetGatewayQuotaCode = "L-A4707A72"
_natGatewayQuotaCode = "L-FE5A380F"
_vpcQuotaCode = "L-F678F1CE"
_securityGroupsQuotaCode = "L-E79EC296"
_securityGroupRulesQuotaCode = "L-0EA8095F"

// 11 inbound rules
_baseInboundRulesForNodeGroup = 11
_inboundRulesPerAZ = 8
// ClusterSharedNodeSecurityGroup, ControlPlaneSecurityGroup, eks-cluster-sg-<cluster-name>, and operator security group
_baseNumberOfSecurityGroups = 4
)

type InstanceTypeRequests struct {
Expand Down Expand Up @@ -145,12 +153,21 @@ func (c *Client) VerifyInstanceQuota(instances []InstanceTypeRequests) error {
return nil
}

func (c *Client) VerifyNetworkQuotas(requiredInternetGateways int, natGatewayRequired bool, highlyAvailableNATGateway bool, requiredVPCs int, availabilityZones strset.Set) error {
func (c *Client) VerifyNetworkQuotas(
requiredInternetGateways int,
natGatewayRequired bool,
highlyAvailableNATGateway bool,
requiredVPCs int,
availabilityZones strset.Set,
numNodeGroups int,
longestCIDRWhiteList int) error {
quotaCodeToValueMap := map[string]int{
_elasticIPsQuotaCode: 0, // elastic IP quota code
_internetGatewayQuotaCode: 0, // internet gw quota code
_natGatewayQuotaCode: 0, // nat gw quota code
_vpcQuotaCode: 0, // vpc quota code
_elasticIPsQuotaCode: 0, // elastic IP quota code
_internetGatewayQuotaCode: 0, // internet gw quota code
_natGatewayQuotaCode: 0, // nat gw quota code
_vpcQuotaCode: 0, // vpc quota code
_securityGroupsQuotaCode: 0, // security groups quota code
_securityGroupRulesQuotaCode: 0, // security group rules quota code
}

err := c.ServiceQuotas().ListServiceQuotasPages(
Expand Down Expand Up @@ -285,5 +302,52 @@ func (c *Client) VerifyNetworkQuotas(requiredInternetGateways int, natGatewayReq
}
}

// check rules quota for nodegroup SGs
requiredRulesForSG := requiredRulesForNodeGroupSecurityGroup(len(availabilityZones), longestCIDRWhiteList)
if requiredRulesForSG > quotaCodeToValueMap[_securityGroupRulesQuotaCode] {
additionalQuotaRequired := requiredRulesForSG - quotaCodeToValueMap[_securityGroupRulesQuotaCode]
return ErrorSecurityGroupRulesExceeded(quotaCodeToValueMap[_securityGroupRulesQuotaCode], additionalQuotaRequired, c.Region)
}

// check rules quota for control plane SG
requiredRulesForCPSG := requiredRulesForControlPlaneSecurityGroup(numNodeGroups)
if requiredRulesForCPSG > quotaCodeToValueMap[_securityGroupRulesQuotaCode] {
additionalQuotaRequired := requiredRulesForCPSG - quotaCodeToValueMap[_securityGroupRulesQuotaCode]
return ErrorSecurityGroupRulesExceeded(quotaCodeToValueMap[_securityGroupRulesQuotaCode], additionalQuotaRequired, c.Region)
}

// check security groups quota
requiredSecurityGroups := requiredSecurityGroups(numNodeGroups)
sgs, err := c.DescribeSecurityGroups()
if err != nil {
return err
}
if quotaCodeToValueMap[_securityGroupsQuotaCode]-len(sgs)-requiredSecurityGroups < 0 {
additionalQuotaRequired := len(sgs) + requiredSecurityGroups - quotaCodeToValueMap[_securityGroupsQuotaCode]
return ErrorSecurityGroupLimitExceeded(quotaCodeToValueMap[_securityGroupsQuotaCode], additionalQuotaRequired, c.Region)

}

return nil
}

func requiredRulesForNodeGroupSecurityGroup(numAZs, whitelistLength int) int {
whitelistRuleCount := 0
if whitelistLength == 1 {
whitelistRuleCount = 1
} else if whitelistLength > 1 {
whitelistRuleCount = 1 + 5*(whitelistLength-1)
}
return _baseInboundRulesForNodeGroup + numAZs*_inboundRulesPerAZ + whitelistRuleCount
}

func requiredRulesForControlPlaneSecurityGroup(numNodeGroups int) int {
// +1 for the operator node group
// this is the number of outbound rules (there are half as many inbound rules, so that is not the limiting factor)
return 2 * (numNodeGroups + 1)
}

func requiredSecurityGroups(numNodeGroups int) int {
// each node group requires a security group
return _baseNumberOfSecurityGroups + numNodeGroups
}
31 changes: 14 additions & 17 deletions pkg/types/clusterconfig/cluster_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -781,7 +781,7 @@ func (cc *CoreConfig) SQSNamePrefix() string {
}

// this validates the user-provided cluster config
func (cc *Config) Validate(awsClient *aws.Client, skipQuotaVerification bool) error {
func (cc *Config) Validate(awsClient *aws.Client) error {
fmt.Print("verifying your configuration ...\n\n")

numNodeGroups := len(cc.NodeGroups)
Expand Down Expand Up @@ -817,12 +817,10 @@ func (cc *Config) Validate(awsClient *aws.Client, skipQuotaVerification bool) er
})
}

if !skipQuotaVerification {
if err := awsClient.VerifyInstanceQuota(instances); err != nil {
// Skip AWS errors, since some regions (e.g. eu-north-1) do not support this API
if !aws.IsAWSError(err) {
return errors.Wrap(err, NodeGroupsKey)
}
if err := awsClient.VerifyInstanceQuota(instances); err != nil {
// Skip AWS errors, since some regions (e.g. eu-north-1) do not support this API
if !aws.IsAWSError(err) {
return errors.Wrap(err, NodeGroupsKey)
}
}

Expand Down Expand Up @@ -909,16 +907,15 @@ func (cc *Config) Validate(awsClient *aws.Client, skipQuotaVerification bool) er
}
}

if !skipQuotaVerification {
var requiredVPCs int
if len(cc.Subnets) == 0 {
requiredVPCs = 1
}
if err := awsClient.VerifyNetworkQuotas(1, cc.NATGateway != NoneNATGateway, cc.NATGateway == HighlyAvailableNATGateway, requiredVPCs, strset.FromSlice(cc.AvailabilityZones)); err != nil {
// Skip AWS errors, since some regions (e.g. eu-north-1) do not support this API
if !aws.IsAWSError(err) {
return err
}
var requiredVPCs int
if len(cc.Subnets) == 0 {
requiredVPCs = 1
}
longestCIDRWhiteList := libmath.MaxInt(len(cc.APILoadBalancerCIDRWhiteList), len(cc.OperatorLoadBalancerCIDRWhiteList))
if err := awsClient.VerifyNetworkQuotas(1, cc.NATGateway != NoneNATGateway, cc.NATGateway == HighlyAvailableNATGateway, requiredVPCs, strset.FromSlice(cc.AvailabilityZones), len(cc.NodeGroups), longestCIDRWhiteList); err != nil {
// Skip AWS errors, since some regions (e.g. eu-north-1) do not support this API
if !aws.IsAWSError(err) {
return err
}
}

Expand Down