Skip to content

Commit

Permalink
Don't pile up successive full refreshes during AWS scaledowns
Browse files Browse the repository at this point in the history
Force refreshing everything at every DeleteNodes calls causes slow down
and throttling on large clusters with many ASGs (and lot of activity).

That function might be called several times in a row during scale-down
(once for each ASG having a node to be removed). Each time the forced
refresh will re-discover all ASGs, all LaunchConfigurations, then re-list all
instances from discovered ASGs.

That immediate refresh isn't required anyway, as the cache's DeleteInstances
concrete implementation will decrement the nodegroup size, and we can
schedule a grouped refresh for the next loop iteration.
  • Loading branch information
bpineau committed Apr 19, 2021
1 parent 7761d70 commit 037dc73
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -469,7 +469,7 @@ func TestDeleteNodes(t *testing.T) {
err = asgs[0].DeleteNodes([]*apiv1.Node{node})
assert.NoError(t, err)
service.AssertNumberOfCalls(t, "TerminateInstanceInAutoScalingGroup", 1)
service.AssertNumberOfCalls(t, "DescribeAutoScalingGroupsPages", 2)
service.AssertNumberOfCalls(t, "DescribeAutoScalingGroupsPages", 1)

newSize, err := asgs[0].TargetSize()
assert.NoError(t, err)
Expand Down Expand Up @@ -516,7 +516,7 @@ func TestDeleteNodesWithPlaceholder(t *testing.T) {
err = asgs[0].DeleteNodes([]*apiv1.Node{node})
assert.NoError(t, err)
service.AssertNumberOfCalls(t, "SetDesiredCapacity", 1)
service.AssertNumberOfCalls(t, "DescribeAutoScalingGroupsPages", 2)
service.AssertNumberOfCalls(t, "DescribeAutoScalingGroupsPages", 1)

newSize, err := asgs[0].TargetSize()
assert.NoError(t, err)
Expand Down
5 changes: 3 additions & 2 deletions cluster-autoscaler/cloudprovider/aws/aws_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -294,8 +294,9 @@ func (m *AwsManager) DeleteInstances(instances []*AwsInstanceRef) error {
if err := m.asgCache.DeleteInstances(instances); err != nil {
return err
}
klog.V(2).Infof("Some ASG instances might have been deleted, forcing ASG list refresh")
return m.forceRefresh()
klog.V(2).Infof("DeleteInstances was called: scheduling an ASG list refresh for next main loop evaluation")
m.lastRefresh = time.Now().Add(-refreshInterval)
return nil
}

// GetAsgNodes returns Asg nodes.
Expand Down

0 comments on commit 037dc73

Please sign in to comment.