Skip to content

Commit

Permalink
Implement CL2_RATE_LIMIT_POD_CREATION that drops artificial rate limi…
Browse files Browse the repository at this point in the history
…ting
  • Loading branch information
mborsz committed Aug 24, 2022
1 parent 91d0966 commit fa62c29
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,8 @@ type waitForControlledPodsRunningMeasurement struct {
kind string
selector *util.ObjectSelector
operationTimeout time.Duration
// exactOperationTimeout controls whether to use 2x timeout during scale down/deletion.
exactOperationTimeout bool
// countErrorMargin orders measurement to wait for number of pods to be in
// <desired count - countErrorMargin, desired count> range
// When using preemptibles on large scale, number of ready nodes is not stable
Expand Down Expand Up @@ -170,6 +172,13 @@ func (w *waitForControlledPodsRunningMeasurement) Execute(config *measurement.Co
if err != nil {
return nil, err
}
// exactOperationTimeout controls whether we should skip multiplying by two operationTimeout on scale down/deletion.
// Defaults to false for backward compatibility.
// TODO(mborsz): Change default to true and remove.
w.exactOperationTimeout, err = util.GetBoolOrDefault(config.Params, "exactOperationTimeout", false)
if err != nil {
return nil, err
}
w.countErrorMargin, err = util.GetIntOrDefault(config.Params, "countErrorMargin", 0)
if err != nil {
return nil, err
Expand Down Expand Up @@ -438,7 +447,7 @@ func (w *waitForControlledPodsRunningMeasurement) handleObjectLocked(oldObj, new
}

operationTimeout := w.operationTimeout
if isObjDeleted || isScalingDown {
if !w.exactOperationTimeout && (isObjDeleted || isScalingDown) {
// In case of deleting pods, twice as much time is required.
// The pod deletion throughput equals half of the pod creation throughput.
// NOTE: Starting from k8s 1.23 it's not true anymore, at least not in all cases.
Expand Down
19 changes: 19 additions & 0 deletions clusterloader2/testing/load/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
{{$PODS_PER_NODE := DefaultParam .PODS_PER_NODE 30}}
{{$LOAD_TEST_THROUGHPUT := DefaultParam .CL2_LOAD_TEST_THROUGHPUT 10}}
{{$DELETE_TEST_THROUGHPUT := DefaultParam .CL2_DELETE_TEST_THROUGHPUT $LOAD_TEST_THROUGHPUT}}
{{$RATE_LIMIT_POD_CREATION := DefaultParam .CL2_RATE_LIMIT_POD_CREATION true}}
{{$BIG_GROUP_SIZE := DefaultParam .BIG_GROUP_SIZE 250}}
{{$MEDIUM_GROUP_SIZE := DefaultParam .MEDIUM_GROUP_SIZE 30}}
{{$SMALL_GROUP_SIZE := DefaultParam .SMALL_GROUP_SIZE 5}}
Expand Down Expand Up @@ -168,7 +169,13 @@ steps:
params:
actionName: "create"
namespaces: {{$namespaces}}
{{if .RATE_LIMIT_POD_CREATION}}
tuningSet: RandomizedSaturationTimeLimited
operationTimeout: 15m
{{else}}
tuningSet: Global100qps
operationTimeout: {{AddInt $saturationTime 900}}s
{{end}}
testMaxReplicaFactor: {{$RANDOM_SCALE_FACTOR}}
# We rely on the fact that daemonset is using the same image as the 'pod-startup-latency' module.
# The goal is to cache the image to all nodes before we start any latency pod,
Expand Down Expand Up @@ -277,7 +284,13 @@ steps:
params:
actionName: "scale and update"
namespaces: {{$namespaces}}
{{if .RATE_LIMIT_POD_CREATION}}
tuningSet: RandomizedScalingTimeLimited
operationTimeout: 15m
{{else}}
tuningSet: Global100qps
operationTimeout: {{AddInt (DivideInt $saturationTime 4) 900}}s
{{end}}
randomScaleFactor: {{$RANDOM_SCALE_FACTOR}}
testMaxReplicaFactor: {{$RANDOM_SCALE_FACTOR}}
daemonSetImage: {{$latencyPodImage}}
Expand Down Expand Up @@ -305,7 +318,13 @@ steps:
params:
actionName: "delete"
namespaces: {{$namespaces}}
{{if .RATE_LIMIT_POD_CREATION}}
tuningSet: RandomizedDeletionTimeLimited
operationTimeout: 15m
{{else}}
tuningSet: Global100qps
operationTimeout: {{AddInt $deletionTime 900}}s
{{end}}
testMaxReplicaFactor: {{$RANDOM_SCALE_FACTOR}}
daemonSetReplicas: 0
bigDeploymentSize: {{$BIG_GROUP_SIZE}}
Expand Down
10 changes: 6 additions & 4 deletions clusterloader2/testing/load/modules/reconcile-objects.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
{{$minReplicaFactor := SubtractFloat 1 $randomScaleFactor}}
{{$maxReplicaFactor := AddFloat 1 $randomScaleFactor}}
{{$testMaxReplicaFactor := AddFloat 1 .testMaxReplicaFactor}}
{{$operationTimeout := .operationTimeout}}

# DaemonSets
{{$daemonSetImage := DefaultParam .daemonSetImage "k8s.gcr.io/pause:3.0"}}
Expand Down Expand Up @@ -75,7 +76,8 @@ steps:
action: start
checkIfPodsAreUpdated: {{$CHECK_IF_PODS_ARE_UPDATED}}
labelSelector: group = load
operationTimeout: 15m
operationTimeout: {{$operationTimeout}}
exactOperationTimeout: true

- name: {{$actionName}}
phases:
Expand Down Expand Up @@ -212,7 +214,7 @@ steps:
min: 1
max: {{$namespaces}}
replicasPerNamespace: 0
tuningSet: RandomizedDeletionTimeLimited
tuningSet: {{$tuningSet}}
objectBundle:
{{range $ssIndex := Loop $pvSmallStatefulSetSize}}
- basename: pv-small-statefulset-{{$ssIndex}}
Expand All @@ -226,7 +228,7 @@ steps:
min: 1
max: {{$namespaces}}
replicasPerNamespace: 0
tuningSet: RandomizedDeletionTimeLimited
tuningSet: {{$tuningSet}}
objectBundle:
{{range $ssIndex := Loop $pvMediumStatefulSetSize}}
- basename: pv-medium-statefulset-{{$ssIndex}}
Expand Down Expand Up @@ -254,5 +256,5 @@ steps:
Params:
desiredPVCCount: 0
labelSelector: group = load
timeout: 15m
timeout: {{$operationTimeout}}
{{end}}

0 comments on commit fa62c29

Please sign in to comment.