From b10bc24c936e6019f0c9fdc9726816b5cf87c194 Mon Sep 17 00:00:00 2001 From: aksel-skaar-leirvaag Date: Sun, 13 Oct 2024 08:37:06 +0200 Subject: [PATCH] Added exponential backoff on reconciliation failure --- api/v1alpha2/terraform_types.go | 44 +++++++++- api/v1alpha2/terraform_types_test.go | 81 +++++++++++++++++++ api/v1alpha2/zz_generated.deepcopy.go | 5 ++ charts/tofu-controller/crds/crds.yaml | 17 ++++ .../infra.contrib.fluxcd.io_terraforms.yaml | 17 ++++ docs/References/terraform.md | 72 +++++++++++++++++ 6 files changed, 232 insertions(+), 4 deletions(-) create mode 100644 api/v1alpha2/terraform_types_test.go diff --git a/api/v1alpha2/terraform_types.go b/api/v1alpha2/terraform_types.go index 9998bda2b..6a1376d21 100644 --- a/api/v1alpha2/terraform_types.go +++ b/api/v1alpha2/terraform_types.go @@ -19,12 +19,12 @@ package v1alpha2 import ( "bytes" "fmt" + "math" "net" "strings" "time" "unicode/utf8" - "github.com/flux-iac/tofu-controller/api/planid" "github.com/fluxcd/pkg/apis/meta" sourcev1 "github.com/fluxcd/source-controller/api/v1" corev1 "k8s.io/api/core/v1" @@ -33,6 +33,8 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime/serializer" + + "github.com/flux-iac/tofu-controller/api/planid" ) const ( @@ -145,6 +147,21 @@ type TerraformSpec struct { // +optional RetryInterval *metav1.Duration `json:"retryInterval,omitempty"` + // The strategy to use when retrying a previously failed reconciliation. + // The default strategy is StaticInterval and the retry interval is based on the RetryInterval value. + // The ExponentialBackoff strategy uses the formula: 2^reconciliationFailures * RetryInterval with a + // maximum requeue duration of MaxRetryInterval. + // +kubebuilder:validation:Enum=StaticInterval;ExponentialBackoff + // +kubebuilder:default:string=StaticInterval + // +optional + RetryStrategy RetryStrategyEnum `json:"retryStrategy,omitempty"` + + // The maximum requeue duration after a previously failed reconciliation. + // Only applicable when RetryStrategy is set to ExponentialBackoff. + // The default value is 24 hours when not specified. + // +optional + MaxRetryInterval *metav1.Duration `json:"maxRetryInterval,omitempty"` + // Path to the directory containing Terraform (.tf) files. // Defaults to 'None', which translates to the root path of the SourceRef. // +optional @@ -521,6 +538,13 @@ const ( ForceUnlockEnumNo ForceUnlockEnum = "no" ) +type RetryStrategyEnum string + +const ( + StaticInterval RetryStrategyEnum = "StaticInterval" + ExponentialBackoff RetryStrategyEnum = "ExponentialBackoff" +) + const ( TerraformKind = "Terraform" TerraformFinalizer = "finalizers.tf.contrib.fluxcd.io" @@ -892,12 +916,24 @@ func (in Terraform) GetDependsOn() []meta.NamespacedObjectReference { // GetRetryInterval returns the retry interval func (in Terraform) GetRetryInterval() time.Duration { + retryInterval := 15 * time.Second if in.Spec.RetryInterval != nil { - return in.Spec.RetryInterval.Duration + retryInterval = in.Spec.RetryInterval.Duration + } + + if in.Spec.RetryStrategy == ExponentialBackoff { + retryInterval *= time.Duration(math.Pow(2, float64(in.Status.ReconciliationFailures))) + maxRetryInterval := 60 * time.Minute + if in.Spec.MaxRetryInterval != nil { + maxRetryInterval = in.Spec.MaxRetryInterval.Duration + } + + if retryInterval > maxRetryInterval { + return maxRetryInterval + } } - // The default retry interval is 15 seconds. - return 15 * time.Second + return retryInterval } // GetStatusConditions returns a pointer to the Status.Conditions slice. diff --git a/api/v1alpha2/terraform_types_test.go b/api/v1alpha2/terraform_types_test.go new file mode 100644 index 000000000..a293b9873 --- /dev/null +++ b/api/v1alpha2/terraform_types_test.go @@ -0,0 +1,81 @@ +package v1alpha2 + +import ( + "testing" + "time" + + . "github.com/onsi/gomega" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +func TestGetRetryInterval(t *testing.T) { + g := NewGomegaWithT(t) + + tests := []struct { + name string + terraform Terraform + expectedRetryInterval time.Duration + }{ + { + name: "default retry interval", + terraform: Terraform{ + Spec: TerraformSpec{}, + }, + expectedRetryInterval: 15 * time.Second, + }, + { + name: "custom retry interval", + terraform: Terraform{ + Spec: TerraformSpec{ + RetryInterval: &metav1.Duration{Duration: 30 * time.Second}, + }, + }, + expectedRetryInterval: 30 * time.Second, + }, + { + name: "exponential backoff with default retry interval", + terraform: Terraform{ + Spec: TerraformSpec{ + RetryStrategy: ExponentialBackoff, + }, + Status: TerraformStatus{ + ReconciliationFailures: 2, + }, + }, + expectedRetryInterval: 60 * time.Second, + }, + { + name: "exponential backoff", + terraform: Terraform{ + Spec: TerraformSpec{ + RetryStrategy: ExponentialBackoff, + RetryInterval: &metav1.Duration{Duration: 60 * time.Second}, + }, + Status: TerraformStatus{ + ReconciliationFailures: 4, + }, + }, + expectedRetryInterval: 960 * time.Second, + }, + { + name: "exponential backoff with max retry interval", + terraform: Terraform{ + Spec: TerraformSpec{ + RetryStrategy: ExponentialBackoff, + RetryInterval: &metav1.Duration{Duration: 60 * time.Second}, + MaxRetryInterval: &metav1.Duration{Duration: 45 * time.Second}, + }, + Status: TerraformStatus{ + ReconciliationFailures: 4, + }, + }, + expectedRetryInterval: 45 * time.Second, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + g.Expect(tt.terraform.GetRetryInterval()).To(Equal(tt.expectedRetryInterval)) + }) + } +} diff --git a/api/v1alpha2/zz_generated.deepcopy.go b/api/v1alpha2/zz_generated.deepcopy.go index 680f75402..90e94d769 100644 --- a/api/v1alpha2/zz_generated.deepcopy.go +++ b/api/v1alpha2/zz_generated.deepcopy.go @@ -533,6 +533,11 @@ func (in *TerraformSpec) DeepCopyInto(out *TerraformSpec) { *out = new(v1.Duration) **out = **in } + if in.MaxRetryInterval != nil { + in, out := &in.MaxRetryInterval, &out.MaxRetryInterval + *out = new(v1.Duration) + **out = **in + } out.SourceRef = in.SourceRef if in.ReadInputsFromSecrets != nil { in, out := &in.ReadInputsFromSecrets, &out.ReadInputsFromSecrets diff --git a/charts/tofu-controller/crds/crds.yaml b/charts/tofu-controller/crds/crds.yaml index 2f6cd7245..03aee5479 100644 --- a/charts/tofu-controller/crds/crds.yaml +++ b/charts/tofu-controller/crds/crds.yaml @@ -5536,6 +5536,12 @@ spec: interval: description: The interval at which to reconcile the Terraform. type: string + maxRetryInterval: + description: |- + The maximum requeue duration after a previously failed reconciliation. + Only applicable when RetryStrategy is set to ExponentialBackoff. + The default value is 24 hours when not specified. + type: string parallelism: default: 0 description: Parallelism limits the number of concurrent operations @@ -5587,6 +5593,17 @@ spec: The interval at which to retry a previously failed reconciliation. The default value is 15 when not specified. type: string + retryStrategy: + default: StaticInterval + description: |- + The strategy to use when retrying a previously failed reconciliation. + The default strategy is StaticInterval and the retry interval is based on the RetryInterval value. + The ExponentialBackoff strategy uses the formula: 2^reconciliationFailures * RetryInterval with a + maximum requeue duration of MaxRetryInterval. + enum: + - StaticInterval + - ExponentialBackoff + type: string runnerPodTemplate: properties: metadata: diff --git a/config/crd/bases/infra.contrib.fluxcd.io_terraforms.yaml b/config/crd/bases/infra.contrib.fluxcd.io_terraforms.yaml index 2f6cd7245..03aee5479 100644 --- a/config/crd/bases/infra.contrib.fluxcd.io_terraforms.yaml +++ b/config/crd/bases/infra.contrib.fluxcd.io_terraforms.yaml @@ -5536,6 +5536,12 @@ spec: interval: description: The interval at which to reconcile the Terraform. type: string + maxRetryInterval: + description: |- + The maximum requeue duration after a previously failed reconciliation. + Only applicable when RetryStrategy is set to ExponentialBackoff. + The default value is 24 hours when not specified. + type: string parallelism: default: 0 description: Parallelism limits the number of concurrent operations @@ -5587,6 +5593,17 @@ spec: The interval at which to retry a previously failed reconciliation. The default value is 15 when not specified. type: string + retryStrategy: + default: StaticInterval + description: |- + The strategy to use when retrying a previously failed reconciliation. + The default strategy is StaticInterval and the retry interval is based on the RetryInterval value. + The ExponentialBackoff strategy uses the formula: 2^reconciliationFailures * RetryInterval with a + maximum requeue duration of MaxRetryInterval. + enum: + - StaticInterval + - ExponentialBackoff + type: string runnerPodTemplate: properties: metadata: diff --git a/docs/References/terraform.md b/docs/References/terraform.md index b447fe30a..103660a70 100644 --- a/docs/References/terraform.md +++ b/docs/References/terraform.md @@ -794,6 +794,12 @@ string +

RetryStrategyEnum +(string alias)

+

+(Appears on: +TerraformSpec) +

RunnerPodMetadata

@@ -1568,6 +1574,39 @@ The default value is 15 when not specified.

+retryStrategy
+ + +RetryStrategyEnum + + + + +(Optional) +

The strategy to use when retrying a previously failed reconciliation. +The default strategy is StaticInterval and the retry interval is based on the RetryInterval value. +The ExponentialBackoff strategy will double the RetryInterval on each failed reconciliation +until the MaxRetryInterval is reached.

+ + + + +maxRetryInterval
+ + +Kubernetes meta/v1.Duration + + + + +(Optional) +

The maximum requeue duration after a previously failed reconciliation. +Only applicable when RetryStrategy is set to ExponentialBackoff. +The default value is 24 hours when not specified.

+ + + + path
string @@ -2131,6 +2170,39 @@ The default value is 15 when not specified.

+retryStrategy
+ + +RetryStrategyEnum + + + + +(Optional) +

The strategy to use when retrying a previously failed reconciliation. +The default strategy is StaticInterval and the retry interval is based on the RetryInterval value. +The ExponentialBackoff strategy will double the RetryInterval on each failed reconciliation +until the MaxRetryInterval is reached.

+ + + + +maxRetryInterval
+ + +Kubernetes meta/v1.Duration + + + + +(Optional) +

The maximum requeue duration after a previously failed reconciliation. +Only applicable when RetryStrategy is set to ExponentialBackoff. +The default value is 24 hours when not specified.

+ + + + path
string