Skip to content
This repository has been archived by the owner on Jun 29, 2022. It is now read-only.

cert-rotator: Add retry to cluster upgrade #1513

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ run-e2e-tests:
# This is a test that should be run in the end to reduce the disruption to other tests because
# it will delete a node.
# The timeout is made longer as it will perform actions that will take longer than the default 10 minutes.
KUBECONFIG=${kubeconfig} PLATFORM=${platform} go test -timeout 30m -mod=$(MOD) -tags="$(platform),disruptivee2e" -covermode=atomic -buildmode=exe -v -count=1 ./test/...
KUBECONFIG=${kubeconfig} PLATFORM=${platform} go test -timeout 90m -mod=$(MOD) -tags="$(platform),disruptivee2e" -covermode=atomic -buildmode=exe -v -count=1 ./test/...

.PHONY: all
all: build test
Expand Down
24 changes: 22 additions & 2 deletions cli/cmd/cluster/certificate-rotator.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,18 @@ import (

log "github.com/sirupsen/logrus"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/kubernetes"

"github.com/kinvolk/lokomotive/pkg/k8sutil"
"github.com/kinvolk/lokomotive/pkg/platform"
)

const (
retryInterval = 10 * time.Second
retryTimeout = 30 * time.Minute
)

type certificateRotator struct {
clientSet *kubernetes.Clientset
newCACert string
Expand Down Expand Up @@ -99,8 +105,22 @@ func rotateControlPlaneCerts(contextLogger *log.Entry, cc clusterConfig) error {

contextLogger.Log(log.InfoLevel, "Applying a controlplane update with the new CA")

if err := c.upgradeControlPlane(contextLogger, kubeconfig); err != nil {
return fmt.Errorf("running controlplane upgrade: %v", err)
var upgradeErr error

err = wait.PollImmediate(retryInterval, retryTimeout, func() (bool, error) {
if upgradeErr = c.upgradeControlPlane(contextLogger, kubeconfig); upgradeErr != nil {
return false, nil
}

return true, nil
})

if upgradeErr != nil {
return fmt.Errorf("running controlplane upgrade: %w", upgradeErr)
}

if err != nil {
return fmt.Errorf("control plane did not upgrade after multiple retries: %w", err)
}

cs, err := k8sutil.NewClientset(kubeconfig)
Expand Down
58 changes: 55 additions & 3 deletions cli/cmd/cluster/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ package cluster
import (
"encoding/base64"
"fmt"
"math"
"path/filepath"

"github.com/hashicorp/hcl/v2"
Expand All @@ -25,6 +26,7 @@ import (
"helm.sh/helm/v3/pkg/action"
"helm.sh/helm/v3/pkg/chart"
"helm.sh/helm/v3/pkg/chart/loader"
"helm.sh/helm/v3/pkg/release"
"helm.sh/helm/v3/pkg/storage/driver"
"sigs.k8s.io/yaml"

Expand Down Expand Up @@ -286,13 +288,63 @@ func (c controlplaneUpdater) upgradeComponent(component, namespace string) error

fmt.Printf("Ensuring controlplane component '%s' is up to date... ", component)

if _, err := update.Run(component, helmChart, values); err != nil {
updateComplete := false
counter := 0

var updateErr error

for !updateComplete && counter < 10 {
counter++

// Try to update.
if _, updateErr = update.Run(component, helmChart, values); updateErr == nil {
updateComplete = true
}

// Update failed for some reason, so roll it back.
fmt.Println("Failed!")
fmt.Printf("updating controlplane component: %v\n", updateErr)

// Get the entire history associated with this release.
histClient := action.NewHistory(actionConfig)
histMax := math.MaxInt32

histories, err := helm.GetHistory(histClient, component, histMax)
if err != nil && err != driver.ErrReleaseNotFound {
fmt.Printf("checking for chart history of failed update: %v\n", err)

continue
}

var history *release.Release

return fmt.Errorf("updating controlplane component: %w", err)
// Search for the last successful deploy from all the histories.
for _, history = range histories {
if history.Info.Status.IsPending() {
continue
}

// Found a non-pending history.
break
}

// TODO: Run rollback in a Loop. There is no point in doing the update again when the rollback has failed.
// Rollback to this history.
rollback := action.NewRollback(actionConfig)
rollback.Wait = true
rollback.Version = history.Version

if err := rollback.Run(component); err != nil {
fmt.Println("Failed!")
fmt.Printf("rolling back failed update: %v\n", err)

continue
}
}

fmt.Println("Done.")
if updateErr != nil {
return fmt.Errorf("updating controlplane component: %w", updateErr)
}

return nil
}
Expand Down