From f2c22fa024350340aaa838a219622a9fdbbd4b47 Mon Sep 17 00:00:00 2001 From: Himanshu Sharma Date: Wed, 22 Nov 2023 19:16:08 +0530 Subject: [PATCH 01/22] preliminary changes --- pkg/util/provider/machinecontroller/controller.go | 11 +++++------ pkg/util/provider/machinecontroller/machine.go | 14 +++++++------- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/pkg/util/provider/machinecontroller/controller.go b/pkg/util/provider/machinecontroller/controller.go index a9ff3c941..863d65961 100644 --- a/pkg/util/provider/machinecontroller/controller.go +++ b/pkg/util/provider/machinecontroller/controller.go @@ -135,7 +135,6 @@ func NewController( controller.pvcLister = pvcInformer.Lister() controller.pvLister = pvInformer.Lister() controller.secretLister = secretInformer.Lister() - // TODO: Need to handle K8s versions below 1.13 differently controller.volumeAttachementLister = volumeAttachmentInformer.Lister() controller.machineClassLister = machineClassInformer.Lister() controller.nodeLister = nodeInformer.Lister() @@ -158,7 +157,7 @@ func NewController( controller.pdbV1beta1Synced = pdbV1beta1Informer.Informer().HasSynced } - // Secret Controller Informers + // Secret Controller's Informers secretInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ AddFunc: controller.secretAdd, DeleteFunc: controller.secretDelete, @@ -170,7 +169,7 @@ func NewController( DeleteFunc: controller.machineClassToSecretDelete, }) - // Machine Class Controller Informers + // Machine Class Controller's Informers machineInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ AddFunc: controller.machineToMachineClassAdd, UpdateFunc: controller.machineToMachineClassUpdate, @@ -183,7 +182,7 @@ func NewController( DeleteFunc: controller.machineClassDelete, }) - // Machine Controller Informers + // Machine Controller's Informers nodeInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ AddFunc: controller.addNodeToMachine, UpdateFunc: controller.updateNodeToMachine, @@ -196,7 +195,7 @@ func NewController( DeleteFunc: controller.deleteMachine, }) - // MachineSafety Controller Informers + // MachineSafety Controller's Informers // We follow the kubernetes way of reconciling the safety controller // done by adding empty key objects. We initialize it, to trigger // running of different safety loop on MCM startup. @@ -209,7 +208,7 @@ func NewController( DeleteFunc: controller.deleteMachineToSafety, }) - // Drain Controller Informers + // Drain Controller's Informers if k8sutils.IsResourceSupported( targetCoreClient, schema.GroupResource{ diff --git a/pkg/util/provider/machinecontroller/machine.go b/pkg/util/provider/machinecontroller/machine.go index dbd95b480..5d6268afe 100644 --- a/pkg/util/provider/machinecontroller/machine.go +++ b/pkg/util/provider/machinecontroller/machine.go @@ -61,26 +61,26 @@ func (c *controller) deleteMachine(obj interface{}) { c.enqueueMachine(obj) } -// isToBeEnqueued returns true if the key is to be managed by this controller -func (c *controller) isToBeEnqueued(obj interface{}) (bool, string) { +// getKeyForObj returns key for object, else returns false +func (c *controller) getKeyForObj(obj interface{}) (string, bool) { key, err := cache.MetaNamespaceKeyFunc(obj) if err != nil { klog.Errorf("Couldn't get key for object %+v: %v", obj, err) - return false, "" + return "", false } - - return true, key + return key, true } + func (c *controller) enqueueMachine(obj interface{}) { - if toBeEnqueued, key := c.isToBeEnqueued(obj); toBeEnqueued { + if key, ok := c.getKeyForObj(obj); ok { klog.V(5).Infof("Adding machine object to the queue %q", key) c.machineQueue.Add(key) } } func (c *controller) enqueueMachineAfter(obj interface{}, after time.Duration) { - if toBeEnqueued, key := c.isToBeEnqueued(obj); toBeEnqueued { + if key, ok := c.getKeyForObj(obj); ok { klog.V(5).Infof("Adding machine object to the queue %q after %s", key, after) c.machineQueue.AddAfter(key, after) } From b5feb2d15a25e9a2d12e8fb170d398c4bf6b8f6b Mon Sep 17 00:00:00 2001 From: Himanshu Sharma Date: Wed, 22 Nov 2023 19:29:35 +0530 Subject: [PATCH 02/22] reconcile machineClass only on addition/deletion of machines --- .../provider/machinecontroller/controller.go | 1 - .../machinecontroller/machineclass.go | 30 ------------------- 2 files changed, 31 deletions(-) diff --git a/pkg/util/provider/machinecontroller/controller.go b/pkg/util/provider/machinecontroller/controller.go index 863d65961..e4b458371 100644 --- a/pkg/util/provider/machinecontroller/controller.go +++ b/pkg/util/provider/machinecontroller/controller.go @@ -172,7 +172,6 @@ func NewController( // Machine Class Controller's Informers machineInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ AddFunc: controller.machineToMachineClassAdd, - UpdateFunc: controller.machineToMachineClassUpdate, DeleteFunc: controller.machineToMachineClassDelete, }) diff --git a/pkg/util/provider/machinecontroller/machineclass.go b/pkg/util/provider/machinecontroller/machineclass.go index 6ba849975..f7c43e1a2 100644 --- a/pkg/util/provider/machinecontroller/machineclass.go +++ b/pkg/util/provider/machinecontroller/machineclass.go @@ -45,36 +45,6 @@ func (c *controller) machineToMachineClassAdd(obj interface{}) { } } -func (c *controller) machineToMachineClassUpdate(oldObj, newObj interface{}) { - oldMachine, ok := oldObj.(*v1alpha1.Machine) - if oldMachine == nil || !ok { - klog.Warningf("Couldn't get machine from object: %+v", oldObj) - return - } - newMachine, ok := newObj.(*v1alpha1.Machine) - if newMachine == nil || !ok { - klog.Warningf("Couldn't get machine from object: %+v", newObj) - return - } - - if oldMachine.Spec.Class.Kind == newMachine.Spec.Class.Kind { - if newMachine.Spec.Class.Kind == machineutils.MachineClassKind { - // Both old and new machine refer to the same machineClass object - // And the correct kind so enqueuing only one of them. - c.machineClassQueue.Add(newMachine.Spec.Class.Name) - } - } else { - // If both are pointing to different machineClasses - // we might have to enqueue both. - if oldMachine.Spec.Class.Kind == machineutils.MachineClassKind { - c.machineClassQueue.Add(oldMachine.Spec.Class.Name) - } - if newMachine.Spec.Class.Kind == machineutils.MachineClassKind { - c.machineClassQueue.Add(newMachine.Spec.Class.Name) - } - } -} - func (c *controller) machineToMachineClassDelete(obj interface{}) { c.machineToMachineClassAdd(obj) } From f8bd6aba1a70ac1ad85d56939c45b0fe02c6a0b9 Mon Sep 17 00:00:00 2001 From: Himanshu Sharma Date: Wed, 22 Nov 2023 19:37:03 +0530 Subject: [PATCH 03/22] removed double check in secret reconciler --- pkg/util/provider/machinecontroller/secret.go | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pkg/util/provider/machinecontroller/secret.go b/pkg/util/provider/machinecontroller/secret.go index b620ae0a4..7e279a809 100644 --- a/pkg/util/provider/machinecontroller/secret.go +++ b/pkg/util/provider/machinecontroller/secret.go @@ -80,10 +80,6 @@ func (c *controller) reconcileClusterSecret(ctx context.Context, secret *corev1. return err } } else { - if finalizers := sets.NewString(secret.Finalizers...); !finalizers.Has(MCFinalizerName) { - // Finalizer doesn't exist, simply return nil - return nil - } err = c.deleteSecretFinalizers(ctx, secret) if err != nil { return err From db98422e3b45e4696b30a3131eb01df8fe989cda Mon Sep 17 00:00:00 2001 From: Himanshu Sharma Date: Thu, 23 Nov 2023 00:48:04 +0530 Subject: [PATCH 04/22] small refactoring --- pkg/apis/machine/v1alpha1/machine_types.go | 20 ++++++------ .../machinecontroller/machineclass.go | 12 ++++--- .../machinecontroller/machineclass_util.go | 31 ------------------- 3 files changed, 18 insertions(+), 45 deletions(-) diff --git a/pkg/apis/machine/v1alpha1/machine_types.go b/pkg/apis/machine/v1alpha1/machine_types.go index b5920e570..00f87aaf1 100644 --- a/pkg/apis/machine/v1alpha1/machine_types.go +++ b/pkg/apis/machine/v1alpha1/machine_types.go @@ -136,10 +136,10 @@ type LastOperation struct { Type MachineOperationType `json:"type,omitempty"` } -// MachinePhase is a label for the condition of a machines at the current time. +// MachinePhase is a label for the condition of a machine at the current time. type MachinePhase string -// These are the valid statuses of machines. +// These are the valid phases of machines. const ( // MachinePending means that the machine is being created MachinePending MachinePhase = "Pending" @@ -156,25 +156,25 @@ const ( // MachineUnknown indicates that the node is not ready at the movement MachineUnknown MachinePhase = "Unknown" - // MachineFailed means operation failed leading to machine status failure + // MachineFailed means operation timeod out MachineFailed MachinePhase = "Failed" - // MachineCrashLoopBackOff means creation or deletion of the machine is failing. It means that machine object is present but there is no corresponding VM. + // MachineCrashLoopBackOff means creation machine is failing. It means that machine object is present but there is no corresponding VM. MachineCrashLoopBackOff MachinePhase = "CrashLoopBackOff" ) -// MachineState is a current state of the machine. +// MachineState is a current state of the operation. type MachineState string -// These are the valid statuses of machines. +// These are the valid states of the operation performed on the machine. const ( - // MachineStatePending means there are operations pending on this machine state + // MachineStateProcessing means operation is not yet complete MachineStateProcessing MachineState = "Processing" - // MachineStateFailed means operation failed leading to machine status failure + // MachineStateFailed means operation failed MachineStateFailed MachineState = "Failed" - // MachineStateSuccessful indicates that the node is not ready at the moment + // MachineStateSuccessful means operation completed successfully MachineStateSuccessful MachineState = "Successful" ) @@ -189,7 +189,7 @@ const ( // MachineOperationUpdate indicates that the operation was an update MachineOperationUpdate MachineOperationType = "Update" - // MachineOperationHealthCheck indicates that the operation was a create + // MachineOperationHealthCheck indicates that the operation was a health check of node object MachineOperationHealthCheck MachineOperationType = "HealthCheck" // MachineOperationDelete indicates that the operation was a delete diff --git a/pkg/util/provider/machinecontroller/machineclass.go b/pkg/util/provider/machinecontroller/machineclass.go index f7c43e1a2..13e1dc031 100644 --- a/pkg/util/provider/machinecontroller/machineclass.go +++ b/pkg/util/provider/machinecontroller/machineclass.go @@ -171,16 +171,16 @@ func (c *controller) reconcileClusterMachineClass(ctx context.Context, class *v1 func (c *controller) addMachineClassFinalizers(ctx context.Context, class *v1alpha1.MachineClass) error { finalizers := sets.NewString(class.Finalizers...) finalizers.Insert(MCMFinalizerName) - return c.updateMachineClassFinalizers(ctx, class, finalizers.List()) + return c.updateMachineClassFinalizers(ctx, class, finalizers.List(),true) } func (c *controller) deleteMachineClassFinalizers(ctx context.Context, class *v1alpha1.MachineClass) error { finalizers := sets.NewString(class.Finalizers...) finalizers.Delete(MCMFinalizerName) - return c.updateMachineClassFinalizers(ctx, class, finalizers.List()) + return c.updateMachineClassFinalizers(ctx, class, finalizers.List(),false) } -func (c *controller) updateMachineClassFinalizers(ctx context.Context, class *v1alpha1.MachineClass, finalizers []string) error { +func (c *controller) updateMachineClassFinalizers(ctx context.Context, class *v1alpha1.MachineClass, finalizers []string, addFinalizers bool) error { // Get the latest version of the class so that we can avoid conflicts class, err := c.controlMachineClient.MachineClasses(class.Namespace).Get(ctx, class.Name, metav1.GetOptions{}) if err != nil { @@ -194,7 +194,11 @@ func (c *controller) updateMachineClassFinalizers(ctx context.Context, class *v1 klog.Warning("Updating machineClass failed, retrying. ", class.Name, err) return err } - klog.V(3).Infof("Successfully added/removed finalizer on the machineclass %q", class.Name) + if addFinalizers{ + klog.V(3).Infof("Successfully added finalizer on the machineclass %q", class.Name) + }else{ + klog.V(3).Infof("Successfully removed finalizer on the machineclass %q", class.Name) + } return err } diff --git a/pkg/util/provider/machinecontroller/machineclass_util.go b/pkg/util/provider/machinecontroller/machineclass_util.go index fdd15ea15..7d2250faf 100644 --- a/pkg/util/provider/machinecontroller/machineclass_util.go +++ b/pkg/util/provider/machinecontroller/machineclass_util.go @@ -22,37 +22,6 @@ import ( "k8s.io/apimachinery/pkg/labels" ) -/* -TODO: Move this code to MCM/MachineSet controller as well? -func (c *controller) findMachineDeploymentsForClass(kind, name string) ([]*v1alpha1.MachineDeployment, error) { - machineDeployments, err := c.machineDeploymentLister.List(labels.Everything()) - if err != nil { - return nil, err - } - var filtered []*v1alpha1.MachineDeployment - for _, machineDeployment := range machineDeployments { - if machineDeployment.Spec.Template.Spec.Class.Kind == kind && machineDeployment.Spec.Template.Spec.Class.Name == name { - filtered = append(filtered, machineDeployment) - } - } - return filtered, nil -} - -func (c *controller) findMachineSetsForClass(kind, name string) ([]*v1alpha1.MachineSet, error) { - machineSets, err := c.machineSetLister.List(labels.Everything()) - if err != nil { - return nil, err - } - var filtered []*v1alpha1.MachineSet - for _, machineSet := range machineSets { - if machineSet.Spec.Template.Spec.Class.Kind == kind && machineSet.Spec.Template.Spec.Class.Name == name { - filtered = append(filtered, machineSet) - } - } - return filtered, nil -} -*/ - func (c *controller) findMachinesForClass(kind, name string) ([]*v1alpha1.Machine, error) { machines, err := c.machineLister.List(labels.Everything()) if err != nil { From 5bdd6bc1efa1c3a9ec7144a751afaa3ecfe0f43a Mon Sep 17 00:00:00 2001 From: Himanshu Sharma Date: Thu, 23 Nov 2023 00:49:59 +0530 Subject: [PATCH 05/22] introduced machine conditions diff logging --- .../machinecontroller/machine_util.go | 111 +++++++++--------- 1 file changed, 56 insertions(+), 55 deletions(-) diff --git a/pkg/util/provider/machinecontroller/machine_util.go b/pkg/util/provider/machinecontroller/machine_util.go index a9b75e750..7a8f4300b 100644 --- a/pkg/util/provider/machinecontroller/machine_util.go +++ b/pkg/util/provider/machinecontroller/machine_util.go @@ -60,6 +60,7 @@ import ( // emptyMap is a dummy emptyMap to compare with var emptyMap = make(map[string]string) +var errSuccessfulPhaseUpdate = errors.New("machine creation is successful. Machine Phase/Conditions have been UPDATED") const ( maxReplacements = 1 @@ -68,43 +69,6 @@ const ( cacheUpdateTimeout = 1 * time.Second ) -// TODO: use client library instead when it starts to support update retries -// -// see https://github.com/kubernetes/kubernetes/issues/21479 -// type updateMachineFunc func(machine *v1alpha1.Machine) error - -/* -// UpdateMachineWithRetries updates a machine with given applyUpdate function. Note that machine not found error is ignored. -// The returned bool value can be used to tell if the machine is actually updated. -func UpdateMachineWithRetries(machineClient v1alpha1client.MachineInterface, machineLister v1alpha1listers.MachineLister, namespace, name string, applyUpdate updateMachineFunc) (*v1alpha1.Machine, error) { - var machine *v1alpha1.Machine - - retryErr := retry.RetryOnConflict(retry.DefaultBackoff, func() error { - var err error - machine, err = machineLister.Machines(namespace).Get(name) - if err != nil { - return err - } - machine = machine.DeepCopy() - // Apply the update, then attempt to push it to the apiserver. - if applyErr := applyUpdate(machine); applyErr != nil { - return applyErr - } - machine, err = machineClient.Update(machine) - return err - }) - - // Ignore the precondition violated error, this machine is already updated - // with the desired label. - if retryErr == errorsutil.ErrPreconditionViolated { - klog.V(4).Infof("Machine %s precondition doesn't hold, skip updating it.", name) - retryErr = nil - } - - return machine, retryErr -} -*/ - // ValidateMachineClass validates the machine class. func (c *controller) ValidateMachineClass(_ context.Context, classSpec *v1alpha1.ClassSpec) (*v1alpha1.MachineClass, map[string][]byte, machineutils.RetryPeriod, error) { var ( @@ -219,19 +183,38 @@ func (c *controller) getSecret(ref *v1.SecretReference, MachineClassName string) return secretRef, err } -// nodeConditionsHaveChanged compares two node statuses to see if any of the statuses have changed -func nodeConditionsHaveChanged(machineConditions []v1.NodeCondition, nodeConditions []v1.NodeCondition) bool { - if len(machineConditions) != len(nodeConditions) { - return true +// nodeConditionsHaveChanged compares two node status.conditions to see if any of the statuses have changed +func nodeConditionsHaveChanged(oldConditions []v1.NodeCondition, newConditions []v1.NodeCondition) ([]v1.NodeCondition, []v1.NodeCondition, bool) { + var ( + oldConditionsByType = make(map[v1.NodeConditionType]v1.NodeCondition, len(oldConditions)) + newConditionsByType = make(map[v1.NodeConditionType]v1.NodeCondition, len(newConditions)) + addedOrUpdatedConditions = make([]v1.NodeCondition, 0, len(newConditions)) + removedConditions = make([]v1.NodeCondition, 0, len(oldConditions)) + ) + + for _, c := range oldConditions { + oldConditionsByType[c.Type] = c + } + for _, c := range newConditions { + newConditionsByType[c.Type] = c + } + + // checking for any added/updated new condition + for _, c := range newConditions { + oldC, exists := oldConditionsByType[c.Type] + if !exists || (oldC.Status != c.Status) { + addedOrUpdatedConditions = append(addedOrUpdatedConditions, c) + } } - for i := range nodeConditions { - if nodeConditions[i].Status != machineConditions[i].Status { - return true + // checking for any deleted condition + for _, c := range oldConditions { + if _, exists := newConditionsByType[c.Type]; !exists { + removedConditions = append(removedConditions, c) } } - return false + return addedOrUpdatedConditions, removedConditions, len(addedOrUpdatedConditions) != 0 || len(removedConditions) != 0 } func mergeDataMaps(in map[string][]byte, maps ...map[string][]byte) map[string][]byte { @@ -601,14 +584,13 @@ func (c *controller) reconcileMachineHealth(ctx context.Context, machine *v1alph // and corresponding node object went missing // and if machine object still reports healthy description = fmt.Sprintf( - "Node object went missing. Machine %s is unhealthy - changing MachineState to Unknown", + "Node object went missing. Machine %s is unhealthy - changing MachinePhase to Unknown", machine.Name, ) klog.Warning(description) clone.Status.CurrentStatus = v1alpha1.CurrentStatus{ - Phase: v1alpha1.MachineUnknown, - // TimeoutActive: true, + Phase: v1alpha1.MachineUnknown, LastUpdateTime: metav1.Now(), } clone.Status.LastOperation = v1alpha1.LastOperation{ @@ -625,9 +607,11 @@ func (c *controller) reconcileMachineHealth(ctx context.Context, machine *v1alph return machineutils.ShortRetry, err } } else { - if nodeConditionsHaveChanged(machine.Status.Conditions, node.Status.Conditions) { + populatedConditions, removedConditions, isChanged := nodeConditionsHaveChanged(machine.Status.Conditions, node.Status.Conditions) + if isChanged { clone.Status.Conditions = node.Status.Conditions - klog.V(3).Infof("Conditions of Machine %q with providerID %q and backing node %q are changing", machine.Name, getProviderID(machine), getNodeName(machine)) + + klog.V(3).Infof("Conditions of Machine %q with providerID %q and backing node %q are changing.\nAdded/Updated Conditions:\n\n%s\nRemoved Conditions:\n\n%s", machine.Name, getProviderID(machine), getNodeName(machine), getFormattedNodeConditions(populatedConditions), getFormattedNodeConditions(removedConditions)) cloneDirty = true } @@ -756,12 +740,12 @@ func (c *controller) reconcileMachineHealth(ctx context.Context, machine *v1alph if cloneDirty { _, err = c.controlMachineClient.Machines(clone.Namespace).UpdateStatus(ctx, clone, metav1.UpdateOptions{}) if err != nil { - // Keep retrying until update goes through - klog.Errorf("Update failed for machine %q. Retrying, error: %q", machine.Name, err) + // Keep retrying across reconciles until update goes through + klog.Errorf("Update of Phase/Conditions failed for machine %q. Retrying, error: %q", machine.Name, err) } else { - klog.V(2).Infof("Machine State has been updated for %q with providerID %q and backing node %q", machine.Name, getProviderID(machine), getNodeName(machine)) - // Return error for continuing in next iteration - err = fmt.Errorf("machine creation is successful. Machine State has been UPDATED") + klog.V(2).Infof("Machine Phase/Conditions have been updated for %q with providerID %q and backing node %q", machine.Name, getProviderID(machine), getNodeName(machine)) + // Return error to end the reconcile + err = errSuccessfulPhaseUpdate } return machineutils.ShortRetry, err @@ -770,6 +754,23 @@ func (c *controller) reconcileMachineHealth(ctx context.Context, machine *v1alph return machineutils.LongRetry, nil } +func NodeConditionsHaveChanged(oldConditions []v1.NodeCondition, newConditions []v1.NodeCondition) ([]v1.NodeCondition, []v1.NodeCondition, bool) { + return nodeConditionsHaveChanged(oldConditions, newConditions) +} + +func GetFormattedNodeConditions(conditions []v1.NodeCondition) string { + return getFormattedNodeConditions(conditions) +} + +func getFormattedNodeConditions(conditions []v1.NodeCondition) string { + var result string + for _, c := range conditions { + result = fmt.Sprintf("%sType: %s | Status: %s | Reason: %s | Message: %s\n", result, c.Type, c.Status, c.Reason, c.Message) + } + + return result +} + /* SECTION Manipulate Finalizers From e1849e4eca6e1a25fbe278e86aad2bf0d6bf0d98 Mon Sep 17 00:00:00 2001 From: Himanshu Sharma Date: Thu, 23 Nov 2023 10:08:45 +0530 Subject: [PATCH 06/22] removed old in-tree related code --- pkg/util/provider/machineutils/utils.go | 3 --- 1 file changed, 3 deletions(-) diff --git a/pkg/util/provider/machineutils/utils.go b/pkg/util/provider/machineutils/utils.go index 8bc546a0f..8e906f588 100644 --- a/pkg/util/provider/machineutils/utils.go +++ b/pkg/util/provider/machineutils/utils.go @@ -54,9 +54,6 @@ const ( // MachineClassKind is used to identify the machineClassKind for generic machineClasses MachineClassKind = "MachineClass" - // MigratedMachineClass annotation helps in identifying machineClasses who have been migrated by migration controller - MigratedMachineClass = "machine.sapcloud.io/migrated" - // NotManagedByMCM annotation helps in identifying the nodes which are not handled by MCM NotManagedByMCM = "node.machine.sapcloud.io/not-managed-by-mcm" From b50ff02b3547f1aefe5b594af5f7d94002737530 Mon Sep 17 00:00:00 2001 From: Himanshu Sharma Date: Thu, 23 Nov 2023 10:09:30 +0530 Subject: [PATCH 07/22] adapted unit tests ot new const errSuccessfulPhaseUpdate --- .../machinecontroller/machine_util_test.go | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pkg/util/provider/machinecontroller/machine_util_test.go b/pkg/util/provider/machinecontroller/machine_util_test.go index cadd5b550..ea8a82731 100644 --- a/pkg/util/provider/machinecontroller/machine_util_test.go +++ b/pkg/util/provider/machinecontroller/machine_util_test.go @@ -2177,7 +2177,7 @@ var _ = Describe("machine_util", func() { }, expect: expect{ retryPeriod: machineutils.ShortRetry, - err: errors.New("machine creation is successful. Machine State has been UPDATED"), + err: errSuccessfulPhaseUpdate, expectedPhase: v1alpha1.MachineFailed, }, }), @@ -2196,7 +2196,7 @@ var _ = Describe("machine_util", func() { }, expect: expect{ retryPeriod: machineutils.ShortRetry, - err: errors.New("machine creation is successful. Machine State has been UPDATED"), + err: errSuccessfulPhaseUpdate, expectedPhase: v1alpha1.MachineRunning, }, }), @@ -2215,7 +2215,7 @@ var _ = Describe("machine_util", func() { }, expect: expect{ retryPeriod: machineutils.ShortRetry, - err: errors.New("machine creation is successful. Machine State has been UPDATED"), + err: errSuccessfulPhaseUpdate, expectedPhase: v1alpha1.MachineUnknown, }, }), @@ -2234,7 +2234,7 @@ var _ = Describe("machine_util", func() { }, expect: expect{ retryPeriod: machineutils.ShortRetry, - err: errors.New("machine creation is successful. Machine State has been UPDATED"), + err: errSuccessfulPhaseUpdate, expectedPhase: v1alpha1.MachineUnknown, }, }), @@ -2250,7 +2250,7 @@ var _ = Describe("machine_util", func() { }, expect: expect{ retryPeriod: machineutils.ShortRetry, - err: errors.New("machine creation is successful. Machine State has been UPDATED"), + err: errSuccessfulPhaseUpdate, expectedPhase: v1alpha1.MachineUnknown, }, }), @@ -2363,7 +2363,7 @@ var _ = Describe("machine_util", func() { }, expect: expect{ retryPeriod: machineutils.ShortRetry, - err: errors.New("machine creation is successful. Machine State has been UPDATED"), + err: errors.New("machine creation is successful. Machine Phase or conditions have been UPDATED"), expectedPhase: v1alpha1.MachineRunning, }, }), @@ -2390,7 +2390,7 @@ var _ = Describe("machine_util", func() { }, expect: expect{ retryPeriod: machineutils.ShortRetry, - err: errors.New("machine creation is successful. Machine State has been UPDATED"), + err: errors.New("machine creation is successful. Machine Phase or conditions have been UPDATED"), expectedPhase: v1alpha1.MachineRunning, }, }), @@ -2424,7 +2424,7 @@ var _ = Describe("machine_util", func() { }, expect: expect{ retryPeriod: machineutils.ShortRetry, - err: errors.New("machine creation is successful. Machine State has been UPDATED"), + err: errors.New("machine creation is successful. Machine Phase or conditions have been UPDATED"), expectedPhase: v1alpha1.MachineRunning, }, }), From 0c81be07d5b87815f882d06665d99fb0a8b05fcc Mon Sep 17 00:00:00 2001 From: Himanshu Sharma Date: Fri, 1 Dec 2023 17:36:44 +0530 Subject: [PATCH 08/22] added new retry period ConflictRetry --- pkg/util/provider/machineutils/utils.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pkg/util/provider/machineutils/utils.go b/pkg/util/provider/machineutils/utils.go index 8e906f588..332349206 100644 --- a/pkg/util/provider/machineutils/utils.go +++ b/pkg/util/provider/machineutils/utils.go @@ -79,8 +79,10 @@ type RetryPeriod time.Duration // These are the valid values for RetryPeriod const ( + // ConflictRetry tells the controller to retry quickly - 200 milliseconds + ConflictRetry RetryPeriod = RetryPeriod(200 * time.Millisecond) // ShortRetry tells the controller to retry after a short duration - 15 seconds - ShortRetry RetryPeriod = RetryPeriod(15 * time.Second) + ShortRetry RetryPeriod = RetryPeriod(5 * time.Second) // MediumRetry tells the controller to retry after a medium duration - 2 minutes MediumRetry RetryPeriod = RetryPeriod(3 * time.Minute) // LongRetry tells the controller to retry after a long duration - 10 minutes From 61ceccd495056e8fc885207610f5f4b6f7652277 Mon Sep 17 00:00:00 2001 From: Himanshu Sharma Date: Fri, 1 Dec 2023 17:37:41 +0530 Subject: [PATCH 09/22] updated log level for better readability --- cmd/machine-controller-manager/app/controllermanager.go | 8 ++++---- pkg/util/provider/drain/drain.go | 2 +- pkg/util/provider/drain/volume_attachment.go | 4 ++-- pkg/util/provider/machinecontroller/secret.go | 4 ++-- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/cmd/machine-controller-manager/app/controllermanager.go b/cmd/machine-controller-manager/app/controllermanager.go index 2f9aec1f3..3128acd74 100644 --- a/cmd/machine-controller-manager/app/controllermanager.go +++ b/cmd/machine-controller-manager/app/controllermanager.go @@ -208,7 +208,7 @@ func StartControllers(s *options.MCMServer, recorder record.EventRecorder, stop <-chan struct{}) error { - klog.V(5).Info("Getting available resources") + klog.V(4).Info("Getting available resources") availableResources, err := getAvailableResources(controlCoreClientBuilder) if err != nil { return err @@ -229,7 +229,7 @@ func StartControllers(s *options.MCMServer, } if availableResources[machineGVR] || availableResources[machineSetGVR] || availableResources[machineDeploymentGVR] { - klog.V(5).Infof("Creating shared informers; resync interval: %v", s.MinResyncPeriod) + klog.V(4).Infof("Creating shared informers; resync interval: %v", s.MinResyncPeriod) controlMachineInformerFactory := machineinformers.NewFilteredSharedInformerFactory( controlMachineClientBuilder.ClientOrDie("control-machine-shared-informers"), @@ -253,7 +253,7 @@ func StartControllers(s *options.MCMServer, // All shared informers are v1alpha1 API level machineSharedInformers := controlMachineInformerFactory.Machine().V1alpha1() - klog.V(5).Infof("Creating controllers...") + klog.V(4).Infof("Creating controllers...") mcmController, err := mcmcontroller.NewController( s.Namespace, controlMachineClient, @@ -276,7 +276,7 @@ func StartControllers(s *options.MCMServer, controlCoreInformerFactory.Start(stop) targetCoreInformerFactory.Start(stop) - klog.V(5).Info("Running controller") + klog.V(4).Info("Running controller") go mcmController.Run(int(s.ConcurrentNodeSyncs), stop) } else { diff --git a/pkg/util/provider/drain/drain.go b/pkg/util/provider/drain/drain.go index 0b5a9f6e4..8b607056e 100644 --- a/pkg/util/provider/drain/drain.go +++ b/pkg/util/provider/drain/drain.go @@ -934,7 +934,7 @@ func (o *Options) waitForReattach(ctx context.Context, podVolumeInfo PodVolumeIn case incomingEvent := <-volumeAttachmentEventCh: persistentVolumeName := *incomingEvent.Spec.Source.PersistentVolumeName - klog.V(5).Infof("VolumeAttachment event received for PV: %s", persistentVolumeName) + klog.V(4).Infof("VolumeAttachment event received for PV: %s", persistentVolumeName) // Checking if event for an PV that is being waited on if _, present := pvsWaitingForReattachments[persistentVolumeName]; present { diff --git a/pkg/util/provider/drain/volume_attachment.go b/pkg/util/provider/drain/volume_attachment.go index f3dcc24ec..5c35afa3a 100644 --- a/pkg/util/provider/drain/volume_attachment.go +++ b/pkg/util/provider/drain/volume_attachment.go @@ -70,13 +70,13 @@ func (v *VolumeAttachmentHandler) dispatch(obj interface{}) { // AddVolumeAttachment is the event handler for VolumeAttachment add func (v *VolumeAttachmentHandler) AddVolumeAttachment(obj interface{}) { - klog.V(5).Infof("Adding volume attachment object") + klog.V(4).Infof("Adding volume attachment object") v.dispatch(obj) } // UpdateVolumeAttachment is the event handler for VolumeAttachment update func (v *VolumeAttachmentHandler) UpdateVolumeAttachment(oldObj, newObj interface{}) { - klog.V(5).Info("Updating volume attachment object") + klog.V(4).Info("Updating volume attachment object") v.dispatch(newObj) } diff --git a/pkg/util/provider/machinecontroller/secret.go b/pkg/util/provider/machinecontroller/secret.go index 7e279a809..6e072fb54 100644 --- a/pkg/util/provider/machinecontroller/secret.go +++ b/pkg/util/provider/machinecontroller/secret.go @@ -61,10 +61,10 @@ func (c *controller) reconcileClusterSecretKey(key string) error { func (c *controller) reconcileClusterSecret(ctx context.Context, secret *corev1.Secret) error { startTime := time.Now() - klog.V(5).Infof("Start syncing %q", secret.Name) + klog.V(4).Infof("Start syncing %q", secret.Name) defer func() { c.enqueueSecretAfter(secret, 10*time.Minute) - klog.V(5).Infof("Finished syncing %q (%v)", secret.Name, time.Since(startTime)) + klog.V(4).Infof("Finished syncing %q (%v)", secret.Name, time.Since(startTime)) }() // Check if machineClasses are referring to this secret From aeb28f7377cf859ad1876fc04e2d80ba19f9e88d Mon Sep 17 00:00:00 2001 From: Himanshu Sharma Date: Fri, 1 Dec 2023 17:37:56 +0530 Subject: [PATCH 10/22] some more updated log levels --- pkg/util/provider/app/app.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pkg/util/provider/app/app.go b/pkg/util/provider/app/app.go index 3b095fce8..c1129b147 100644 --- a/pkg/util/provider/app/app.go +++ b/pkg/util/provider/app/app.go @@ -209,7 +209,7 @@ func StartControllers(s *options.MCServer, recorder record.EventRecorder, stop <-chan struct{}) error { - klog.V(5).Info("Getting available resources") + klog.V(4).Info("Getting available resources") availableResources, err := getAvailableResources(controlCoreClientBuilder) if err != nil { return err @@ -239,7 +239,7 @@ func StartControllers(s *options.MCServer, } if availableResources[machineGVR] { - klog.V(5).Infof("Creating shared informers; resync interval: %v", s.MinResyncPeriod) + klog.V(4).Infof("Creating shared informers; resync interval: %v", s.MinResyncPeriod) controlMachineInformerFactory := machineinformers.NewFilteredSharedInformerFactory( controlMachineClientBuilder.ClientOrDie("control-machine-shared-informers"), @@ -274,7 +274,7 @@ func StartControllers(s *options.MCServer, // All shared informers are v1alpha1 API level machineSharedInformers := controlMachineInformerFactory.Machine().V1alpha1() - klog.V(5).Infof("Creating controllers...") + klog.V(4).Infof("Creating controllers...") machineController, err := machinecontroller.NewController( s.Namespace, controlMachineClient, @@ -305,7 +305,7 @@ func StartControllers(s *options.MCServer, controlCoreInformerFactory.Start(stop) targetCoreInformerFactory.Start(stop) - klog.V(5).Info("Running controller") + klog.V(4).Info("Running controller") go machineController.Run(int(s.ConcurrentNodeSyncs), stop) } else { From a672b570d9643424e4b361caadf5af1f0e3fa7ec Mon Sep 17 00:00:00 2001 From: Himanshu Sharma Date: Fri, 1 Dec 2023 17:38:28 +0530 Subject: [PATCH 11/22] adapted test cases --- pkg/util/provider/machinecontroller/machine_test.go | 2 +- .../provider/machinecontroller/machine_util_test.go | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pkg/util/provider/machinecontroller/machine_test.go b/pkg/util/provider/machinecontroller/machine_test.go index 98707bc5d..23ab69061 100644 --- a/pkg/util/provider/machinecontroller/machine_test.go +++ b/pkg/util/provider/machinecontroller/machine_test.go @@ -2794,7 +2794,7 @@ var _ = Describe("machine", func() { }, }, expect: expect{ - err: fmt.Errorf("Machine deletion in process. Deletion of node object was succesful"), + err: fmt.Errorf("Machine deletion in process. Deletion of node object was successful"), retry: machineutils.ShortRetry, nodeDeleted: true, machine: newMachine( diff --git a/pkg/util/provider/machinecontroller/machine_util_test.go b/pkg/util/provider/machinecontroller/machine_util_test.go index ea8a82731..8626c0f88 100644 --- a/pkg/util/provider/machinecontroller/machine_util_test.go +++ b/pkg/util/provider/machinecontroller/machine_util_test.go @@ -196,7 +196,7 @@ var _ = Describe("machine_util", func() { }, }, }, - err: fmt.Errorf("Machine ALTs have been reconciled"), + err: errSuccessfulALTsync, }, }), @@ -276,7 +276,7 @@ var _ = Describe("machine_util", func() { }, }, }, - err: fmt.Errorf("Machine ALTs have been reconciled"), + err: errSuccessfulALTsync, }, }), @@ -422,7 +422,7 @@ var _ = Describe("machine_util", func() { }, }, }, - err: fmt.Errorf("Machine ALTs have been reconciled"), + err: errSuccessfulALTsync, }, }), ) @@ -2363,7 +2363,7 @@ var _ = Describe("machine_util", func() { }, expect: expect{ retryPeriod: machineutils.ShortRetry, - err: errors.New("machine creation is successful. Machine Phase or conditions have been UPDATED"), + err: errors.New("machine creation is successful. Machine Phase/Conditions have been UPDATED"), expectedPhase: v1alpha1.MachineRunning, }, }), @@ -2390,7 +2390,7 @@ var _ = Describe("machine_util", func() { }, expect: expect{ retryPeriod: machineutils.ShortRetry, - err: errors.New("machine creation is successful. Machine Phase or conditions have been UPDATED"), + err: errors.New("machine creation is successful. Machine Phase/Conditions have been UPDATED"), expectedPhase: v1alpha1.MachineRunning, }, }), @@ -2424,7 +2424,7 @@ var _ = Describe("machine_util", func() { }, expect: expect{ retryPeriod: machineutils.ShortRetry, - err: errors.New("machine creation is successful. Machine Phase or conditions have been UPDATED"), + err: errors.New("machine creation is successful. Machine Phase/Conditions have been UPDATED"), expectedPhase: v1alpha1.MachineRunning, }, }), From a653be5383e46fc83ea80897fcbec44fe3177cbf Mon Sep 17 00:00:00 2001 From: Himanshu Sharma Date: Fri, 1 Dec 2023 17:39:45 +0530 Subject: [PATCH 12/22] no machine reconcile on status update --- .../provider/machinecontroller/machine.go | 128 +++++++++--------- 1 file changed, 61 insertions(+), 67 deletions(-) diff --git a/pkg/util/provider/machinecontroller/machine.go b/pkg/util/provider/machinecontroller/machine.go index 5d6268afe..8fbc82710 100644 --- a/pkg/util/provider/machinecontroller/machine.go +++ b/pkg/util/provider/machinecontroller/machine.go @@ -47,18 +47,28 @@ SECTION Machine controller - Machine add, update, delete watches */ func (c *controller) addMachine(obj interface{}) { - klog.V(5).Infof("Adding machine object") - c.enqueueMachine(obj) + c.enqueueMachine(obj, "handling machine obj ADD event") } func (c *controller) updateMachine(oldObj, newObj interface{}) { - klog.V(5).Info("Updating machine object") - c.enqueueMachine(newObj) + oldMachine := oldObj.(*v1alpha1.Machine) + newMachine := newObj.(*v1alpha1.Machine) + + if oldMachine == nil || newMachine == nil { + klog.Errorf("Couldn't convert to machine resource from object") + return + } + + if oldMachine.ResourceVersion != newMachine.ResourceVersion && oldMachine.Generation == newMachine.Generation { + klog.V(3).Infof("Skipping non-spec updates for machine %s", oldMachine.Name) + return + } + + c.enqueueMachine(newObj, "handling machine object UPDATE event") } func (c *controller) deleteMachine(obj interface{}) { - klog.V(5).Info("Deleting machine object") - c.enqueueMachine(obj) + c.enqueueMachine(obj, "handling machine object DELETE event") } // getKeyForObj returns key for object, else returns false @@ -71,17 +81,16 @@ func (c *controller) getKeyForObj(obj interface{}) (string, bool) { return key, true } - -func (c *controller) enqueueMachine(obj interface{}) { +func (c *controller) enqueueMachine(obj interface{}, reason string) { if key, ok := c.getKeyForObj(obj); ok { - klog.V(5).Infof("Adding machine object to the queue %q", key) + klog.V(3).Infof("Adding machine object to queue %q, reason: %s", key, reason) c.machineQueue.Add(key) } } -func (c *controller) enqueueMachineAfter(obj interface{}, after time.Duration) { +func (c *controller) enqueueMachineAfter(obj interface{}, after time.Duration, reason string) { if key, ok := c.getKeyForObj(obj); ok { - klog.V(5).Infof("Adding machine object to the queue %q after %s", key, after) + klog.V(3).Infof("Adding machine object to queue %q after %s, reason: %s", key, after, reason) c.machineQueue.AddAfter(key, after) } } @@ -105,16 +114,19 @@ func (c *controller) reconcileClusterMachineKey(key string) error { } retryPeriod, err := c.reconcileClusterMachine(ctx, machine) - klog.V(5).Info(err, retryPeriod) - c.enqueueMachineAfter(machine, time.Duration(retryPeriod)) + var reEnqueReason = "periodic reconcile" + if err != nil { + reEnqueReason = err.Error() + } + c.enqueueMachineAfter(machine, time.Duration(retryPeriod), reEnqueReason) return nil } func (c *controller) reconcileClusterMachine(ctx context.Context, machine *v1alpha1.Machine) (machineutils.RetryPeriod, error) { - klog.V(5).Infof("Start Reconciling machine: %q , nodeName: %q ,providerID: %q", machine.Name, getNodeName(machine), getProviderID(machine)) - defer klog.V(5).Infof("Stop Reconciling machine %q, nodeName: %q ,providerID: %q", machine.Name, getNodeName(machine), getProviderID(machine)) + klog.V(2).Infof("reconcileClusterMachine: Start for %q with phase:%q, description:%q", machine.Name, machine.Status.CurrentStatus.Phase, machine.Status.LastOperation.Description) + defer klog.V(2).Infof("reconcileClusterMachine: Stop for %q", machine.Name) if c.safetyOptions.MachineControllerFrozen && machine.DeletionTimestamp == nil { // If Machine controller is frozen and @@ -132,7 +144,7 @@ func (c *controller) reconcileClusterMachine(ctx context.Context, machine *v1alp validationerr := validation.ValidateMachine(internalMachine) if validationerr.ToAggregate() != nil && len(validationerr.ToAggregate().Errors()) > 0 { - err := fmt.Errorf("Validation of Machine failed %s", validationerr.ToAggregate().Error()) + err := fmt.Errorf("validation of Machine failed %s", validationerr.ToAggregate().Error()) klog.Error(err) return machineutils.LongRetry, err } @@ -140,7 +152,7 @@ func (c *controller) reconcileClusterMachine(ctx context.Context, machine *v1alp // Validate MachineClass machineClass, secretData, retry, err := c.ValidateMachineClass(ctx, &machine.Spec.Class) if err != nil { - klog.Error(err) + klog.Errorf("Cannot reconcile machine %s: %s", machine.Name, err) return retry, err } @@ -193,8 +205,8 @@ SECTION Machine controller - nodeToMachine */ var ( - errMultipleMachineMatch = errors.New("Multiple machines matching node") - errNoMachineMatch = errors.New("No machines matching node found") + errMultipleMachineMatch = errors.New("multiple machines matching node") + errNoMachineMatch = errors.New("no machines matching node found") ) func (c *controller) addNodeToMachine(obj interface{}) { @@ -226,9 +238,12 @@ func (c *controller) addNodeToMachine(obj interface{}) { return } - if machine.Status.CurrentStatus.Phase != v1alpha1.MachineCrashLoopBackOff && nodeConditionsHaveChanged(machine.Status.Conditions, node.Status.Conditions) { - klog.V(5).Infof("Enqueue machine object %q as conditions of backing node %q have changed", machine.Name, getNodeName(machine)) - c.enqueueMachine(machine) + isMachineCrashLooping := machine.Status.CurrentStatus.Phase == v1alpha1.MachineCrashLoopBackOff + isMachineTerminating := machine.Status.CurrentStatus.Phase == v1alpha1.MachineTerminating + _, _, nodeConditionsHaveChanged := nodeConditionsHaveChanged(machine.Status.Conditions, node.Status.Conditions) + + if !isMachineCrashLooping && !isMachineTerminating && nodeConditionsHaveChanged { + c.enqueueMachine(machine, fmt.Sprintf("handling node UPDATE event. conditions of backing node %q have changed", getNodeName(machine))) } } @@ -242,7 +257,6 @@ func (c *controller) updateNodeToMachine(oldObj, newObj interface{}) { // check for the TriggerDeletionByMCM annotation on the node object // if it is present then mark the machine object for deletion if value, ok := node.Annotations[machineutils.TriggerDeletionByMCM]; ok && value == "true" { - machine, err := c.getMachineFromNode(node.Name) if err != nil { klog.Errorf("Couldn't fetch machine %s, Error: %s", machine.Name, err) @@ -250,7 +264,7 @@ func (c *controller) updateNodeToMachine(oldObj, newObj interface{}) { } if machine.DeletionTimestamp == nil { - klog.Infof("Node %s is annotated to trigger deletion by MCM.", node.Name) + klog.Infof("Node %s for machine %s is annotated to trigger deletion by MCM.", node.Name, machine.Name) if err := c.controlMachineClient.Machines(c.namespace).Delete(context.Background(), machine.Name, metav1.DeleteOptions{}); err != nil { klog.Errorf("Machine object %s backing the node %s could not be marked for deletion.", machine.Name, node.Name) return @@ -275,7 +289,10 @@ func (c *controller) deleteNodeToMachine(obj interface{}) { return } - c.enqueueMachine(machine) + // donot respond if machine obj is already in termination flow + if machine.DeletionTimestamp == nil { + c.enqueueMachine(machine, fmt.Sprintf("backing node obj %q got deleted", key)) + } } /* @@ -304,7 +321,7 @@ func (c *controller) getMachineFromNode(nodeName string) (*v1alpha1.Machine, err /* SECTION - Machine operations - Create, Update, Delete + Machine operations - Create, Delete */ func (c *controller) triggerCreationFlow(ctx context.Context, createMachineRequest *driver.CreateMachineRequest) (machineutils.RetryPeriod, error) { @@ -397,7 +414,7 @@ func (c *controller) triggerCreationFlow(ctx context.Context, createMachineReque } // machine obj marked Failed for double security - c.machineStatusUpdate( + updateErr := c.machineStatusUpdate( ctx, machine, v1alpha1.LastOperation{ @@ -412,6 +429,11 @@ func (c *controller) triggerCreationFlow(ctx context.Context, createMachineReque }, machine.Status.LastKnownState, ) + if apierrors.IsConflict(updateErr) { + return machineutils.ConflictRetry, err + } else if updateErr != nil { + return machineutils.ShortRetry, updateErr + } klog.V(2).Infof("Machine %q marked Failed as VM was referring to a stale node object", machine.Name) return machineutils.ShortRetry, err @@ -424,7 +446,7 @@ func (c *controller) triggerCreationFlow(ctx context.Context, createMachineReque case codes.Unknown, codes.DeadlineExceeded, codes.Aborted, codes.Unavailable: // GetMachineStatus() returned with one of the above error codes. // Retry operation. - c.machineStatusUpdate( + updateErr := c.machineStatusUpdate( ctx, machine, v1alpha1.LastOperation{ @@ -439,11 +461,16 @@ func (c *controller) triggerCreationFlow(ctx context.Context, createMachineReque }, machine.Status.LastKnownState, ) + if apierrors.IsConflict(updateErr) { + return machineutils.ConflictRetry, err + } else if updateErr != nil { + return machineutils.ShortRetry, updateErr + } return machineutils.ShortRetry, err default: - c.machineStatusUpdate( + updateErr := c.machineStatusUpdate( ctx, machine, v1alpha1.LastOperation{ @@ -458,6 +485,11 @@ func (c *controller) triggerCreationFlow(ctx context.Context, createMachineReque }, machine.Status.LastKnownState, ) + if apierrors.IsConflict(updateErr) { + return machineutils.ConflictRetry, err + } else if updateErr != nil { + return machineutils.MediumRetry, updateErr + } return machineutils.MediumRetry, err } @@ -528,44 +560,6 @@ func (c *controller) triggerCreationFlow(ctx context.Context, createMachineReque return machineutils.LongRetry, nil } -func (c *controller) triggerUpdationFlow(ctx context.Context, machine *v1alpha1.Machine, actualProviderID string) (machineutils.RetryPeriod, error) { - klog.V(2).Infof("Setting ProviderID of machine %s with backing node %s to %s", machine.Name, getNodeName(machine), actualProviderID) - - for { - machine, err := c.controlMachineClient.Machines(machine.Namespace).Get(ctx, machine.Name, metav1.GetOptions{}) - if err != nil { - klog.Warningf("Machine GET failed. Retrying, error: %s", err) - continue - } - - clone := machine.DeepCopy() - clone.Spec.ProviderID = actualProviderID - machine, err = c.controlMachineClient.Machines(clone.Namespace).Update(ctx, clone, metav1.UpdateOptions{}) - if err != nil { - klog.Warningf("Machine UPDATE failed. Retrying, error: %s", err) - continue - } - - clone = machine.DeepCopy() - lastOperation := v1alpha1.LastOperation{ - Description: "Updated provider ID", - State: v1alpha1.MachineStateSuccessful, - Type: v1alpha1.MachineOperationUpdate, - LastUpdateTime: metav1.Now(), - } - clone.Status.LastOperation = lastOperation - _, err = c.controlMachineClient.Machines(clone.Namespace).UpdateStatus(ctx, clone, metav1.UpdateOptions{}) - if err != nil { - klog.Warningf("Machine/status UPDATE failed. Retrying, error: %s", err) - continue - } - // Update went through, exit out of infinite loop - break - } - - return machineutils.LongRetry, nil -} - func (c *controller) triggerDeletionFlow(ctx context.Context, deleteMachineRequest *driver.DeleteMachineRequest) (machineutils.RetryPeriod, error) { var ( machine = deleteMachineRequest.Machine From 0c0aaafd6f8f19ea4cc632296536b5c242d58d5d Mon Sep 17 00:00:00 2001 From: Himanshu Sharma Date: Fri, 1 Dec 2023 17:40:38 +0530 Subject: [PATCH 13/22] updated log messages and used ConflictRetry --- .../machinecontroller/machine_safety.go | 5 +- .../machinecontroller/machine_util.go | 101 +++++++++++++----- .../machinecontroller/machineclass.go | 16 +-- 3 files changed, 87 insertions(+), 35 deletions(-) diff --git a/pkg/util/provider/machinecontroller/machine_safety.go b/pkg/util/provider/machinecontroller/machine_safety.go index 23ce2e132..8c02bdcb7 100644 --- a/pkg/util/provider/machinecontroller/machine_safety.go +++ b/pkg/util/provider/machinecontroller/machine_safety.go @@ -107,8 +107,9 @@ func (c *controller) reconcileClusterMachineSafetyAPIServer(key string) error { klog.V(2).Infof("SafetyController: Reinitializing machine health check for machine: %q with backing node: %q and providerID: %q", machine.Name, getNodeName(machine), getProviderID(machine)) } - // En-queue after 30 seconds, to ensure all machine states are reconciled - c.enqueueMachineAfter(machine, 30*time.Second) + // En-queue after 30 seconds, to ensure all machine phases are reconciled to actual as all are + // reseted to `Running` currently + c.enqueueMachineAfter(machine, 30*time.Second, "kube-api-servers are up again, so reconcile of machine phase is needed") } c.safetyOptions.MachineControllerFrozen = false diff --git a/pkg/util/provider/machinecontroller/machine_util.go b/pkg/util/provider/machinecontroller/machine_util.go index 7a8f4300b..a00c15f8e 100644 --- a/pkg/util/provider/machinecontroller/machine_util.go +++ b/pkg/util/provider/machinecontroller/machine_util.go @@ -60,7 +60,10 @@ import ( // emptyMap is a dummy emptyMap to compare with var emptyMap = make(map[string]string) -var errSuccessfulPhaseUpdate = errors.New("machine creation is successful. Machine Phase/Conditions have been UPDATED") +var ( + errSuccessfulALTsync = errors.New("machine ALTs have been reconciled") + errSuccessfulPhaseUpdate = errors.New("machine creation is successful. Machine Phase/Conditions have been UPDATED") +) const ( maxReplacements = 1 @@ -101,7 +104,6 @@ func (c *controller) ValidateMachineClass(_ context.Context, classSpec *v1alpha1 errMessage := fmt.Sprintf("The machine class %s has no finalizers set. So not reconciling the machine.", machineClass.Name) err := errors.New(errMessage) - klog.Warning(errMessage) return nil, nil, machineutils.ShortRetry, err } @@ -297,10 +299,13 @@ func (c *controller) syncMachineNodeTemplates(ctx context.Context, machine *v1al // Keep retrying until update goes through klog.Errorf("Updated failed for node object of machine %q. Retrying, error: %q", machine.Name, err) } else { - // Return error even when machine object is updated - err = fmt.Errorf("Machine ALTs have been reconciled") + // Return error to continue in next reconcile + err = errSuccessfulALTsync } + if apierrors.IsConflict(err) { + return machineutils.ConflictRetry, err + } return machineutils.ShortRetry, err } @@ -477,7 +482,7 @@ func (c *controller) machineCreateErrorHandler(ctx context.Context, machine *v1a lastKnownState = createMachineResponse.LastKnownState } - c.machineStatusUpdate( + updateErr := c.machineStatusUpdate( ctx, machine, v1alpha1.LastOperation{ @@ -493,8 +498,13 @@ func (c *controller) machineCreateErrorHandler(ctx context.Context, machine *v1a }, lastKnownState, ) + if apierrors.IsConflict(updateErr) { + return machineutils.ConflictRetry, err + } else if updateErr != nil { + return retryRequired, updateErr + } - return retryRequired, nil + return retryRequired, err } func (c *controller) machineStatusUpdate( @@ -557,7 +567,7 @@ func (c *controller) getCreateFailurePhase(machine *v1alpha1.Machine) v1alpha1.M if timeOut > 0 { // Machine creation timeout occured while joining of machine // Machine set controller would replace this machine with a new one as phase is failed. - klog.V(2).Infof("Machine %q , providerID %q and backing node %q couldn't join in creation timeout of %s. Changing phase to failed.", machine.Name, getProviderID(machine), getNodeName(machine), timeOutDuration) + klog.V(2).Infof("Machine %q , providerID %q and backing node %q couldn't join in creation timeout of %s. Changing phase to Failed.", machine.Name, getProviderID(machine), getNodeName(machine), timeOutDuration) return v1alpha1.MachineFailed } @@ -611,7 +621,7 @@ func (c *controller) reconcileMachineHealth(ctx context.Context, machine *v1alph if isChanged { clone.Status.Conditions = node.Status.Conditions - klog.V(3).Infof("Conditions of Machine %q with providerID %q and backing node %q are changing.\nAdded/Updated Conditions:\n\n%s\nRemoved Conditions:\n\n%s", machine.Name, getProviderID(machine), getNodeName(machine), getFormattedNodeConditions(populatedConditions), getFormattedNodeConditions(removedConditions)) + klog.V(3).Infof("Conditions of node %q backing machine %q with providerID %q have changed.\nAdded/Updated Conditions:\n\n%s\nRemoved Conditions:\n\n%s\n", getNodeName(machine), machine.Name, getProviderID(machine), getFormattedNodeConditions(populatedConditions), getFormattedNodeConditions(removedConditions)) cloneDirty = true } @@ -633,7 +643,7 @@ func (c *controller) reconcileMachineHealth(ctx context.Context, machine *v1alph description = fmt.Sprintf("Machine %s successfully re-joined the cluster", clone.Name) lastOperationType = v1alpha1.MachineOperationHealthCheck } - klog.V(2).Info(description) + klog.V(2).Infof("%s with backing node %q and providerID %q", description, getNodeName(clone), getProviderID(clone)) // Machine is ready and has joined/re-joined the cluster clone.Status.LastOperation = v1alpha1.LastOperation{ @@ -651,9 +661,9 @@ func (c *controller) reconcileMachineHealth(ctx context.Context, machine *v1alph } } else { if clone.Status.CurrentStatus.Phase == v1alpha1.MachineRunning { - // If machine is not healthy, and current state is running, - // change the machinePhase to unknown and activate health check timeout - description = fmt.Sprintf("Machine %s is unhealthy - changing MachineState to Unknown. Node conditions: %+v", clone.Name, clone.Status.Conditions) + // If machine is not healthy, and current phase is Running, + // change the machinePhase to Unknown and activate health check timeout + description = fmt.Sprintf("Machine %s is unhealthy - changing MachinePhase to Unknown. Node conditions: %+v", clone.Name, clone.Status.Conditions) klog.Warning(description) clone.Status.CurrentStatus = v1alpha1.CurrentStatus{ @@ -719,7 +729,7 @@ func (c *controller) reconcileMachineHealth(ctx context.Context, machine *v1alph } else { // Timeout occurred due to machine being unhealthy for too long description = fmt.Sprintf( - "Machine %s is not healthy since %s minutes. Changing status to failed. Node Conditions: %+v", + "Machine %s health checks failing since last %s minutes. Updating machine phase to Failed. Node Conditions: %+v", machine.Name, timeOutDuration, machine.Status.Conditions, @@ -733,7 +743,8 @@ func (c *controller) reconcileMachineHealth(ctx context.Context, machine *v1alph } else { // If timeout has not occurred, re-enqueue the machine // after a specified sleep time - c.enqueueMachineAfter(machine, sleepTime) + klog.V(4).Infof("Creation/Health Timeout hasn't occured yet , will re-enqueue after %s", time.Duration(sleepTime)) + c.enqueueMachineAfter(machine, sleepTime, "re-check for creation/health timeout") } } @@ -742,8 +753,11 @@ func (c *controller) reconcileMachineHealth(ctx context.Context, machine *v1alph if err != nil { // Keep retrying across reconciles until update goes through klog.Errorf("Update of Phase/Conditions failed for machine %q. Retrying, error: %q", machine.Name, err) + if apierrors.IsConflict(err) { + return machineutils.ConflictRetry, err + } } else { - klog.V(2).Infof("Machine Phase/Conditions have been updated for %q with providerID %q and backing node %q", machine.Name, getProviderID(machine), getNodeName(machine)) + klog.V(2).Infof("Machine Phase/Conditions have been updated for %q with providerID %q and are in sync with backing node %q", machine.Name, getProviderID(machine), getNodeName(machine)) // Return error to end the reconcile err = errSuccessfulPhaseUpdate } @@ -764,10 +778,13 @@ func GetFormattedNodeConditions(conditions []v1.NodeCondition) string { func getFormattedNodeConditions(conditions []v1.NodeCondition) string { var result string + if len(conditions) == 0 { + return "\n" + } + for _, c := range conditions { result = fmt.Sprintf("%sType: %s | Status: %s | Reason: %s | Message: %s\n", result, c.Type, c.Status, c.Reason, c.Message) } - return result } @@ -857,7 +874,11 @@ func criticalComponentsNotReadyTaintPresent(node *v1.Node) bool { } func isPendingMachineWithCriticalComponentsNotReadyTaint(clone *v1alpha1.Machine, node *v1.Node) bool { - return clone.Status.CurrentStatus.Phase == v1alpha1.MachinePending && criticalComponentsNotReadyTaintPresent(node) + if clone.Status.CurrentStatus.Phase == v1alpha1.MachinePending && criticalComponentsNotReadyTaintPresent(node) { + klog.V(3).Infof("Critical component taint %q still present on node %q for machine %q", machineutils.TaintNodeCriticalComponentsNotReady, getNodeName(clone), clone.Name) + return true + } + return false } /* @@ -889,6 +910,10 @@ func (c *controller) setMachineTerminationStatus(ctx context.Context, deleteMach // Return error even when machine object is updated to ensure reconcilation is restarted err = fmt.Errorf("Machine deletion in process. Phase set to termination") } + + if apierrors.IsConflict(err) { + return machineutils.ConflictRetry, err + } return machineutils.ShortRetry, err } @@ -949,7 +974,7 @@ func (c *controller) getVMStatus(ctx context.Context, getMachineStatusRequest *d } } - c.machineStatusUpdate( + updateErr := c.machineStatusUpdate( ctx, getMachineStatusRequest.Machine, v1alpha1.LastOperation{ @@ -964,6 +989,11 @@ func (c *controller) getVMStatus(ctx context.Context, getMachineStatusRequest *d getMachineStatusRequest.Machine.Status.CurrentStatus, getMachineStatusRequest.Machine.Status.LastKnownState, ) + if apierrors.IsConflict(updateErr) { + return machineutils.ConflictRetry, err + } else if updateErr != nil { + return retry, updateErr + } return retry, err } @@ -1077,7 +1107,7 @@ func (c *controller) drainNode(ctx context.Context, deleteMachineRequest *driver ) } - // update node with the machine's state prior to termination + // update node with the machine's phase prior to termination if err = c.UpdateNodeTerminationCondition(ctx, machine); err != nil { if forceDeleteMachine { klog.Warningf("Failed to update node conditions: %v. However, since it's a force deletion shall continue deletion of VM.", err) @@ -1148,7 +1178,7 @@ func (c *controller) drainNode(ctx context.Context, deleteMachineRequest *driver } } - c.machineStatusUpdate( + updateErr := c.machineStatusUpdate( ctx, machine, v1alpha1.LastOperation{ @@ -1163,6 +1193,11 @@ func (c *controller) drainNode(ctx context.Context, deleteMachineRequest *driver machine.Status.CurrentStatus, machine.Status.LastKnownState, ) + if apierrors.IsConflict(updateErr) { + return machineutils.ConflictRetry, err + } else if updateErr != nil { + return machineutils.ShortRetry, updateErr + } return machineutils.ShortRetry, err } @@ -1212,7 +1247,7 @@ func (c *controller) deleteNodeVolAttachments(ctx context.Context, deleteMachine } now := metav1.Now() klog.V(4).Infof("(deleteVolumeAttachmentsForNode) For node %q, machine %q, set LastOperation.Description: %q", nodeName, machine.Name, description) - err = c.machineStatusUpdate( + updateErr := c.machineStatusUpdate( ctx, machine, v1alpha1.LastOperation{ @@ -1224,6 +1259,12 @@ func (c *controller) deleteNodeVolAttachments(ctx context.Context, deleteMachine machine.Status.CurrentStatus, machine.Status.LastKnownState, ) + if apierrors.IsConflict(updateErr) { + return machineutils.ConflictRetry, err + } else if updateErr != nil { + return retryPeriod, updateErr + } + return retryPeriod, err } @@ -1275,7 +1316,7 @@ func (c *controller) deleteVM(ctx context.Context, deleteMachineRequest *driver. lastKnownState = deleteMachineResponse.LastKnownState } - c.machineStatusUpdate( + updateErr := c.machineStatusUpdate( ctx, machine, v1alpha1.LastOperation{ @@ -1290,6 +1331,11 @@ func (c *controller) deleteVM(ctx context.Context, deleteMachineRequest *driver. machine.Status.CurrentStatus, lastKnownState, ) + if apierrors.IsConflict(updateErr) { + return machineutils.ConflictRetry, err + } else if updateErr != nil { + return retryRequired, updateErr + } return retryRequired, err } @@ -1308,14 +1354,14 @@ func (c *controller) deleteNodeObject(ctx context.Context, machine *v1alpha1.Mac // Delete node object err = c.targetCoreClient.CoreV1().Nodes().Delete(ctx, nodeName, metav1.DeleteOptions{}) if err != nil && !apierrors.IsNotFound(err) { - // If its an error, and anyother error than object not found + // If its an error, and any other error than object not found description = fmt.Sprintf("Deletion of Node Object %q failed due to error: %s. %s", nodeName, err, machineutils.InitiateNodeDeletion) state = v1alpha1.MachineStateFailed } else if err == nil { description = fmt.Sprintf("Deletion of Node Object %q is successful. %s", nodeName, machineutils.InitiateFinalizerRemoval) state = v1alpha1.MachineStateProcessing - err = fmt.Errorf("Machine deletion in process. Deletion of node object was succesful") + err = fmt.Errorf("Machine deletion in process. Deletion of node object was successful") } else { description = fmt.Sprintf("No node object found for %q, continuing deletion flow. %s", nodeName, machineutils.InitiateFinalizerRemoval) state = v1alpha1.MachineStateProcessing @@ -1327,7 +1373,7 @@ func (c *controller) deleteNodeObject(ctx context.Context, machine *v1alpha1.Mac err = fmt.Errorf("Machine deletion in process. No node object found") } - c.machineStatusUpdate( + updateErr := c.machineStatusUpdate( ctx, machine, v1alpha1.LastOperation{ @@ -1342,6 +1388,11 @@ func (c *controller) deleteNodeObject(ctx context.Context, machine *v1alpha1.Mac machine.Status.CurrentStatus, machine.Status.LastKnownState, ) + if apierrors.IsConflict(updateErr) { + return machineutils.ConflictRetry, err + } else if updateErr != nil { + return machineutils.ShortRetry, updateErr + } return machineutils.ShortRetry, err } diff --git a/pkg/util/provider/machinecontroller/machineclass.go b/pkg/util/provider/machinecontroller/machineclass.go index 13e1dc031..c90c139aa 100644 --- a/pkg/util/provider/machinecontroller/machineclass.go +++ b/pkg/util/provider/machinecontroller/machineclass.go @@ -96,12 +96,12 @@ func (c *controller) reconcileClusterMachineClassKey(key string) error { err = c.reconcileClusterMachineClass(ctx, class) if err != nil { - // Re-enqueue after a 10s window - c.enqueueMachineClassAfter(class, 10*time.Second) + // Re-enqueue after a ShortRetry window + c.enqueueMachineClassAfter(class, time.Duration(machineutils.ShortRetry)) } else { // Re-enqueue periodically to avoid missing of events // TODO: Get ride of this logic - c.enqueueMachineClassAfter(class, 10*time.Minute) + c.enqueueMachineClassAfter(class, time.Duration(machineutils.LongRetry)) } return nil @@ -137,7 +137,7 @@ func (c *controller) reconcileClusterMachineClass(ctx context.Context, class *v1 // Enqueue all machines once finalizer is added to machineClass // This is to allow processing of such machines for _, machine := range machines { - c.enqueueMachine(machine) + c.enqueueMachine(machine, "finalizer placed on machineClass") } } @@ -171,13 +171,13 @@ func (c *controller) reconcileClusterMachineClass(ctx context.Context, class *v1 func (c *controller) addMachineClassFinalizers(ctx context.Context, class *v1alpha1.MachineClass) error { finalizers := sets.NewString(class.Finalizers...) finalizers.Insert(MCMFinalizerName) - return c.updateMachineClassFinalizers(ctx, class, finalizers.List(),true) + return c.updateMachineClassFinalizers(ctx, class, finalizers.List(), true) } func (c *controller) deleteMachineClassFinalizers(ctx context.Context, class *v1alpha1.MachineClass) error { finalizers := sets.NewString(class.Finalizers...) finalizers.Delete(MCMFinalizerName) - return c.updateMachineClassFinalizers(ctx, class, finalizers.List(),false) + return c.updateMachineClassFinalizers(ctx, class, finalizers.List(), false) } func (c *controller) updateMachineClassFinalizers(ctx context.Context, class *v1alpha1.MachineClass, finalizers []string, addFinalizers bool) error { @@ -194,9 +194,9 @@ func (c *controller) updateMachineClassFinalizers(ctx context.Context, class *v1 klog.Warning("Updating machineClass failed, retrying. ", class.Name, err) return err } - if addFinalizers{ + if addFinalizers { klog.V(3).Infof("Successfully added finalizer on the machineclass %q", class.Name) - }else{ + } else { klog.V(3).Infof("Successfully removed finalizer on the machineclass %q", class.Name) } return err From f606092e25f2f3ae538a3e50e1e88fdf2fdf6148 Mon Sep 17 00:00:00 2001 From: Himanshu Sharma Date: Fri, 1 Dec 2023 17:47:17 +0530 Subject: [PATCH 14/22] make generate && make check --- docs/documents/apis.md | 4 ++-- kubernetes/crds/machine.sapcloud.io_machines.yaml | 2 +- pkg/util/provider/machinecontroller/machine_util.go | 8 -------- 3 files changed, 3 insertions(+), 11 deletions(-) diff --git a/docs/documents/apis.md b/docs/documents/apis.md index 6ea44ddc0..401062f35 100644 --- a/docs/documents/apis.md +++ b/docs/documents/apis.md @@ -1555,7 +1555,7 @@ to be.

CurrentStatus)

-

MachinePhase is a label for the condition of a machines at the current time.

+

MachinePhase is a label for the condition of a machine at the current time.


@@ -1992,7 +1992,7 @@ MachineConfiguration LastOperation)

-

MachineState is a current state of the machine.

+

MachineState is a current state of the operation.


diff --git a/kubernetes/crds/machine.sapcloud.io_machines.yaml b/kubernetes/crds/machine.sapcloud.io_machines.yaml index 1b800af25..744354ac5 100644 --- a/kubernetes/crds/machine.sapcloud.io_machines.yaml +++ b/kubernetes/crds/machine.sapcloud.io_machines.yaml @@ -245,7 +245,7 @@ spec: format: date-time type: string phase: - description: MachinePhase is a label for the condition of a machines + description: MachinePhase is a label for the condition of a machine at the current time. type: string timeoutActive: diff --git a/pkg/util/provider/machinecontroller/machine_util.go b/pkg/util/provider/machinecontroller/machine_util.go index a00c15f8e..6720859b3 100644 --- a/pkg/util/provider/machinecontroller/machine_util.go +++ b/pkg/util/provider/machinecontroller/machine_util.go @@ -768,14 +768,6 @@ func (c *controller) reconcileMachineHealth(ctx context.Context, machine *v1alph return machineutils.LongRetry, nil } -func NodeConditionsHaveChanged(oldConditions []v1.NodeCondition, newConditions []v1.NodeCondition) ([]v1.NodeCondition, []v1.NodeCondition, bool) { - return nodeConditionsHaveChanged(oldConditions, newConditions) -} - -func GetFormattedNodeConditions(conditions []v1.NodeCondition) string { - return getFormattedNodeConditions(conditions) -} - func getFormattedNodeConditions(conditions []v1.NodeCondition) string { var result string if len(conditions) == 0 { From 41b8a731a5649b7012fb3c223377b39ead654c62 Mon Sep 17 00:00:00 2001 From: Himanshu Sharma Date: Fri, 8 Dec 2023 16:33:44 +0530 Subject: [PATCH 15/22] reconcile on removal of critical component taint --- .../provider/machinecontroller/machine.go | 25 ++++++++++++++----- 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/pkg/util/provider/machinecontroller/machine.go b/pkg/util/provider/machinecontroller/machine.go index 8fbc82710..521f3288d 100644 --- a/pkg/util/provider/machinecontroller/machine.go +++ b/pkg/util/provider/machinecontroller/machine.go @@ -21,6 +21,7 @@ import ( "context" "errors" "fmt" + "reflect" "strings" "time" @@ -248,21 +249,22 @@ func (c *controller) addNodeToMachine(obj interface{}) { } func (c *controller) updateNodeToMachine(oldObj, newObj interface{}) { + oldNode := oldObj.(*corev1.Node) node := newObj.(*corev1.Node) if node == nil { klog.Errorf("Couldn't convert to node from object") return } + machine, err := c.getMachineFromNode(node.Name) + if err != nil { + klog.Errorf("Couldn't fetch machine %s, Error: %s", machine.Name, err) + return + } + // check for the TriggerDeletionByMCM annotation on the node object // if it is present then mark the machine object for deletion if value, ok := node.Annotations[machineutils.TriggerDeletionByMCM]; ok && value == "true" { - machine, err := c.getMachineFromNode(node.Name) - if err != nil { - klog.Errorf("Couldn't fetch machine %s, Error: %s", machine.Name, err) - return - } - if machine.DeletionTimestamp == nil { klog.Infof("Node %s for machine %s is annotated to trigger deletion by MCM.", node.Name, machine.Name) if err := c.controlMachineClient.Machines(c.namespace).Delete(context.Background(), machine.Name, metav1.DeleteOptions{}); err != nil { @@ -273,6 +275,13 @@ func (c *controller) updateNodeToMachine(oldObj, newObj interface{}) { } } + // to reconcile on removal of critical component taint + nodeSpecHasChanged := isNodeSpecChanged(oldNode, node) + if nodeSpecHasChanged { + c.enqueueMachine(machine, fmt.Sprintf("handling node UPDATE event. spec of backing node %q has changed", getNodeName(machine))) + return + } + c.addNodeToMachine(newObj) } @@ -319,6 +328,10 @@ func (c *controller) getMachineFromNode(nodeName string) (*v1alpha1.Machine, err return machines[0], nil } +func isNodeSpecChanged(oldNode *corev1.Node, node *corev1.Node) bool { + return !reflect.DeepEqual(oldNode.Spec, node.Spec) +} + /* SECTION Machine operations - Create, Delete From 2099f976ebed35db69e90e9a45ed0637914cff6c Mon Sep 17 00:00:00 2001 From: Himanshu Sharma Date: Fri, 8 Dec 2023 20:25:45 +0530 Subject: [PATCH 16/22] reconcile machine on specific node taint updates --- .../provider/machinecontroller/machine.go | 30 ++++++++++++++----- pkg/util/provider/machineutils/utils.go | 4 +++ 2 files changed, 27 insertions(+), 7 deletions(-) diff --git a/pkg/util/provider/machinecontroller/machine.go b/pkg/util/provider/machinecontroller/machine.go index 521f3288d..b9cc4f1f9 100644 --- a/pkg/util/provider/machinecontroller/machine.go +++ b/pkg/util/provider/machinecontroller/machine.go @@ -21,7 +21,6 @@ import ( "context" "errors" "fmt" - "reflect" "strings" "time" @@ -275,10 +274,9 @@ func (c *controller) updateNodeToMachine(oldObj, newObj interface{}) { } } - // to reconcile on removal of critical component taint - nodeSpecHasChanged := isNodeSpecChanged(oldNode, node) - if nodeSpecHasChanged { - c.enqueueMachine(machine, fmt.Sprintf("handling node UPDATE event. spec of backing node %q has changed", getNodeName(machine))) + // to reconcile on addition/removal of essential taints in machine lifecycle, example - critical component taint + if areEssentialTaintsUpdated(oldNode, node, machineutils.EssentialTaints) { + c.enqueueMachine(machine, fmt.Sprintf("handling node UPDATE event. atleast one of essential taints on backing node %q has changed", getNodeName(machine))) return } @@ -328,8 +326,26 @@ func (c *controller) getMachineFromNode(nodeName string) (*v1alpha1.Machine, err return machines[0], nil } -func isNodeSpecChanged(oldNode *corev1.Node, node *corev1.Node) bool { - return !reflect.DeepEqual(oldNode.Spec, node.Spec) +func areEssentialTaintsUpdated(oldNode, node *corev1.Node, taintKeys []string) bool { + mapOldNodeTaintKeys := make(map[string]bool) + mapNodeTaintKeys := make(map[string]bool) + + for _, t := range oldNode.Spec.Taints { + mapOldNodeTaintKeys[t.Key] = true + } + + for _, t := range node.Spec.Taints { + mapNodeTaintKeys[t.Key] = true + } + + for _, tk := range taintKeys { + _, oldNodeHasTaint := mapOldNodeTaintKeys[tk] + _, newNodeHasTaint := mapNodeTaintKeys[tk] + if oldNodeHasTaint != newNodeHasTaint { + return true + } + } + return false } /* diff --git a/pkg/util/provider/machineutils/utils.go b/pkg/util/provider/machineutils/utils.go index 332349206..3d91e1a68 100644 --- a/pkg/util/provider/machineutils/utils.go +++ b/pkg/util/provider/machineutils/utils.go @@ -88,3 +88,7 @@ const ( // LongRetry tells the controller to retry after a long duration - 10 minutes LongRetry RetryPeriod = RetryPeriod(10 * time.Minute) ) + +// EssentialTaints are taints on node object which if added/removed, require an immediate reconcile by machine controller +// TODO: update this when taints for ALT updation and PostCreate operations is introduced. +var EssentialTaints = []string{TaintNodeCriticalComponentsNotReady} From 692e894494a74a52fb8223a69111afa9ddf87aed Mon Sep 17 00:00:00 2001 From: Himanshu Sharma Date: Tue, 12 Dec 2023 16:39:31 +0530 Subject: [PATCH 17/22] adapted unit tests for create errors --- pkg/util/provider/machinecontroller/machine_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/util/provider/machinecontroller/machine_test.go b/pkg/util/provider/machinecontroller/machine_test.go index 23ab69061..39334ee71 100644 --- a/pkg/util/provider/machinecontroller/machine_test.go +++ b/pkg/util/provider/machinecontroller/machine_test.go @@ -885,7 +885,7 @@ var _ = Describe("machine", func() { ErrorCode: codes.Internal.String(), }, }, nil, nil, nil, true, metav1.NewTime(metav1.Now().Add(-time.Hour))), - err: nil, + err: status.Error(codes.Internal, "Provider is returning error on create call"), retry: machineutils.MediumRetry, }, }), From 174a4b19841d7b900ffab686cf3e1eafc0b25b3c Mon Sep 17 00:00:00 2001 From: Himanshu Sharma Date: Wed, 13 Dec 2023 12:12:11 +0530 Subject: [PATCH 18/22] fixed some failing tests --- pkg/util/provider/machinecontroller/machine_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/util/provider/machinecontroller/machine_test.go b/pkg/util/provider/machinecontroller/machine_test.go index 39334ee71..9bfd4bf81 100644 --- a/pkg/util/provider/machinecontroller/machine_test.go +++ b/pkg/util/provider/machinecontroller/machine_test.go @@ -781,7 +781,7 @@ var _ = Describe("machine", func() { ErrorCode: codes.Internal.String(), }, }, nil, nil, nil, true, metav1.Now()), - err: nil, + err: status.Error(codes.Internal, "Provider is returning error on create call"), retry: machineutils.MediumRetry, }, }), @@ -833,7 +833,7 @@ var _ = Describe("machine", func() { ErrorCode: codes.ResourceExhausted.String(), }, }, nil, nil, nil, true, metav1.Now()), - err: nil, + err: status.Error(codes.ResourceExhausted, "Provider does not have capacity to create VM"), retry: machineutils.MediumRetry, }, }), From ca12dd0abca0efc07b9307c5b58c6a8eb80be3a6 Mon Sep 17 00:00:00 2001 From: Himanshu Sharma <79965161+himanshu-kun@users.noreply.github.com> Date: Fri, 15 Dec 2023 12:12:29 +0530 Subject: [PATCH 19/22] Apply suggestions from code review Co-authored-by: Aaron Francis Fernandes <79958509+aaronfern@users.noreply.github.com> Co-authored-by: Rishabh Patel <66425093+rishabh-11@users.noreply.github.com> --- pkg/apis/machine/v1alpha1/machine_types.go | 8 ++++---- pkg/util/provider/machinecontroller/machine.go | 6 +++--- pkg/util/provider/machinecontroller/machine_safety.go | 4 ++-- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/pkg/apis/machine/v1alpha1/machine_types.go b/pkg/apis/machine/v1alpha1/machine_types.go index 00f87aaf1..d3541743c 100644 --- a/pkg/apis/machine/v1alpha1/machine_types.go +++ b/pkg/apis/machine/v1alpha1/machine_types.go @@ -156,19 +156,19 @@ const ( // MachineUnknown indicates that the node is not ready at the movement MachineUnknown MachinePhase = "Unknown" - // MachineFailed means operation timeod out + // MachineFailed means operation timed out MachineFailed MachinePhase = "Failed" - // MachineCrashLoopBackOff means creation machine is failing. It means that machine object is present but there is no corresponding VM. + // MachineCrashLoopBackOff means machine creation is failing. It means that machine object is present but there is no corresponding VM. MachineCrashLoopBackOff MachinePhase = "CrashLoopBackOff" ) // MachineState is a current state of the operation. type MachineState string -// These are the valid states of the operation performed on the machine. +// These are the valid states of operations performed on the machine. const ( - // MachineStateProcessing means operation is not yet complete + // MachineStateProcessing means operation is not yet complete MachineStateProcessing MachineState = "Processing" // MachineStateFailed means operation failed diff --git a/pkg/util/provider/machinecontroller/machine.go b/pkg/util/provider/machinecontroller/machine.go index b9cc4f1f9..7ebc8507a 100644 --- a/pkg/util/provider/machinecontroller/machine.go +++ b/pkg/util/provider/machinecontroller/machine.go @@ -55,7 +55,7 @@ func (c *controller) updateMachine(oldObj, newObj interface{}) { newMachine := newObj.(*v1alpha1.Machine) if oldMachine == nil || newMachine == nil { - klog.Errorf("Couldn't convert to machine resource from object") + klog.Errorf("couldn't convert to machine resource from object") return } @@ -152,7 +152,7 @@ func (c *controller) reconcileClusterMachine(ctx context.Context, machine *v1alp // Validate MachineClass machineClass, secretData, retry, err := c.ValidateMachineClass(ctx, &machine.Spec.Class) if err != nil { - klog.Errorf("Cannot reconcile machine %s: %s", machine.Name, err) + klog.Errorf("cannot reconcile machine %s: %s", machine.Name, err) return retry, err } @@ -257,7 +257,7 @@ func (c *controller) updateNodeToMachine(oldObj, newObj interface{}) { machine, err := c.getMachineFromNode(node.Name) if err != nil { - klog.Errorf("Couldn't fetch machine %s, Error: %s", machine.Name, err) + klog.Errorf("Unable to handle update event for node %s, couldn't fetch machine %s, Error: %s", machine.Name, err) return } diff --git a/pkg/util/provider/machinecontroller/machine_safety.go b/pkg/util/provider/machinecontroller/machine_safety.go index 8c02bdcb7..7da65e9b3 100644 --- a/pkg/util/provider/machinecontroller/machine_safety.go +++ b/pkg/util/provider/machinecontroller/machine_safety.go @@ -107,8 +107,8 @@ func (c *controller) reconcileClusterMachineSafetyAPIServer(key string) error { klog.V(2).Infof("SafetyController: Reinitializing machine health check for machine: %q with backing node: %q and providerID: %q", machine.Name, getNodeName(machine), getProviderID(machine)) } - // En-queue after 30 seconds, to ensure all machine phases are reconciled to actual as all are - // reseted to `Running` currently + // En-queue after 30 seconds, to ensure all machine phases are reconciled to their actual value + // as they are currently reset to `Running` c.enqueueMachineAfter(machine, 30*time.Second, "kube-api-servers are up again, so reconcile of machine phase is needed") } From b73c1476307bcf8d7e09e38aee513ad83d6c814c Mon Sep 17 00:00:00 2001 From: Himanshu Sharma Date: Fri, 15 Dec 2023 12:13:57 +0530 Subject: [PATCH 20/22] addresed Rishab's suggestion part 1 --- pkg/util/provider/machinecontroller/machine.go | 4 ++-- pkg/util/provider/machinecontroller/machine_util_test.go | 6 +++--- pkg/util/provider/machinecontroller/machineclass.go | 8 ++++---- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/pkg/util/provider/machinecontroller/machine.go b/pkg/util/provider/machinecontroller/machine.go index 7ebc8507a..dcd223d8c 100644 --- a/pkg/util/provider/machinecontroller/machine.go +++ b/pkg/util/provider/machinecontroller/machine.go @@ -275,7 +275,7 @@ func (c *controller) updateNodeToMachine(oldObj, newObj interface{}) { } // to reconcile on addition/removal of essential taints in machine lifecycle, example - critical component taint - if areEssentialTaintsUpdated(oldNode, node, machineutils.EssentialTaints) { + if addedOrRemovedEssentialTaints(oldNode, node, machineutils.EssentialTaints) { c.enqueueMachine(machine, fmt.Sprintf("handling node UPDATE event. atleast one of essential taints on backing node %q has changed", getNodeName(machine))) return } @@ -326,7 +326,7 @@ func (c *controller) getMachineFromNode(nodeName string) (*v1alpha1.Machine, err return machines[0], nil } -func areEssentialTaintsUpdated(oldNode, node *corev1.Node, taintKeys []string) bool { +func addedOrRemovedEssentialTaints(oldNode, node *corev1.Node, taintKeys []string) bool { mapOldNodeTaintKeys := make(map[string]bool) mapNodeTaintKeys := make(map[string]bool) diff --git a/pkg/util/provider/machinecontroller/machine_util_test.go b/pkg/util/provider/machinecontroller/machine_util_test.go index 8626c0f88..2e83bfed2 100644 --- a/pkg/util/provider/machinecontroller/machine_util_test.go +++ b/pkg/util/provider/machinecontroller/machine_util_test.go @@ -2363,7 +2363,7 @@ var _ = Describe("machine_util", func() { }, expect: expect{ retryPeriod: machineutils.ShortRetry, - err: errors.New("machine creation is successful. Machine Phase/Conditions have been UPDATED"), + err: errSuccessfulPhaseUpdate, expectedPhase: v1alpha1.MachineRunning, }, }), @@ -2390,7 +2390,7 @@ var _ = Describe("machine_util", func() { }, expect: expect{ retryPeriod: machineutils.ShortRetry, - err: errors.New("machine creation is successful. Machine Phase/Conditions have been UPDATED"), + err: errSuccessfulPhaseUpdate, expectedPhase: v1alpha1.MachineRunning, }, }), @@ -2424,7 +2424,7 @@ var _ = Describe("machine_util", func() { }, expect: expect{ retryPeriod: machineutils.ShortRetry, - err: errors.New("machine creation is successful. Machine Phase/Conditions have been UPDATED"), + err: errSuccessfulPhaseUpdate, expectedPhase: v1alpha1.MachineRunning, }, }), diff --git a/pkg/util/provider/machinecontroller/machineclass.go b/pkg/util/provider/machinecontroller/machineclass.go index c90c139aa..91a1ab460 100644 --- a/pkg/util/provider/machinecontroller/machineclass.go +++ b/pkg/util/provider/machinecontroller/machineclass.go @@ -129,7 +129,7 @@ func (c *controller) reconcileClusterMachineClass(ctx context.Context, class *v1 if finalizers := sets.NewString(class.Finalizers...); !finalizers.Has(MCMFinalizerName) { // Add machineClassFinalizer as if doesn't exist - err = c.addMachineClassFinalizers(ctx, class) + err = c.addMCMFinalizerToMachineClass(ctx, class) if err != nil { return err } @@ -157,7 +157,7 @@ func (c *controller) reconcileClusterMachineClass(ctx context.Context, class *v1 if finalizers := sets.NewString(class.Finalizers...); finalizers.Has(MCMFinalizerName) { // Delete finalizer if exists on machineClass - return c.deleteMachineClassFinalizers(ctx, class) + return c.deleteMCMFinalizerFromMachineClass(ctx, class) } return nil @@ -168,13 +168,13 @@ func (c *controller) reconcileClusterMachineClass(ctx context.Context, class *v1 Manipulate Finalizers */ -func (c *controller) addMachineClassFinalizers(ctx context.Context, class *v1alpha1.MachineClass) error { +func (c *controller) addMCMFinalizerToMachineClass(ctx context.Context, class *v1alpha1.MachineClass) error { finalizers := sets.NewString(class.Finalizers...) finalizers.Insert(MCMFinalizerName) return c.updateMachineClassFinalizers(ctx, class, finalizers.List(), true) } -func (c *controller) deleteMachineClassFinalizers(ctx context.Context, class *v1alpha1.MachineClass) error { +func (c *controller) deleteMCMFinalizerFromMachineClass(ctx context.Context, class *v1alpha1.MachineClass) error { finalizers := sets.NewString(class.Finalizers...) finalizers.Delete(MCMFinalizerName) return c.updateMachineClassFinalizers(ctx, class, finalizers.List(), false) From 9d41bf78e2a7a3f19d74888366730ac602f733b7 Mon Sep 17 00:00:00 2001 From: Himanshu Sharma Date: Fri, 15 Dec 2023 13:10:26 +0530 Subject: [PATCH 21/22] addressed Rishab's review comments part 2 --- .../provider/machinecontroller/machine.go | 28 ++++---- .../machinecontroller/machine_safety.go | 4 +- .../machinecontroller/machine_util.go | 64 +++++++++---------- 3 files changed, 45 insertions(+), 51 deletions(-) diff --git a/pkg/util/provider/machinecontroller/machine.go b/pkg/util/provider/machinecontroller/machine.go index dcd223d8c..c85d658c5 100644 --- a/pkg/util/provider/machinecontroller/machine.go +++ b/pkg/util/provider/machinecontroller/machine.go @@ -59,7 +59,7 @@ func (c *controller) updateMachine(oldObj, newObj interface{}) { return } - if oldMachine.ResourceVersion != newMachine.ResourceVersion && oldMachine.Generation == newMachine.Generation { + if oldMachine.Generation == newMachine.Generation { klog.V(3).Infof("Skipping non-spec updates for machine %s", oldMachine.Name) return } @@ -342,6 +342,7 @@ func addedOrRemovedEssentialTaints(oldNode, node *corev1.Node, taintKeys []strin _, oldNodeHasTaint := mapOldNodeTaintKeys[tk] _, newNodeHasTaint := mapNodeTaintKeys[tk] if oldNodeHasTaint != newNodeHasTaint { + klog.V(2).Infof("Taint with key %q has been added/removed from the node %q", tk, node.Name) return true } } @@ -443,7 +444,7 @@ func (c *controller) triggerCreationFlow(ctx context.Context, createMachineReque } // machine obj marked Failed for double security - updateErr := c.machineStatusUpdate( + updateRetryRequired, updateErr := c.machineStatusUpdate( ctx, machine, v1alpha1.LastOperation{ @@ -458,10 +459,9 @@ func (c *controller) triggerCreationFlow(ctx context.Context, createMachineReque }, machine.Status.LastKnownState, ) - if apierrors.IsConflict(updateErr) { - return machineutils.ConflictRetry, err - } else if updateErr != nil { - return machineutils.ShortRetry, updateErr + + if updateErr != nil { + return updateRetryRequired, updateErr } klog.V(2).Infof("Machine %q marked Failed as VM was referring to a stale node object", machine.Name) @@ -475,7 +475,7 @@ func (c *controller) triggerCreationFlow(ctx context.Context, createMachineReque case codes.Unknown, codes.DeadlineExceeded, codes.Aborted, codes.Unavailable: // GetMachineStatus() returned with one of the above error codes. // Retry operation. - updateErr := c.machineStatusUpdate( + updateRetryRequired, updateErr := c.machineStatusUpdate( ctx, machine, v1alpha1.LastOperation{ @@ -490,16 +490,14 @@ func (c *controller) triggerCreationFlow(ctx context.Context, createMachineReque }, machine.Status.LastKnownState, ) - if apierrors.IsConflict(updateErr) { - return machineutils.ConflictRetry, err - } else if updateErr != nil { - return machineutils.ShortRetry, updateErr + if updateErr != nil { + return updateRetryRequired, updateErr } return machineutils.ShortRetry, err default: - updateErr := c.machineStatusUpdate( + updateRetryRequired, updateErr := c.machineStatusUpdate( ctx, machine, v1alpha1.LastOperation{ @@ -514,10 +512,8 @@ func (c *controller) triggerCreationFlow(ctx context.Context, createMachineReque }, machine.Status.LastKnownState, ) - if apierrors.IsConflict(updateErr) { - return machineutils.ConflictRetry, err - } else if updateErr != nil { - return machineutils.MediumRetry, updateErr + if updateErr != nil { + return updateRetryRequired, updateErr } return machineutils.MediumRetry, err diff --git a/pkg/util/provider/machinecontroller/machine_safety.go b/pkg/util/provider/machinecontroller/machine_safety.go index 7da65e9b3..44b99d286 100644 --- a/pkg/util/provider/machinecontroller/machine_safety.go +++ b/pkg/util/provider/machinecontroller/machine_safety.go @@ -107,8 +107,8 @@ func (c *controller) reconcileClusterMachineSafetyAPIServer(key string) error { klog.V(2).Infof("SafetyController: Reinitializing machine health check for machine: %q with backing node: %q and providerID: %q", machine.Name, getNodeName(machine), getProviderID(machine)) } - // En-queue after 30 seconds, to ensure all machine phases are reconciled to their actual value - // as they are currently reset to `Running` + // En-queue after 30 seconds, to ensure all machine phases are reconciled to their actual value + // as they are currently reset to `Running` c.enqueueMachineAfter(machine, 30*time.Second, "kube-api-servers are up again, so reconcile of machine phase is needed") } diff --git a/pkg/util/provider/machinecontroller/machine_util.go b/pkg/util/provider/machinecontroller/machine_util.go index 6720859b3..c49c35ec4 100644 --- a/pkg/util/provider/machinecontroller/machine_util.go +++ b/pkg/util/provider/machinecontroller/machine_util.go @@ -482,7 +482,7 @@ func (c *controller) machineCreateErrorHandler(ctx context.Context, machine *v1a lastKnownState = createMachineResponse.LastKnownState } - updateErr := c.machineStatusUpdate( + updateRetryRequired, updateErr := c.machineStatusUpdate( ctx, machine, v1alpha1.LastOperation{ @@ -498,10 +498,9 @@ func (c *controller) machineCreateErrorHandler(ctx context.Context, machine *v1a }, lastKnownState, ) - if apierrors.IsConflict(updateErr) { - return machineutils.ConflictRetry, err - } else if updateErr != nil { - return retryRequired, updateErr + + if updateErr != nil { + return updateRetryRequired, updateErr } return retryRequired, err @@ -513,7 +512,7 @@ func (c *controller) machineStatusUpdate( lastOperation v1alpha1.LastOperation, currentStatus v1alpha1.CurrentStatus, lastKnownState string, -) error { +) (machineutils.RetryPeriod, error) { clone := machine.DeepCopy() clone.Status.LastOperation = lastOperation clone.Status.CurrentStatus = currentStatus @@ -521,7 +520,7 @@ func (c *controller) machineStatusUpdate( if isMachineStatusSimilar(clone.Status, machine.Status) { klog.V(3).Infof("Not updating the status of the machine object %q, as the content is similar", clone.Name) - return nil + return machineutils.ShortRetry, nil } _, err := c.controlMachineClient.Machines(clone.Namespace).UpdateStatus(ctx, clone, metav1.UpdateOptions{}) @@ -532,7 +531,11 @@ func (c *controller) machineStatusUpdate( klog.V(2).Infof("Machine/status UPDATE for %q", machine.Name) } - return err + if apierrors.IsConflict(err) { + return machineutils.ConflictRetry, err + } + + return machineutils.ShortRetry, err } // isMachineStatusSimilar checks if the status of 2 machines is similar or not. @@ -966,7 +969,7 @@ func (c *controller) getVMStatus(ctx context.Context, getMachineStatusRequest *d } } - updateErr := c.machineStatusUpdate( + updateRetryRequired, updateErr := c.machineStatusUpdate( ctx, getMachineStatusRequest.Machine, v1alpha1.LastOperation{ @@ -981,10 +984,9 @@ func (c *controller) getVMStatus(ctx context.Context, getMachineStatusRequest *d getMachineStatusRequest.Machine.Status.CurrentStatus, getMachineStatusRequest.Machine.Status.LastKnownState, ) - if apierrors.IsConflict(updateErr) { - return machineutils.ConflictRetry, err - } else if updateErr != nil { - return retry, updateErr + + if updateErr != nil { + return updateRetryRequired, updateErr } return retry, err @@ -1170,7 +1172,7 @@ func (c *controller) drainNode(ctx context.Context, deleteMachineRequest *driver } } - updateErr := c.machineStatusUpdate( + updateRetryRequired, updateErr := c.machineStatusUpdate( ctx, machine, v1alpha1.LastOperation{ @@ -1185,10 +1187,9 @@ func (c *controller) drainNode(ctx context.Context, deleteMachineRequest *driver machine.Status.CurrentStatus, machine.Status.LastKnownState, ) - if apierrors.IsConflict(updateErr) { - return machineutils.ConflictRetry, err - } else if updateErr != nil { - return machineutils.ShortRetry, updateErr + + if updateErr != nil { + return updateRetryRequired, updateErr } return machineutils.ShortRetry, err @@ -1239,7 +1240,7 @@ func (c *controller) deleteNodeVolAttachments(ctx context.Context, deleteMachine } now := metav1.Now() klog.V(4).Infof("(deleteVolumeAttachmentsForNode) For node %q, machine %q, set LastOperation.Description: %q", nodeName, machine.Name, description) - updateErr := c.machineStatusUpdate( + updateRetryRequired, updateErr := c.machineStatusUpdate( ctx, machine, v1alpha1.LastOperation{ @@ -1251,10 +1252,9 @@ func (c *controller) deleteNodeVolAttachments(ctx context.Context, deleteMachine machine.Status.CurrentStatus, machine.Status.LastKnownState, ) - if apierrors.IsConflict(updateErr) { - return machineutils.ConflictRetry, err - } else if updateErr != nil { - return retryPeriod, updateErr + + if updateErr != nil { + return updateRetryRequired, updateErr } return retryPeriod, err @@ -1308,7 +1308,7 @@ func (c *controller) deleteVM(ctx context.Context, deleteMachineRequest *driver. lastKnownState = deleteMachineResponse.LastKnownState } - updateErr := c.machineStatusUpdate( + updateRetryRequired, updateErr := c.machineStatusUpdate( ctx, machine, v1alpha1.LastOperation{ @@ -1323,10 +1323,9 @@ func (c *controller) deleteVM(ctx context.Context, deleteMachineRequest *driver. machine.Status.CurrentStatus, lastKnownState, ) - if apierrors.IsConflict(updateErr) { - return machineutils.ConflictRetry, err - } else if updateErr != nil { - return retryRequired, updateErr + + if updateErr != nil { + return updateRetryRequired, updateErr } return retryRequired, err @@ -1365,7 +1364,7 @@ func (c *controller) deleteNodeObject(ctx context.Context, machine *v1alpha1.Mac err = fmt.Errorf("Machine deletion in process. No node object found") } - updateErr := c.machineStatusUpdate( + updateRetryRequired, updateErr := c.machineStatusUpdate( ctx, machine, v1alpha1.LastOperation{ @@ -1380,10 +1379,9 @@ func (c *controller) deleteNodeObject(ctx context.Context, machine *v1alpha1.Mac machine.Status.CurrentStatus, machine.Status.LastKnownState, ) - if apierrors.IsConflict(updateErr) { - return machineutils.ConflictRetry, err - } else if updateErr != nil { - return machineutils.ShortRetry, updateErr + + if updateErr != nil { + return updateRetryRequired, updateErr } return machineutils.ShortRetry, err From c93ce96d10b5a89363be09fdb4a1439b663f0011 Mon Sep 17 00:00:00 2001 From: Himanshu Sharma Date: Fri, 15 Dec 2023 14:03:35 +0530 Subject: [PATCH 22/22] renamed updateRetryRequired to updateRetryPeriod --- .../provider/machinecontroller/machine.go | 12 +++++----- .../machinecontroller/machine_util.go | 24 +++++++++---------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/pkg/util/provider/machinecontroller/machine.go b/pkg/util/provider/machinecontroller/machine.go index c85d658c5..e28c9e9b4 100644 --- a/pkg/util/provider/machinecontroller/machine.go +++ b/pkg/util/provider/machinecontroller/machine.go @@ -444,7 +444,7 @@ func (c *controller) triggerCreationFlow(ctx context.Context, createMachineReque } // machine obj marked Failed for double security - updateRetryRequired, updateErr := c.machineStatusUpdate( + updateRetryPeriod, updateErr := c.machineStatusUpdate( ctx, machine, v1alpha1.LastOperation{ @@ -461,7 +461,7 @@ func (c *controller) triggerCreationFlow(ctx context.Context, createMachineReque ) if updateErr != nil { - return updateRetryRequired, updateErr + return updateRetryPeriod, updateErr } klog.V(2).Infof("Machine %q marked Failed as VM was referring to a stale node object", machine.Name) @@ -475,7 +475,7 @@ func (c *controller) triggerCreationFlow(ctx context.Context, createMachineReque case codes.Unknown, codes.DeadlineExceeded, codes.Aborted, codes.Unavailable: // GetMachineStatus() returned with one of the above error codes. // Retry operation. - updateRetryRequired, updateErr := c.machineStatusUpdate( + updateRetryPeriod, updateErr := c.machineStatusUpdate( ctx, machine, v1alpha1.LastOperation{ @@ -491,13 +491,13 @@ func (c *controller) triggerCreationFlow(ctx context.Context, createMachineReque machine.Status.LastKnownState, ) if updateErr != nil { - return updateRetryRequired, updateErr + return updateRetryPeriod, updateErr } return machineutils.ShortRetry, err default: - updateRetryRequired, updateErr := c.machineStatusUpdate( + updateRetryPeriod, updateErr := c.machineStatusUpdate( ctx, machine, v1alpha1.LastOperation{ @@ -513,7 +513,7 @@ func (c *controller) triggerCreationFlow(ctx context.Context, createMachineReque machine.Status.LastKnownState, ) if updateErr != nil { - return updateRetryRequired, updateErr + return updateRetryPeriod, updateErr } return machineutils.MediumRetry, err diff --git a/pkg/util/provider/machinecontroller/machine_util.go b/pkg/util/provider/machinecontroller/machine_util.go index c49c35ec4..67919c5c8 100644 --- a/pkg/util/provider/machinecontroller/machine_util.go +++ b/pkg/util/provider/machinecontroller/machine_util.go @@ -482,7 +482,7 @@ func (c *controller) machineCreateErrorHandler(ctx context.Context, machine *v1a lastKnownState = createMachineResponse.LastKnownState } - updateRetryRequired, updateErr := c.machineStatusUpdate( + updateRetryPeriod, updateErr := c.machineStatusUpdate( ctx, machine, v1alpha1.LastOperation{ @@ -500,7 +500,7 @@ func (c *controller) machineCreateErrorHandler(ctx context.Context, machine *v1a ) if updateErr != nil { - return updateRetryRequired, updateErr + return updateRetryPeriod, updateErr } return retryRequired, err @@ -969,7 +969,7 @@ func (c *controller) getVMStatus(ctx context.Context, getMachineStatusRequest *d } } - updateRetryRequired, updateErr := c.machineStatusUpdate( + updateRetryPeriod, updateErr := c.machineStatusUpdate( ctx, getMachineStatusRequest.Machine, v1alpha1.LastOperation{ @@ -986,7 +986,7 @@ func (c *controller) getVMStatus(ctx context.Context, getMachineStatusRequest *d ) if updateErr != nil { - return updateRetryRequired, updateErr + return updateRetryPeriod, updateErr } return retry, err @@ -1172,7 +1172,7 @@ func (c *controller) drainNode(ctx context.Context, deleteMachineRequest *driver } } - updateRetryRequired, updateErr := c.machineStatusUpdate( + updateRetryPeriod, updateErr := c.machineStatusUpdate( ctx, machine, v1alpha1.LastOperation{ @@ -1189,7 +1189,7 @@ func (c *controller) drainNode(ctx context.Context, deleteMachineRequest *driver ) if updateErr != nil { - return updateRetryRequired, updateErr + return updateRetryPeriod, updateErr } return machineutils.ShortRetry, err @@ -1240,7 +1240,7 @@ func (c *controller) deleteNodeVolAttachments(ctx context.Context, deleteMachine } now := metav1.Now() klog.V(4).Infof("(deleteVolumeAttachmentsForNode) For node %q, machine %q, set LastOperation.Description: %q", nodeName, machine.Name, description) - updateRetryRequired, updateErr := c.machineStatusUpdate( + updateRetryPeriod, updateErr := c.machineStatusUpdate( ctx, machine, v1alpha1.LastOperation{ @@ -1254,7 +1254,7 @@ func (c *controller) deleteNodeVolAttachments(ctx context.Context, deleteMachine ) if updateErr != nil { - return updateRetryRequired, updateErr + return updateRetryPeriod, updateErr } return retryPeriod, err @@ -1308,7 +1308,7 @@ func (c *controller) deleteVM(ctx context.Context, deleteMachineRequest *driver. lastKnownState = deleteMachineResponse.LastKnownState } - updateRetryRequired, updateErr := c.machineStatusUpdate( + updateRetryPeriod, updateErr := c.machineStatusUpdate( ctx, machine, v1alpha1.LastOperation{ @@ -1325,7 +1325,7 @@ func (c *controller) deleteVM(ctx context.Context, deleteMachineRequest *driver. ) if updateErr != nil { - return updateRetryRequired, updateErr + return updateRetryPeriod, updateErr } return retryRequired, err @@ -1364,7 +1364,7 @@ func (c *controller) deleteNodeObject(ctx context.Context, machine *v1alpha1.Mac err = fmt.Errorf("Machine deletion in process. No node object found") } - updateRetryRequired, updateErr := c.machineStatusUpdate( + updateRetryPeriod, updateErr := c.machineStatusUpdate( ctx, machine, v1alpha1.LastOperation{ @@ -1381,7 +1381,7 @@ func (c *controller) deleteNodeObject(ctx context.Context, machine *v1alpha1.Mac ) if updateErr != nil { - return updateRetryRequired, updateErr + return updateRetryPeriod, updateErr } return machineutils.ShortRetry, err