diff --git a/cmd/machine-controller-manager/app/controllermanager.go b/cmd/machine-controller-manager/app/controllermanager.go index 2f9aec1f3..3128acd74 100644 --- a/cmd/machine-controller-manager/app/controllermanager.go +++ b/cmd/machine-controller-manager/app/controllermanager.go @@ -208,7 +208,7 @@ func StartControllers(s *options.MCMServer, recorder record.EventRecorder, stop <-chan struct{}) error { - klog.V(5).Info("Getting available resources") + klog.V(4).Info("Getting available resources") availableResources, err := getAvailableResources(controlCoreClientBuilder) if err != nil { return err @@ -229,7 +229,7 @@ func StartControllers(s *options.MCMServer, } if availableResources[machineGVR] || availableResources[machineSetGVR] || availableResources[machineDeploymentGVR] { - klog.V(5).Infof("Creating shared informers; resync interval: %v", s.MinResyncPeriod) + klog.V(4).Infof("Creating shared informers; resync interval: %v", s.MinResyncPeriod) controlMachineInformerFactory := machineinformers.NewFilteredSharedInformerFactory( controlMachineClientBuilder.ClientOrDie("control-machine-shared-informers"), @@ -253,7 +253,7 @@ func StartControllers(s *options.MCMServer, // All shared informers are v1alpha1 API level machineSharedInformers := controlMachineInformerFactory.Machine().V1alpha1() - klog.V(5).Infof("Creating controllers...") + klog.V(4).Infof("Creating controllers...") mcmController, err := mcmcontroller.NewController( s.Namespace, controlMachineClient, @@ -276,7 +276,7 @@ func StartControllers(s *options.MCMServer, controlCoreInformerFactory.Start(stop) targetCoreInformerFactory.Start(stop) - klog.V(5).Info("Running controller") + klog.V(4).Info("Running controller") go mcmController.Run(int(s.ConcurrentNodeSyncs), stop) } else { diff --git a/docs/documents/apis.md b/docs/documents/apis.md index 6ea44ddc0..401062f35 100644 --- a/docs/documents/apis.md +++ b/docs/documents/apis.md @@ -1555,7 +1555,7 @@ to be.

CurrentStatus)

-

MachinePhase is a label for the condition of a machines at the current time.

+

MachinePhase is a label for the condition of a machine at the current time.


@@ -1992,7 +1992,7 @@ MachineConfiguration LastOperation)

-

MachineState is a current state of the machine.

+

MachineState is a current state of the operation.


diff --git a/kubernetes/crds/machine.sapcloud.io_machines.yaml b/kubernetes/crds/machine.sapcloud.io_machines.yaml index 1b800af25..744354ac5 100644 --- a/kubernetes/crds/machine.sapcloud.io_machines.yaml +++ b/kubernetes/crds/machine.sapcloud.io_machines.yaml @@ -245,7 +245,7 @@ spec: format: date-time type: string phase: - description: MachinePhase is a label for the condition of a machines + description: MachinePhase is a label for the condition of a machine at the current time. type: string timeoutActive: diff --git a/pkg/apis/machine/v1alpha1/machine_types.go b/pkg/apis/machine/v1alpha1/machine_types.go index b5920e570..d3541743c 100644 --- a/pkg/apis/machine/v1alpha1/machine_types.go +++ b/pkg/apis/machine/v1alpha1/machine_types.go @@ -136,10 +136,10 @@ type LastOperation struct { Type MachineOperationType `json:"type,omitempty"` } -// MachinePhase is a label for the condition of a machines at the current time. +// MachinePhase is a label for the condition of a machine at the current time. type MachinePhase string -// These are the valid statuses of machines. +// These are the valid phases of machines. const ( // MachinePending means that the machine is being created MachinePending MachinePhase = "Pending" @@ -156,25 +156,25 @@ const ( // MachineUnknown indicates that the node is not ready at the movement MachineUnknown MachinePhase = "Unknown" - // MachineFailed means operation failed leading to machine status failure + // MachineFailed means operation timed out MachineFailed MachinePhase = "Failed" - // MachineCrashLoopBackOff means creation or deletion of the machine is failing. It means that machine object is present but there is no corresponding VM. + // MachineCrashLoopBackOff means machine creation is failing. It means that machine object is present but there is no corresponding VM. MachineCrashLoopBackOff MachinePhase = "CrashLoopBackOff" ) -// MachineState is a current state of the machine. +// MachineState is a current state of the operation. type MachineState string -// These are the valid statuses of machines. +// These are the valid states of operations performed on the machine. const ( - // MachineStatePending means there are operations pending on this machine state + // MachineStateProcessing means operation is not yet complete MachineStateProcessing MachineState = "Processing" - // MachineStateFailed means operation failed leading to machine status failure + // MachineStateFailed means operation failed MachineStateFailed MachineState = "Failed" - // MachineStateSuccessful indicates that the node is not ready at the moment + // MachineStateSuccessful means operation completed successfully MachineStateSuccessful MachineState = "Successful" ) @@ -189,7 +189,7 @@ const ( // MachineOperationUpdate indicates that the operation was an update MachineOperationUpdate MachineOperationType = "Update" - // MachineOperationHealthCheck indicates that the operation was a create + // MachineOperationHealthCheck indicates that the operation was a health check of node object MachineOperationHealthCheck MachineOperationType = "HealthCheck" // MachineOperationDelete indicates that the operation was a delete diff --git a/pkg/util/provider/app/app.go b/pkg/util/provider/app/app.go index 3b095fce8..c1129b147 100644 --- a/pkg/util/provider/app/app.go +++ b/pkg/util/provider/app/app.go @@ -209,7 +209,7 @@ func StartControllers(s *options.MCServer, recorder record.EventRecorder, stop <-chan struct{}) error { - klog.V(5).Info("Getting available resources") + klog.V(4).Info("Getting available resources") availableResources, err := getAvailableResources(controlCoreClientBuilder) if err != nil { return err @@ -239,7 +239,7 @@ func StartControllers(s *options.MCServer, } if availableResources[machineGVR] { - klog.V(5).Infof("Creating shared informers; resync interval: %v", s.MinResyncPeriod) + klog.V(4).Infof("Creating shared informers; resync interval: %v", s.MinResyncPeriod) controlMachineInformerFactory := machineinformers.NewFilteredSharedInformerFactory( controlMachineClientBuilder.ClientOrDie("control-machine-shared-informers"), @@ -274,7 +274,7 @@ func StartControllers(s *options.MCServer, // All shared informers are v1alpha1 API level machineSharedInformers := controlMachineInformerFactory.Machine().V1alpha1() - klog.V(5).Infof("Creating controllers...") + klog.V(4).Infof("Creating controllers...") machineController, err := machinecontroller.NewController( s.Namespace, controlMachineClient, @@ -305,7 +305,7 @@ func StartControllers(s *options.MCServer, controlCoreInformerFactory.Start(stop) targetCoreInformerFactory.Start(stop) - klog.V(5).Info("Running controller") + klog.V(4).Info("Running controller") go machineController.Run(int(s.ConcurrentNodeSyncs), stop) } else { diff --git a/pkg/util/provider/drain/drain.go b/pkg/util/provider/drain/drain.go index 0b5a9f6e4..8b607056e 100644 --- a/pkg/util/provider/drain/drain.go +++ b/pkg/util/provider/drain/drain.go @@ -934,7 +934,7 @@ func (o *Options) waitForReattach(ctx context.Context, podVolumeInfo PodVolumeIn case incomingEvent := <-volumeAttachmentEventCh: persistentVolumeName := *incomingEvent.Spec.Source.PersistentVolumeName - klog.V(5).Infof("VolumeAttachment event received for PV: %s", persistentVolumeName) + klog.V(4).Infof("VolumeAttachment event received for PV: %s", persistentVolumeName) // Checking if event for an PV that is being waited on if _, present := pvsWaitingForReattachments[persistentVolumeName]; present { diff --git a/pkg/util/provider/drain/volume_attachment.go b/pkg/util/provider/drain/volume_attachment.go index f3dcc24ec..5c35afa3a 100644 --- a/pkg/util/provider/drain/volume_attachment.go +++ b/pkg/util/provider/drain/volume_attachment.go @@ -70,13 +70,13 @@ func (v *VolumeAttachmentHandler) dispatch(obj interface{}) { // AddVolumeAttachment is the event handler for VolumeAttachment add func (v *VolumeAttachmentHandler) AddVolumeAttachment(obj interface{}) { - klog.V(5).Infof("Adding volume attachment object") + klog.V(4).Infof("Adding volume attachment object") v.dispatch(obj) } // UpdateVolumeAttachment is the event handler for VolumeAttachment update func (v *VolumeAttachmentHandler) UpdateVolumeAttachment(oldObj, newObj interface{}) { - klog.V(5).Info("Updating volume attachment object") + klog.V(4).Info("Updating volume attachment object") v.dispatch(newObj) } diff --git a/pkg/util/provider/machinecontroller/controller.go b/pkg/util/provider/machinecontroller/controller.go index a9ff3c941..e4b458371 100644 --- a/pkg/util/provider/machinecontroller/controller.go +++ b/pkg/util/provider/machinecontroller/controller.go @@ -135,7 +135,6 @@ func NewController( controller.pvcLister = pvcInformer.Lister() controller.pvLister = pvInformer.Lister() controller.secretLister = secretInformer.Lister() - // TODO: Need to handle K8s versions below 1.13 differently controller.volumeAttachementLister = volumeAttachmentInformer.Lister() controller.machineClassLister = machineClassInformer.Lister() controller.nodeLister = nodeInformer.Lister() @@ -158,7 +157,7 @@ func NewController( controller.pdbV1beta1Synced = pdbV1beta1Informer.Informer().HasSynced } - // Secret Controller Informers + // Secret Controller's Informers secretInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ AddFunc: controller.secretAdd, DeleteFunc: controller.secretDelete, @@ -170,10 +169,9 @@ func NewController( DeleteFunc: controller.machineClassToSecretDelete, }) - // Machine Class Controller Informers + // Machine Class Controller's Informers machineInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ AddFunc: controller.machineToMachineClassAdd, - UpdateFunc: controller.machineToMachineClassUpdate, DeleteFunc: controller.machineToMachineClassDelete, }) @@ -183,7 +181,7 @@ func NewController( DeleteFunc: controller.machineClassDelete, }) - // Machine Controller Informers + // Machine Controller's Informers nodeInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ AddFunc: controller.addNodeToMachine, UpdateFunc: controller.updateNodeToMachine, @@ -196,7 +194,7 @@ func NewController( DeleteFunc: controller.deleteMachine, }) - // MachineSafety Controller Informers + // MachineSafety Controller's Informers // We follow the kubernetes way of reconciling the safety controller // done by adding empty key objects. We initialize it, to trigger // running of different safety loop on MCM startup. @@ -209,7 +207,7 @@ func NewController( DeleteFunc: controller.deleteMachineToSafety, }) - // Drain Controller Informers + // Drain Controller's Informers if k8sutils.IsResourceSupported( targetCoreClient, schema.GroupResource{ diff --git a/pkg/util/provider/machinecontroller/machine.go b/pkg/util/provider/machinecontroller/machine.go index dbd95b480..e28c9e9b4 100644 --- a/pkg/util/provider/machinecontroller/machine.go +++ b/pkg/util/provider/machinecontroller/machine.go @@ -47,41 +47,50 @@ SECTION Machine controller - Machine add, update, delete watches */ func (c *controller) addMachine(obj interface{}) { - klog.V(5).Infof("Adding machine object") - c.enqueueMachine(obj) + c.enqueueMachine(obj, "handling machine obj ADD event") } func (c *controller) updateMachine(oldObj, newObj interface{}) { - klog.V(5).Info("Updating machine object") - c.enqueueMachine(newObj) + oldMachine := oldObj.(*v1alpha1.Machine) + newMachine := newObj.(*v1alpha1.Machine) + + if oldMachine == nil || newMachine == nil { + klog.Errorf("couldn't convert to machine resource from object") + return + } + + if oldMachine.Generation == newMachine.Generation { + klog.V(3).Infof("Skipping non-spec updates for machine %s", oldMachine.Name) + return + } + + c.enqueueMachine(newObj, "handling machine object UPDATE event") } func (c *controller) deleteMachine(obj interface{}) { - klog.V(5).Info("Deleting machine object") - c.enqueueMachine(obj) + c.enqueueMachine(obj, "handling machine object DELETE event") } -// isToBeEnqueued returns true if the key is to be managed by this controller -func (c *controller) isToBeEnqueued(obj interface{}) (bool, string) { +// getKeyForObj returns key for object, else returns false +func (c *controller) getKeyForObj(obj interface{}) (string, bool) { key, err := cache.MetaNamespaceKeyFunc(obj) if err != nil { klog.Errorf("Couldn't get key for object %+v: %v", obj, err) - return false, "" + return "", false } - - return true, key + return key, true } -func (c *controller) enqueueMachine(obj interface{}) { - if toBeEnqueued, key := c.isToBeEnqueued(obj); toBeEnqueued { - klog.V(5).Infof("Adding machine object to the queue %q", key) +func (c *controller) enqueueMachine(obj interface{}, reason string) { + if key, ok := c.getKeyForObj(obj); ok { + klog.V(3).Infof("Adding machine object to queue %q, reason: %s", key, reason) c.machineQueue.Add(key) } } -func (c *controller) enqueueMachineAfter(obj interface{}, after time.Duration) { - if toBeEnqueued, key := c.isToBeEnqueued(obj); toBeEnqueued { - klog.V(5).Infof("Adding machine object to the queue %q after %s", key, after) +func (c *controller) enqueueMachineAfter(obj interface{}, after time.Duration, reason string) { + if key, ok := c.getKeyForObj(obj); ok { + klog.V(3).Infof("Adding machine object to queue %q after %s, reason: %s", key, after, reason) c.machineQueue.AddAfter(key, after) } } @@ -105,16 +114,19 @@ func (c *controller) reconcileClusterMachineKey(key string) error { } retryPeriod, err := c.reconcileClusterMachine(ctx, machine) - klog.V(5).Info(err, retryPeriod) - c.enqueueMachineAfter(machine, time.Duration(retryPeriod)) + var reEnqueReason = "periodic reconcile" + if err != nil { + reEnqueReason = err.Error() + } + c.enqueueMachineAfter(machine, time.Duration(retryPeriod), reEnqueReason) return nil } func (c *controller) reconcileClusterMachine(ctx context.Context, machine *v1alpha1.Machine) (machineutils.RetryPeriod, error) { - klog.V(5).Infof("Start Reconciling machine: %q , nodeName: %q ,providerID: %q", machine.Name, getNodeName(machine), getProviderID(machine)) - defer klog.V(5).Infof("Stop Reconciling machine %q, nodeName: %q ,providerID: %q", machine.Name, getNodeName(machine), getProviderID(machine)) + klog.V(2).Infof("reconcileClusterMachine: Start for %q with phase:%q, description:%q", machine.Name, machine.Status.CurrentStatus.Phase, machine.Status.LastOperation.Description) + defer klog.V(2).Infof("reconcileClusterMachine: Stop for %q", machine.Name) if c.safetyOptions.MachineControllerFrozen && machine.DeletionTimestamp == nil { // If Machine controller is frozen and @@ -132,7 +144,7 @@ func (c *controller) reconcileClusterMachine(ctx context.Context, machine *v1alp validationerr := validation.ValidateMachine(internalMachine) if validationerr.ToAggregate() != nil && len(validationerr.ToAggregate().Errors()) > 0 { - err := fmt.Errorf("Validation of Machine failed %s", validationerr.ToAggregate().Error()) + err := fmt.Errorf("validation of Machine failed %s", validationerr.ToAggregate().Error()) klog.Error(err) return machineutils.LongRetry, err } @@ -140,7 +152,7 @@ func (c *controller) reconcileClusterMachine(ctx context.Context, machine *v1alp // Validate MachineClass machineClass, secretData, retry, err := c.ValidateMachineClass(ctx, &machine.Spec.Class) if err != nil { - klog.Error(err) + klog.Errorf("cannot reconcile machine %s: %s", machine.Name, err) return retry, err } @@ -193,8 +205,8 @@ SECTION Machine controller - nodeToMachine */ var ( - errMultipleMachineMatch = errors.New("Multiple machines matching node") - errNoMachineMatch = errors.New("No machines matching node found") + errMultipleMachineMatch = errors.New("multiple machines matching node") + errNoMachineMatch = errors.New("no machines matching node found") ) func (c *controller) addNodeToMachine(obj interface{}) { @@ -226,31 +238,34 @@ func (c *controller) addNodeToMachine(obj interface{}) { return } - if machine.Status.CurrentStatus.Phase != v1alpha1.MachineCrashLoopBackOff && nodeConditionsHaveChanged(machine.Status.Conditions, node.Status.Conditions) { - klog.V(5).Infof("Enqueue machine object %q as conditions of backing node %q have changed", machine.Name, getNodeName(machine)) - c.enqueueMachine(machine) + isMachineCrashLooping := machine.Status.CurrentStatus.Phase == v1alpha1.MachineCrashLoopBackOff + isMachineTerminating := machine.Status.CurrentStatus.Phase == v1alpha1.MachineTerminating + _, _, nodeConditionsHaveChanged := nodeConditionsHaveChanged(machine.Status.Conditions, node.Status.Conditions) + + if !isMachineCrashLooping && !isMachineTerminating && nodeConditionsHaveChanged { + c.enqueueMachine(machine, fmt.Sprintf("handling node UPDATE event. conditions of backing node %q have changed", getNodeName(machine))) } } func (c *controller) updateNodeToMachine(oldObj, newObj interface{}) { + oldNode := oldObj.(*corev1.Node) node := newObj.(*corev1.Node) if node == nil { klog.Errorf("Couldn't convert to node from object") return } + machine, err := c.getMachineFromNode(node.Name) + if err != nil { + klog.Errorf("Unable to handle update event for node %s, couldn't fetch machine %s, Error: %s", machine.Name, err) + return + } + // check for the TriggerDeletionByMCM annotation on the node object // if it is present then mark the machine object for deletion if value, ok := node.Annotations[machineutils.TriggerDeletionByMCM]; ok && value == "true" { - - machine, err := c.getMachineFromNode(node.Name) - if err != nil { - klog.Errorf("Couldn't fetch machine %s, Error: %s", machine.Name, err) - return - } - if machine.DeletionTimestamp == nil { - klog.Infof("Node %s is annotated to trigger deletion by MCM.", node.Name) + klog.Infof("Node %s for machine %s is annotated to trigger deletion by MCM.", node.Name, machine.Name) if err := c.controlMachineClient.Machines(c.namespace).Delete(context.Background(), machine.Name, metav1.DeleteOptions{}); err != nil { klog.Errorf("Machine object %s backing the node %s could not be marked for deletion.", machine.Name, node.Name) return @@ -259,6 +274,12 @@ func (c *controller) updateNodeToMachine(oldObj, newObj interface{}) { } } + // to reconcile on addition/removal of essential taints in machine lifecycle, example - critical component taint + if addedOrRemovedEssentialTaints(oldNode, node, machineutils.EssentialTaints) { + c.enqueueMachine(machine, fmt.Sprintf("handling node UPDATE event. atleast one of essential taints on backing node %q has changed", getNodeName(machine))) + return + } + c.addNodeToMachine(newObj) } @@ -275,7 +296,10 @@ func (c *controller) deleteNodeToMachine(obj interface{}) { return } - c.enqueueMachine(machine) + // donot respond if machine obj is already in termination flow + if machine.DeletionTimestamp == nil { + c.enqueueMachine(machine, fmt.Sprintf("backing node obj %q got deleted", key)) + } } /* @@ -302,9 +326,32 @@ func (c *controller) getMachineFromNode(nodeName string) (*v1alpha1.Machine, err return machines[0], nil } +func addedOrRemovedEssentialTaints(oldNode, node *corev1.Node, taintKeys []string) bool { + mapOldNodeTaintKeys := make(map[string]bool) + mapNodeTaintKeys := make(map[string]bool) + + for _, t := range oldNode.Spec.Taints { + mapOldNodeTaintKeys[t.Key] = true + } + + for _, t := range node.Spec.Taints { + mapNodeTaintKeys[t.Key] = true + } + + for _, tk := range taintKeys { + _, oldNodeHasTaint := mapOldNodeTaintKeys[tk] + _, newNodeHasTaint := mapNodeTaintKeys[tk] + if oldNodeHasTaint != newNodeHasTaint { + klog.V(2).Infof("Taint with key %q has been added/removed from the node %q", tk, node.Name) + return true + } + } + return false +} + /* SECTION - Machine operations - Create, Update, Delete + Machine operations - Create, Delete */ func (c *controller) triggerCreationFlow(ctx context.Context, createMachineRequest *driver.CreateMachineRequest) (machineutils.RetryPeriod, error) { @@ -397,7 +444,7 @@ func (c *controller) triggerCreationFlow(ctx context.Context, createMachineReque } // machine obj marked Failed for double security - c.machineStatusUpdate( + updateRetryPeriod, updateErr := c.machineStatusUpdate( ctx, machine, v1alpha1.LastOperation{ @@ -413,6 +460,10 @@ func (c *controller) triggerCreationFlow(ctx context.Context, createMachineReque machine.Status.LastKnownState, ) + if updateErr != nil { + return updateRetryPeriod, updateErr + } + klog.V(2).Infof("Machine %q marked Failed as VM was referring to a stale node object", machine.Name) return machineutils.ShortRetry, err } @@ -424,7 +475,7 @@ func (c *controller) triggerCreationFlow(ctx context.Context, createMachineReque case codes.Unknown, codes.DeadlineExceeded, codes.Aborted, codes.Unavailable: // GetMachineStatus() returned with one of the above error codes. // Retry operation. - c.machineStatusUpdate( + updateRetryPeriod, updateErr := c.machineStatusUpdate( ctx, machine, v1alpha1.LastOperation{ @@ -439,11 +490,14 @@ func (c *controller) triggerCreationFlow(ctx context.Context, createMachineReque }, machine.Status.LastKnownState, ) + if updateErr != nil { + return updateRetryPeriod, updateErr + } return machineutils.ShortRetry, err default: - c.machineStatusUpdate( + updateRetryPeriod, updateErr := c.machineStatusUpdate( ctx, machine, v1alpha1.LastOperation{ @@ -458,6 +512,9 @@ func (c *controller) triggerCreationFlow(ctx context.Context, createMachineReque }, machine.Status.LastKnownState, ) + if updateErr != nil { + return updateRetryPeriod, updateErr + } return machineutils.MediumRetry, err } @@ -528,44 +585,6 @@ func (c *controller) triggerCreationFlow(ctx context.Context, createMachineReque return machineutils.LongRetry, nil } -func (c *controller) triggerUpdationFlow(ctx context.Context, machine *v1alpha1.Machine, actualProviderID string) (machineutils.RetryPeriod, error) { - klog.V(2).Infof("Setting ProviderID of machine %s with backing node %s to %s", machine.Name, getNodeName(machine), actualProviderID) - - for { - machine, err := c.controlMachineClient.Machines(machine.Namespace).Get(ctx, machine.Name, metav1.GetOptions{}) - if err != nil { - klog.Warningf("Machine GET failed. Retrying, error: %s", err) - continue - } - - clone := machine.DeepCopy() - clone.Spec.ProviderID = actualProviderID - machine, err = c.controlMachineClient.Machines(clone.Namespace).Update(ctx, clone, metav1.UpdateOptions{}) - if err != nil { - klog.Warningf("Machine UPDATE failed. Retrying, error: %s", err) - continue - } - - clone = machine.DeepCopy() - lastOperation := v1alpha1.LastOperation{ - Description: "Updated provider ID", - State: v1alpha1.MachineStateSuccessful, - Type: v1alpha1.MachineOperationUpdate, - LastUpdateTime: metav1.Now(), - } - clone.Status.LastOperation = lastOperation - _, err = c.controlMachineClient.Machines(clone.Namespace).UpdateStatus(ctx, clone, metav1.UpdateOptions{}) - if err != nil { - klog.Warningf("Machine/status UPDATE failed. Retrying, error: %s", err) - continue - } - // Update went through, exit out of infinite loop - break - } - - return machineutils.LongRetry, nil -} - func (c *controller) triggerDeletionFlow(ctx context.Context, deleteMachineRequest *driver.DeleteMachineRequest) (machineutils.RetryPeriod, error) { var ( machine = deleteMachineRequest.Machine diff --git a/pkg/util/provider/machinecontroller/machine_safety.go b/pkg/util/provider/machinecontroller/machine_safety.go index 23ce2e132..44b99d286 100644 --- a/pkg/util/provider/machinecontroller/machine_safety.go +++ b/pkg/util/provider/machinecontroller/machine_safety.go @@ -107,8 +107,9 @@ func (c *controller) reconcileClusterMachineSafetyAPIServer(key string) error { klog.V(2).Infof("SafetyController: Reinitializing machine health check for machine: %q with backing node: %q and providerID: %q", machine.Name, getNodeName(machine), getProviderID(machine)) } - // En-queue after 30 seconds, to ensure all machine states are reconciled - c.enqueueMachineAfter(machine, 30*time.Second) + // En-queue after 30 seconds, to ensure all machine phases are reconciled to their actual value + // as they are currently reset to `Running` + c.enqueueMachineAfter(machine, 30*time.Second, "kube-api-servers are up again, so reconcile of machine phase is needed") } c.safetyOptions.MachineControllerFrozen = false diff --git a/pkg/util/provider/machinecontroller/machine_test.go b/pkg/util/provider/machinecontroller/machine_test.go index 98707bc5d..9bfd4bf81 100644 --- a/pkg/util/provider/machinecontroller/machine_test.go +++ b/pkg/util/provider/machinecontroller/machine_test.go @@ -781,7 +781,7 @@ var _ = Describe("machine", func() { ErrorCode: codes.Internal.String(), }, }, nil, nil, nil, true, metav1.Now()), - err: nil, + err: status.Error(codes.Internal, "Provider is returning error on create call"), retry: machineutils.MediumRetry, }, }), @@ -833,7 +833,7 @@ var _ = Describe("machine", func() { ErrorCode: codes.ResourceExhausted.String(), }, }, nil, nil, nil, true, metav1.Now()), - err: nil, + err: status.Error(codes.ResourceExhausted, "Provider does not have capacity to create VM"), retry: machineutils.MediumRetry, }, }), @@ -885,7 +885,7 @@ var _ = Describe("machine", func() { ErrorCode: codes.Internal.String(), }, }, nil, nil, nil, true, metav1.NewTime(metav1.Now().Add(-time.Hour))), - err: nil, + err: status.Error(codes.Internal, "Provider is returning error on create call"), retry: machineutils.MediumRetry, }, }), @@ -2794,7 +2794,7 @@ var _ = Describe("machine", func() { }, }, expect: expect{ - err: fmt.Errorf("Machine deletion in process. Deletion of node object was succesful"), + err: fmt.Errorf("Machine deletion in process. Deletion of node object was successful"), retry: machineutils.ShortRetry, nodeDeleted: true, machine: newMachine( diff --git a/pkg/util/provider/machinecontroller/machine_util.go b/pkg/util/provider/machinecontroller/machine_util.go index a9b75e750..67919c5c8 100644 --- a/pkg/util/provider/machinecontroller/machine_util.go +++ b/pkg/util/provider/machinecontroller/machine_util.go @@ -60,6 +60,10 @@ import ( // emptyMap is a dummy emptyMap to compare with var emptyMap = make(map[string]string) +var ( + errSuccessfulALTsync = errors.New("machine ALTs have been reconciled") + errSuccessfulPhaseUpdate = errors.New("machine creation is successful. Machine Phase/Conditions have been UPDATED") +) const ( maxReplacements = 1 @@ -68,43 +72,6 @@ const ( cacheUpdateTimeout = 1 * time.Second ) -// TODO: use client library instead when it starts to support update retries -// -// see https://github.com/kubernetes/kubernetes/issues/21479 -// type updateMachineFunc func(machine *v1alpha1.Machine) error - -/* -// UpdateMachineWithRetries updates a machine with given applyUpdate function. Note that machine not found error is ignored. -// The returned bool value can be used to tell if the machine is actually updated. -func UpdateMachineWithRetries(machineClient v1alpha1client.MachineInterface, machineLister v1alpha1listers.MachineLister, namespace, name string, applyUpdate updateMachineFunc) (*v1alpha1.Machine, error) { - var machine *v1alpha1.Machine - - retryErr := retry.RetryOnConflict(retry.DefaultBackoff, func() error { - var err error - machine, err = machineLister.Machines(namespace).Get(name) - if err != nil { - return err - } - machine = machine.DeepCopy() - // Apply the update, then attempt to push it to the apiserver. - if applyErr := applyUpdate(machine); applyErr != nil { - return applyErr - } - machine, err = machineClient.Update(machine) - return err - }) - - // Ignore the precondition violated error, this machine is already updated - // with the desired label. - if retryErr == errorsutil.ErrPreconditionViolated { - klog.V(4).Infof("Machine %s precondition doesn't hold, skip updating it.", name) - retryErr = nil - } - - return machine, retryErr -} -*/ - // ValidateMachineClass validates the machine class. func (c *controller) ValidateMachineClass(_ context.Context, classSpec *v1alpha1.ClassSpec) (*v1alpha1.MachineClass, map[string][]byte, machineutils.RetryPeriod, error) { var ( @@ -137,7 +104,6 @@ func (c *controller) ValidateMachineClass(_ context.Context, classSpec *v1alpha1 errMessage := fmt.Sprintf("The machine class %s has no finalizers set. So not reconciling the machine.", machineClass.Name) err := errors.New(errMessage) - klog.Warning(errMessage) return nil, nil, machineutils.ShortRetry, err } @@ -219,19 +185,38 @@ func (c *controller) getSecret(ref *v1.SecretReference, MachineClassName string) return secretRef, err } -// nodeConditionsHaveChanged compares two node statuses to see if any of the statuses have changed -func nodeConditionsHaveChanged(machineConditions []v1.NodeCondition, nodeConditions []v1.NodeCondition) bool { - if len(machineConditions) != len(nodeConditions) { - return true +// nodeConditionsHaveChanged compares two node status.conditions to see if any of the statuses have changed +func nodeConditionsHaveChanged(oldConditions []v1.NodeCondition, newConditions []v1.NodeCondition) ([]v1.NodeCondition, []v1.NodeCondition, bool) { + var ( + oldConditionsByType = make(map[v1.NodeConditionType]v1.NodeCondition, len(oldConditions)) + newConditionsByType = make(map[v1.NodeConditionType]v1.NodeCondition, len(newConditions)) + addedOrUpdatedConditions = make([]v1.NodeCondition, 0, len(newConditions)) + removedConditions = make([]v1.NodeCondition, 0, len(oldConditions)) + ) + + for _, c := range oldConditions { + oldConditionsByType[c.Type] = c + } + for _, c := range newConditions { + newConditionsByType[c.Type] = c } - for i := range nodeConditions { - if nodeConditions[i].Status != machineConditions[i].Status { - return true + // checking for any added/updated new condition + for _, c := range newConditions { + oldC, exists := oldConditionsByType[c.Type] + if !exists || (oldC.Status != c.Status) { + addedOrUpdatedConditions = append(addedOrUpdatedConditions, c) } } - return false + // checking for any deleted condition + for _, c := range oldConditions { + if _, exists := newConditionsByType[c.Type]; !exists { + removedConditions = append(removedConditions, c) + } + } + + return addedOrUpdatedConditions, removedConditions, len(addedOrUpdatedConditions) != 0 || len(removedConditions) != 0 } func mergeDataMaps(in map[string][]byte, maps ...map[string][]byte) map[string][]byte { @@ -314,10 +299,13 @@ func (c *controller) syncMachineNodeTemplates(ctx context.Context, machine *v1al // Keep retrying until update goes through klog.Errorf("Updated failed for node object of machine %q. Retrying, error: %q", machine.Name, err) } else { - // Return error even when machine object is updated - err = fmt.Errorf("Machine ALTs have been reconciled") + // Return error to continue in next reconcile + err = errSuccessfulALTsync } + if apierrors.IsConflict(err) { + return machineutils.ConflictRetry, err + } return machineutils.ShortRetry, err } @@ -494,7 +482,7 @@ func (c *controller) machineCreateErrorHandler(ctx context.Context, machine *v1a lastKnownState = createMachineResponse.LastKnownState } - c.machineStatusUpdate( + updateRetryPeriod, updateErr := c.machineStatusUpdate( ctx, machine, v1alpha1.LastOperation{ @@ -511,7 +499,11 @@ func (c *controller) machineCreateErrorHandler(ctx context.Context, machine *v1a lastKnownState, ) - return retryRequired, nil + if updateErr != nil { + return updateRetryPeriod, updateErr + } + + return retryRequired, err } func (c *controller) machineStatusUpdate( @@ -520,7 +512,7 @@ func (c *controller) machineStatusUpdate( lastOperation v1alpha1.LastOperation, currentStatus v1alpha1.CurrentStatus, lastKnownState string, -) error { +) (machineutils.RetryPeriod, error) { clone := machine.DeepCopy() clone.Status.LastOperation = lastOperation clone.Status.CurrentStatus = currentStatus @@ -528,7 +520,7 @@ func (c *controller) machineStatusUpdate( if isMachineStatusSimilar(clone.Status, machine.Status) { klog.V(3).Infof("Not updating the status of the machine object %q, as the content is similar", clone.Name) - return nil + return machineutils.ShortRetry, nil } _, err := c.controlMachineClient.Machines(clone.Namespace).UpdateStatus(ctx, clone, metav1.UpdateOptions{}) @@ -539,7 +531,11 @@ func (c *controller) machineStatusUpdate( klog.V(2).Infof("Machine/status UPDATE for %q", machine.Name) } - return err + if apierrors.IsConflict(err) { + return machineutils.ConflictRetry, err + } + + return machineutils.ShortRetry, err } // isMachineStatusSimilar checks if the status of 2 machines is similar or not. @@ -574,7 +570,7 @@ func (c *controller) getCreateFailurePhase(machine *v1alpha1.Machine) v1alpha1.M if timeOut > 0 { // Machine creation timeout occured while joining of machine // Machine set controller would replace this machine with a new one as phase is failed. - klog.V(2).Infof("Machine %q , providerID %q and backing node %q couldn't join in creation timeout of %s. Changing phase to failed.", machine.Name, getProviderID(machine), getNodeName(machine), timeOutDuration) + klog.V(2).Infof("Machine %q , providerID %q and backing node %q couldn't join in creation timeout of %s. Changing phase to Failed.", machine.Name, getProviderID(machine), getNodeName(machine), timeOutDuration) return v1alpha1.MachineFailed } @@ -601,14 +597,13 @@ func (c *controller) reconcileMachineHealth(ctx context.Context, machine *v1alph // and corresponding node object went missing // and if machine object still reports healthy description = fmt.Sprintf( - "Node object went missing. Machine %s is unhealthy - changing MachineState to Unknown", + "Node object went missing. Machine %s is unhealthy - changing MachinePhase to Unknown", machine.Name, ) klog.Warning(description) clone.Status.CurrentStatus = v1alpha1.CurrentStatus{ - Phase: v1alpha1.MachineUnknown, - // TimeoutActive: true, + Phase: v1alpha1.MachineUnknown, LastUpdateTime: metav1.Now(), } clone.Status.LastOperation = v1alpha1.LastOperation{ @@ -625,9 +620,11 @@ func (c *controller) reconcileMachineHealth(ctx context.Context, machine *v1alph return machineutils.ShortRetry, err } } else { - if nodeConditionsHaveChanged(machine.Status.Conditions, node.Status.Conditions) { + populatedConditions, removedConditions, isChanged := nodeConditionsHaveChanged(machine.Status.Conditions, node.Status.Conditions) + if isChanged { clone.Status.Conditions = node.Status.Conditions - klog.V(3).Infof("Conditions of Machine %q with providerID %q and backing node %q are changing", machine.Name, getProviderID(machine), getNodeName(machine)) + + klog.V(3).Infof("Conditions of node %q backing machine %q with providerID %q have changed.\nAdded/Updated Conditions:\n\n%s\nRemoved Conditions:\n\n%s\n", getNodeName(machine), machine.Name, getProviderID(machine), getFormattedNodeConditions(populatedConditions), getFormattedNodeConditions(removedConditions)) cloneDirty = true } @@ -649,7 +646,7 @@ func (c *controller) reconcileMachineHealth(ctx context.Context, machine *v1alph description = fmt.Sprintf("Machine %s successfully re-joined the cluster", clone.Name) lastOperationType = v1alpha1.MachineOperationHealthCheck } - klog.V(2).Info(description) + klog.V(2).Infof("%s with backing node %q and providerID %q", description, getNodeName(clone), getProviderID(clone)) // Machine is ready and has joined/re-joined the cluster clone.Status.LastOperation = v1alpha1.LastOperation{ @@ -667,9 +664,9 @@ func (c *controller) reconcileMachineHealth(ctx context.Context, machine *v1alph } } else { if clone.Status.CurrentStatus.Phase == v1alpha1.MachineRunning { - // If machine is not healthy, and current state is running, - // change the machinePhase to unknown and activate health check timeout - description = fmt.Sprintf("Machine %s is unhealthy - changing MachineState to Unknown. Node conditions: %+v", clone.Name, clone.Status.Conditions) + // If machine is not healthy, and current phase is Running, + // change the machinePhase to Unknown and activate health check timeout + description = fmt.Sprintf("Machine %s is unhealthy - changing MachinePhase to Unknown. Node conditions: %+v", clone.Name, clone.Status.Conditions) klog.Warning(description) clone.Status.CurrentStatus = v1alpha1.CurrentStatus{ @@ -735,7 +732,7 @@ func (c *controller) reconcileMachineHealth(ctx context.Context, machine *v1alph } else { // Timeout occurred due to machine being unhealthy for too long description = fmt.Sprintf( - "Machine %s is not healthy since %s minutes. Changing status to failed. Node Conditions: %+v", + "Machine %s health checks failing since last %s minutes. Updating machine phase to Failed. Node Conditions: %+v", machine.Name, timeOutDuration, machine.Status.Conditions, @@ -749,19 +746,23 @@ func (c *controller) reconcileMachineHealth(ctx context.Context, machine *v1alph } else { // If timeout has not occurred, re-enqueue the machine // after a specified sleep time - c.enqueueMachineAfter(machine, sleepTime) + klog.V(4).Infof("Creation/Health Timeout hasn't occured yet , will re-enqueue after %s", time.Duration(sleepTime)) + c.enqueueMachineAfter(machine, sleepTime, "re-check for creation/health timeout") } } if cloneDirty { _, err = c.controlMachineClient.Machines(clone.Namespace).UpdateStatus(ctx, clone, metav1.UpdateOptions{}) if err != nil { - // Keep retrying until update goes through - klog.Errorf("Update failed for machine %q. Retrying, error: %q", machine.Name, err) + // Keep retrying across reconciles until update goes through + klog.Errorf("Update of Phase/Conditions failed for machine %q. Retrying, error: %q", machine.Name, err) + if apierrors.IsConflict(err) { + return machineutils.ConflictRetry, err + } } else { - klog.V(2).Infof("Machine State has been updated for %q with providerID %q and backing node %q", machine.Name, getProviderID(machine), getNodeName(machine)) - // Return error for continuing in next iteration - err = fmt.Errorf("machine creation is successful. Machine State has been UPDATED") + klog.V(2).Infof("Machine Phase/Conditions have been updated for %q with providerID %q and are in sync with backing node %q", machine.Name, getProviderID(machine), getNodeName(machine)) + // Return error to end the reconcile + err = errSuccessfulPhaseUpdate } return machineutils.ShortRetry, err @@ -770,6 +771,18 @@ func (c *controller) reconcileMachineHealth(ctx context.Context, machine *v1alph return machineutils.LongRetry, nil } +func getFormattedNodeConditions(conditions []v1.NodeCondition) string { + var result string + if len(conditions) == 0 { + return "\n" + } + + for _, c := range conditions { + result = fmt.Sprintf("%sType: %s | Status: %s | Reason: %s | Message: %s\n", result, c.Type, c.Status, c.Reason, c.Message) + } + return result +} + /* SECTION Manipulate Finalizers @@ -856,7 +869,11 @@ func criticalComponentsNotReadyTaintPresent(node *v1.Node) bool { } func isPendingMachineWithCriticalComponentsNotReadyTaint(clone *v1alpha1.Machine, node *v1.Node) bool { - return clone.Status.CurrentStatus.Phase == v1alpha1.MachinePending && criticalComponentsNotReadyTaintPresent(node) + if clone.Status.CurrentStatus.Phase == v1alpha1.MachinePending && criticalComponentsNotReadyTaintPresent(node) { + klog.V(3).Infof("Critical component taint %q still present on node %q for machine %q", machineutils.TaintNodeCriticalComponentsNotReady, getNodeName(clone), clone.Name) + return true + } + return false } /* @@ -888,6 +905,10 @@ func (c *controller) setMachineTerminationStatus(ctx context.Context, deleteMach // Return error even when machine object is updated to ensure reconcilation is restarted err = fmt.Errorf("Machine deletion in process. Phase set to termination") } + + if apierrors.IsConflict(err) { + return machineutils.ConflictRetry, err + } return machineutils.ShortRetry, err } @@ -948,7 +969,7 @@ func (c *controller) getVMStatus(ctx context.Context, getMachineStatusRequest *d } } - c.machineStatusUpdate( + updateRetryPeriod, updateErr := c.machineStatusUpdate( ctx, getMachineStatusRequest.Machine, v1alpha1.LastOperation{ @@ -964,6 +985,10 @@ func (c *controller) getVMStatus(ctx context.Context, getMachineStatusRequest *d getMachineStatusRequest.Machine.Status.LastKnownState, ) + if updateErr != nil { + return updateRetryPeriod, updateErr + } + return retry, err } @@ -1076,7 +1101,7 @@ func (c *controller) drainNode(ctx context.Context, deleteMachineRequest *driver ) } - // update node with the machine's state prior to termination + // update node with the machine's phase prior to termination if err = c.UpdateNodeTerminationCondition(ctx, machine); err != nil { if forceDeleteMachine { klog.Warningf("Failed to update node conditions: %v. However, since it's a force deletion shall continue deletion of VM.", err) @@ -1147,7 +1172,7 @@ func (c *controller) drainNode(ctx context.Context, deleteMachineRequest *driver } } - c.machineStatusUpdate( + updateRetryPeriod, updateErr := c.machineStatusUpdate( ctx, machine, v1alpha1.LastOperation{ @@ -1163,6 +1188,10 @@ func (c *controller) drainNode(ctx context.Context, deleteMachineRequest *driver machine.Status.LastKnownState, ) + if updateErr != nil { + return updateRetryPeriod, updateErr + } + return machineutils.ShortRetry, err } @@ -1211,7 +1240,7 @@ func (c *controller) deleteNodeVolAttachments(ctx context.Context, deleteMachine } now := metav1.Now() klog.V(4).Infof("(deleteVolumeAttachmentsForNode) For node %q, machine %q, set LastOperation.Description: %q", nodeName, machine.Name, description) - err = c.machineStatusUpdate( + updateRetryPeriod, updateErr := c.machineStatusUpdate( ctx, machine, v1alpha1.LastOperation{ @@ -1223,6 +1252,11 @@ func (c *controller) deleteNodeVolAttachments(ctx context.Context, deleteMachine machine.Status.CurrentStatus, machine.Status.LastKnownState, ) + + if updateErr != nil { + return updateRetryPeriod, updateErr + } + return retryPeriod, err } @@ -1274,7 +1308,7 @@ func (c *controller) deleteVM(ctx context.Context, deleteMachineRequest *driver. lastKnownState = deleteMachineResponse.LastKnownState } - c.machineStatusUpdate( + updateRetryPeriod, updateErr := c.machineStatusUpdate( ctx, machine, v1alpha1.LastOperation{ @@ -1290,6 +1324,10 @@ func (c *controller) deleteVM(ctx context.Context, deleteMachineRequest *driver. lastKnownState, ) + if updateErr != nil { + return updateRetryPeriod, updateErr + } + return retryRequired, err } @@ -1307,14 +1345,14 @@ func (c *controller) deleteNodeObject(ctx context.Context, machine *v1alpha1.Mac // Delete node object err = c.targetCoreClient.CoreV1().Nodes().Delete(ctx, nodeName, metav1.DeleteOptions{}) if err != nil && !apierrors.IsNotFound(err) { - // If its an error, and anyother error than object not found + // If its an error, and any other error than object not found description = fmt.Sprintf("Deletion of Node Object %q failed due to error: %s. %s", nodeName, err, machineutils.InitiateNodeDeletion) state = v1alpha1.MachineStateFailed } else if err == nil { description = fmt.Sprintf("Deletion of Node Object %q is successful. %s", nodeName, machineutils.InitiateFinalizerRemoval) state = v1alpha1.MachineStateProcessing - err = fmt.Errorf("Machine deletion in process. Deletion of node object was succesful") + err = fmt.Errorf("Machine deletion in process. Deletion of node object was successful") } else { description = fmt.Sprintf("No node object found for %q, continuing deletion flow. %s", nodeName, machineutils.InitiateFinalizerRemoval) state = v1alpha1.MachineStateProcessing @@ -1326,7 +1364,7 @@ func (c *controller) deleteNodeObject(ctx context.Context, machine *v1alpha1.Mac err = fmt.Errorf("Machine deletion in process. No node object found") } - c.machineStatusUpdate( + updateRetryPeriod, updateErr := c.machineStatusUpdate( ctx, machine, v1alpha1.LastOperation{ @@ -1342,6 +1380,10 @@ func (c *controller) deleteNodeObject(ctx context.Context, machine *v1alpha1.Mac machine.Status.LastKnownState, ) + if updateErr != nil { + return updateRetryPeriod, updateErr + } + return machineutils.ShortRetry, err } diff --git a/pkg/util/provider/machinecontroller/machine_util_test.go b/pkg/util/provider/machinecontroller/machine_util_test.go index cadd5b550..2e83bfed2 100644 --- a/pkg/util/provider/machinecontroller/machine_util_test.go +++ b/pkg/util/provider/machinecontroller/machine_util_test.go @@ -196,7 +196,7 @@ var _ = Describe("machine_util", func() { }, }, }, - err: fmt.Errorf("Machine ALTs have been reconciled"), + err: errSuccessfulALTsync, }, }), @@ -276,7 +276,7 @@ var _ = Describe("machine_util", func() { }, }, }, - err: fmt.Errorf("Machine ALTs have been reconciled"), + err: errSuccessfulALTsync, }, }), @@ -422,7 +422,7 @@ var _ = Describe("machine_util", func() { }, }, }, - err: fmt.Errorf("Machine ALTs have been reconciled"), + err: errSuccessfulALTsync, }, }), ) @@ -2177,7 +2177,7 @@ var _ = Describe("machine_util", func() { }, expect: expect{ retryPeriod: machineutils.ShortRetry, - err: errors.New("machine creation is successful. Machine State has been UPDATED"), + err: errSuccessfulPhaseUpdate, expectedPhase: v1alpha1.MachineFailed, }, }), @@ -2196,7 +2196,7 @@ var _ = Describe("machine_util", func() { }, expect: expect{ retryPeriod: machineutils.ShortRetry, - err: errors.New("machine creation is successful. Machine State has been UPDATED"), + err: errSuccessfulPhaseUpdate, expectedPhase: v1alpha1.MachineRunning, }, }), @@ -2215,7 +2215,7 @@ var _ = Describe("machine_util", func() { }, expect: expect{ retryPeriod: machineutils.ShortRetry, - err: errors.New("machine creation is successful. Machine State has been UPDATED"), + err: errSuccessfulPhaseUpdate, expectedPhase: v1alpha1.MachineUnknown, }, }), @@ -2234,7 +2234,7 @@ var _ = Describe("machine_util", func() { }, expect: expect{ retryPeriod: machineutils.ShortRetry, - err: errors.New("machine creation is successful. Machine State has been UPDATED"), + err: errSuccessfulPhaseUpdate, expectedPhase: v1alpha1.MachineUnknown, }, }), @@ -2250,7 +2250,7 @@ var _ = Describe("machine_util", func() { }, expect: expect{ retryPeriod: machineutils.ShortRetry, - err: errors.New("machine creation is successful. Machine State has been UPDATED"), + err: errSuccessfulPhaseUpdate, expectedPhase: v1alpha1.MachineUnknown, }, }), @@ -2363,7 +2363,7 @@ var _ = Describe("machine_util", func() { }, expect: expect{ retryPeriod: machineutils.ShortRetry, - err: errors.New("machine creation is successful. Machine State has been UPDATED"), + err: errSuccessfulPhaseUpdate, expectedPhase: v1alpha1.MachineRunning, }, }), @@ -2390,7 +2390,7 @@ var _ = Describe("machine_util", func() { }, expect: expect{ retryPeriod: machineutils.ShortRetry, - err: errors.New("machine creation is successful. Machine State has been UPDATED"), + err: errSuccessfulPhaseUpdate, expectedPhase: v1alpha1.MachineRunning, }, }), @@ -2424,7 +2424,7 @@ var _ = Describe("machine_util", func() { }, expect: expect{ retryPeriod: machineutils.ShortRetry, - err: errors.New("machine creation is successful. Machine State has been UPDATED"), + err: errSuccessfulPhaseUpdate, expectedPhase: v1alpha1.MachineRunning, }, }), diff --git a/pkg/util/provider/machinecontroller/machineclass.go b/pkg/util/provider/machinecontroller/machineclass.go index 6ba849975..91a1ab460 100644 --- a/pkg/util/provider/machinecontroller/machineclass.go +++ b/pkg/util/provider/machinecontroller/machineclass.go @@ -45,36 +45,6 @@ func (c *controller) machineToMachineClassAdd(obj interface{}) { } } -func (c *controller) machineToMachineClassUpdate(oldObj, newObj interface{}) { - oldMachine, ok := oldObj.(*v1alpha1.Machine) - if oldMachine == nil || !ok { - klog.Warningf("Couldn't get machine from object: %+v", oldObj) - return - } - newMachine, ok := newObj.(*v1alpha1.Machine) - if newMachine == nil || !ok { - klog.Warningf("Couldn't get machine from object: %+v", newObj) - return - } - - if oldMachine.Spec.Class.Kind == newMachine.Spec.Class.Kind { - if newMachine.Spec.Class.Kind == machineutils.MachineClassKind { - // Both old and new machine refer to the same machineClass object - // And the correct kind so enqueuing only one of them. - c.machineClassQueue.Add(newMachine.Spec.Class.Name) - } - } else { - // If both are pointing to different machineClasses - // we might have to enqueue both. - if oldMachine.Spec.Class.Kind == machineutils.MachineClassKind { - c.machineClassQueue.Add(oldMachine.Spec.Class.Name) - } - if newMachine.Spec.Class.Kind == machineutils.MachineClassKind { - c.machineClassQueue.Add(newMachine.Spec.Class.Name) - } - } -} - func (c *controller) machineToMachineClassDelete(obj interface{}) { c.machineToMachineClassAdd(obj) } @@ -126,12 +96,12 @@ func (c *controller) reconcileClusterMachineClassKey(key string) error { err = c.reconcileClusterMachineClass(ctx, class) if err != nil { - // Re-enqueue after a 10s window - c.enqueueMachineClassAfter(class, 10*time.Second) + // Re-enqueue after a ShortRetry window + c.enqueueMachineClassAfter(class, time.Duration(machineutils.ShortRetry)) } else { // Re-enqueue periodically to avoid missing of events // TODO: Get ride of this logic - c.enqueueMachineClassAfter(class, 10*time.Minute) + c.enqueueMachineClassAfter(class, time.Duration(machineutils.LongRetry)) } return nil @@ -159,7 +129,7 @@ func (c *controller) reconcileClusterMachineClass(ctx context.Context, class *v1 if finalizers := sets.NewString(class.Finalizers...); !finalizers.Has(MCMFinalizerName) { // Add machineClassFinalizer as if doesn't exist - err = c.addMachineClassFinalizers(ctx, class) + err = c.addMCMFinalizerToMachineClass(ctx, class) if err != nil { return err } @@ -167,7 +137,7 @@ func (c *controller) reconcileClusterMachineClass(ctx context.Context, class *v1 // Enqueue all machines once finalizer is added to machineClass // This is to allow processing of such machines for _, machine := range machines { - c.enqueueMachine(machine) + c.enqueueMachine(machine, "finalizer placed on machineClass") } } @@ -187,7 +157,7 @@ func (c *controller) reconcileClusterMachineClass(ctx context.Context, class *v1 if finalizers := sets.NewString(class.Finalizers...); finalizers.Has(MCMFinalizerName) { // Delete finalizer if exists on machineClass - return c.deleteMachineClassFinalizers(ctx, class) + return c.deleteMCMFinalizerFromMachineClass(ctx, class) } return nil @@ -198,19 +168,19 @@ func (c *controller) reconcileClusterMachineClass(ctx context.Context, class *v1 Manipulate Finalizers */ -func (c *controller) addMachineClassFinalizers(ctx context.Context, class *v1alpha1.MachineClass) error { +func (c *controller) addMCMFinalizerToMachineClass(ctx context.Context, class *v1alpha1.MachineClass) error { finalizers := sets.NewString(class.Finalizers...) finalizers.Insert(MCMFinalizerName) - return c.updateMachineClassFinalizers(ctx, class, finalizers.List()) + return c.updateMachineClassFinalizers(ctx, class, finalizers.List(), true) } -func (c *controller) deleteMachineClassFinalizers(ctx context.Context, class *v1alpha1.MachineClass) error { +func (c *controller) deleteMCMFinalizerFromMachineClass(ctx context.Context, class *v1alpha1.MachineClass) error { finalizers := sets.NewString(class.Finalizers...) finalizers.Delete(MCMFinalizerName) - return c.updateMachineClassFinalizers(ctx, class, finalizers.List()) + return c.updateMachineClassFinalizers(ctx, class, finalizers.List(), false) } -func (c *controller) updateMachineClassFinalizers(ctx context.Context, class *v1alpha1.MachineClass, finalizers []string) error { +func (c *controller) updateMachineClassFinalizers(ctx context.Context, class *v1alpha1.MachineClass, finalizers []string, addFinalizers bool) error { // Get the latest version of the class so that we can avoid conflicts class, err := c.controlMachineClient.MachineClasses(class.Namespace).Get(ctx, class.Name, metav1.GetOptions{}) if err != nil { @@ -224,7 +194,11 @@ func (c *controller) updateMachineClassFinalizers(ctx context.Context, class *v1 klog.Warning("Updating machineClass failed, retrying. ", class.Name, err) return err } - klog.V(3).Infof("Successfully added/removed finalizer on the machineclass %q", class.Name) + if addFinalizers { + klog.V(3).Infof("Successfully added finalizer on the machineclass %q", class.Name) + } else { + klog.V(3).Infof("Successfully removed finalizer on the machineclass %q", class.Name) + } return err } diff --git a/pkg/util/provider/machinecontroller/machineclass_util.go b/pkg/util/provider/machinecontroller/machineclass_util.go index fdd15ea15..7d2250faf 100644 --- a/pkg/util/provider/machinecontroller/machineclass_util.go +++ b/pkg/util/provider/machinecontroller/machineclass_util.go @@ -22,37 +22,6 @@ import ( "k8s.io/apimachinery/pkg/labels" ) -/* -TODO: Move this code to MCM/MachineSet controller as well? -func (c *controller) findMachineDeploymentsForClass(kind, name string) ([]*v1alpha1.MachineDeployment, error) { - machineDeployments, err := c.machineDeploymentLister.List(labels.Everything()) - if err != nil { - return nil, err - } - var filtered []*v1alpha1.MachineDeployment - for _, machineDeployment := range machineDeployments { - if machineDeployment.Spec.Template.Spec.Class.Kind == kind && machineDeployment.Spec.Template.Spec.Class.Name == name { - filtered = append(filtered, machineDeployment) - } - } - return filtered, nil -} - -func (c *controller) findMachineSetsForClass(kind, name string) ([]*v1alpha1.MachineSet, error) { - machineSets, err := c.machineSetLister.List(labels.Everything()) - if err != nil { - return nil, err - } - var filtered []*v1alpha1.MachineSet - for _, machineSet := range machineSets { - if machineSet.Spec.Template.Spec.Class.Kind == kind && machineSet.Spec.Template.Spec.Class.Name == name { - filtered = append(filtered, machineSet) - } - } - return filtered, nil -} -*/ - func (c *controller) findMachinesForClass(kind, name string) ([]*v1alpha1.Machine, error) { machines, err := c.machineLister.List(labels.Everything()) if err != nil { diff --git a/pkg/util/provider/machinecontroller/secret.go b/pkg/util/provider/machinecontroller/secret.go index b620ae0a4..6e072fb54 100644 --- a/pkg/util/provider/machinecontroller/secret.go +++ b/pkg/util/provider/machinecontroller/secret.go @@ -61,10 +61,10 @@ func (c *controller) reconcileClusterSecretKey(key string) error { func (c *controller) reconcileClusterSecret(ctx context.Context, secret *corev1.Secret) error { startTime := time.Now() - klog.V(5).Infof("Start syncing %q", secret.Name) + klog.V(4).Infof("Start syncing %q", secret.Name) defer func() { c.enqueueSecretAfter(secret, 10*time.Minute) - klog.V(5).Infof("Finished syncing %q (%v)", secret.Name, time.Since(startTime)) + klog.V(4).Infof("Finished syncing %q (%v)", secret.Name, time.Since(startTime)) }() // Check if machineClasses are referring to this secret @@ -80,10 +80,6 @@ func (c *controller) reconcileClusterSecret(ctx context.Context, secret *corev1. return err } } else { - if finalizers := sets.NewString(secret.Finalizers...); !finalizers.Has(MCFinalizerName) { - // Finalizer doesn't exist, simply return nil - return nil - } err = c.deleteSecretFinalizers(ctx, secret) if err != nil { return err diff --git a/pkg/util/provider/machineutils/utils.go b/pkg/util/provider/machineutils/utils.go index 8bc546a0f..3d91e1a68 100644 --- a/pkg/util/provider/machineutils/utils.go +++ b/pkg/util/provider/machineutils/utils.go @@ -54,9 +54,6 @@ const ( // MachineClassKind is used to identify the machineClassKind for generic machineClasses MachineClassKind = "MachineClass" - // MigratedMachineClass annotation helps in identifying machineClasses who have been migrated by migration controller - MigratedMachineClass = "machine.sapcloud.io/migrated" - // NotManagedByMCM annotation helps in identifying the nodes which are not handled by MCM NotManagedByMCM = "node.machine.sapcloud.io/not-managed-by-mcm" @@ -82,10 +79,16 @@ type RetryPeriod time.Duration // These are the valid values for RetryPeriod const ( + // ConflictRetry tells the controller to retry quickly - 200 milliseconds + ConflictRetry RetryPeriod = RetryPeriod(200 * time.Millisecond) // ShortRetry tells the controller to retry after a short duration - 15 seconds - ShortRetry RetryPeriod = RetryPeriod(15 * time.Second) + ShortRetry RetryPeriod = RetryPeriod(5 * time.Second) // MediumRetry tells the controller to retry after a medium duration - 2 minutes MediumRetry RetryPeriod = RetryPeriod(3 * time.Minute) // LongRetry tells the controller to retry after a long duration - 10 minutes LongRetry RetryPeriod = RetryPeriod(10 * time.Minute) ) + +// EssentialTaints are taints on node object which if added/removed, require an immediate reconcile by machine controller +// TODO: update this when taints for ALT updation and PostCreate operations is introduced. +var EssentialTaints = []string{TaintNodeCriticalComponentsNotReady}