Merge pull request kubernetes-sigs#11147 from sbueringer/pr-fix-drain…

…-log 🐛 Fix drain log for unreachable Nodes
sbueringer · Sep 9, 2024 · 831e017 · 831e017
2 parents a6109b7 + ac46f9f
commit 831e017
Showing 1 changed file with 9 additions and 1 deletion.
diff --git a/internal/controllers/machine/machine_controller.go b/internal/controllers/machine/machine_controller.go
@@ -686,7 +686,15 @@ func (r *Reconciler) drainNode(ctx context.Context, cluster *clusterv1.Cluster,
 		// Override the grace period of pods to reduce the time needed to skip them.
 		drainer.GracePeriodSeconds = 1
 
-		log.V(3).Info("Node is unreachable, draining will use 1s GracePeriodSeconds and will ignore all Pods that have a deletionTimestamp > 1s old. PDBs are still honored.")
+		// Our drain code still respects PDBs when evicting Pods, but that does not mean they are respected
+		// in general by the entire system.
+		// When a Node becomes unreachable the following happens:
+		// * node.kubernetes.io/unreachable:NoExecute taint is set on the Node
+		// * taint manager will evict Pods immediately because of the NoExecute taint (without respecting PDBs)
+		//   * https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/#concepts
+		//     "NoExecute": "Pods that do not tolerate the taint are evicted immediately""
+		// * our drain code will now ignore the Pods (as they quickly have a deletionTimestamp older than 2 seconds)
+		log.V(3).Info("Node is unreachable, draining will use 1s GracePeriodSeconds and will ignore all Pods that have a deletionTimestamp > 1s old")
 	}
 
 	if err := drainer.CordonNode(ctx, node); err != nil {