fix: Spiderpool GC incorrect IP address during sts Pod scale up/down,…

… causing IP conflict Signed-off-by: ty-dc <tao.yang@daocloud.io>
ty-dc · Aug 9, 2024 · cfe7059 · cfe7059
1 parent 2a45620
commit cfe7059
Show file tree

Hide file tree

Showing 10 changed files with 622 additions and 140 deletions.
diff --git a/docs/usage/install/upgrade-zh_CN.md b/docs/usage/install/upgrade-zh_CN.md
@@ -149,6 +149,10 @@ kubectl patch sp ${auto-pool} --type merge --patch '{"metadata": {"labels": {"ip
 
 由于在 0.9.0 的版本中，我们给 [SpiderCoordinator CRD](./../../reference/crd-spidercoordinator.md) 补充了 `txQueueLen` 字段，但由于执行升级时 Helm 不支持升级或删除 CRD，因此在升级前需要你手动更新一下 CRD。(建议越过 0.9.0 直接升级至 0.9.1 版本)
 
+### 低于 0.9.4 (包含 0.9.4) 升级到最高版本的注意事项
+
+在 0.9.4 以下的版本中，statefulSet 应用在快速扩缩容场景下，Spiderpool GC 可能会错误的回收掉 IPPool 中的 IP 地址，导致同一个 IP 被分配给 K8S 集群的多个 Pod，从而出现 IP 地址冲突。该问题已修复，参考[修复](https://github.com/spidernet-io/spiderpool/pull/3778)，但在升级后，冲突的 IP 地址并不能自动被 Spiderpool 纠正回来，您需要通过手动重启冲突 IP 的 Pod 来辅助解决，在新版本中不会再出现错误 GC IP 而导致 IP 冲突的问题。
+
 ### 更多版本升级的注意事项
 
 *TODO.*

diff --git a/docs/usage/install/upgrade.md b/docs/usage/install/upgrade.md
@@ -149,6 +149,10 @@ In versions below 0.7.3, Spiderpool will enable a set of DaemonSet: `spiderpool-
 
 Due to the addition of the `txQueueLen` field to the [SpiderCoordinator CRD](./../../reference/crd-spidercoordinator.md) in version 0.9.0, you need to manually update the CRD before upgrading as Helm does not support upgrading or deleting CRDs during the upgrade process.(We suggest skipping version 0.9.0 and upgrading directly to version 0.9.1)
 
+### Upgrading from a version below 0.9.4 (Includes 0.9.4) to a higher version
+
+In versions below 0.9.4, when statefulSet is rapidly scaling up or down, Spiderpool GC may mistakenly reclaim IP addresses in IPPool, causing the same IP to be assigned to multiple Pods in the K8S cluster, resulting in IP address conflicts. This issue has been fixed, see [Fix](https://github.com/spidernet-io/spiderpool/pull/3778), but after the upgrade, the conflicting IP addresses cannot be automatically corrected by Spiderpool. You need to manually restart the Pod with the conflicting IP to assist in resolving the issue. In the new version, there will no longer be an issue with IP conflicts caused by incorrect GC IPs.
+
 ### More notes on version upgrades
 
 *TODO.*

diff --git a/go.mod b/go.mod
@@ -77,6 +77,7 @@ require k8s.io/component-base v0.29.4 // indirect
 
 require (
 	github.com/hashicorp/go-multierror v1.1.1
+	k8s.io/kubectl v0.26.3
 	k8s.io/kubelet v0.29.2
 	tags.cncf.io/container-device-interface v0.6.2
 	tags.cncf.io/container-device-interface/specs-go v0.6.0
@@ -193,7 +194,6 @@ require (
 	gopkg.in/ini.v1 v1.67.0 // indirect
 	k8s.io/gengo/v2 v2.0.0-20240228010128-51d4e06bde70 // indirect
 	k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340 // indirect
-	k8s.io/kubectl v0.26.3 // indirect
 	kubevirt.io/containerized-data-importer-api v1.57.0-alpha1 // indirect
 	kubevirt.io/controller-lifecycle-operator-sdk/api v0.0.0-20220329064328-f3cc58c6ed90 // indirect
 	sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect

diff --git a/pkg/gcmanager/pod_cache.go b/pkg/gcmanager/pod_cache.go
@@ -32,6 +32,7 @@ type PodEntry struct {
 	PodName   string
 	Namespace string
 	NodeName  string
+	UID       string
 
 	EntryUpdateTime     time.Time
 	TracingStartTime    time.Time
@@ -169,10 +170,12 @@ func (s *SpiderGC) buildPodEntry(oldPod, currentPod *corev1.Pod, deleted bool) (
 
 	// deleted pod
 	if deleted {
+
 		podEntry := &PodEntry{
 			PodName:             currentPod.Name,
 			Namespace:           currentPod.Namespace,
 			NodeName:            currentPod.Spec.NodeName,
+			UID:                 string(currentPod.UID),
 			EntryUpdateTime:     metav1.Now().UTC(),
 			TracingStartTime:    metav1.Now().UTC(),
 			TracingGracefulTime: time.Duration(s.gcConfig.AdditionalGraceDelay) * time.Second,
@@ -244,6 +247,7 @@ func (s *SpiderGC) buildPodEntry(oldPod, currentPod *corev1.Pod, deleted bool) (
 				Namespace:        currentPod.Namespace,
 				NodeName:         currentPod.Spec.NodeName,
 				EntryUpdateTime:  metav1.Now().UTC(),
+				UID:              string(currentPod.UID),
 				TracingStartTime: currentPod.DeletionTimestamp.Time,
 				PodTracingReason: podStatus,
 			}
@@ -263,6 +267,7 @@ func (s *SpiderGC) buildPodEntry(oldPod, currentPod *corev1.Pod, deleted bool) (
 				PodName:          currentPod.Name,
 				Namespace:        currentPod.Namespace,
 				NodeName:         currentPod.Spec.NodeName,
+				UID:              string(currentPod.UID),
 				EntryUpdateTime:  metav1.Now().UTC(),
 				PodTracingReason: podStatus,
 			}

diff --git a/pkg/gcmanager/scanAll_IPPool.go b/pkg/gcmanager/scanAll_IPPool.go
diff --git a/pkg/gcmanager/tracePod_worker.go b/pkg/gcmanager/tracePod_worker.go
@@ -51,8 +51,31 @@ func (s *SpiderGC) handlePodEntryForTracingTimeOut(podEntry *PodEntry) {
 		return
 	} else {
 		if time.Now().UTC().After(podEntry.TracingStopTime) {
+			// If the statefulset application quickly experiences scaling down and up,
+			// check whether `Status.PodIPs` is empty to determine whether the Pod in the current K8S has completed the normal IP release to avoid releasing the wrong IP.
+			ctx := context.TODO()
+			currentPodYaml, err := s.podMgr.GetPodByName(ctx, podEntry.Namespace, podEntry.PodName, constant.UseCache)
+			if err != nil {
+				tracingReason := fmt.Sprintf("the graceful deletion period of pod '%s/%s' is over, get the current pod status in Kubernetes", podEntry.Namespace, podEntry.PodName)
+				if apierrors.IsNotFound(err) {
+					logger.With(zap.Any("podEntry tracing-reason", tracingReason)).
+						Sugar().Debugf("pod '%s/%s' not found", podEntry.Namespace, podEntry.PodName)
+				} else {
+					logger.With(zap.Any("podEntry tracing-reason", tracingReason)).
+						Sugar().Errorf("failed to get pod '%s/%s', error: %v", podEntry.Namespace, podEntry.PodName, err)
+					// the pod will be handled next time.
+					return
+				}
+			} else {
+				if len(currentPodYaml.Status.PodIPs) == 0 {
+					logger.Sugar().Infof("The IP address of the Pod %v that has exceeded the grace period has been released through cmdDel, ignore it.", podEntry.PodName)
+					s.PodDB.DeletePodEntry(podEntry.Namespace, podEntry.PodName)
+					return
+				}
+			}
+
 			logger.With(zap.Any("podEntry tracing-reason", podEntry.PodTracingReason)).
-				Sugar().Infof("pod '%s/%s' is out of time, begin to gc IP", podEntry.Namespace, podEntry.PodName)
+				Sugar().Infof("the graceful deletion period of pod '%s/%s' is over, try to release the IP address.", podEntry.Namespace, podEntry.PodName)
 		} else {
 			// not time out
 			return
@@ -62,7 +85,6 @@ func (s *SpiderGC) handlePodEntryForTracingTimeOut(podEntry *PodEntry) {
 	select {
 	case s.gcIPPoolIPSignal <- podEntry:
 		logger.Sugar().Debugf("sending signal to gc pod '%s/%s' IP", podEntry.Namespace, podEntry.PodName)
-
 	case <-time.After(time.Duration(s.gcConfig.GCSignalTimeoutDuration) * time.Second):
 		logger.Sugar().Errorf("failed to gc IP, gcSignal:len=%d, event:'%s/%s' will be dropped", len(s.gcSignal), podEntry.Namespace, podEntry.PodName)
 	}
@@ -71,7 +93,7 @@ func (s *SpiderGC) handlePodEntryForTracingTimeOut(podEntry *PodEntry) {
 // releaseIPPoolIPExecutor receive signals to execute gc IP
 func (s *SpiderGC) releaseIPPoolIPExecutor(ctx context.Context, workerIndex int) {
 	log := logger.With(zap.Any("IPPoolIP_Worker", workerIndex))
-	log.Info("Starting running 'releaseIPPoolIPExecutor'")
+	log.Info("Start checking if IPPool.Status.AllocatedIPs and the endpoint need to be garbage collected ")
 
 	for {
 		select {
@@ -89,6 +111,19 @@ func (s *SpiderGC) releaseIPPoolIPExecutor(ctx context.Context, workerIndex int)
 					return err
 				}
 
+				// Pod has the same name as SpiderEndpoint, but the UID does not match.
+				// Such SpiderEndpoint should be reclaim, but because the IPPool name used by SpiderEndpoint cannot be tracked,
+				// it will be reclaim later via GC All
+				if endpoint.Status.Current.UID != podCache.UID {
+					log.Sugar().Debugf("Pod name=%s/%s,UID=%s and SpiderEndpoint name=%s/%s,UID=%s have the same name but different UIDs, This is an invalid SpiderEndpoint and needs to be recycled.",
+						podCache.Namespace, podCache.PodName, podCache.UID, endpoint.Namespace, endpoint.Name, endpoint.Status.Current.UID)
+					log.Sugar().Warnf("Since the IPPool name used by SpiderEndpoint cannot be tracked, it is waiting for GC all to process",
+						podCache.PodName, podCache.UID, endpoint.Name, endpoint.Status.Current.UID)
+
+					s.PodDB.DeletePodEntry(podCache.Namespace, podCache.PodName)
+					return nil
+				}
+
 				// we need to gather the pod corresponding SpiderEndpoint allocation data to get the used history IPs.
 				podUsedIPs := convert.GroupIPAllocationDetails(endpoint.Status.Current.UID, endpoint.Status.Current.IPs)
 				tickets := podUsedIPs.Pools()

diff --git a/test/doc/reclaim.md b/test/doc/reclaim.md
@@ -11,3 +11,6 @@
 | G00007  | A dirty IP record (pod name is right but container ID is wrong) in the IPPool should be auto clean by Spiderpool                           | p3       |       | done   |       |
 | G00008  | The Spiderpool component recovery from repeated reboot, and could correctly reclaim IP                                                     | p3       |       | done   |       |
 | G00009  | stateless workload IP could be released with node not ready                                                                                | p3       |       | done   |       |
+| G00010  | IP addresses not used by statefulSet can be released by gc all ready                                                                       | p3       |       | done   |       |
+| G00011  | The IPPool is used by 2 statefulSets and scaling up/down the replicas, gc works normally and there is no IP conflict in statefulset.       | p2       |       | done   |       |
+| G00012  | Multiple resource types compete for a single IPPool. In scenarios of creation, scaling up/down, and deletion, GC all can correctly handle IP addresses.       | p2       |       | done   |       |
diff --git a/test/e2e/common/statefulset.go b/test/e2e/common/statefulset.go
@@ -14,6 +14,7 @@ import (
 	corev1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/utils/ptr"
+	"sigs.k8s.io/controller-runtime/pkg/client"
 )
 
 func GenerateExampleStatefulSetYaml(stsName, namespace string, replica int32) *appsv1.StatefulSet {
@@ -94,3 +95,18 @@ func ScaleStatefulsetUntilExpectedReplicas(ctx context.Context, frame *e2e.Frame
 		time.Sleep(ForcedWaitingTime)
 	}
 }
+
+func PatchStatefulSet(frame *e2e.Framework, desiredStatefulSet, originalStatefulSet *appsv1.StatefulSet, opts ...client.PatchOption) error {
+	if desiredStatefulSet == nil || frame == nil || originalStatefulSet == nil {
+		return e2e.ErrWrongInput
+	}
+
+	mergePatch := client.MergeFrom(originalStatefulSet)
+	d, err := mergePatch.Data(desiredStatefulSet)
+	GinkgoWriter.Printf("the patch is: %v. \n", string(d))
+	if err != nil {
+		return fmt.Errorf("failed to generate patch, err is %v", err)
+	}
+
+	return frame.PatchResource(desiredStatefulSet, mergePatch, opts...)
+}
diff --git a/test/e2e/performance/performance_test.go b/test/e2e/performance/performance_test.go
@@ -175,6 +175,6 @@ var _ = Describe("performance test case", Serial, Label("performance"), func() {
 		},
 		// TODO (tao.yang), N controller replicas in Ippool for N IP, Through this template complete gc performance closed-loop test together
 		Entry("time cost for creating, rebuilding, deleting deployment pod in batches",
-			Label("P00002"), common.OwnerDeployment, int32(40), time.Minute*4),
+			Label("P00002"), common.OwnerDeployment, int32(10), time.Minute*4),
 	)
 })