Skip to content

Commit

Permalink
fix: Optimize the reliability of restart components e2e
Browse files Browse the repository at this point in the history
Signed-off-by: tao.yang <tao.yang@daocloud.io>
  • Loading branch information
ty-dc committed Oct 18, 2024
1 parent 6a708dd commit 5d5dae3
Showing 1 changed file with 82 additions and 64 deletions.
146 changes: 82 additions & 64 deletions test/e2e/reliability/reliability_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import (
coordinationv1 "k8s.io/api/coordination/v1"
corev1 "k8s.io/api/core/v1"
apitypes "k8s.io/apimachinery/pkg/types"
"k8s.io/kubectl/pkg/util/podutils"
)

var _ = Describe("test reliability", Label("reliability"), Serial, func() {
Expand Down Expand Up @@ -53,85 +54,102 @@ var _ = Describe("test reliability", Label("reliability"), Serial, func() {

DescribeTable("reliability test table",
func(componentName string, label map[string]string, startupTimeRequired time.Duration) {

// get component pod list
GinkgoWriter.Printf("get %v pod list \n", componentName)
podList, e := frame.GetPodListByLabel(label)
Expect(e).NotTo(HaveOccurred())
Expect(podList.Items).NotTo(HaveLen(0))
expectPodNum := len(podList.Items)
GinkgoWriter.Printf("the %v pod number is: %v \n", componentName, expectPodNum)

// delete component pod
GinkgoWriter.Printf("now time: %s, restart %v %v pod \n", time.Now().Format(time.RFC3339Nano), expectPodNum, componentName)
podList, e = frame.DeletePodListUntilReady(podList, startupTimeRequired)
GinkgoWriter.Printf("pod %v recovery time: %s \n", componentName, time.Now().Format(time.RFC3339Nano))
Expect(e).NotTo(HaveOccurred())
Expect(podList).NotTo(BeNil())

// create pod when component is unstable
GinkgoWriter.Printf("create pod %v/%v when %v is unstable \n", namespace, podName, componentName)
podYaml := common.GenerateExamplePodYaml(podName, namespace)
componentPodList, err := frame.GetPodListByLabel(label)
Expect(err).NotTo(HaveOccurred(), "failed to get %v pod list", componentName)
expectPodNum := len(componentPodList.Items)
GinkgoWriter.Printf("succeeded to get %v pod list \n", componentName)

// Define a set of daemonSets with Pods on each node to verify that the components on each node can provide services for the Pods.
dsName := "ds" + tools.RandomName()
dsYaml := common.GenerateExampleDaemonSetYaml(dsName, namespace)
podIppoolAnnoStr := common.GeneratePodIPPoolAnnotations(frame, common.NIC1, globalDefaultV4IppoolList, globalDefaultV6IppoolList)
podYaml.Annotations = map[string]string{constant.AnnoPodIPPool: podIppoolAnnoStr}

GinkgoWriter.Printf("podyaml %v \n", podYaml)
e = frame.CreatePod(podYaml)
Expect(e).NotTo(HaveOccurred())
dsYaml.Spec.Template.Annotations = map[string]string{constant.AnnoPodIPPool: podIppoolAnnoStr}

wg.Add(1)
// Concurrently delete components and create a new pod
wg.Add(2)
go func() {
defer GinkgoRecover()
// delete component pod
startT1 := time.Now()
defer wg.Done()
GinkgoWriter.Printf("now time: %s, restart %v %v pod \n", time.Now().Format(time.RFC3339Nano), expectPodNum, componentName)
podList, e1 := frame.DeletePodListUntilReady(podList, startupTimeRequired)
GinkgoWriter.Printf("pod %v recovery time: %s \n", componentName, time.Now().Format(time.RFC3339Nano))
Expect(e1).NotTo(HaveOccurred())
Expect(podList).NotTo(BeNil())
endT1 := time.Since(startT1)
GinkgoWriter.Printf("component restart until running time cost is:%v\n", endT1)
wg.Done()
err := frame.DeletePodList(componentPodList)
Expect(err).NotTo(HaveOccurred())

Eventually(func() error {
componentPodList, err := frame.GetPodListByLabel(label)
if err != nil {
return fmt.Errorf("failed to get component %v pod list", componentName)
}
if len(componentPodList.Items) != expectPodNum {
return fmt.Errorf("the number of component %s pod is not equal to expectPodNum %d", componentName, expectPodNum)
}
for _, pod := range componentPodList.Items {
if !podutils.IsPodReady(&pod) {
return fmt.Errorf("the pod %v is not ready", pod.Name)
}
}

// Check webhook service ready after restarting the spiderpool-controller, Avoid affecting the creation of IPPool
if componentName == constant.SpiderpoolController {
ctx, cancel := context.WithTimeout(context.Background(), common.PodReStartTimeout)
defer cancel()
Expect(common.WaitWebhookReady(ctx, frame, common.WebhookPort)).NotTo(HaveOccurred())
}
return nil
}).WithTimeout(common.PodReStartTimeout).WithPolling(time.Second * 3).Should(BeNil())
}()

if componentName == constant.SpiderpoolController {
// Check wbehook service ready after restarting the controller
ctx, cancel := context.WithTimeout(context.Background(), common.PodReStartTimeout)
defer cancel()
Expect(common.WaitWebhookReady(ctx, frame, common.WebhookPort)).NotTo(HaveOccurred())
}
go func() {
defer GinkgoRecover()
defer wg.Done()
GinkgoWriter.Printf("create daemonSet %v/%v when %v is unstable \n", namespace, dsName, componentName)
err := frame.CreateDaemonSet(dsYaml)
Expect(err).NotTo(HaveOccurred())

// Wait test Pod ready
ctx, cancel := context.WithTimeout(context.Background(), time.Minute*2)
defer cancel()
commandString := fmt.Sprintf("get po -n %v %v -oyaml", namespace, podName)
podYamlInfo, err := frame.ExecKubectl(commandString, ctx)
GinkgoWriter.Printf("pod yaml %v \n", podYamlInfo)
Expect(err).NotTo(HaveOccurred())
pod, e := frame.WaitPodStarted(podName, namespace, ctx)
Expect(e).NotTo(HaveOccurred())
Expect(pod.Status.PodIPs).NotTo(BeEmpty(), "pod failed to assign ip")
GinkgoWriter.Printf("pod: %v/%v, ips: %+v \n", namespace, podName, pod.Status.PodIPs)

// Check the Pod's IP recorded IPPool
ok, _, _, err := common.CheckPodIpRecordInIppool(frame, globalDefaultV4IppoolList, globalDefaultV6IppoolList, &corev1.PodList{Items: []corev1.Pod{*pod}})
Expect(err).NotTo(HaveOccurred())
Expect(ok).To(BeTrue())
wg.Wait()
Eventually(func() error {
podList, err := frame.GetPodListByLabel(dsYaml.Spec.Template.Labels)
if err != nil {
return err
}
if len(podList.Items) != len(frame.Info.KindNodeList) {
return fmt.Errorf("the number of pod is not equal to expectPodNum %v", len(frame.Info.KindNodeList))
}
for _, pod := range podList.Items {
if !podutils.IsPodReady(&pod) {
return fmt.Errorf("the pod %v is not ready", pod.Name)
}
}

// try to delete pod
GinkgoWriter.Printf("delete pod %v/%v \n", namespace, podName)
Expect(frame.DeletePod(podName, namespace)).NotTo(HaveOccurred())
// G00008: The Spiderpool component recovery from repeated reboot, and could correctly reclaim IP
if componentName == constant.SpiderpoolAgent || componentName == constant.SpiderpoolController {
Expect(common.WaitIPReclaimedFinish(frame, globalDefaultV4IppoolList, globalDefaultV6IppoolList, &corev1.PodList{Items: []corev1.Pod{*pod}}, 2*common.IPReclaimTimeout)).To(Succeed())
}
// Check the Pod's IP recorded IPPool
ok, _, _, err := common.CheckPodIpRecordInIppool(frame, globalDefaultV4IppoolList, globalDefaultV6IppoolList, podList)
Expect(err).NotTo(HaveOccurred())
Expect(ok).To(BeTrue())

// try to delete pod
Expect(frame.DeleteDaemonSet(dsName, namespace)).NotTo(HaveOccurred())
// G00008: The Spiderpool component recovery from repeated reboot, and could correctly reclaim IP
if componentName == constant.SpiderpoolAgent || componentName == constant.SpiderpoolController {
Expect(common.WaitIPReclaimedFinish(frame, globalDefaultV4IppoolList, globalDefaultV6IppoolList, podList, common.IPReclaimTimeout)).To(Succeed())
}

if frame.Info.IpV4Enabled {
Expect(common.CheckIppoolSanity(frame, common.SpiderPoolIPv4PoolDefault)).NotTo(HaveOccurred(), "error %v", err)
GinkgoWriter.Printf("successfully checked sanity of spiderpool %v \n", common.SpiderPoolIPv4PoolDefault)
}
if frame.Info.IpV6Enabled {
Expect(common.CheckIppoolSanity(frame, common.SpiderPoolIPv6PoolDefault)).NotTo(HaveOccurred(), "error %v", err)
GinkgoWriter.Printf("successfully checked sanity of spiderpool %v \n", common.SpiderPoolIPv6PoolDefault)
}

return nil
}).WithTimeout(common.PodStartTimeout).WithPolling(time.Second * 5).Should(BeNil())
}()
wg.Wait()
},
Entry("Successfully run a pod during the ETCD is restarting",
Label("R00002"), "etcd", map[string]string{"component": "etcd"}, common.PodStartTimeout),
Entry("Successfully run a pod during the API-server is restarting",
Label("R00003"), "apiserver", map[string]string{"component": "kube-apiserver"}, common.PodStartTimeout),
// https://github.com/spidernet-io/spiderpool/issues/1916
Entry("Successfully run a pod during the coreDns is restarting",
Label("R00005"), "coredns", map[string]string{"k8s-app": "kube-dns"}, common.PodStartTimeout),
Entry("Successfully run a pod during the Spiderpool agent is restarting",
Expand Down

0 comments on commit 5d5dae3

Please sign in to comment.