Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Optimize kdoctor e2e #4176

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions test/e2e/common/constant.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ const (
BatchCreateTimeout = time.Minute * 5
KdoctorCheckTime = time.Minute * 10
SpiderSyncMultusTime = time.Minute * 2
KDoctorRunTimeout = time.Minute * 10
)

var ForcedWaitingTime = time.Second
Expand Down
34 changes: 34 additions & 0 deletions test/e2e/common/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"fmt"
"os/exec"

"github.com/hashicorp/go-multierror"
. "github.com/onsi/ginkgo/v2"
e2e "github.com/spidernet-io/e2eframework/framework"
corev1 "k8s.io/api/core/v1"
Expand Down Expand Up @@ -55,3 +56,36 @@ func RestartNodeUntilClusterReady(ctx context.Context, frame *e2e.Framework, nod
GinkgoWriter.Println("Check that the status of all Pods in the cluster is running")
return nil
}

func GetNodeNetworkInfo(ctx context.Context, frame *e2e.Framework, nodeList []string) error {
var jobResult *multierror.Error
for _, node := range nodeList {
GinkgoWriter.Printf("=============== Check the network information of the node %v ============== \n", node)
commands := []string{
"ip a",
"ip link show",
"ip n",
"ip -6 n",
"ip rule",
"ip -6 rule",
"ip route",
"ip route show table 100",
"ip route show table 101",
"ip route show table 500",
"ip -6 route",
"ip -6 route show table 100",
"ip -6 route show table 101",
"ip -6 route show table 500",
}

for _, command := range commands {
GinkgoWriter.Printf("--------------- execute %v in node: %v ------------ \n", command, node)
out, err := frame.DockerExecCommand(ctx, node, command)
if err != nil {
jobResult = multierror.Append(jobResult, fmt.Errorf("node %v: command '%v' failed with error: %w, output: %s", node, command, err, out))
}
}
}

return jobResult.ErrorOrNil()
}
32 changes: 32 additions & 0 deletions test/e2e/common/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"github.com/spidernet-io/spiderpool/pkg/constant"
"sigs.k8s.io/controller-runtime/pkg/client"

"github.com/hashicorp/go-multierror"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
e2e "github.com/spidernet-io/e2eframework/framework"
Expand Down Expand Up @@ -143,3 +144,34 @@ func ValidatePodIPConflict(podList *corev1.PodList) error {
}
return nil
}

func GetPodNetworkInfo(ctx context.Context, frame *e2e.Framework, podList *corev1.PodList) error {
var jobResult *multierror.Error
for _, pod := range podList.Items {
GinkgoWriter.Printf("=============== Check the network information of the pod %v/%v ============== \n", pod.Namespace, pod.Name)
commands := []string{
"ip a",
"ip link show",
"ip n",
"ip -6 n",
"ip rule",
"ip -6 rule",
"ip route",
"ip route show table 100",
"ip route show table 101",
"ip -6 route",
"ip -6 route show table 100",
"ip -6 route show table 101",
}

for _, command := range commands {
GinkgoWriter.Printf("--------------- execute %v in pod: %v/%v on node: %v ------------ \n", command, pod.Namespace, pod.Name, pod.Spec.NodeName)
out, err := frame.ExecCommandInPod(pod.Name, pod.Namespace, command, ctx)
if err != nil {
jobResult = multierror.Append(jobResult, fmt.Errorf("pod %v/%v: command '%v' failed with error: %w, output: %s", pod.Namespace, pod.Name, command, err, out))
}
}
}

return jobResult.ErrorOrNil()
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ import (
apitypes "k8s.io/apimachinery/pkg/types"
"k8s.io/utils/ptr"

"github.com/spidernet-io/spiderpool/pkg/constant"
pkgconstant "github.com/spidernet-io/spiderpool/pkg/constant"
"github.com/spidernet-io/spiderpool/pkg/ip"
spiderpoolv2beta1 "github.com/spidernet-io/spiderpool/pkg/k8s/apis/spiderpool.spidernet.io/v2beta1"
Expand Down Expand Up @@ -68,7 +67,10 @@ var _ = Describe("MacvlanUnderlayOne", Serial, Label("underlay", "one-interface"
// Schedule
crontab := "1 1"
schedule.Schedule = &crontab
schedule.RoundNumber = 1
// The sporadic test failures in kdoctor were attempted to be reproduced, but couldn't be.
// By leveraging kdoctor's loop testing, if a failure occurs in the first test,
// check whether it also fails on the second attempt.
schedule.RoundNumber = 3
schedule.RoundTimeoutMinute = 1
task.Spec.Schedule = schedule

Expand All @@ -85,7 +87,7 @@ var _ = Describe("MacvlanUnderlayOne", Serial, Label("underlay", "one-interface"
task.Spec.Target = targetAgent

// request
request.DurationInSecond = 5
request.DurationInSecond = 10
request.QPS = 1
request.PerRequestTimeoutInMS = 7000
task.Spec.Request = request
Expand All @@ -94,15 +96,12 @@ var _ = Describe("MacvlanUnderlayOne", Serial, Label("underlay", "one-interface"
condition.SuccessRate = &successRate
condition.MeanAccessDelayInMs = &delayMs
task.Spec.SuccessCondition = condition
taskCopy := task

GinkgoWriter.Printf("kdoctor task: %+v \n", task)
err := frame.CreateResource(task)
Expect(err).NotTo(HaveOccurred(), " kdoctor nethttp crd create failed")

err = frame.GetResource(apitypes.NamespacedName{Name: name}, taskCopy)
Expect(err).NotTo(HaveOccurred(), " kdoctor nethttp crd get failed")
Expect(err).NotTo(HaveOccurred(), "failed to create kdoctor task")
GinkgoWriter.Printf("succeeded to create kdoctor task: %+v \n", task)

// update the kdoctor service to use corev1.ServiceExternalTrafficPolicyLocal
if frame.Info.IpV4Enabled {
kdoctorIPv4ServiceName := fmt.Sprintf("%s-%s-ipv4", "kdoctor-netreach", task.Name)
var kdoctorIPv4Service *corev1.Service
Expand Down Expand Up @@ -138,52 +137,50 @@ var _ = Describe("MacvlanUnderlayOne", Serial, Label("underlay", "one-interface"
Expect(frame.UpdateResource(kdoctorIPv6Service)).NotTo(HaveOccurred())
}

ctx, cancel := context.WithTimeout(context.Background(), time.Second*60*5)
// waiting for kdoctor task to finish
ctx, cancel := context.WithTimeout(context.Background(), common.KDoctorRunTimeout)
defer cancel()
var err1 = errors.New("error has occurred")
for run {
for {
select {
case <-ctx.Done():
run = false
Expect(errors.New("wait nethttp test timeout")).NotTo(HaveOccurred(), " running kdoctor task timeout")
Expect(errors.New("timeout waiting for kdoctor task to finish")).NotTo(HaveOccurred())
default:
taskCopy := task
err = frame.GetResource(apitypes.NamespacedName{Name: name}, taskCopy)
Expect(err).NotTo(HaveOccurred(), " kdoctor nethttp crd get failed")

if taskCopy.Status.Finish == true {
command := fmt.Sprintf("get netreaches.kdoctor.io %s -oyaml", taskCopy.Name)
netreachesLog, _ := frame.ExecKubectl(command, ctx)
GinkgoWriter.Printf("kdoctor's netreaches execution result %+v \n", string(netreachesLog))

for _, v := range taskCopy.Status.History {
if v.Status == "succeed" {
err1 = nil
Expect(err).NotTo(HaveOccurred(), "Failed to get kdoctor task")
if taskCopy.Status.Finish {
roundFailed := false
for _, t := range taskCopy.Status.History {
// No configuration has been changed, The first round of the test is not considered a failure
if t.RoundNumber != 1 && t.Status == "failed" {
roundFailed = true
break
}
}
run = false

ctx1, cancel1 := context.WithTimeout(context.Background(), time.Second*30)
defer cancel1()
for {
select {
case <-ctx1.Done():
Expect(errors.New("wait kdoctorreport timeout")).NotTo(HaveOccurred(), "failed to run kdoctor task and wait kdoctorreport timeout")
default:
command = fmt.Sprintf("get kdoctorreport %s -oyaml", taskCopy.Name)
kdoctorreportLog, err := frame.ExecKubectl(command, ctx)
if err != nil {
time.Sleep(common.ForcedWaitingTime)
continue
}
GinkgoWriter.Printf("kdoctor's kdoctorreport execution result %+v \n", string(kdoctorreportLog))
}
break
if roundFailed {
Fail("kdoctor task is not successful")
}
return
}
for _, t := range taskCopy.Status.History {
// If the check is successful, exit directly.
if t.RoundNumber == 1 && t.Status == "succeed" {
GinkgoWriter.Println("succeed to run kdoctor task")
return
}
// If the check fails, we should collect the failed Pod network information as soon as possible
// If the first attempt failed but the second attempt succeeded,
// we collected network logs and compared the two attempts to see if there were any differences.
if t.Status == "failed" || (t.RoundNumber != 1 && t.Status == "succeed") {
GinkgoLogr.Error(fmt.Errorf("Failed to run kdoctor task, round %d, at time %s", t.RoundNumber, time.Now()), "Failed")
podList, err := frame.GetPodListByLabel(map[string]string{"app.kubernetes.io/name": taskCopy.Name})
Expect(err).NotTo(HaveOccurred(), "Failed to get pod list by label")
Expect(common.GetPodNetworkInfo(ctx, frame, podList)).NotTo(HaveOccurred(), "Failed to get pod network info")
Expect(common.GetNodeNetworkInfo(ctx, frame, frame.Info.KindNodeList)).NotTo(HaveOccurred(), "Failed to get node network info")
}
}
time.Sleep(time.Second * 5)
}
}
Expect(err1).NotTo(HaveOccurred())
})
})

Expand Down Expand Up @@ -232,7 +229,7 @@ var _ = Describe("MacvlanUnderlayOne", Serial, Label("underlay", "one-interface"
Namespace: namespace,
},
Spec: spiderpoolv2beta1.MultusCNIConfigSpec{
CniType: ptr.To(constant.MacvlanCNI),
CniType: ptr.To(pkgconstant.MacvlanCNI),
MacvlanConfig: &spiderpoolv2beta1.SpiderMacvlanCniConfig{
Master: []string{common.NIC1},
VlanID: ptr.To(int32(100)),
Expand Down Expand Up @@ -283,7 +280,7 @@ var _ = Describe("MacvlanUnderlayOne", Serial, Label("underlay", "one-interface"
Expect(err).NotTo(HaveOccurred())
var annotations = make(map[string]string)
annotations[common.MultusNetworks] = fmt.Sprintf("%s/%s", namespace, multusNadName)
annotations[constant.AnnoPodIPPools] = string(podAnnoMarshal)
annotations[pkgconstant.AnnoPodIPPools] = string(podAnnoMarshal)
deployObject := common.GenerateExampleDeploymentYaml(depName, namespace, int32(1))
deployObject.Spec.Template.Annotations = annotations
Expect(frame.CreateDeployment(deployObject)).NotTo(HaveOccurred())
Expand Down
7 changes: 7 additions & 0 deletions test/scripts/debugEnv.sh
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,13 @@ elif [ "$TYPE"x == "detail"x ] ; then
echo "--------- kubectl logs ${POD} -n ${NAMESPACE} --previous"
kubectl logs ${POD} -n ${NAMESPACE} --kubeconfig ${E2E_KUBECONFIG} --previous
done
if [ -n "$KDOCTOR_POD_LIST" ]; then
echo "Fetching kdoctor reports..."
echo "--------- kubectl get kdoctorreport -A -ojson --------- "
kubectl get kdoctorreport -A -ojson --kubeconfig ${E2E_KUBECONFIG}
echo "--------- kubectl get kdoctorreport -A -oyaml --------- "
kubectl get kdoctorreport -A -oyaml --kubeconfig ${E2E_KUBECONFIG}
fi

echo ""
echo "=============== open kruise logs ============== "
Expand Down
2 changes: 1 addition & 1 deletion test/scripts/install-kdoctor.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ echo "$CURRENT_FILENAME : KDOCTOR_REPORT_PATH $KDOCTOR_REPORT_PATH "
[ ! -f "$E2E_KUBECONFIG" ] && echo "error, could not find file $E2E_KUBECONFIG " && exit 1
echo "$CURRENT_FILENAME : E2E_KUBECONFIG $E2E_KUBECONFIG "

KDOCTOR_VERSION=${KDOCTOR_VERSION:-0.2.0}
KDOCTOR_VERSION=${KDOCTOR_VERSION:-0.2.2}
E2E_KDOCTOR_IMAGE_REPO=${E2E_KDOCTOR_IMAGE_REPO:-"ghcr.io"}

INSTALL_TIME_OUT=300s
Expand Down
Loading