From c8fafc36a83a2dee6692f4a7393dc312aa3aeafa Mon Sep 17 00:00:00 2001 From: Yecheng Fu Date: Wed, 18 Mar 2020 14:02:22 +0800 Subject: [PATCH] add a serial test for stable scheduling --- tests/actions.go | 64 +++-------------- tests/cmd/stability/main.go | 2 - tests/e2e/tidbcluster/serial.go | 102 +++++++++++++++++++++++++++ tests/e2e/tidbcluster/tidbcluster.go | 2 - 4 files changed, 111 insertions(+), 59 deletions(-) diff --git a/tests/actions.go b/tests/actions.go index 947c0820aed..6bd33923dab 100644 --- a/tests/actions.go +++ b/tests/actions.go @@ -230,10 +230,6 @@ type OperatorActions interface { LabelNodesOrDie() CheckDisasterTolerance(info *TidbClusterConfig) error CheckDisasterToleranceOrDie(info *TidbClusterConfig) - GetTidbMemberAssignedNodes(info *TidbClusterConfig) (map[string]string, error) - GetTidbMemberAssignedNodesOrDie(info *TidbClusterConfig) map[string]string - CheckTidbMemberAssignedNodes(info *TidbClusterConfig, oldAssignedNodes map[string]string) error - CheckTidbMemberAssignedNodesOrDie(info *TidbClusterConfig, oldAssignedNodes map[string]string) CheckUpgradeComplete(info *TidbClusterConfig) error CheckUpgradeCompleteOrDie(info *TidbClusterConfig) CheckInitSQL(info *TidbClusterConfig) error @@ -410,10 +406,7 @@ func (oi *OperatorConfig) OperatorHelmSetString(m map[string]string) string { set := map[string]string{ "operatorImage": oi.Image, "controllerManager.autoFailover": "true", - "scheduler.kubeSchedulerImageName": oi.SchedulerImage, - "controllerManager.logLevel": oi.LogLevel, "scheduler.logLevel": "4", - "imagePullPolicy": string(oi.ImagePullPolicy), "testMode": strconv.FormatBool(oi.TestMode), "admissionWebhook.cabundle": oi.Cabundle, "admissionWebhook.create": strconv.FormatBool(oi.WebhookEnabled), @@ -422,6 +415,15 @@ func (oi *OperatorConfig) OperatorHelmSetString(m map[string]string) string { "admissionWebhook.mutation.pingcapResources": strconv.FormatBool(oi.DefaultingEnabled), "admissionWebhook.validation.pingcapResources": strconv.FormatBool(oi.ValidatingEnabled), } + if oi.LogLevel != "" { + set["controllerManager.logLevel"] = oi.LogLevel + } + if oi.SchedulerImage != "" { + set["scheduler.kubeSchedulerImageName"] = oi.SchedulerImage + } + if string(oi.ImagePullPolicy) != "" { + set["imagePullPolicy"] = string(oi.ImagePullPolicy) + } if oi.ControllerManagerReplicas != nil { set["controllerManager.replicas"] = strconv.Itoa(*oi.ControllerManagerReplicas) } @@ -893,54 +895,6 @@ func (oa *operatorActions) CleanTidbClusterOrDie(info *TidbClusterConfig) { } } -func (oa *operatorActions) GetTidbMemberAssignedNodes(info *TidbClusterConfig) (map[string]string, error) { - assignedNodes := make(map[string]string) - ns := info.Namespace - tcName := info.ClusterName - listOptions := metav1.ListOptions{ - LabelSelector: labels.SelectorFromSet( - label.New().Instance(tcName).Component(label.TiDBLabelVal).Labels()).String(), - } - podList, err := oa.kubeCli.CoreV1().Pods(ns).List(listOptions) - if err != nil { - klog.Errorf("failed to get tidb pods: %s/%s, %v", ns, tcName, err) - return nil, err - } - for _, pod := range podList.Items { - assignedNodes[pod.Name] = pod.Spec.NodeName - } - return assignedNodes, nil -} - -func (oa *operatorActions) GetTidbMemberAssignedNodesOrDie(info *TidbClusterConfig) map[string]string { - result, err := oa.GetTidbMemberAssignedNodes(info) - if err != nil { - slack.NotifyAndPanic(err) - } - return result -} - -func (oa *operatorActions) CheckTidbMemberAssignedNodes(info *TidbClusterConfig, oldAssignedNodes map[string]string) error { - klog.Infof("checking tidb member [%s/%s] assigned nodes", info.Namespace, info.ClusterName) - assignedNodes, err := oa.GetTidbMemberAssignedNodes(info) - if err != nil { - return err - } - for member, node := range oldAssignedNodes { - newNode, ok := assignedNodes[member] - if !ok || newNode != node { - return fmt.Errorf("tidb member %s is not scheduled to %s, new node: %s", member, node, newNode) - } - } - return nil -} - -func (oa *operatorActions) CheckTidbMemberAssignedNodesOrDie(info *TidbClusterConfig, oldAssignedNodes map[string]string) { - if err := oa.CheckTidbMemberAssignedNodes(info, oldAssignedNodes); err != nil { - slack.NotifyAndPanic(err) - } -} - func (oa *operatorActions) CheckTidbClusterStatus(info *TidbClusterConfig) error { klog.Infof("checking tidb cluster [%s/%s] status", info.Namespace, info.ClusterName) diff --git a/tests/cmd/stability/main.go b/tests/cmd/stability/main.go index fca200e173d..87857df030e 100644 --- a/tests/cmd/stability/main.go +++ b/tests/cmd/stability/main.go @@ -174,12 +174,10 @@ func run() { oa.RegisterWebHookAndServiceOrDie(ocfg.WebhookConfigName, namespace, ocfg.WebhookServiceName, certCtx) ctx, cancel := context.WithCancel(context.Background()) for _, cluster := range clusters { - assignedNodes := oa.GetTidbMemberAssignedNodesOrDie(cluster) cluster.UpgradeAll(upgradeVersion) oa.UpgradeTidbClusterOrDie(cluster) oa.CheckUpgradeOrDie(ctx, cluster) oa.CheckTidbClusterStatusOrDie(cluster) - oa.CheckTidbMemberAssignedNodesOrDie(cluster, assignedNodes) } // configuration change diff --git a/tests/e2e/tidbcluster/serial.go b/tests/e2e/tidbcluster/serial.go index 74c9a6da183..0809d079471 100644 --- a/tests/e2e/tidbcluster/serial.go +++ b/tests/e2e/tidbcluster/serial.go @@ -38,6 +38,7 @@ import ( "github.com/pingcap/tidb-operator/tests/pkg/fixture" v1 "k8s.io/api/core/v1" apiextensionsclientset "k8s.io/apiextensions-apiserver/pkg/client/clientset/clientset" + apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/util/sets" @@ -106,6 +107,107 @@ var _ = ginkgo.Describe("[tidb-operator][Serial]", func() { } }) + ginkgo.Context("tidb-operator with default values", func() { + var ocfg *tests.OperatorConfig + var oa tests.OperatorActions + var genericCli client.Client + + ginkgo.BeforeEach(func() { + ocfg = &tests.OperatorConfig{ + Namespace: "pingcap", + ReleaseName: "operator", + Image: cfg.OperatorImage, + Tag: cfg.OperatorTag, + } + oa = tests.NewOperatorActions(cli, c, asCli, aggrCli, apiExtCli, tests.DefaultPollInterval, ocfg, e2econfig.TestConfig, nil, fw, f) + ginkgo.By("Installing CRDs") + oa.CleanCRDOrDie() + oa.InstallCRDOrDie(ocfg) + ginkgo.By("Installing tidb-operator") + oa.CleanOperatorOrDie(ocfg) + oa.DeployOperatorOrDie(ocfg) + var err error + genericCli, err = client.New(config, client.Options{Scheme: scheme.Scheme}) + framework.ExpectNoError(err, "failed to create clientset") + }) + + ginkgo.AfterEach(func() { + ginkgo.By("Uninstall tidb-operator") + oa.CleanOperatorOrDie(ocfg) + ginkgo.By("Uninstalling CRDs") + oa.CleanCRDOrDie() + }) + + // There is no guarantee but tidb pods should be assigned back to + // previous nodes if no other pods to occupy the positions. + // See docs/design-proposals/tidb-stable-scheduling.md + ginkgo.It("[Feature: StableScheduling] TiDB pods should be scheduled to preivous nodes", func() { + clusterName := "tidb-scheduling" + tc := fixture.GetTidbCluster(ns, clusterName, utilimage.TiDBV3Version) + tc.Spec.PD.Replicas = 1 + tc.Spec.TiKV.Replicas = 1 + tc.Spec.TiDB.Replicas = 3 + err := genericCli.Create(context.TODO(), tc) + framework.ExpectNoError(err) + err = oa.WaitForTidbClusterReady(tc, 30*time.Minute, 15*time.Second) + framework.ExpectNoError(err) + + listOptions := metav1.ListOptions{ + LabelSelector: labels.SelectorFromSet( + label.New().Instance(clusterName).Component(label.TiDBLabelVal).Labels()).String(), + } + oldPodList, err := c.CoreV1().Pods(ns).List(listOptions) + framework.ExpectNoError(err) + + ginkgo.By("Update tidb configuration") + err = controller.GuaranteedUpdate(genericCli, tc, func() error { + tc.Spec.TiDB.Config.TokenLimit = func(i uint) *uint { + return &i + }(2000) + return nil + }) + framework.ExpectNoError(err) + + ginkgo.By("Waiting for all tidb pods are recreated and assigned to the same node") + getOldPodByName := func(pod *v1.Pod) *v1.Pod { + for _, oldPod := range oldPodList.Items { + if oldPod.Name == pod.Name { + return &oldPod + } + } + return nil + } + err = wait.PollImmediate(time.Second*5, time.Minute*15, func() (bool, error) { + newPodList, err := c.CoreV1().Pods(ns).List(listOptions) + if err != nil && !apierrors.IsNotFound(err) { + return false, err + } + if apierrors.IsNotFound(err) { + return false, nil + } + if len(newPodList.Items) != len(oldPodList.Items) { + return false, nil + } + for _, newPod := range newPodList.Items { + oldPod := getOldPodByName(&newPod) + if oldPod == nil { + return false, fmt.Errorf("found an unexpected pod: %q", newPod.Name) + } + if oldPod.UID == newPod.UID { + // not recreated yet + return false, nil + } + if oldPod.Spec.NodeName != newPod.Spec.NodeName { + // recreated but assigned to another node + return false, fmt.Errorf("pod %q recreated but not assigned to previous node %q, got %q", oldPod.Name, oldPod.Spec.NodeName, newPod.Spec.NodeName) + } + } + return true, nil + }) + framework.ExpectNoError(err) + }) + }) + // tidb-operator with AdvancedStatefulSet feature enabled ginkgo.Context("[Feature: AdvancedStatefulSet][Feature: Webhook]", func() { var ocfg *tests.OperatorConfig diff --git a/tests/e2e/tidbcluster/tidbcluster.go b/tests/e2e/tidbcluster/tidbcluster.go index b5d2ec7b47b..b53e99cd383 100644 --- a/tests/e2e/tidbcluster/tidbcluster.go +++ b/tests/e2e/tidbcluster/tidbcluster.go @@ -244,12 +244,10 @@ var _ = ginkgo.Describe("[tidb-operator] TiDBCluster", func() { upgradeVersions := cfg.GetUpgradeTidbVersionsOrDie() ginkgo.By(fmt.Sprintf("Upgrading tidb cluster from %s to %s", cluster.ClusterVersion, upgradeVersions[0])) ctx, cancel := context.WithCancel(context.Background()) - assignedNodes := oa.GetTidbMemberAssignedNodesOrDie(&cluster) cluster.UpgradeAll(upgradeVersions[0]) oa.UpgradeTidbClusterOrDie(&cluster) oa.CheckUpgradeOrDie(ctx, &cluster) oa.CheckTidbClusterStatusOrDie(&cluster) - oa.CheckTidbMemberAssignedNodesOrDie(&cluster, assignedNodes) cancel() ginkgo.By("Check webhook is still running")