From 5eb5a2af9082cb12f566fdad9ddf819ea120fc7b Mon Sep 17 00:00:00 2001 From: cyclinder Date: Thu, 17 Oct 2024 19:01:14 +0800 Subject: [PATCH] Add a pod mutating webhook to auto inject the pod network resources Signed-off-by: cyclinder --- charts/spiderpool/templates/deployment.yaml | 4 + charts/spiderpool/values.yaml | 3 + cmd/spiderpool-controller/cmd/config.go | 21 +- cmd/spiderpool-controller/cmd/daemon.go | 14 ++ docs/reference/spiderpool-controller.md | 1 + .../install/ai/get-started-macvlan-zh_CN.md | 72 +++--- docs/usage/install/ai/get-started-macvlan.md | 72 +++--- .../install/ai/get-started-sriov-zh_CN.md | 101 ++++---- docs/usage/install/ai/get-started-sriov.md | 100 ++++---- pkg/constant/k8s.go | 7 +- pkg/multuscniconfig/utils.go | 26 +++ pkg/podmanager/pod_manager.go | 6 +- pkg/podmanager/pod_webhook.go | 77 +++++++ pkg/podmanager/utils.go | 217 ++++++++++++++++++ pkg/podmanager/utils_test.go | 104 +++++++++ test/Makefile | 1 + test/doc/podwebhook.md | 5 + test/e2e/podwebhook/podwebhook_suite_test.go | 26 +++ test/e2e/podwebhook/podwebhook_test.go | 107 +++++++++ 19 files changed, 789 insertions(+), 175 deletions(-) create mode 100644 pkg/podmanager/pod_webhook.go create mode 100644 test/doc/podwebhook.md create mode 100644 test/e2e/podwebhook/podwebhook_suite_test.go create mode 100644 test/e2e/podwebhook/podwebhook_test.go diff --git a/charts/spiderpool/templates/deployment.yaml b/charts/spiderpool/templates/deployment.yaml index d78ce1b080..350381dba7 100644 --- a/charts/spiderpool/templates/deployment.yaml +++ b/charts/spiderpool/templates/deployment.yaml @@ -187,6 +187,10 @@ spec: valueFrom: fieldRef: fieldPath: metadata.namespace + - name: SPIDERPOOL_DEPLOYMENT_NAME + value: {{ .Values.spiderpoolController.name | quote }} + - name: SPIDERPOOL_ENABLE_POD_NETWORK_RESOURCE_INJECT + value: {{ .Values.spiderpoolController.enablePodNetworkResourceInject | quote }} {{- with .Values.spiderpoolController.extraEnv }} {{- toYaml . | nindent 8 }} {{- end }} diff --git a/charts/spiderpool/values.yaml b/charts/spiderpool/values.yaml index a51d96b488..10f3734f6e 100644 --- a/charts/spiderpool/values.yaml +++ b/charts/spiderpool/values.yaml @@ -669,6 +669,9 @@ spiderpoolController: ## @param spiderpoolController.webhookPort the http port for spiderpoolController webhook webhookPort: 5722 + ## @param spiderpoolController.enablePodNetworkResourceInject inject network resource to pod + enablePodNetworkResourceInject: false + prometheus: ## @param spiderpoolController.prometheus.enabled enable spiderpool Controller to collect metrics enabled: false diff --git a/cmd/spiderpool-controller/cmd/config.go b/cmd/spiderpool-controller/cmd/config.go index 276c5cebe0..6d76438a93 100644 --- a/cmd/spiderpool-controller/cmd/config.go +++ b/cmd/spiderpool-controller/cmd/config.go @@ -99,6 +99,7 @@ var envInfo = []envConf{ {"SPIDERPOOL_MULTUS_CONFIG_INFORMER_RESYNC_PERIOD", "60", false, nil, nil, &controllerContext.Cfg.MultusConfigInformerResyncPeriod}, {"SPIDERPOOL_CILIUM_CONFIGMAP_NAMESPACE_NAME", "kube-system/cilium-config", false, &controllerContext.Cfg.CiliumConfigName, nil, nil}, + {"SPIDERPOOL_ENABLE_POD_NETWORK_RESOURCE_INJECT", "false", false, nil, &controllerContext.Cfg.InjectPodNetworkResource, nil}, {"SPIDERPOOL_IPPOOL_INFORMER_RESYNC_PERIOD", "300", false, nil, nil, &controllerContext.Cfg.IPPoolInformerResyncPeriod}, {"SPIDERPOOL_IPPOOL_INFORMER_WORKERS", "3", true, nil, nil, &controllerContext.Cfg.IPPoolInformerWorkers}, {"SPIDERPOOL_AUTO_IPPOOL_HANDLER_MAX_WORKQUEUE_LENGTH", "10000", true, nil, nil, &controllerContext.Cfg.IPPoolInformerMaxWorkQueueLength}, @@ -128,16 +129,20 @@ type Config struct { GopsListenPort string PyroscopeAddress string DefaultCniConfDir string - // CiliumConfigName is formatted by namespace and name,default is kube-system/cilium-config + // CiliumConfigName is formatted by namespace and name + // default is kube-system/cilium-config CiliumConfigName string - ControllerPodNamespace string - ControllerPodName string - DefaultCoordinatorName string - LeaseDuration int - LeaseRenewDeadline int - LeaseRetryPeriod int - LeaseRetryGap int + InjectPodNetworkResource bool + + ControllerDeploymentName string + ControllerPodNamespace string + ControllerPodName string + DefaultCoordinatorName string + LeaseDuration int + LeaseRenewDeadline int + LeaseRetryPeriod int + LeaseRetryGap int IPPoolMaxAllocatedIPs int diff --git a/cmd/spiderpool-controller/cmd/daemon.go b/cmd/spiderpool-controller/cmd/daemon.go index 88023c0565..b42aa944e9 100644 --- a/cmd/spiderpool-controller/cmd/daemon.go +++ b/cmd/spiderpool-controller/cmd/daemon.go @@ -268,6 +268,20 @@ func initControllerServiceManagers(ctx context.Context) { } controllerContext.PodManager = podManager + if controllerContext.Cfg.InjectPodNetworkResource { + logger.Debug("Begin to init Pod MutatingWebhook") + if err := podmanager.InitPodWebhook(controllerContext.CRDManager.GetClient(), + controllerContext.CRDManager, controllerContext.Cfg.ControllerDeploymentName); err != nil { + logger.Fatal(err.Error()) + } + } else { + logger.Debug("InjectPodNetworkResource is disabled, try to remove the pod part in the MutatingWebhook") + if err := podmanager.RemovePodMutatingWebhook(controllerContext.CRDManager.GetClient(), + controllerContext.Cfg.ControllerDeploymentName); err != nil { + logger.Fatal(err.Error()) + } + } + logger.Info("Begin to initialize StatefulSet manager") statefulSetManager, err := statefulsetmanager.NewStatefulSetManager( controllerContext.CRDManager.GetClient(), diff --git a/docs/reference/spiderpool-controller.md b/docs/reference/spiderpool-controller.md index 847ff3f572..60ef921d52 100644 --- a/docs/reference/spiderpool-controller.md +++ b/docs/reference/spiderpool-controller.md @@ -32,6 +32,7 @@ Run the spiderpool controller daemon. | SPIDERPOOL_CNI_CONFIG_DIR | /etc/cni/net.d | The host path of the cni config directory. | | SPIDERPOOL_CILIUM_CONFIGMAP_NAMESPACE_NAME | kube-system/cilium-config. | The cilium's configMap, default is kube-system/cilium-config. | | SPIDERPOOL_COORDINATOR_DEFAULT_NAME | default | the name of default spidercoordinator CR | +| SPIDERPOOL_ENABLE_POD_NETWORK_RESOURCE_INJECT | false | Enable/disable inject network resources for pod. | ## spiderpool-controller shutdown diff --git a/docs/usage/install/ai/get-started-macvlan-zh_CN.md b/docs/usage/install/ai/get-started-macvlan-zh_CN.md index 426be7471a..78bfae3ebd 100644 --- a/docs/usage/install/ai/get-started-macvlan-zh_CN.md +++ b/docs/usage/install/ai/get-started-macvlan-zh_CN.md @@ -214,7 +214,9 @@ 3. 创建 CNI 配置和对应的 ippool 资源 - 对于 Ethernet 网络,请为所有的 GPU 亲和的 macvlan 网卡配置,并创建对应的 IP 地址池。如下例子,配置了 GPU1 亲和的网卡和 IP 地址池。 + 对于 Ethernet 网络,请为所有的 GPU 亲和的 macvlan 网卡配置,并创建对应的 IP 地址池。Spiderpool 为了简化 AI 应用配置多网卡的复杂度,支持通过 labels 对一组网卡配置分类,用户只需要为 Pod 注入特定的标签,这样 Spiderpool 会通过 webhook 自动为 Pod 注入对应的网卡和网络资源。 + + 如下例子,配置了 GPU1 亲和的网卡和 IP 地址池: ```shell $ cat <-- + Path: ptr.To("/mutate--v1-pod"), + } + } + return wb +} + +// addPodMutatingWebhook updates the MutatingWebhookConfiguration for pods. +// It retrieves the existing configuration, adds a new webhook for pods, +// and updates the configuration in the Kubernetes API server. +func addPodMutatingWebhook(client client.Client, mutatingWebhookName string) error { + var mwc admissionregistrationv1.MutatingWebhookConfiguration + err := client.Get(context.TODO(), types.NamespacedName{Name: mutatingWebhookName}, &mwc) + if err != nil { + return fmt.Errorf("failed to get MutatingWebhookConfiguration: %v", err) + } + + if len(mwc.Webhooks) == 0 { + return fmt.Errorf("no any mutating webhook found in MutatingWebhookConfiguration %s", mutatingWebhookName) + } + + podWebhook := initPodMutatingWebhook(*mwc.Webhooks[0].DeepCopy()) + mwc.Webhooks = append(mwc.Webhooks, podWebhook) + + err = client.Update(context.TODO(), &mwc) + if err != nil { + return fmt.Errorf("failed to update MutatingWebhookConfiguration %s: %v", mutatingWebhookName, err) + } + + return nil +} + +// RemovePodMutatingWebhook removes the mutating webhook for pods. +// It retrieves the existing configuration, removes the webhook for pods, +// and updates the configuration in the Kubernetes API server. +func RemovePodMutatingWebhook(client client.Client, mutatingWebhookName string) error { + var mwc admissionregistrationv1.MutatingWebhookConfiguration + err := client.Get(context.TODO(), types.NamespacedName{Name: mutatingWebhookName}, &mwc) + if err != nil { + return fmt.Errorf("failed to get MutatingWebhookConfiguration: %v", err) + } + + var newWebhooks []admissionregistrationv1.MutatingWebhook + for _, wb := range mwc.Webhooks { + if wb.Name != constant.PodMutatingWebhookName { + newWebhooks = append(newWebhooks, wb) + } + } + + if len(newWebhooks) == len(mwc.Webhooks) { + return nil + } + + mwc.Webhooks = newWebhooks + err = client.Update(context.TODO(), &mwc) + if err != nil { + return fmt.Errorf("failed to update MutatingWebhookConfiguration %s: %v", mutatingWebhookName, err) + } + return nil +} diff --git a/pkg/podmanager/utils_test.go b/pkg/podmanager/utils_test.go index b91c7f8c50..1e7b2b833d 100644 --- a/pkg/podmanager/utils_test.go +++ b/pkg/podmanager/utils_test.go @@ -6,6 +6,8 @@ package podmanager_test import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" + "github.com/spidernet-io/spiderpool/pkg/constant" + "github.com/spidernet-io/spiderpool/pkg/k8s/apis/spiderpool.spidernet.io/v2beta1" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/utils/ptr" @@ -75,4 +77,106 @@ var _ = Describe("PodManager utils", Label("pod_manager_utils_test"), func() { Expect(isAlive).To(BeTrue()) }) }) + + Describe("Test injectPodNetwork", func() { + var pod *corev1.Pod + var multusConfigs v2beta1.SpiderMultusConfigList + + BeforeEach(func() { + pod = &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pod", + Namespace: "default", + Annotations: make(map[string]string), + }, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "test-container", + Resources: corev1.ResourceRequirements{ + Requests: corev1.ResourceList{}, + Limits: corev1.ResourceList{}, + }, + }, + }, + }, + } + }) + + It("should successfully inject network configuration", func() { + multusConfigs = v2beta1.SpiderMultusConfigList{ + Items: []v2beta1.SpiderMultusConfig{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "config1", + Namespace: "default", + }, + Spec: v2beta1.MultusCNIConfigSpec{ + CniType: ptr.To("macvlan"), + MacvlanConfig: &v2beta1.SpiderMacvlanCniConfig{ + EnableRdma: true, + RdmaResourceName: "spidernet.io/rdma-resource1", + }, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "config2", + Namespace: "default", + }, + Spec: v2beta1.MultusCNIConfigSpec{ + CniType: ptr.To("macvlan"), + MacvlanConfig: &v2beta1.SpiderMacvlanCniConfig{ + EnableRdma: true, + RdmaResourceName: "spidernet.io/rdma-resource2", + }, + }, + }, + }, + } + err := podmanager.InjectPodNetwork(pod, multusConfigs) + Expect(err).NotTo(HaveOccurred()) + Expect(pod.Annotations[constant.MultusNetworkAttachmentAnnot]).To(Equal("default/config1,default/config2")) + + Expect(pod.Spec.Containers[0].Resources.Requests).To(HaveKey(corev1.ResourceName("spidernet.io/rdma-resource1"))) + Expect(pod.Spec.Containers[0].Resources.Requests).To(HaveKey(corev1.ResourceName("spidernet.io/rdma-resource2"))) + }) + + It("should return an error when CNI types are inconsistent", func() { + multusConfigs = v2beta1.SpiderMultusConfigList{ + Items: []v2beta1.SpiderMultusConfig{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "config1", + Namespace: "default", + }, + Spec: v2beta1.MultusCNIConfigSpec{ + CniType: ptr.To("macvlan"), + MacvlanConfig: &v2beta1.SpiderMacvlanCniConfig{ + EnableRdma: true, + RdmaResourceName: "spidernet.io/rdma-resource1", + }, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "config2", + Namespace: "default", + }, + Spec: v2beta1.MultusCNIConfigSpec{ + CniType: ptr.To("ipvlan"), + IPVlanConfig: &v2beta1.SpiderIPvlanCniConfig{ + EnableRdma: true, + RdmaResourceName: "spidernet.io/rdma-resource2", + }, + }, + }, + }, + } + + err := podmanager.InjectPodNetwork(pod, multusConfigs) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("cniType ipvlan is not consistent with macvlan")) + }) + }) }) diff --git a/test/Makefile b/test/Makefile index ae3323cdb7..e8eee205db 100644 --- a/test/Makefile +++ b/test/Makefile @@ -338,6 +338,7 @@ setup_spiderpool: HELM_OPTION+=" --set clusterDefaultPool.ipv4IPRanges={$${ipv4_ip_range}} --set clusterDefaultPool.ipv6IPRanges={$${ipv6_ip_range}}" ; \ HELM_OPTION+=" --set ipam.enableIPv4=true --set ipam.enableIPv6=true" ; \ fi ; \ + HELM_OPTION+=" --set spiderpoolController.enablePodNetworkResource=true " ; \ HELM_OPTION+=" --set spiderpoolAgent.prometheus.enabled=true --set spiderpoolController.prometheus.enabled=true " ; \ HELM_OPTION+=" --set spiderpoolAgent.prometheus.enabledDebugMetric=true --set spiderpoolController.prometheus.enabledDebugMetric=true " ; \ if [ -n "$(PYROSCOPE_LOCAL_PORT)" ] ; then \ diff --git a/test/doc/podwebhook.md b/test/doc/podwebhook.md new file mode 100644 index 0000000000..e3c9ed63e9 --- /dev/null +++ b/test/doc/podwebhook.md @@ -0,0 +1,5 @@ +# E2E Cases for Pod Webhook + +| Case ID | Title | Priority | Smoke | Status | Other | +| ------- | --------------------------------------------------------------------------------- | -------- | ----- | ------ | ----- | +| H00001 | test pod webhook auto inject resource to pod | p1 | true | done | | diff --git a/test/e2e/podwebhook/podwebhook_suite_test.go b/test/e2e/podwebhook/podwebhook_suite_test.go new file mode 100644 index 0000000000..66cd0363b6 --- /dev/null +++ b/test/e2e/podwebhook/podwebhook_suite_test.go @@ -0,0 +1,26 @@ +package podwebhook_test + +import ( + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + e2e "github.com/spidernet-io/e2eframework/framework" + spiderpool "github.com/spidernet-io/spiderpool/pkg/k8s/apis/spiderpool.spidernet.io/v2beta1" + "k8s.io/apimachinery/pkg/runtime" +) + +func TestPodwebhook(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "Podwebhook Suite") +} + +var frame *e2e.Framework + +var _ = BeforeSuite(func() { + defer GinkgoRecover() + var e error + frame, e = e2e.NewFramework(GinkgoT(), []func(*runtime.Scheme) error{spiderpool.AddToScheme}) + Expect(e).NotTo(HaveOccurred()) +}) diff --git a/test/e2e/podwebhook/podwebhook_test.go b/test/e2e/podwebhook/podwebhook_test.go new file mode 100644 index 0000000000..b0153ca247 --- /dev/null +++ b/test/e2e/podwebhook/podwebhook_test.go @@ -0,0 +1,107 @@ +package podwebhook_test + +import ( + "fmt" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/spidernet-io/spiderpool/pkg/constant" + "github.com/spidernet-io/spiderpool/pkg/k8s/apis/spiderpool.spidernet.io/v2beta1" + "github.com/spidernet-io/spiderpool/test/e2e/common" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/utils/ptr" +) + +var _ = Describe("Podwebhook", func() { + var namespace string + + BeforeEach(func() { + // create namespace + namespace = "ns-" + common.GenerateString(10, true) + err := frame.CreateNamespaceUntilDefaultServiceAccountReady(namespace, common.ServiceAccountReadyTimeout) + Expect(err).NotTo(HaveOccurred()) + + DeferCleanup(func() { + if CurrentSpecReport().Failed() { + GinkgoWriter.Println("If the use case fails, the cleanup step will be skipped") + return + } + + err := frame.DeleteNamespace(namespace) + Expect(err).NotTo(HaveOccurred(), "Failed to delete namespace %v") + }) + }) + + Context("Test inject pod network resources", func() { + It("Test inject pod network resources", Label("H00001"), func() { + + // Define multus cni NetworkAttachmentDefinition and create + createNad := func(name string) *v2beta1.SpiderMultusConfig { + return &v2beta1.SpiderMultusConfig{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + Labels: map[string]string{ + "multus.spidernet.io/auto-inject1": "", + }, + }, + Spec: v2beta1.MultusCNIConfigSpec{ + CniType: ptr.To(constant.MacvlanCNI), + MacvlanConfig: &v2beta1.SpiderMacvlanCniConfig{ + Master: []string{common.NIC1}, + EnableRdma: true, + RdmaResourceName: "spidernet.io/rdma_resource" + "_" + name, + }, + }, + } + } + + By("Create spiderMultusConfig: nad1 for testing") + Expect(frame.CreateSpiderMultusInstance(createNad("nad1"))).NotTo(HaveOccurred()) + By("Create spiderMultusConfig: nad2 for testing") + Expect(frame.CreateSpiderMultusInstance(createNad("nad2"))).NotTo(HaveOccurred()) + + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pod", + Namespace: namespace, + Labels: map[string]string{ + constant.LabelMutatingPodWebhook: "multus.spidernet.io/auto-inject1", + }, + }, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "samplepod", + Image: "alpine", + ImagePullPolicy: "IfNotPresent", + Command: []string{"/bin/ash", "-c", "while true; do echo 'HTTP/1.1 200 OK Hello, World!' | nc -l -p 80; done"}, + Ports: []corev1.ContainerPort{ + { + Name: "samplepod", + ContainerPort: 80, + }, + }, + }, + }, + }, + } + + By("Create Pod for testing network resources inject") + err := frame.CreatePod(pod) + Expect(err).NotTo(HaveOccurred()) + + By("Check pod network annotations and resources") + p, err := frame.GetPod(pod.Name, pod.Namespace) + Expect(err).NotTo(HaveOccurred(), "failed to get pod: %v", err) + + GinkgoWriter.Printf("Pod annotations: %v\n", p.Annotations) + GinkgoWriter.Printf("Pod resources: %v\n", p.Spec.Containers[0].Resources.Requests) + Expect(p.Annotations[constant.MultusNetworkAttachmentAnnot]).To(Equal(fmt.Sprintf("%s/%s,%s/%s", namespace, "nad1", namespace, "nad2"))) + Expect(p.Spec.Containers[0].Resources.Requests).To(HaveKey(corev1.ResourceName("spidernet.io/rdma_resource_nad1"))) + Expect(p.Spec.Containers[0].Resources.Requests).To(HaveKey(corev1.ResourceName("spidernet.io/rdma_resource_nad2"))) + }) + }) +})