From 2033fcd436a59cd429a761e5d62071dd16ca4e7d Mon Sep 17 00:00:00 2001 From: Jacek Kaniuk Date: Wed, 29 Apr 2020 22:59:19 +0200 Subject: [PATCH] Snapshot Prometheus disk after tearing Prometheus down --- .../pkg/framework/client/objects.go | 8 +++ clusterloader2/pkg/prometheus/experimental.go | 60 ++++++++++++++++--- .../manifests/0ssd-storage-class.yaml | 3 + clusterloader2/pkg/prometheus/prometheus.go | 18 +++++- 4 files changed, 79 insertions(+), 10 deletions(-) diff --git a/clusterloader2/pkg/framework/client/objects.go b/clusterloader2/pkg/framework/client/objects.go index cb0440e48..b530e23d0 100644 --- a/clusterloader2/pkg/framework/client/objects.go +++ b/clusterloader2/pkg/framework/client/objects.go @@ -250,6 +250,14 @@ func ListEvents(c clientset.Interface, namespace string, name string, options .. return obj, nil } +// DeleteStorageClass deletes storage class with given name. +func DeleteStorageClass(c clientset.Interface, name string) error { + deleteFunc := func() error { + return c.StorageV1().StorageClasses().Delete(name, nil) + } + return RetryWithExponentialBackOff(RetryFunction(deleteFunc, Allow(apierrs.IsNotFound))) +} + // CreateObject creates object based on given object description. func CreateObject(dynamicClient dynamic.Interface, namespace string, name string, obj *unstructured.Unstructured, options ...*ApiCallOptions) error { gvk := obj.GroupVersionKind() diff --git a/clusterloader2/pkg/prometheus/experimental.go b/clusterloader2/pkg/prometheus/experimental.go index a2e1b55c1..188ac93cc 100644 --- a/clusterloader2/pkg/prometheus/experimental.go +++ b/clusterloader2/pkg/prometheus/experimental.go @@ -23,6 +23,7 @@ import ( "time" "github.com/spf13/pflag" + corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/klog" @@ -33,6 +34,12 @@ type prometheusDiskMetadata struct { zone string } +const ( + gcloudRetryInterval = 20 * time.Second + snapshotRetryTimeout = 10 * time.Minute + deleteRetryTimeout = 2 * time.Minute +) + var ( shouldSnapshotPrometheusDisk = pflag.Bool("experimental-gcp-snapshot-prometheus-disk", false, "(experimental, provider=gce|gke only) whether to snapshot Prometheus disk before Prometheus stack is torn down") prometheusDiskSnapshotName = pflag.String("experimental-prometheus-disk-snapshot-name", "", "Name of the prometheus disk snapshot that will be created if snapshots are enabled. If not set, the prometheus disk name will be used.") @@ -73,6 +80,9 @@ func (pc *PrometheusController) tryRetrievePrometheusDiskMetadata() (bool, error if pv.Spec.ClaimRef.Name != "prometheus-k8s-db-prometheus-k8s-0" { continue } + if pv.Status.Phase != corev1.VolumeBound { + continue + } klog.Infof("Found Prometheus' PV with name: %s", pv.Name) pdName = pv.Spec.GCEPersistentDisk.PDName zone = pv.ObjectMeta.Labels["failure-domain.beta.kubernetes.io/zone"] @@ -98,15 +108,10 @@ func (pc *PrometheusController) snapshotPrometheusDiskIfEnabled() error { if enabled, err := pc.isEnabled(); !enabled { return err } - // Update cache of Prometheus disk metadata - err := wait.Poll( - 10*time.Second, - 2*time.Minute, - pc.tryRetrievePrometheusDiskMetadata) if pc.diskMetadata.name == "" || pc.diskMetadata.zone == "" { klog.Errorf("Missing zone or PD name, aborting snapshot") klog.Infof("PD name=%s, zone=%s", pc.diskMetadata.name, pc.diskMetadata.zone) - return err + return fmt.Errorf("missing zone or PD name, aborting snapshot") } // Select snapshot name snapshotName := pc.diskMetadata.name @@ -119,8 +124,8 @@ func (pc *PrometheusController) snapshotPrometheusDiskIfEnabled() error { } // Snapshot Prometheus disk return wait.Poll( - 20*time.Second, - 10*time.Minute, + gcloudRetryInterval, + snapshotRetryTimeout, func() (bool, error) { err := pc.trySnapshotPrometheusDisk(pc.diskMetadata.name, snapshotName, pc.diskMetadata.zone) // Poll() stops on error so returning nil @@ -146,3 +151,42 @@ func (pc *PrometheusController) trySnapshotPrometheusDisk(pdName, snapshotName, } return err } + +func (pc *PrometheusController) deletePrometheusDiskIfEnabled() error { + if enabled, err := pc.isEnabled(); !enabled { + return err + } + if pc.diskMetadata.name == "" || pc.diskMetadata.zone == "" { + klog.Errorf("Missing zone or PD name, aborting deletion") + klog.Infof("PD name=%s, zone=%s", pc.diskMetadata.name, pc.diskMetadata.zone) + return fmt.Errorf("missing zone or PD name, aborting deletion") + } + // Delete Prometheus disk + return wait.Poll( + gcloudRetryInterval, + deleteRetryTimeout, + func() (bool, error) { + err := pc.tryDeletePrometheusDisk(pc.diskMetadata.name, pc.diskMetadata.zone) + // Poll() stops on error so returning nil + return err == nil, nil + }) +} + +func (pc *PrometheusController) tryDeletePrometheusDisk(pdName, zone string) error { + klog.Info("Trying to delete Prometheus' persistent disk...") + project := pc.clusterLoaderConfig.PrometheusConfig.SnapshotProject + if project == "" { + // This should never happen when run from kubetest with a GCE/GKE Kubernetes + // provider - kubetest always propagates PROJECT env var in such situations. + return fmt.Errorf("unknown project - please set --experimental-snapshot-project flag") + } + klog.Infof("Deleting PD %q in project %q in zone %q", pdName, project, zone) + cmd := exec.Command("gcloud", "compute", "disks", "delete", pdName, "--project", project, "--zone", zone) + output, err := cmd.CombinedOutput() + if err != nil { + klog.Errorf("Deleting disk failed: %v\nCommand output: %q", err, string(output)) + } else { + klog.Infof("Deleting disk finished with: %q", string(output)) + } + return err +} diff --git a/clusterloader2/pkg/prometheus/manifests/0ssd-storage-class.yaml b/clusterloader2/pkg/prometheus/manifests/0ssd-storage-class.yaml index a87cca26a..b780c0bf5 100644 --- a/clusterloader2/pkg/prometheus/manifests/0ssd-storage-class.yaml +++ b/clusterloader2/pkg/prometheus/manifests/0ssd-storage-class.yaml @@ -5,3 +5,6 @@ metadata: provisioner: kubernetes.io/gce-pd parameters: type: pd-ssd +{{if .RetainPD}} +reclaimPolicy: Retain +{{end}} diff --git a/clusterloader2/pkg/prometheus/prometheus.go b/clusterloader2/pkg/prometheus/prometheus.go index de9d70bb8..fd406b09d 100644 --- a/clusterloader2/pkg/prometheus/prometheus.go +++ b/clusterloader2/pkg/prometheus/prometheus.go @@ -40,6 +40,7 @@ import ( const ( namespace = "monitoring" + storageClass = "ssd" coreManifests = "$GOPATH/src/k8s.io/perf-tests/clusterloader2/pkg/prometheus/manifests/*.yaml" defaultServiceMonitors = "$GOPATH/src/k8s.io/perf-tests/clusterloader2/pkg/prometheus/manifests/default/*.yaml" masterIPServiceMonitors = "$GOPATH/src/k8s.io/perf-tests/clusterloader2/pkg/prometheus/manifests/master-ip/*.yaml" @@ -120,6 +121,8 @@ func NewPrometheusController(clusterLoaderConfig *config.ClusterLoaderConfig) (p clusterLoaderConfig.PrometheusConfig.ScrapeKubeProxy = mapping["PROMETHEUS_SCRAPE_KUBE_PROXY"].(bool) } mapping["PROMETHEUS_SCRAPE_KUBELETS"] = clusterLoaderConfig.PrometheusConfig.ScrapeKubelets + snapshotEnabled, _ := pc.isEnabled() + mapping["RetainPD"] = snapshotEnabled pc.templateMapping = mapping @@ -144,6 +147,10 @@ func (pc *PrometheusController) SetUpPrometheusStack() error { return err } } + // Removing Storage Class as Reclaim Policy cannot be changed + if err := client.DeleteStorageClass(k8sClient, storageClass); err != nil { + return err + } if err := pc.applyManifests(coreManifests); err != nil { return err } @@ -179,8 +186,9 @@ func (pc *PrometheusController) SetUpPrometheusStack() error { // TearDownPrometheusStack tears down prometheus stack, releasing all prometheus resources. func (pc *PrometheusController) TearDownPrometheusStack() error { - if err := pc.snapshotPrometheusDiskIfEnabled(); err != nil { - klog.Warningf("Error while snapshotting prometheus disk: %v", err) + // Get disk metadata again to be sure + if err := pc.cachePrometheusDiskMetadataIfEnabled(); err != nil { + klog.Warningf("Error while caching prometheus disk metadata: %v", err) } klog.Info("Tearing down prometheus stack") k8sClient := pc.framework.GetClientSets().GetClient() @@ -190,6 +198,12 @@ func (pc *PrometheusController) TearDownPrometheusStack() error { if err := client.WaitForDeleteNamespace(k8sClient, namespace); err != nil { return err } + if err := pc.snapshotPrometheusDiskIfEnabled(); err != nil { + klog.Warningf("Error while snapshotting prometheus disk: %v", err) + } + if err := pc.deletePrometheusDiskIfEnabled(); err != nil { + klog.Warningf("Error while deleting prometheus disk: %v", err) + } return nil }