Skip to content

Commit

Permalink
Merge pull request kubernetes#1199 from jkaniuk/snapshot-after-teardown
Browse files Browse the repository at this point in the history
Snapshot Prometheus disk after tearing Prometheus down
  • Loading branch information
k8s-ci-robot committed Apr 30, 2020
2 parents 6c02284 + 2033fcd commit d25aba2
Show file tree
Hide file tree
Showing 4 changed files with 79 additions and 10 deletions.
8 changes: 8 additions & 0 deletions clusterloader2/pkg/framework/client/objects.go
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,14 @@ func ListEvents(c clientset.Interface, namespace string, name string, options ..
return obj, nil
}

// DeleteStorageClass deletes storage class with given name.
func DeleteStorageClass(c clientset.Interface, name string) error {
deleteFunc := func() error {
return c.StorageV1().StorageClasses().Delete(name, nil)
}
return RetryWithExponentialBackOff(RetryFunction(deleteFunc, Allow(apierrs.IsNotFound)))
}

// CreateObject creates object based on given object description.
func CreateObject(dynamicClient dynamic.Interface, namespace string, name string, obj *unstructured.Unstructured, options ...*ApiCallOptions) error {
gvk := obj.GroupVersionKind()
Expand Down
60 changes: 52 additions & 8 deletions clusterloader2/pkg/prometheus/experimental.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"time"

"github.com/spf13/pflag"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/klog"
Expand All @@ -33,6 +34,12 @@ type prometheusDiskMetadata struct {
zone string
}

const (
gcloudRetryInterval = 20 * time.Second
snapshotRetryTimeout = 10 * time.Minute
deleteRetryTimeout = 2 * time.Minute
)

var (
shouldSnapshotPrometheusDisk = pflag.Bool("experimental-gcp-snapshot-prometheus-disk", false, "(experimental, provider=gce|gke only) whether to snapshot Prometheus disk before Prometheus stack is torn down")
prometheusDiskSnapshotName = pflag.String("experimental-prometheus-disk-snapshot-name", "", "Name of the prometheus disk snapshot that will be created if snapshots are enabled. If not set, the prometheus disk name will be used.")
Expand Down Expand Up @@ -73,6 +80,9 @@ func (pc *PrometheusController) tryRetrievePrometheusDiskMetadata() (bool, error
if pv.Spec.ClaimRef.Name != "prometheus-k8s-db-prometheus-k8s-0" {
continue
}
if pv.Status.Phase != corev1.VolumeBound {
continue
}
klog.Infof("Found Prometheus' PV with name: %s", pv.Name)
pdName = pv.Spec.GCEPersistentDisk.PDName
zone = pv.ObjectMeta.Labels["failure-domain.beta.kubernetes.io/zone"]
Expand All @@ -98,15 +108,10 @@ func (pc *PrometheusController) snapshotPrometheusDiskIfEnabled() error {
if enabled, err := pc.isEnabled(); !enabled {
return err
}
// Update cache of Prometheus disk metadata
err := wait.Poll(
10*time.Second,
2*time.Minute,
pc.tryRetrievePrometheusDiskMetadata)
if pc.diskMetadata.name == "" || pc.diskMetadata.zone == "" {
klog.Errorf("Missing zone or PD name, aborting snapshot")
klog.Infof("PD name=%s, zone=%s", pc.diskMetadata.name, pc.diskMetadata.zone)
return err
return fmt.Errorf("missing zone or PD name, aborting snapshot")
}
// Select snapshot name
snapshotName := pc.diskMetadata.name
Expand All @@ -119,8 +124,8 @@ func (pc *PrometheusController) snapshotPrometheusDiskIfEnabled() error {
}
// Snapshot Prometheus disk
return wait.Poll(
20*time.Second,
10*time.Minute,
gcloudRetryInterval,
snapshotRetryTimeout,
func() (bool, error) {
err := pc.trySnapshotPrometheusDisk(pc.diskMetadata.name, snapshotName, pc.diskMetadata.zone)
// Poll() stops on error so returning nil
Expand All @@ -146,3 +151,42 @@ func (pc *PrometheusController) trySnapshotPrometheusDisk(pdName, snapshotName,
}
return err
}

func (pc *PrometheusController) deletePrometheusDiskIfEnabled() error {
if enabled, err := pc.isEnabled(); !enabled {
return err
}
if pc.diskMetadata.name == "" || pc.diskMetadata.zone == "" {
klog.Errorf("Missing zone or PD name, aborting deletion")
klog.Infof("PD name=%s, zone=%s", pc.diskMetadata.name, pc.diskMetadata.zone)
return fmt.Errorf("missing zone or PD name, aborting deletion")
}
// Delete Prometheus disk
return wait.Poll(
gcloudRetryInterval,
deleteRetryTimeout,
func() (bool, error) {
err := pc.tryDeletePrometheusDisk(pc.diskMetadata.name, pc.diskMetadata.zone)
// Poll() stops on error so returning nil
return err == nil, nil
})
}

func (pc *PrometheusController) tryDeletePrometheusDisk(pdName, zone string) error {
klog.Info("Trying to delete Prometheus' persistent disk...")
project := pc.clusterLoaderConfig.PrometheusConfig.SnapshotProject
if project == "" {
// This should never happen when run from kubetest with a GCE/GKE Kubernetes
// provider - kubetest always propagates PROJECT env var in such situations.
return fmt.Errorf("unknown project - please set --experimental-snapshot-project flag")
}
klog.Infof("Deleting PD %q in project %q in zone %q", pdName, project, zone)
cmd := exec.Command("gcloud", "compute", "disks", "delete", pdName, "--project", project, "--zone", zone)
output, err := cmd.CombinedOutput()
if err != nil {
klog.Errorf("Deleting disk failed: %v\nCommand output: %q", err, string(output))
} else {
klog.Infof("Deleting disk finished with: %q", string(output))
}
return err
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,6 @@ metadata:
provisioner: kubernetes.io/gce-pd
parameters:
type: pd-ssd
{{if .RetainPD}}
reclaimPolicy: Retain
{{end}}
18 changes: 16 additions & 2 deletions clusterloader2/pkg/prometheus/prometheus.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ import (

const (
namespace = "monitoring"
storageClass = "ssd"
coreManifests = "$GOPATH/src/k8s.io/perf-tests/clusterloader2/pkg/prometheus/manifests/*.yaml"
defaultServiceMonitors = "$GOPATH/src/k8s.io/perf-tests/clusterloader2/pkg/prometheus/manifests/default/*.yaml"
masterIPServiceMonitors = "$GOPATH/src/k8s.io/perf-tests/clusterloader2/pkg/prometheus/manifests/master-ip/*.yaml"
Expand Down Expand Up @@ -120,6 +121,8 @@ func NewPrometheusController(clusterLoaderConfig *config.ClusterLoaderConfig) (p
clusterLoaderConfig.PrometheusConfig.ScrapeKubeProxy = mapping["PROMETHEUS_SCRAPE_KUBE_PROXY"].(bool)
}
mapping["PROMETHEUS_SCRAPE_KUBELETS"] = clusterLoaderConfig.PrometheusConfig.ScrapeKubelets
snapshotEnabled, _ := pc.isEnabled()
mapping["RetainPD"] = snapshotEnabled

pc.templateMapping = mapping

Expand All @@ -144,6 +147,10 @@ func (pc *PrometheusController) SetUpPrometheusStack() error {
return err
}
}
// Removing Storage Class as Reclaim Policy cannot be changed
if err := client.DeleteStorageClass(k8sClient, storageClass); err != nil {
return err
}
if err := pc.applyManifests(coreManifests); err != nil {
return err
}
Expand Down Expand Up @@ -179,8 +186,9 @@ func (pc *PrometheusController) SetUpPrometheusStack() error {

// TearDownPrometheusStack tears down prometheus stack, releasing all prometheus resources.
func (pc *PrometheusController) TearDownPrometheusStack() error {
if err := pc.snapshotPrometheusDiskIfEnabled(); err != nil {
klog.Warningf("Error while snapshotting prometheus disk: %v", err)
// Get disk metadata again to be sure
if err := pc.cachePrometheusDiskMetadataIfEnabled(); err != nil {
klog.Warningf("Error while caching prometheus disk metadata: %v", err)
}
klog.Info("Tearing down prometheus stack")
k8sClient := pc.framework.GetClientSets().GetClient()
Expand All @@ -190,6 +198,12 @@ func (pc *PrometheusController) TearDownPrometheusStack() error {
if err := client.WaitForDeleteNamespace(k8sClient, namespace); err != nil {
return err
}
if err := pc.snapshotPrometheusDiskIfEnabled(); err != nil {
klog.Warningf("Error while snapshotting prometheus disk: %v", err)
}
if err := pc.deletePrometheusDiskIfEnabled(); err != nil {
klog.Warningf("Error while deleting prometheus disk: %v", err)
}
return nil
}

Expand Down

0 comments on commit d25aba2

Please sign in to comment.