From 3b2d411651636baa3e9b4de8d9e9dba83b20e7f8 Mon Sep 17 00:00:00 2001 From: Martin Chodur Date: Sat, 10 Aug 2019 17:25:18 +0200 Subject: [PATCH 1/7] feat sicedar: added readiness prober Signed-off-by: Martin Chodur --- CHANGELOG.md | 1 + cmd/thanos/main.go | 2 +- cmd/thanos/sidecar.go | 29 ++++++++++++++----- .../manifests/prometheus-ha-sidecar-lts.yaml | 8 +++++ .../manifests/prometheus-ha-sidecar.yaml | 8 +++++ 5 files changed, 39 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 71834004b8..cb77bd0971 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ We use *breaking* word for marking changes that are not backward compatible (rel - [#1358](https://github.com/thanos-io/thanos/pull/1358) Added `part_size` configuration option for HTTP multipart requests minimum part size for S3 storage type - [#1363](https://github.com/thanos-io/thanos/pull/1363) Thanos Receive now exposes `thanos_receive_hashring_nodes` and `thanos_receive_hashring_tenants` metrics to monitor status of hash-rings +- [#1395](https://github.com/thanos-io/thanos/pull/1395) Added `/-/ready` and `/-/healthy` endpoints to Thanos sidecar. ### Changed diff --git a/cmd/thanos/main.go b/cmd/thanos/main.go index a44ac43624..df233e6563 100644 --- a/cmd/thanos/main.go +++ b/cmd/thanos/main.go @@ -70,7 +70,7 @@ func main() { tracingConfig := regCommonTracingFlags(app) cmds := map[string]setupFunc{} - registerSidecar(cmds, app, "sidecar") + registerSidecar(cmds, app) registerStore(cmds, app, "store") registerQuery(cmds, app, "query") registerRule(cmds, app, "rule") diff --git a/cmd/thanos/sidecar.go b/cmd/thanos/sidecar.go index 1cd95bc44c..3f7222c33d 100644 --- a/cmd/thanos/sidecar.go +++ b/cmd/thanos/sidecar.go @@ -8,10 +8,12 @@ import ( "sync" "time" + "github.com/thanos-io/thanos/pkg/prober" + "github.com/go-kit/kit/log" "github.com/go-kit/kit/log/level" "github.com/oklog/run" - opentracing "github.com/opentracing/opentracing-go" + "github.com/opentracing/opentracing-go" "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/common/model" @@ -26,13 +28,14 @@ import ( "github.com/thanos-io/thanos/pkg/store" "github.com/thanos-io/thanos/pkg/store/storepb" "google.golang.org/grpc" - kingpin "gopkg.in/alecthomas/kingpin.v2" + "gopkg.in/alecthomas/kingpin.v2" ) const waitForExternalLabelsTimeout = 10 * time.Minute -func registerSidecar(m map[string]setupFunc, app *kingpin.Application, name string) { - cmd := app.Command(name, "sidecar for Prometheus server") +func registerSidecar(m map[string]setupFunc, app *kingpin.Application) { + comp := component.Sidecar + cmd := app.Command(comp.String(), "sidecar for Prometheus server") grpcBindAddr, httpBindAddr, cert, key, clientCA := regCommonServerFlags(cmd) @@ -54,7 +57,7 @@ func registerSidecar(m map[string]setupFunc, app *kingpin.Application, name stri uploadCompacted := cmd.Flag("shipper.upload-compacted", "[Experimental] If true sidecar will try to upload compacted blocks as well. Useful for migration purposes. Works only if compaction is disabled on Prometheus.").Default("false").Hidden().Bool() - m[name] = func(g *run.Group, logger log.Logger, reg *prometheus.Registry, tracer opentracing.Tracer, _ bool) error { + m[comp.String()] = func(g *run.Group, logger log.Logger, reg *prometheus.Registry, tracer opentracing.Tracer, _ bool) error { rl := reloader.New( log.With(logger, "component", "reloader"), reloader.ReloadURLFromBase(*promURL), @@ -77,6 +80,7 @@ func registerSidecar(m map[string]setupFunc, app *kingpin.Application, name stri objStoreConfig, rl, *uploadCompacted, + comp, ) } } @@ -96,6 +100,7 @@ func runSidecar( objStoreConfig *pathOrContent, reloader *reloader.Reloader, uploadCompacted bool, + comp component.Component, ) error { var m = &promMetadata{ promURL: promURL, @@ -117,6 +122,12 @@ func runSidecar( uploads = false } + readinessProber := prober.NewProber(comp, logger, prometheus.WrapRegistererWithPrefix("thanos_", reg)) + // Initiate default HTTP listener providing metrics endpoint and readiness/liveness probes. + if err := defaultHTTPListener(g, logger, reg, httpBindAddr, readinessProber); err != nil { + return errors.Wrap(err, "create readiness prober") + } + // Setup all the concurrent groups. { promUp := prometheus.NewGauge(prometheus.GaugeOpts{ @@ -148,6 +159,7 @@ func runSidecar( "err", err, ) promUp.Set(0) + readinessProber.SetNotReady(err) return err } @@ -156,6 +168,7 @@ func runSidecar( "external_labels", m.Labels().String(), ) promUp.Set(1) + readinessProber.SetReady() lastHeartbeat.Set(float64(time.Now().UnixNano()) / 1e9) return nil }) @@ -176,8 +189,10 @@ func runSidecar( if err := m.UpdateLabels(iterCtx, logger); err != nil { level.Warn(logger).Log("msg", "heartbeat failed", "err", err) promUp.Set(0) + readinessProber.SetNotReady(err) } else { promUp.Set(1) + readinessProber.SetReady() lastHeartbeat.Set(float64(time.Now().UnixNano()) / 1e9) } @@ -195,9 +210,7 @@ func runSidecar( cancel() }) } - if err := metricHTTPListenGroup(g, logger, reg, httpBindAddr); err != nil { - return err - } + { l, err := net.Listen("tcp", grpcBindAddr) if err != nil { diff --git a/tutorials/kubernetes-demo/manifests/prometheus-ha-sidecar-lts.yaml b/tutorials/kubernetes-demo/manifests/prometheus-ha-sidecar-lts.yaml index 02d37d2dc5..a4362035e2 100644 --- a/tutorials/kubernetes-demo/manifests/prometheus-ha-sidecar-lts.yaml +++ b/tutorials/kubernetes-demo/manifests/prometheus-ha-sidecar-lts.yaml @@ -145,6 +145,14 @@ spec: containerPort: 10902 - name: grpc containerPort: 10901 + livenessProbe: + httpGet: + port: 10902 + path: /-/healthy + readinessProbe: + httpGet: + port: 10902 + path: /-/ready volumeMounts: - name: prometheus mountPath: /var/prometheus diff --git a/tutorials/kubernetes-demo/manifests/prometheus-ha-sidecar.yaml b/tutorials/kubernetes-demo/manifests/prometheus-ha-sidecar.yaml index 9e30120ade..11b4830360 100644 --- a/tutorials/kubernetes-demo/manifests/prometheus-ha-sidecar.yaml +++ b/tutorials/kubernetes-demo/manifests/prometheus-ha-sidecar.yaml @@ -129,6 +129,14 @@ spec: containerPort: 10902 - name: grpc containerPort: 10901 + livenessProbe: + httpGet: + port: 10902 + path: /-/healthy + readinessProbe: + httpGet: + port: 10902 + path: /-/ready volumeMounts: - name: prometheus mountPath: /var/prometheus From 9193da38ecc38778654c25e65f5d87a92d908610 Mon Sep 17 00:00:00 2001 From: Martin Chodur Date: Wed, 14 Aug 2019 06:19:06 +0200 Subject: [PATCH 2/7] cr: inline the component usage Signed-off-by: Martin Chodur --- cmd/thanos/compact.go | 7 +++---- cmd/thanos/sidecar.go | 10 ++++------ 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/cmd/thanos/compact.go b/cmd/thanos/compact.go index b7e3f01acb..1a90d5a7c0 100644 --- a/cmd/thanos/compact.go +++ b/cmd/thanos/compact.go @@ -70,8 +70,7 @@ func (cs compactionSet) maxLevel() int { } func registerCompact(m map[string]setupFunc, app *kingpin.Application) { - comp := component.Compact - cmd := app.Command(comp.String(), "continuously compacts blocks in an object store bucket") + cmd := app.Command(component.Compact.String(), "continuously compacts blocks in an object store bucket") haltOnError := cmd.Flag("debug.halt-on-error", "Halt the process if a critical compaction error is detected."). Hidden().Default("true").Bool() @@ -112,7 +111,7 @@ func registerCompact(m map[string]setupFunc, app *kingpin.Application) { compactionConcurrency := cmd.Flag("compact.concurrency", "Number of goroutines to use when compacting groups."). Default("1").Int() - m[comp.String()] = func(g *run.Group, logger log.Logger, reg *prometheus.Registry, tracer opentracing.Tracer, _ bool) error { + m[component.Compact.String()] = func(g *run.Group, logger log.Logger, reg *prometheus.Registry, tracer opentracing.Tracer, _ bool) error { return runCompact(g, logger, reg, *httpAddr, *dataDir, @@ -127,7 +126,7 @@ func registerCompact(m map[string]setupFunc, app *kingpin.Application) { compact.ResolutionLevel5m: time.Duration(*retention5m), compact.ResolutionLevel1h: time.Duration(*retention1h), }, - comp, + component.Compact, *disableDownsampling, *maxCompactionLevel, *blockSyncConcurrency, diff --git a/cmd/thanos/sidecar.go b/cmd/thanos/sidecar.go index 3f7222c33d..315273b1a2 100644 --- a/cmd/thanos/sidecar.go +++ b/cmd/thanos/sidecar.go @@ -8,8 +8,6 @@ import ( "sync" "time" - "github.com/thanos-io/thanos/pkg/prober" - "github.com/go-kit/kit/log" "github.com/go-kit/kit/log/level" "github.com/oklog/run" @@ -21,6 +19,7 @@ import ( "github.com/thanos-io/thanos/pkg/block/metadata" "github.com/thanos-io/thanos/pkg/component" "github.com/thanos-io/thanos/pkg/objstore/client" + "github.com/thanos-io/thanos/pkg/prober" "github.com/thanos-io/thanos/pkg/promclient" "github.com/thanos-io/thanos/pkg/reloader" "github.com/thanos-io/thanos/pkg/runutil" @@ -34,8 +33,7 @@ import ( const waitForExternalLabelsTimeout = 10 * time.Minute func registerSidecar(m map[string]setupFunc, app *kingpin.Application) { - comp := component.Sidecar - cmd := app.Command(comp.String(), "sidecar for Prometheus server") + cmd := app.Command(component.Sidecar.String(), "sidecar for Prometheus server") grpcBindAddr, httpBindAddr, cert, key, clientCA := regCommonServerFlags(cmd) @@ -57,7 +55,7 @@ func registerSidecar(m map[string]setupFunc, app *kingpin.Application) { uploadCompacted := cmd.Flag("shipper.upload-compacted", "[Experimental] If true sidecar will try to upload compacted blocks as well. Useful for migration purposes. Works only if compaction is disabled on Prometheus.").Default("false").Hidden().Bool() - m[comp.String()] = func(g *run.Group, logger log.Logger, reg *prometheus.Registry, tracer opentracing.Tracer, _ bool) error { + m[component.Sidecar.String()] = func(g *run.Group, logger log.Logger, reg *prometheus.Registry, tracer opentracing.Tracer, _ bool) error { rl := reloader.New( log.With(logger, "component", "reloader"), reloader.ReloadURLFromBase(*promURL), @@ -80,7 +78,7 @@ func registerSidecar(m map[string]setupFunc, app *kingpin.Application) { objStoreConfig, rl, *uploadCompacted, - comp, + component.Sidecar, ) } } From cdc30f425dc8efdce1bffca99b973aaffd99c6bb Mon Sep 17 00:00:00 2001 From: Martin Chodur Date: Wed, 14 Aug 2019 06:41:01 +0200 Subject: [PATCH 3/7] cr: refactor sidecar prober logic Signed-off-by: Martin Chodur --- cmd/thanos/compact.go | 6 +++--- cmd/thanos/sidecar.go | 27 +++++++++++++++++++++------ pkg/prober/prober.go | 13 +++++++++++++ 3 files changed, 37 insertions(+), 9 deletions(-) diff --git a/cmd/thanos/compact.go b/cmd/thanos/compact.go index 1a90d5a7c0..08bbba23ea 100644 --- a/cmd/thanos/compact.go +++ b/cmd/thanos/compact.go @@ -169,9 +169,9 @@ func runCompact( downsampleMetrics := newDownsampleMetrics(reg) - readinessProber := prober.NewProber(component, logger, prometheus.WrapRegistererWithPrefix("thanos_", reg)) + statusProber := prober.NewProber(component, logger, prometheus.WrapRegistererWithPrefix("thanos_", reg)) // Initiate default HTTP listener providing metrics endpoint and readiness/liveness probes. - if err := defaultHTTPListener(g, logger, reg, httpBindAddr, readinessProber); err != nil { + if err := defaultHTTPListener(g, logger, reg, httpBindAddr, statusProber); err != nil { return errors.Wrap(err, "create readiness prober") } @@ -326,7 +326,7 @@ func runCompact( }) level.Info(logger).Log("msg", "starting compact node") - readinessProber.SetReady() + statusProber.SetReady() return nil } diff --git a/cmd/thanos/sidecar.go b/cmd/thanos/sidecar.go index 315273b1a2..3771cca156 100644 --- a/cmd/thanos/sidecar.go +++ b/cmd/thanos/sidecar.go @@ -120,9 +120,9 @@ func runSidecar( uploads = false } - readinessProber := prober.NewProber(comp, logger, prometheus.WrapRegistererWithPrefix("thanos_", reg)) + statusProber := prober.NewProber(comp, logger, prometheus.WrapRegistererWithPrefix("thanos_", reg)) // Initiate default HTTP listener providing metrics endpoint and readiness/liveness probes. - if err := defaultHTTPListener(g, logger, reg, httpBindAddr, readinessProber); err != nil { + if err := defaultHTTPListener(g, logger, reg, httpBindAddr, statusProber); err != nil { return errors.Wrap(err, "create readiness prober") } @@ -148,6 +148,12 @@ func runSidecar( } } + // When the heartbeat to Prometheus fails, the sidecar is marked as not ready. + // But after `sinceLastSuccessfulHeartbeatLimit` duration of consequential fails it's marked also not healthy, + // so the orchestrator (if any) can try restarting it if it would help. + sinceLastSuccessfulHeartbeat := 0 * time.Minute + sinceLastSuccessfulHeartbeatLimit := 3 * time.Minute + // Blocking query of external labels before joining as a Source Peer into gossip. // We retry infinitely until we reach and fetch labels from our Prometheus. err := runutil.Retry(2*time.Second, ctx.Done(), func() error { @@ -157,7 +163,10 @@ func runSidecar( "err", err, ) promUp.Set(0) - readinessProber.SetNotReady(err) + statusProber.SetNotReady(err) + if sinceLastSuccessfulHeartbeat >= sinceLastSuccessfulHeartbeatLimit { + statusProber.SetNotHealthy(err) + } return err } @@ -166,7 +175,8 @@ func runSidecar( "external_labels", m.Labels().String(), ) promUp.Set(1) - readinessProber.SetReady() + statusProber.SetReady() + sinceLastSuccessfulHeartbeat = 0 lastHeartbeat.Set(float64(time.Now().UnixNano()) / 1e9) return nil }) @@ -187,10 +197,15 @@ func runSidecar( if err := m.UpdateLabels(iterCtx, logger); err != nil { level.Warn(logger).Log("msg", "heartbeat failed", "err", err) promUp.Set(0) - readinessProber.SetNotReady(err) + statusProber.SetNotReady(err) + if sinceLastSuccessfulHeartbeat >= sinceLastSuccessfulHeartbeatLimit { + statusProber.SetNotHealthy(err) + } + sinceLastSuccessfulHeartbeat = 0 } else { promUp.Set(1) - readinessProber.SetReady() + statusProber.SetReady() + sinceLastSuccessfulHeartbeat = 0 lastHeartbeat.Set(float64(time.Now().UnixNano()) / 1e9) } diff --git a/pkg/prober/prober.go b/pkg/prober/prober.go index 1bf6c32f40..b00ee1fef6 100644 --- a/pkg/prober/prober.go +++ b/pkg/prober/prober.go @@ -37,6 +37,19 @@ type Prober struct { func NewProber(component component.Component, logger log.Logger, reg prometheus.Registerer) *Prober { initialErr := fmt.Errorf(initialErrorFmt, component) + // From Kubernetes documentation https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-probes/ : + // + // liveness: Many applications running for long periods of time eventually transition to broken states, + // (healthy) and cannot recover except by being restarted. + // Kubernetes provides liveness probes to detect and remedy such situations. + // + // readiness: Sometimes, applications are temporarily unable to serve traffic. + // (ready) For example, an application might need to load large data or configuration files during startup, + // or depend on external services after startup. In such cases, you don’t want to kill the application, + // but you don’t want to send it requests either. Kubernetes provides readiness probes to detect + // and mitigate these situations. A pod with containers reporting that they are not ready + // does not receive traffic through Kubernetes Services. + p := &Prober{ component: component, logger: logger, From 153cd3f54544a1e3cd2137091b6a0a565af5d5b0 Mon Sep 17 00:00:00 2001 From: Martin Chodur Date: Mon, 19 Aug 2019 09:37:41 +0200 Subject: [PATCH 4/7] CR: revert to original PR logic wothout touching sidecar liveness Signed-off-by: Martin Chodur --- cmd/thanos/sidecar.go | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/cmd/thanos/sidecar.go b/cmd/thanos/sidecar.go index 3771cca156..316576a7c3 100644 --- a/cmd/thanos/sidecar.go +++ b/cmd/thanos/sidecar.go @@ -148,12 +148,6 @@ func runSidecar( } } - // When the heartbeat to Prometheus fails, the sidecar is marked as not ready. - // But after `sinceLastSuccessfulHeartbeatLimit` duration of consequential fails it's marked also not healthy, - // so the orchestrator (if any) can try restarting it if it would help. - sinceLastSuccessfulHeartbeat := 0 * time.Minute - sinceLastSuccessfulHeartbeatLimit := 3 * time.Minute - // Blocking query of external labels before joining as a Source Peer into gossip. // We retry infinitely until we reach and fetch labels from our Prometheus. err := runutil.Retry(2*time.Second, ctx.Done(), func() error { @@ -164,9 +158,6 @@ func runSidecar( ) promUp.Set(0) statusProber.SetNotReady(err) - if sinceLastSuccessfulHeartbeat >= sinceLastSuccessfulHeartbeatLimit { - statusProber.SetNotHealthy(err) - } return err } @@ -176,7 +167,6 @@ func runSidecar( ) promUp.Set(1) statusProber.SetReady() - sinceLastSuccessfulHeartbeat = 0 lastHeartbeat.Set(float64(time.Now().UnixNano()) / 1e9) return nil }) @@ -198,14 +188,9 @@ func runSidecar( level.Warn(logger).Log("msg", "heartbeat failed", "err", err) promUp.Set(0) statusProber.SetNotReady(err) - if sinceLastSuccessfulHeartbeat >= sinceLastSuccessfulHeartbeatLimit { - statusProber.SetNotHealthy(err) - } - sinceLastSuccessfulHeartbeat = 0 } else { promUp.Set(1) statusProber.SetReady() - sinceLastSuccessfulHeartbeat = 0 lastHeartbeat.Set(float64(time.Now().UnixNano()) / 1e9) } From d351d03eabea190cf65c36971a3be49bea5595a6 Mon Sep 17 00:00:00 2001 From: Martin Chodur Date: Tue, 20 Aug 2019 15:19:58 +0200 Subject: [PATCH 5/7] CR: removed readiness handling from periodical prometheus health checks Signed-off-by: Martin Chodur --- cmd/thanos/sidecar.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/cmd/thanos/sidecar.go b/cmd/thanos/sidecar.go index 316576a7c3..2cc8476e69 100644 --- a/cmd/thanos/sidecar.go +++ b/cmd/thanos/sidecar.go @@ -187,10 +187,8 @@ func runSidecar( if err := m.UpdateLabels(iterCtx, logger); err != nil { level.Warn(logger).Log("msg", "heartbeat failed", "err", err) promUp.Set(0) - statusProber.SetNotReady(err) } else { promUp.Set(1) - statusProber.SetReady() lastHeartbeat.Set(float64(time.Now().UnixNano()) / 1e9) } From 60897a8ee0008520a529e5abe8b47efe457a17c6 Mon Sep 17 00:00:00 2001 From: Martin Chodur Date: Tue, 27 Aug 2019 19:15:46 +0200 Subject: [PATCH 6/7] fix prober: fixed false error logging Signed-off-by: Martin Chodur --- pkg/prober/prober.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/prober/prober.go b/pkg/prober/prober.go index b00ee1fef6..96be8ab8a9 100644 --- a/pkg/prober/prober.go +++ b/pkg/prober/prober.go @@ -89,7 +89,7 @@ func (p *Prober) writeResponse(w http.ResponseWriter, probeFn func() error, prob http.Error(w, fmt.Sprintf("thanos %v is not %v. Reason: %v", p.component, probeType, err), probeErrorHTTPStatus) return } - if _, err := io.WriteString(w, fmt.Sprintf("thanos %v is %v", p.component, probeType)); err == nil { + if _, err := io.WriteString(w, fmt.Sprintf("thanos %v is %v", p.component, probeType)); err != nil { level.Error(p.logger).Log("msg", "failed to write probe response", "probe type", probeType, "err", err) } } From 210075aef4c4bad9b0538e07cab4f04d58d4d524 Mon Sep 17 00:00:00 2001 From: Martin Chodur Date: Wed, 28 Aug 2019 06:07:01 +0200 Subject: [PATCH 7/7] CR: fix prober comment to work with godoc Signed-off-by: Martin Chodur --- pkg/prober/prober.go | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/pkg/prober/prober.go b/pkg/prober/prober.go index 96be8ab8a9..36bea35b88 100644 --- a/pkg/prober/prober.go +++ b/pkg/prober/prober.go @@ -22,6 +22,19 @@ const ( ) // Prober represents health and readiness status of given component. +// +// From Kubernetes documentation https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-probes/ : +// +// liveness: Many applications running for long periods of time eventually transition to broken states, +// (healthy) and cannot recover except by being restarted. +// Kubernetes provides liveness probes to detect and remedy such situations. +// +// readiness: Sometimes, applications are temporarily unable to serve traffic. +// (ready) For example, an application might need to load large data or configuration files during startup, +// or depend on external services after startup. In such cases, you don’t want to kill the application, +// but you don’t want to send it requests either. Kubernetes provides readiness probes to detect +// and mitigate these situations. A pod with containers reporting that they are not ready +// does not receive traffic through Kubernetes Services. type Prober struct { logger log.Logger component component.Component @@ -36,20 +49,6 @@ type Prober struct { // NewProber returns Prober representing readiness and healthiness of given component. func NewProber(component component.Component, logger log.Logger, reg prometheus.Registerer) *Prober { initialErr := fmt.Errorf(initialErrorFmt, component) - - // From Kubernetes documentation https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-probes/ : - // - // liveness: Many applications running for long periods of time eventually transition to broken states, - // (healthy) and cannot recover except by being restarted. - // Kubernetes provides liveness probes to detect and remedy such situations. - // - // readiness: Sometimes, applications are temporarily unable to serve traffic. - // (ready) For example, an application might need to load large data or configuration files during startup, - // or depend on external services after startup. In such cases, you don’t want to kill the application, - // but you don’t want to send it requests either. Kubernetes provides readiness probes to detect - // and mitigate these situations. A pod with containers reporting that they are not ready - // does not receive traffic through Kubernetes Services. - p := &Prober{ component: component, logger: logger,