diff --git a/CHANGELOG.md b/CHANGELOG.md index 71834004b8..cb77bd0971 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ We use *breaking* word for marking changes that are not backward compatible (rel - [#1358](https://github.com/thanos-io/thanos/pull/1358) Added `part_size` configuration option for HTTP multipart requests minimum part size for S3 storage type - [#1363](https://github.com/thanos-io/thanos/pull/1363) Thanos Receive now exposes `thanos_receive_hashring_nodes` and `thanos_receive_hashring_tenants` metrics to monitor status of hash-rings +- [#1395](https://github.com/thanos-io/thanos/pull/1395) Added `/-/ready` and `/-/healthy` endpoints to Thanos sidecar. ### Changed diff --git a/cmd/thanos/compact.go b/cmd/thanos/compact.go index b7e3f01acb..08bbba23ea 100644 --- a/cmd/thanos/compact.go +++ b/cmd/thanos/compact.go @@ -70,8 +70,7 @@ func (cs compactionSet) maxLevel() int { } func registerCompact(m map[string]setupFunc, app *kingpin.Application) { - comp := component.Compact - cmd := app.Command(comp.String(), "continuously compacts blocks in an object store bucket") + cmd := app.Command(component.Compact.String(), "continuously compacts blocks in an object store bucket") haltOnError := cmd.Flag("debug.halt-on-error", "Halt the process if a critical compaction error is detected."). Hidden().Default("true").Bool() @@ -112,7 +111,7 @@ func registerCompact(m map[string]setupFunc, app *kingpin.Application) { compactionConcurrency := cmd.Flag("compact.concurrency", "Number of goroutines to use when compacting groups."). Default("1").Int() - m[comp.String()] = func(g *run.Group, logger log.Logger, reg *prometheus.Registry, tracer opentracing.Tracer, _ bool) error { + m[component.Compact.String()] = func(g *run.Group, logger log.Logger, reg *prometheus.Registry, tracer opentracing.Tracer, _ bool) error { return runCompact(g, logger, reg, *httpAddr, *dataDir, @@ -127,7 +126,7 @@ func registerCompact(m map[string]setupFunc, app *kingpin.Application) { compact.ResolutionLevel5m: time.Duration(*retention5m), compact.ResolutionLevel1h: time.Duration(*retention1h), }, - comp, + component.Compact, *disableDownsampling, *maxCompactionLevel, *blockSyncConcurrency, @@ -170,9 +169,9 @@ func runCompact( downsampleMetrics := newDownsampleMetrics(reg) - readinessProber := prober.NewProber(component, logger, prometheus.WrapRegistererWithPrefix("thanos_", reg)) + statusProber := prober.NewProber(component, logger, prometheus.WrapRegistererWithPrefix("thanos_", reg)) // Initiate default HTTP listener providing metrics endpoint and readiness/liveness probes. - if err := defaultHTTPListener(g, logger, reg, httpBindAddr, readinessProber); err != nil { + if err := defaultHTTPListener(g, logger, reg, httpBindAddr, statusProber); err != nil { return errors.Wrap(err, "create readiness prober") } @@ -327,7 +326,7 @@ func runCompact( }) level.Info(logger).Log("msg", "starting compact node") - readinessProber.SetReady() + statusProber.SetReady() return nil } diff --git a/cmd/thanos/main.go b/cmd/thanos/main.go index a44ac43624..df233e6563 100644 --- a/cmd/thanos/main.go +++ b/cmd/thanos/main.go @@ -70,7 +70,7 @@ func main() { tracingConfig := regCommonTracingFlags(app) cmds := map[string]setupFunc{} - registerSidecar(cmds, app, "sidecar") + registerSidecar(cmds, app) registerStore(cmds, app, "store") registerQuery(cmds, app, "query") registerRule(cmds, app, "rule") diff --git a/cmd/thanos/sidecar.go b/cmd/thanos/sidecar.go index 1cd95bc44c..2cc8476e69 100644 --- a/cmd/thanos/sidecar.go +++ b/cmd/thanos/sidecar.go @@ -11,7 +11,7 @@ import ( "github.com/go-kit/kit/log" "github.com/go-kit/kit/log/level" "github.com/oklog/run" - opentracing "github.com/opentracing/opentracing-go" + "github.com/opentracing/opentracing-go" "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/common/model" @@ -19,6 +19,7 @@ import ( "github.com/thanos-io/thanos/pkg/block/metadata" "github.com/thanos-io/thanos/pkg/component" "github.com/thanos-io/thanos/pkg/objstore/client" + "github.com/thanos-io/thanos/pkg/prober" "github.com/thanos-io/thanos/pkg/promclient" "github.com/thanos-io/thanos/pkg/reloader" "github.com/thanos-io/thanos/pkg/runutil" @@ -26,13 +27,13 @@ import ( "github.com/thanos-io/thanos/pkg/store" "github.com/thanos-io/thanos/pkg/store/storepb" "google.golang.org/grpc" - kingpin "gopkg.in/alecthomas/kingpin.v2" + "gopkg.in/alecthomas/kingpin.v2" ) const waitForExternalLabelsTimeout = 10 * time.Minute -func registerSidecar(m map[string]setupFunc, app *kingpin.Application, name string) { - cmd := app.Command(name, "sidecar for Prometheus server") +func registerSidecar(m map[string]setupFunc, app *kingpin.Application) { + cmd := app.Command(component.Sidecar.String(), "sidecar for Prometheus server") grpcBindAddr, httpBindAddr, cert, key, clientCA := regCommonServerFlags(cmd) @@ -54,7 +55,7 @@ func registerSidecar(m map[string]setupFunc, app *kingpin.Application, name stri uploadCompacted := cmd.Flag("shipper.upload-compacted", "[Experimental] If true sidecar will try to upload compacted blocks as well. Useful for migration purposes. Works only if compaction is disabled on Prometheus.").Default("false").Hidden().Bool() - m[name] = func(g *run.Group, logger log.Logger, reg *prometheus.Registry, tracer opentracing.Tracer, _ bool) error { + m[component.Sidecar.String()] = func(g *run.Group, logger log.Logger, reg *prometheus.Registry, tracer opentracing.Tracer, _ bool) error { rl := reloader.New( log.With(logger, "component", "reloader"), reloader.ReloadURLFromBase(*promURL), @@ -77,6 +78,7 @@ func registerSidecar(m map[string]setupFunc, app *kingpin.Application, name stri objStoreConfig, rl, *uploadCompacted, + component.Sidecar, ) } } @@ -96,6 +98,7 @@ func runSidecar( objStoreConfig *pathOrContent, reloader *reloader.Reloader, uploadCompacted bool, + comp component.Component, ) error { var m = &promMetadata{ promURL: promURL, @@ -117,6 +120,12 @@ func runSidecar( uploads = false } + statusProber := prober.NewProber(comp, logger, prometheus.WrapRegistererWithPrefix("thanos_", reg)) + // Initiate default HTTP listener providing metrics endpoint and readiness/liveness probes. + if err := defaultHTTPListener(g, logger, reg, httpBindAddr, statusProber); err != nil { + return errors.Wrap(err, "create readiness prober") + } + // Setup all the concurrent groups. { promUp := prometheus.NewGauge(prometheus.GaugeOpts{ @@ -148,6 +157,7 @@ func runSidecar( "err", err, ) promUp.Set(0) + statusProber.SetNotReady(err) return err } @@ -156,6 +166,7 @@ func runSidecar( "external_labels", m.Labels().String(), ) promUp.Set(1) + statusProber.SetReady() lastHeartbeat.Set(float64(time.Now().UnixNano()) / 1e9) return nil }) @@ -195,9 +206,7 @@ func runSidecar( cancel() }) } - if err := metricHTTPListenGroup(g, logger, reg, httpBindAddr); err != nil { - return err - } + { l, err := net.Listen("tcp", grpcBindAddr) if err != nil { diff --git a/pkg/prober/prober.go b/pkg/prober/prober.go index 1bf6c32f40..36bea35b88 100644 --- a/pkg/prober/prober.go +++ b/pkg/prober/prober.go @@ -22,6 +22,19 @@ const ( ) // Prober represents health and readiness status of given component. +// +// From Kubernetes documentation https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-probes/ : +// +// liveness: Many applications running for long periods of time eventually transition to broken states, +// (healthy) and cannot recover except by being restarted. +// Kubernetes provides liveness probes to detect and remedy such situations. +// +// readiness: Sometimes, applications are temporarily unable to serve traffic. +// (ready) For example, an application might need to load large data or configuration files during startup, +// or depend on external services after startup. In such cases, you don’t want to kill the application, +// but you don’t want to send it requests either. Kubernetes provides readiness probes to detect +// and mitigate these situations. A pod with containers reporting that they are not ready +// does not receive traffic through Kubernetes Services. type Prober struct { logger log.Logger component component.Component @@ -36,7 +49,6 @@ type Prober struct { // NewProber returns Prober representing readiness and healthiness of given component. func NewProber(component component.Component, logger log.Logger, reg prometheus.Registerer) *Prober { initialErr := fmt.Errorf(initialErrorFmt, component) - p := &Prober{ component: component, logger: logger, @@ -76,7 +88,7 @@ func (p *Prober) writeResponse(w http.ResponseWriter, probeFn func() error, prob http.Error(w, fmt.Sprintf("thanos %v is not %v. Reason: %v", p.component, probeType, err), probeErrorHTTPStatus) return } - if _, err := io.WriteString(w, fmt.Sprintf("thanos %v is %v", p.component, probeType)); err == nil { + if _, err := io.WriteString(w, fmt.Sprintf("thanos %v is %v", p.component, probeType)); err != nil { level.Error(p.logger).Log("msg", "failed to write probe response", "probe type", probeType, "err", err) } } diff --git a/tutorials/kubernetes-demo/manifests/prometheus-ha-sidecar-lts.yaml b/tutorials/kubernetes-demo/manifests/prometheus-ha-sidecar-lts.yaml index 02d37d2dc5..a4362035e2 100644 --- a/tutorials/kubernetes-demo/manifests/prometheus-ha-sidecar-lts.yaml +++ b/tutorials/kubernetes-demo/manifests/prometheus-ha-sidecar-lts.yaml @@ -145,6 +145,14 @@ spec: containerPort: 10902 - name: grpc containerPort: 10901 + livenessProbe: + httpGet: + port: 10902 + path: /-/healthy + readinessProbe: + httpGet: + port: 10902 + path: /-/ready volumeMounts: - name: prometheus mountPath: /var/prometheus diff --git a/tutorials/kubernetes-demo/manifests/prometheus-ha-sidecar.yaml b/tutorials/kubernetes-demo/manifests/prometheus-ha-sidecar.yaml index 9e30120ade..11b4830360 100644 --- a/tutorials/kubernetes-demo/manifests/prometheus-ha-sidecar.yaml +++ b/tutorials/kubernetes-demo/manifests/prometheus-ha-sidecar.yaml @@ -129,6 +129,14 @@ spec: containerPort: 10902 - name: grpc containerPort: 10901 + livenessProbe: + httpGet: + port: 10902 + path: /-/healthy + readinessProbe: + httpGet: + port: 10902 + path: /-/ready volumeMounts: - name: prometheus mountPath: /var/prometheus