From b57c4268de86503d8cdf6a2909ad24190b531e43 Mon Sep 17 00:00:00 2001 From: Xiang Dai <764524258@qq.com> Date: Wed, 19 Feb 2020 15:50:44 +0800 Subject: [PATCH] update mixin Signed-off-by: Xiang Dai <764524258@qq.com> --- examples/alerts/alerts.md | 42 +++++++++++++++++++++ examples/alerts/alerts.yaml | 36 ++++++++++++++++++ examples/alerts/rules.yaml | 2 + mixin/thanos/README.md | 5 +++ mixin/thanos/alerts/alerts.libsonnet | 3 +- mixin/thanos/alerts/replicate.libsonnet | 8 ++-- mixin/thanos/dashboards/replicate.libsonnet | 26 ++++++------- mixin/thanos/defaults.libsonnet | 5 +++ mixin/thanos/rules/replicate.libsonnet | 2 +- mixin/thanos/rules/rules.libsonnet | 3 +- 10 files changed, 112 insertions(+), 20 deletions(-) diff --git a/examples/alerts/alerts.md b/examples/alerts/alerts.md index 3ae95fce3a5..01720d3aa21 100644 --- a/examples/alerts/alerts.md +++ b/examples/alerts/alerts.md @@ -428,6 +428,48 @@ rules: severity: warning ``` +## Replicate + +[embedmd]:# (../tmp/thanos-replicate.rules.yaml yaml) +```yaml +name: thanos-replicate.rules +rules: +- alert: ThanosReplicateIsDown + annotations: + message: Thanos Replicate has disappeared from Prometheus target discovery. + expr: | + absent(up{job=~"thanos-replicate.*"}) + for: 5m + labels: + severity: critical +- alert: ThanosReplicateErrorRate + annotations: + message: Thanos Replicate failing to run, {{ $value | humanize }}% of attempts + failed. + expr: | + ( + sum(rate(thanos_replicate_replication_runs_total{result="error", job=~"thanos-replicate.*"}[5m])) + / on (namespace) group_left + sum(rate(thanos_replicate_replication_runs_total{job=~"thanos-replicate.*"}[5m])) + ) * 100 >= 10 + for: 5m + labels: + severity: critical +- alert: ThanosReplicateRunLatency + annotations: + message: Thanos Replicate {{$labels.job}} has a 99th percentile latency of {{ + $value }} seconds for the replicate operations. + expr: | + ( + histogram_quantile(0.9, sum by (job, le) (thanos_replicate_replication_run_duration_seconds_bucket{job=~"thanos-replicate.*"})) > 120 + and + sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~"thanos-replicate.*"}[5m])) > 0 + ) + for: 5m + labels: + severity: critical +``` + ## Extras ### Absent Rules diff --git a/examples/alerts/alerts.yaml b/examples/alerts/alerts.yaml index 979231ae9ef..5f09216c6d7 100644 --- a/examples/alerts/alerts.yaml +++ b/examples/alerts/alerts.yaml @@ -439,3 +439,39 @@ groups: for: 5m labels: severity: critical +- name: thanos-replicate.rules + rules: + - alert: ThanosReplicateIsDown + annotations: + message: Thanos Replicate has disappeared from Prometheus target discovery. + expr: | + absent(up{job=~"thanos-replicate.*"}) + for: 5m + labels: + severity: critical + - alert: ThanosReplicateErrorRate + annotations: + message: Thanos Replicate failing to run, {{ $value | humanize }}% of attempts + failed. + expr: | + ( + sum(rate(thanos_replicate_replication_runs_total{result="error", job=~"thanos-replicate.*"}[5m])) + / on (namespace) group_left + sum(rate(thanos_replicate_replication_runs_total{job=~"thanos-replicate.*"}[5m])) + ) * 100 >= 10 + for: 5m + labels: + severity: critical + - alert: ThanosReplicateRunLatency + annotations: + message: Thanos Replicate {{$labels.job}} has a 99th percentile latency of {{ + $value }} seconds for the replicate operations. + expr: | + ( + histogram_quantile(0.9, sum by (job, le) (thanos_replicate_replication_run_duration_seconds_bucket{job=~"thanos-replicate.*"})) > 120 + and + sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~"thanos-replicate.*"}[5m])) > 0 + ) + for: 5m + labels: + severity: critical diff --git a/examples/alerts/rules.yaml b/examples/alerts/rules.yaml index 90492621217..8df51d77324 100644 --- a/examples/alerts/rules.yaml +++ b/examples/alerts/rules.yaml @@ -121,3 +121,5 @@ groups: labels: quantile: "0.99" record: :thanos_objstore_bucket_operation_duration_seconds:histogram_quantile +- name: thanos-replicate.rules + rules: [] diff --git a/mixin/thanos/README.md b/mixin/thanos/README.md index 946e7a916da..521e1b91e5a 100644 --- a/mixin/thanos/README.md +++ b/mixin/thanos/README.md @@ -89,6 +89,11 @@ This project is intended to be used as a library. You can extend and customize d selector: 'job=~"%s.*"' % self.jobPrefix, title: '%(prefix)sSidecar' % $.dashboard.prefix, }, + replicate+:: { + jobPrefix: 'thanos-replicate', + selector: 'job=~"%s.*"' % self.jobPrefix, + title: '%(prefix)sReplicate' % $.dashboard.prefix, + }, overview+:: { title: '%(prefix)sOverview' % $.dashboard.prefix, }, diff --git a/mixin/thanos/alerts/alerts.libsonnet b/mixin/thanos/alerts/alerts.libsonnet index fa0a9d3d36f..7a0fd72b28b 100644 --- a/mixin/thanos/alerts/alerts.libsonnet +++ b/mixin/thanos/alerts/alerts.libsonnet @@ -4,4 +4,5 @@ (import 'sidecar.libsonnet') + (import 'store.libsonnet') + (import 'rule.libsonnet') + -(import 'absent.libsonnet') +(import 'absent.libsonnet') + +(import 'replicate.libsonnet') diff --git a/mixin/thanos/alerts/replicate.libsonnet b/mixin/thanos/alerts/replicate.libsonnet index 5373276a762..0184698cea3 100644 --- a/mixin/thanos/alerts/replicate.libsonnet +++ b/mixin/thanos/alerts/replicate.libsonnet @@ -1,6 +1,6 @@ { local thanos = self, - replicator+:: { + replicate+:: { jobPrefix: error 'must provide job prefix for Thanos Replicate dashboard', selector: error 'must provide selector for Thanos Replicate dashboard', }, @@ -13,7 +13,7 @@ alert: 'ThanosReplicateIsDown', expr: ||| absent(up{%(selector)s}) - ||| % thanos.replicator, + ||| % thanos.replicate, 'for': '5m', labels: { severity: 'critical', @@ -33,7 +33,7 @@ / on (namespace) group_left sum(rate(thanos_replicate_replication_runs_total{%(selector)s}[5m])) ) * 100 >= 10 - ||| % thanos.replicator, + ||| % thanos.replicate, 'for': '5m', labels: { severity: 'critical', @@ -50,7 +50,7 @@ and sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{%(selector)s}[5m])) > 0 ) - ||| % thanos.replicator, + ||| % thanos.replicate, 'for': '5m', labels: { severity: 'critical', diff --git a/mixin/thanos/dashboards/replicate.libsonnet b/mixin/thanos/dashboards/replicate.libsonnet index d1bc2e2d0c9..47c0de211f9 100644 --- a/mixin/thanos/dashboards/replicate.libsonnet +++ b/mixin/thanos/dashboards/replicate.libsonnet @@ -2,27 +2,27 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; { local thanos = self, - replicator+:: { + replicate+:: { jobPrefix: error 'must provide job prefix for Thanos Replicate dashboard', selector: error 'must provide selector for Thanos Replicate dashboard', title: error 'must provide title for Thanos Replicate dashboard', }, grafanaDashboards+:: { 'replicate.json': - g.dashboard(thanos.replicator.title) + g.dashboard(thanos.replicate.title) .addRow( g.row('Replicate Runs') .addPanel( g.panel('Rate') + g.qpsErrTotalPanel( - 'thanos_replicate_replication_runs_total{result="error", namespace="$namespace",%(selector)s}' % thanos.replicator, - 'thanos_replicate_replication_runs_total{namespace="$namespace",%(selector)s}' % thanos.replicator, + 'thanos_replicate_replication_runs_total{result="error", namespace="$namespace",%(selector)s}' % thanos.replicate, + 'thanos_replicate_replication_runs_total{namespace="$namespace",%(selector)s}' % thanos.replicate, ) ) .addPanel( g.panel('Errors', 'Shows rate of errors.') + g.queryPanel( - 'sum(rate(thanos_replicate_replication_runs_total{result="error", namespace="$namespace",%(selector)s}[$interval])) by (result)' % thanos.replicator, + 'sum(rate(thanos_replicate_replication_runs_total{result="error", namespace="$namespace",%(selector)s}[$interval])) by (result)' % thanos.replicate, '{{result}}' ) + { yaxes: g.yaxes('percentunit') } + @@ -30,7 +30,7 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; ) .addPanel( g.panel('Duration', 'Shows how long has it taken to run a replication cycle.') + - g.latencyPanel('thanos_replicate_replication_run_duration_seconds', 'result="success", namespace="$namespace",%(selector)s' % thanos.replicator) + g.latencyPanel('thanos_replicate_replication_run_duration_seconds', 'result="success", namespace="$namespace",%(selector)s' % thanos.replicate) ) ) .addRow( @@ -39,12 +39,12 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; g.panel('Metrics') + g.queryPanel( [ - 'sum(rate(thanos_replicate_origin_iterations_total{namespace="$namespace",%(selector)s}[$interval]))' % thanos.replicator, - 'sum(rate(thanos_replicate_origin_meta_loads_total{namespace="$namespace",%(selector)s}[$interval]))' % thanos.replicator, - 'sum(rate(thanos_replicate_origin_partial_meta_reads_total{namespace="$namespace",%(selector)s}[$interval]))' % thanos.replicator, - 'sum(rate(thanos_replicate_blocks_already_replicated_total{namespace="$namespace",%(selector)s}[$interval]))' % thanos.replicator, - 'sum(rate(thanos_replicate_blocks_replicated_total{namespace="$namespace",%(selector)s}[$interval]))' % thanos.replicator, - 'sum(rate(thanos_replicate_objects_replicated_total{namespace="$namespace",%(selector)s}[$interval]))' % thanos.replicator, + 'sum(rate(thanos_replicate_origin_iterations_total{namespace="$namespace",%(selector)s}[$interval]))' % thanos.replicate, + 'sum(rate(thanos_replicate_origin_meta_loads_total{namespace="$namespace",%(selector)s}[$interval]))' % thanos.replicate, + 'sum(rate(thanos_replicate_origin_partial_meta_reads_total{namespace="$namespace",%(selector)s}[$interval]))' % thanos.replicate, + 'sum(rate(thanos_replicate_blocks_already_replicated_total{namespace="$namespace",%(selector)s}[$interval]))' % thanos.replicate, + 'sum(rate(thanos_replicate_blocks_replicated_total{namespace="$namespace",%(selector)s}[$interval]))' % thanos.replicate, + 'sum(rate(thanos_replicate_objects_replicated_total{namespace="$namespace",%(selector)s}[$interval]))' % thanos.replicate, ], ['iterations', 'meta loads', 'partial meta reads', 'already replicated blocks', 'replicated blocks', 'replicated objects'] ) @@ -52,7 +52,7 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; ) + g.template('namespace', 'kube_pod_info') + - g.template('job', 'up', 'namespace="$namespace",%(selector)s' % thanos.replicator, true, '%(jobPrefix)s.*' % thanos.replicator), + g.template('job', 'up', 'namespace="$namespace",%(selector)s' % thanos.replicate, true, '%(jobPrefix)s.*' % thanos.replicate), }, } + (import 'defaults.libsonnet') diff --git a/mixin/thanos/defaults.libsonnet b/mixin/thanos/defaults.libsonnet index b2a0c9d76a1..190196b8f74 100644 --- a/mixin/thanos/defaults.libsonnet +++ b/mixin/thanos/defaults.libsonnet @@ -29,6 +29,11 @@ selector: 'job=~"%s.*"' % self.jobPrefix, title: '%(prefix)sSidecar' % $.dashboard.prefix, }, + replicate+:: { + jobPrefix: 'thanos-replicate', + selector: 'job=~"%s.*"' % self.jobPrefix, + title: '%(prefix)sReplicate' % $.dashboard.prefix, + }, overview+:: { title: '%(prefix)sOverview' % $.dashboard.prefix, }, diff --git a/mixin/thanos/rules/replicate.libsonnet b/mixin/thanos/rules/replicate.libsonnet index adb35879b0d..f578da98763 100644 --- a/mixin/thanos/rules/replicate.libsonnet +++ b/mixin/thanos/rules/replicate.libsonnet @@ -1,6 +1,6 @@ { local thanos = self, - replicator+:: { + replicate+:: { selector: error 'must provide selector for Thanos Replicate dashboard', }, prometheusRules+:: { diff --git a/mixin/thanos/rules/rules.libsonnet b/mixin/thanos/rules/rules.libsonnet index c74492d4d44..449076dcb95 100644 --- a/mixin/thanos/rules/rules.libsonnet +++ b/mixin/thanos/rules/rules.libsonnet @@ -1,3 +1,4 @@ (import 'query.libsonnet') + (import 'receive.libsonnet') + -(import 'store.libsonnet') +(import 'store.libsonnet') + +(import 'replicate.libsonnet')