diff --git a/CHANGELOG.md b/CHANGELOG.md index d8fd8a9096..f70b31b079 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,10 +15,13 @@ We use _breaking :warning:_ to mark changes that are not backward compatible (re ### Added ### Fixed + - [#3204](https://github.com/thanos-io/thanos/pull/3204) Mixin: Use sidecar's metric timestamp for healthcheck. ### Changed +- [#3856](https://github.com/thanos-io/thanos/pull/3856) Mixin: _breaking :warning:_ Introduce flexible multi-cluster/namespace mode for alerts and dashboards. Removes jobPrefix config option. Removes `namespace` by default. + ### Removed ## [v0.19.0 - ](https://github.com/thanos-io/thanos/tree/release-0.19) @@ -32,7 +35,6 @@ We use _breaking :warning:_ to mark changes that are not backward compatible (re - [#3792](https://github.com/thanos-io/thanos/pull/3792) Receiver: Added `--tsdb.allow-overlapping-blocks` flag to allow overlapping tsdb blocks and enable vertical compaction - [#3031](https://github.com/thanos-io/thanos/pull/3031) Compact/Sidecar/other writers: added `--hash-func`. If some function has been specified, writers calculate hashes using that function of each file in a block before uploading them. If those hashes exist in the `meta.json` file then Compact does not download the files if they already exist on disk and with the same hash. This also means that the data directory passed to Thanos Compact is only *cleared once at boot* or *if everything succeeds*. So, if you, for example, use persistent volumes on k8s and your Thanos Compact crashes or fails to make an iteration properly then the last downloaded files are not wiped from the disk. The directories that were created the last time are only wiped again after a successful iteration or if the previously picked up blocks have disappeared. - [#3686](https://github.com/thanos-io/thanos/pull/3686) Query: Added federated metric metadata support. - ### Fixed - [#3773](https://github.com/thanos-io/thanos/pull/3773) Compact: Pad compaction planner size check diff --git a/examples/alerts/alerts.md b/examples/alerts/alerts.md index be63d66b02..1225e9fd14 100644 --- a/examples/alerts/alerts.md +++ b/examples/alerts/alerts.md @@ -10,17 +10,17 @@ name: thanos-compact rules: - alert: ThanosCompactMultipleRunning annotations: - description: No more than one Thanos Compact instance should be running at once. - There are {{ $value }} + description: 'No more than one Thanos Compact instance should be running at once. + There are {{$value}} ' runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompactmultiplerunning summary: Thanos Compact has multiple instances running. - expr: sum(up{job=~"thanos-compact.*"}) > 1 + expr: sum by (job) (up{job=~"thanos-compact.*"}) > 1 for: 5m labels: severity: warning - alert: ThanosCompactHalted annotations: - description: Thanos Compact {{$labels.job}} has failed to run and now is halted. + description: Thanos Compact {{$labels.job}} has failed to run and now is halted. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompacthalted summary: Thanos Compact has failed to run ans is now halted. expr: thanos_compact_halted{job=~"thanos-compact.*"} == 1 @@ -29,8 +29,8 @@ rules: severity: warning - alert: ThanosCompactHighCompactionFailures annotations: - description: Thanos Compact {{$labels.job}} is failing to execute {{ $value | - humanize }}% of compactions. + description: Thanos Compact {{$labels.job}} , is failing to execute {{$value | + humanize}}% of compactions. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompacthighcompactionfailures summary: Thanos Compact is failing to execute compactions. expr: | @@ -45,8 +45,8 @@ rules: severity: warning - alert: ThanosCompactBucketHighOperationFailures annotations: - description: Thanos Compact {{$labels.job}} Bucket is failing to execute {{ $value - | humanize }}% of operations. + description: Thanos Compact {{$labels.job}} , Bucket is failing to execute {{$value + | humanize}}% of operations. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompactbuckethighoperationfailures summary: Thanos Compact Bucket is having a high number of operation failures. expr: | @@ -61,10 +61,11 @@ rules: severity: warning - alert: ThanosCompactHasNotRun annotations: - description: Thanos Compact {{$labels.job}} has not uploaded anything for 24 hours. + description: Thanos Compact {{$labels.job}} has not uploaded anything for 24 + hours. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompacthasnotrun summary: Thanos Compact has not uploaded anything for last 24 hours. - expr: (time() - max(max_over_time(thanos_objstore_bucket_last_successful_upload_time{job=~"thanos-compact.*"}[24h]))) + expr: (time() - max by (job) (max_over_time(thanos_objstore_bucket_last_successful_upload_time{job=~"thanos-compact.*"}[24h]))) / 60 / 60 > 24 labels: severity: warning @@ -80,34 +81,34 @@ name: thanos-rule rules: - alert: ThanosRuleQueueIsDroppingAlerts annotations: - description: Thanos Rule {{$labels.job}} is failing to queue alerts. + description: Thanos Rule {{$labels.instance}} is failing to queue alerts. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulequeueisdroppingalerts summary: Thanos Rule is failing to queue alerts. expr: | - sum by (job) (rate(thanos_alert_queue_alerts_dropped_total{job=~"thanos-rule.*"}[5m])) > 0 + sum by (job, instance) (rate(thanos_alert_queue_alerts_dropped_total{job=~"thanos-rule.*"}[5m])) > 0 for: 5m labels: severity: critical - alert: ThanosRuleSenderIsFailingAlerts annotations: - description: Thanos Rule {{$labels.job}} is failing to send alerts to alertmanager. + description: Thanos Rule {{$labels.instance}} is failing to send alerts to alertmanager. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulesenderisfailingalerts summary: Thanos Rule is failing to send alerts to alertmanager. expr: | - sum by (job) (rate(thanos_alert_sender_alerts_dropped_total{job=~"thanos-rule.*"}[5m])) > 0 + sum by (job, instance) (rate(thanos_alert_sender_alerts_dropped_total{job=~"thanos-rule.*"}[5m])) > 0 for: 5m labels: severity: critical - alert: ThanosRuleHighRuleEvaluationFailures annotations: - description: Thanos Rule {{$labels.job}} is failing to evaluate rules. + description: Thanos Rule {{$labels.instance}} is failing to evaluate rules. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulehighruleevaluationfailures summary: Thanos Rule is failing to evaluate rules. expr: | ( - sum by (job) (rate(prometheus_rule_evaluation_failures_total{job=~"thanos-rule.*"}[5m])) + sum by (job, instance) (rate(prometheus_rule_evaluation_failures_total{job=~"thanos-rule.*"}[5m])) / - sum by (job) (rate(prometheus_rule_evaluations_total{job=~"thanos-rule.*"}[5m])) + sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~"thanos-rule.*"}[5m])) * 100 > 5 ) for: 5m @@ -115,18 +116,18 @@ rules: severity: critical - alert: ThanosRuleHighRuleEvaluationWarnings annotations: - description: Thanos Rule {{$labels.job}} has high number of evaluation warnings. + description: Thanos Rule {{$labels.instance}} has high number of evaluation warnings. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulehighruleevaluationwarnings summary: Thanos Rule has high number of evaluation warnings. expr: | - sum by (job) (rate(thanos_rule_evaluation_with_warnings_total{job=~"thanos-rule.*"}[5m])) > 0 + sum by (job, instance) (rate(thanos_rule_evaluation_with_warnings_total{job=~"thanos-rule.*"}[5m])) > 0 for: 15m labels: severity: info - alert: ThanosRuleRuleEvaluationLatencyHigh annotations: - description: Thanos Rule {{$labels.job}}/{{$labels.instance}} has higher evaluation - latency than interval for {{$labels.rule_group}}. + description: Thanos Rule {{$labels.instance}} has higher evaluation latency than + interval for {{$labels.rule_group}}. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosruleruleevaluationlatencyhigh summary: Thanos Rule has high rule evaluation latency. expr: | @@ -140,15 +141,15 @@ rules: severity: warning - alert: ThanosRuleGrpcErrorRate annotations: - description: Thanos Rule {{$labels.job}} is failing to handle {{ $value | humanize - }}% of requests. + description: Thanos Rule {{$labels.job}} is failing to handle {{$value | humanize}}% + of requests. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulegrpcerrorrate summary: Thanos Rule is failing to handle grpc requests. expr: | ( - sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-rule.*"}[5m])) + sum by (job, instance) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-rule.*"}[5m])) / - sum by (job) (rate(grpc_server_started_total{job=~"thanos-rule.*"}[5m])) + sum by (job, instance) (rate(grpc_server_started_total{job=~"thanos-rule.*"}[5m])) * 100 > 5 ) for: 5m @@ -159,22 +160,22 @@ rules: description: Thanos Rule {{$labels.job}} has not been able to reload its configuration. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosruleconfigreloadfailure summary: Thanos Rule has not been able to reload configuration. - expr: avg(thanos_rule_config_last_reload_successful{job=~"thanos-rule.*"}) by (job) + expr: avg by (job, instance) (thanos_rule_config_last_reload_successful{job=~"thanos-rule.*"}) != 1 for: 5m labels: severity: info - alert: ThanosRuleQueryHighDNSFailures annotations: - description: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing + description: Thanos Rule {{$labels.job}} has {{$value | humanize}}% of failing DNS queries for query endpoints. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulequeryhighdnsfailures summary: Thanos Rule is having high number of DNS failures. expr: | ( - sum by (job) (rate(thanos_rule_query_apis_dns_failures_total{job=~"thanos-rule.*"}[5m])) + sum by (job, instance) (rate(thanos_rule_query_apis_dns_failures_total{job=~"thanos-rule.*"}[5m])) / - sum by (job) (rate(thanos_rule_query_apis_dns_lookups_total{job=~"thanos-rule.*"}[5m])) + sum by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total{job=~"thanos-rule.*"}[5m])) * 100 > 1 ) for: 15m @@ -182,15 +183,15 @@ rules: severity: warning - alert: ThanosRuleAlertmanagerHighDNSFailures annotations: - description: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing + description: Thanos Rule {{$labels.instance}} has {{$value | humanize}}% of failing DNS queries for Alertmanager endpoints. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulealertmanagerhighdnsfailures summary: Thanos Rule is having high number of DNS failures. expr: | ( - sum by (job) (rate(thanos_rule_alertmanagers_dns_failures_total{job=~"thanos-rule.*"}[5m])) + sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_failures_total{job=~"thanos-rule.*"}[5m])) / - sum by (job) (rate(thanos_rule_alertmanagers_dns_lookups_total{job=~"thanos-rule.*"}[5m])) + sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total{job=~"thanos-rule.*"}[5m])) * 100 > 1 ) for: 15m @@ -198,27 +199,27 @@ rules: severity: warning - alert: ThanosRuleNoEvaluationFor10Intervals annotations: - description: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% rule groups + description: Thanos Rule {{$labels.job}} has {{$value | humanize}}% rule groups that did not evaluate for at least 10x of their expected interval. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulenoevaluationfor10intervals summary: Thanos Rule has rule groups that did not evaluate for 10 intervals. expr: | - time() - max by (job, group) (prometheus_rule_group_last_evaluation_timestamp_seconds{job=~"thanos-rule.*"}) + time() - max by (job, instance, group) (prometheus_rule_group_last_evaluation_timestamp_seconds{job=~"thanos-rule.*"}) > - 10 * max by (job, group) (prometheus_rule_group_interval_seconds{job=~"thanos-rule.*"}) + 10 * max by (job, instance, group) (prometheus_rule_group_interval_seconds{job=~"thanos-rule.*"}) for: 5m labels: severity: info - alert: ThanosNoRuleEvaluations annotations: - description: Thanos Rule {{$labels.job}} did not perform any rule evaluations + description: Thanos Rule {{$labels.instance}} did not perform any rule evaluations in the past 2 minutes. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosnoruleevaluations summary: Thanos Rule did not perform any rule evaluations. expr: | - sum(rate(prometheus_rule_evaluations_total{job=~"thanos-rule.*"}[2m])) <= 0 + sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~"thanos-rule.*"}[2m])) <= 0 and - sum(thanos_rule_loaded_rules{job=~"thanos-rule.*"}) > 0 + sum by (job, instance) (thanos_rule_loaded_rules{job=~"thanos-rule.*"}) > 0 for: 3m labels: severity: critical @@ -232,8 +233,8 @@ name: thanos-store rules: - alert: ThanosStoreGrpcErrorRate annotations: - description: Thanos Store {{$labels.job}} is failing to handle {{ $value | humanize - }}% of requests. + description: Thanos Store {{$labels.job}} is failing to handle {{$value | humanize}}% + of requests. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstoregrpcerrorrate summary: Thanos Store is failing to handle qrpcd requests. expr: | @@ -248,8 +249,8 @@ rules: severity: warning - alert: ThanosStoreSeriesGateLatencyHigh annotations: - description: Thanos Store {{$labels.job}} has a 99th percentile latency of {{ - $value }} seconds for store series gate requests. + description: Thanos Store {{$labels.job}} has a 99th percentile latency of {{$value}} + seconds for store series gate requests. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstoreseriesgatelatencyhigh summary: Thanos Store has high latency for store series gate requests. expr: | @@ -263,8 +264,8 @@ rules: severity: warning - alert: ThanosStoreBucketHighOperationFailures annotations: - description: Thanos Store {{$labels.job}} Bucket is failing to execute {{ $value - | humanize }}% of operations. + description: Thanos Store {{$labels.job}} Bucket is failing to execute {{$value + | humanize}}% of operations. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstorebuckethighoperationfailures summary: Thanos Store Bucket is failing to execute operations. expr: | @@ -280,7 +281,7 @@ rules: - alert: ThanosStoreObjstoreOperationLatencyHigh annotations: description: Thanos Store {{$labels.job}} Bucket has a 99th percentile latency - of {{ $value }} seconds for the bucket operations. + of {{$value}} seconds for the bucket operations. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstoreobjstoreoperationlatencyhigh summary: Thanos Store is having high latency for bucket operations. expr: | @@ -302,34 +303,31 @@ name: thanos-sidecar rules: - alert: ThanosSidecarPrometheusDown annotations: - description: Thanos Sidecar {{$labels.job}} {{$labels.instance}} cannot connect - to Prometheus. + description: Thanos Sidecar {{$labels.instance}} cannot connect to Prometheus. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarprometheusdown summary: Thanos Sidecar cannot connect to Prometheus expr: | - sum by (job, instance) (thanos_sidecar_prometheus_up{job=~"thanos-sidecar.*"} == 0) + thanos_sidecar_prometheus_up{job=~"thanos-sidecar.*"} == 0 for: 5m labels: severity: critical - alert: ThanosSidecarBucketOperationsFailed annotations: - description: Thanos Sidecar {{$labels.job}} {{$labels.instance}} bucket operations - are failing + description: Thanos Sidecar {{$labels.instance}} bucket operations are failing runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarbucketoperationsfailed summary: Thanos Sidecar bucket operations are failing expr: | - rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-sidecar.*"}[5m]) > 0 + sum by (job, instance) (rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-sidecar.*"}[5m])) > 0 for: 5m labels: severity: critical - alert: ThanosSidecarUnhealthy annotations: - description: Thanos Sidecar {{$labels.job}} {{$labels.instance}} is unhealthy - for more than {{$value}} seconds. + description: Thanos Sidecar {{$labels.instance}} is unhealthy for {{$value}} seconds. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy summary: Thanos Sidecar is unhealthy. expr: | - time() - max by (job, instance) (timestamp(thanos_sidecar_last_heartbeat_success_time_seconds{job=~"thanos-sidecar.*"})) >= 240 + time() - max by (job, instance) (thanos_sidecar_last_heartbeat_success_time_seconds{job=~"thanos-sidecar.*"}) >= 600 labels: severity: critical ``` @@ -342,38 +340,38 @@ name: thanos-query rules: - alert: ThanosQueryHttpRequestQueryErrorRateHigh annotations: - description: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize - }}% of "query" requests. + description: Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% + of "query" requests. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryhttprequestqueryerrorratehigh summary: Thanos Query is failing to handle requests. expr: | ( - sum(rate(http_requests_total{code=~"5..", job=~"thanos-query.*", handler="query"}[5m])) + sum by (job) (rate(http_requests_total{code=~"5..", job=~"thanos-query.*", handler="query"}[5m])) / - sum(rate(http_requests_total{job=~"thanos-query.*", handler="query"}[5m])) + sum by (job) (rate(http_requests_total{job=~"thanos-query.*", handler="query"}[5m])) ) * 100 > 5 for: 5m labels: severity: critical - alert: ThanosQueryHttpRequestQueryRangeErrorRateHigh annotations: - description: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize - }}% of "query_range" requests. + description: Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% + of "query_range" requests. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryhttprequestqueryrangeerrorratehigh summary: Thanos Query is failing to handle requests. expr: | ( - sum(rate(http_requests_total{code=~"5..", job=~"thanos-query.*", handler="query_range"}[5m])) + sum by (job) (rate(http_requests_total{code=~"5..", job=~"thanos-query.*", handler="query_range"}[5m])) / - sum(rate(http_requests_total{job=~"thanos-query.*", handler="query_range"}[5m])) + sum by (job) (rate(http_requests_total{job=~"thanos-query.*", handler="query_range"}[5m])) ) * 100 > 5 for: 5m labels: severity: critical - alert: ThanosQueryGrpcServerErrorRate annotations: - description: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize - }}% of requests. + description: Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% + of requests. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosquerygrpcservererrorrate summary: Thanos Query is failing to handle requests. expr: | @@ -388,8 +386,8 @@ rules: severity: warning - alert: ThanosQueryGrpcClientErrorRate annotations: - description: Thanos Query {{$labels.job}} is failing to send {{ $value | humanize - }}% of requests. + description: Thanos Query {{$labels.job}} is failing to send {{$value | humanize}}% + of requests. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosquerygrpcclienterrorrate summary: Thanos Query is failing to send requests. expr: | @@ -403,7 +401,7 @@ rules: severity: warning - alert: ThanosQueryHighDNSFailures annotations: - description: Thanos Query {{$labels.job}} have {{ $value | humanize }}% of failing + description: Thanos Query {{$labels.job}} have {{$value | humanize}}% of failing DNS queries for store endpoints. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryhighdnsfailures summary: Thanos Query is having high number of DNS failures. @@ -418,8 +416,8 @@ rules: severity: warning - alert: ThanosQueryInstantLatencyHigh annotations: - description: Thanos Query {{$labels.job}} has a 99th percentile latency of {{ - $value }} seconds for instant queries. + description: Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} + seconds for instant queries. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryinstantlatencyhigh summary: Thanos Query has high latency for queries. expr: | @@ -433,8 +431,8 @@ rules: severity: critical - alert: ThanosQueryRangeLatencyHigh annotations: - description: Thanos Query {{$labels.job}} has a 99th percentile latency of {{ - $value }} seconds for range queries. + description: Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} + seconds for range queries. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryrangelatencyhigh summary: Thanos Query has high latency for queries. expr: | @@ -456,15 +454,15 @@ name: thanos-receive rules: - alert: ThanosReceiveHttpRequestErrorRateHigh annotations: - description: Thanos Receive {{$labels.job}} is failing to handle {{ $value | humanize - }}% of requests. + description: Thanos Receive {{$labels.job}} is failing to handle {{$value | humanize}}% + of requests. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivehttprequesterrorratehigh summary: Thanos Receive is failing to handle requests. expr: | ( - sum(rate(http_requests_total{code=~"5..", job=~"thanos-receive.*", handler="receive"}[5m])) + sum by (job) (rate(http_requests_total{code=~"5..", job=~"thanos-receive.*", handler="receive"}[5m])) / - sum(rate(http_requests_total{job=~"thanos-receive.*", handler="receive"}[5m])) + sum by (job) (rate(http_requests_total{job=~"thanos-receive.*", handler="receive"}[5m])) ) * 100 > 5 for: 5m labels: @@ -486,8 +484,8 @@ rules: severity: critical - alert: ThanosReceiveHighReplicationFailures annotations: - description: Thanos Receive {{$labels.job}} is failing to replicate {{ $value - | humanize }}% of requests. + description: Thanos Receive {{$labels.job}} is failing to replicate {{$value | + humanize}}% of requests. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivehighreplicationfailures summary: Thanos Receive is having high number of replication failures. expr: | @@ -511,8 +509,8 @@ rules: severity: warning - alert: ThanosReceiveHighForwardRequestFailures annotations: - description: Thanos Receive {{$labels.job}} is failing to forward {{ $value | - humanize }}% of requests. + description: Thanos Receive {{$labels.job}} is failing to forward {{$value | humanize}}% + of requests. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivehighforwardrequestfailures summary: Thanos Receive is failing to forward requests. expr: | @@ -527,7 +525,7 @@ rules: - alert: ThanosReceiveHighHashringFileRefreshFailures annotations: description: Thanos Receive {{$labels.job}} is failing to refresh hashring file, - {{ $value | humanize }} of attempts failed. + {{$value | humanize}} of attempts failed. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivehighhashringfilerefreshfailures summary: Thanos Receive is failing to refresh hasring file. expr: | @@ -546,21 +544,21 @@ rules: configurations. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceiveconfigreloadfailure summary: Thanos Receive has not been able to reload configuration. - expr: avg(thanos_receive_config_last_reload_successful{job=~"thanos-receive.*"}) - by (job) != 1 + expr: avg by (job) (thanos_receive_config_last_reload_successful{job=~"thanos-receive.*"}) + != 1 for: 5m labels: severity: warning - alert: ThanosReceiveNoUpload annotations: - description: Thanos Receive {{ $labels.instance }} of {{$labels.job}} has not - uploaded latest data to object storage. + description: Thanos Receive {{$labels.instance}} has not uploaded latest data + to object storage. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivenoupload summary: Thanos Receive has not uploaded latest data to object storage. expr: | (up{job=~"thanos-receive.*"} - 1) - + on (instance) # filters to only alert on current instance last 3h - (sum by (instance) (increase(thanos_shipper_uploads_total{job=~"thanos-receive.*"}[3h])) == 0) + + on (job, instance) # filters to only alert on current instance last 3h + (sum by (job, instance) (increase(thanos_shipper_uploads_total{job=~"thanos-receive.*"}[3h])) == 0) for: 3h labels: severity: critical @@ -572,27 +570,17 @@ rules: ```yaml name: thanos-bucket-replicate rules: -- alert: ThanosBucketReplicateIsDown - annotations: - description: Thanos Replicate has disappeared from Prometheus target discovery. - runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosbucketreplicateisdown - summary: Thanos Replicate has disappeared from Prometheus target discovery. - expr: | - absent(up{job=~"thanos-bucket-replicate.*"}) - for: 5m - labels: - severity: critical - alert: ThanosBucketReplicateErrorRate annotations: - description: Thanos Replicate failing to run, {{ $value | humanize }}% of attempts + description: Thanos Replicate is failing to run , {{$value | humanize}}% of attempts failed. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosbucketreplicateerrorrate - summary: Thanose Replicate is failing to run. + summary: Thanose Replicate is failing to run in . expr: | ( - sum(rate(thanos_replicate_replication_runs_total{result="error", job=~"thanos-bucket-replicate.*"}[5m])) - / on (namespace) group_left - sum(rate(thanos_replicate_replication_runs_total{job=~"thanos-bucket-replicate.*"}[5m])) + sum by (job) (rate(thanos_replicate_replication_runs_total{result="error", job=~"thanos-bucket-replicate.*"}[5m])) + / on (job) group_left + sum by (job) (rate(thanos_replicate_replication_runs_total{job=~"thanos-bucket-replicate.*"}[5m])) ) * 100 >= 10 for: 5m labels: @@ -600,12 +588,12 @@ rules: - alert: ThanosBucketReplicateRunLatency annotations: description: Thanos Replicate {{$labels.job}} has a 99th percentile latency of - {{ $value }} seconds for the replicate operations. + {{$value}} seconds for the replicate operations. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosbucketreplicaterunlatency summary: Thanos Replicate has a high latency for replicate operations. expr: | ( - histogram_quantile(0.99, sum by (job, le) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~"thanos-bucket-replicate.*"}[5m]))) > 20 + histogram_quantile(0.99, sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~"thanos-bucket-replicate.*"}[5m]))) > 20 and sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~"thanos-bucket-replicate.*"}[5m])) > 0 ) @@ -624,9 +612,10 @@ name: thanos-component-absent rules: - alert: ThanosBucketReplicateIsDown annotations: - description: ThanosBucketReplicate has disappeared from Prometheus target discovery. + description: ThanosBucketReplicate has disappeared. Prometheus target for the + component cannot be discovered. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosbucketreplicateisdown - summary: thanos component has disappeared from Prometheus target discovery. + summary: Thanos component has disappeared. expr: | absent(up{job=~"thanos-bucket-replicate.*"} == 1) for: 5m @@ -634,9 +623,10 @@ rules: severity: critical - alert: ThanosCompactIsDown annotations: - description: ThanosCompact has disappeared from Prometheus target discovery. + description: ThanosCompact has disappeared. Prometheus target for the component + cannot be discovered. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompactisdown - summary: thanos component has disappeared from Prometheus target discovery. + summary: Thanos component has disappeared. expr: | absent(up{job=~"thanos-compact.*"} == 1) for: 5m @@ -644,9 +634,10 @@ rules: severity: critical - alert: ThanosQueryIsDown annotations: - description: ThanosQuery has disappeared from Prometheus target discovery. + description: ThanosQuery has disappeared. Prometheus target for the component + cannot be discovered. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryisdown - summary: thanos component has disappeared from Prometheus target discovery. + summary: Thanos component has disappeared. expr: | absent(up{job=~"thanos-query.*"} == 1) for: 5m @@ -654,9 +645,10 @@ rules: severity: critical - alert: ThanosReceiveIsDown annotations: - description: ThanosReceive has disappeared from Prometheus target discovery. + description: ThanosReceive has disappeared. Prometheus target for the component + cannot be discovered. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceiveisdown - summary: thanos component has disappeared from Prometheus target discovery. + summary: Thanos component has disappeared. expr: | absent(up{job=~"thanos-receive.*"} == 1) for: 5m @@ -664,9 +656,10 @@ rules: severity: critical - alert: ThanosRuleIsDown annotations: - description: ThanosRule has disappeared from Prometheus target discovery. + description: ThanosRule has disappeared. Prometheus target for the component cannot + be discovered. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosruleisdown - summary: thanos component has disappeared from Prometheus target discovery. + summary: Thanos component has disappeared. expr: | absent(up{job=~"thanos-rule.*"} == 1) for: 5m @@ -674,9 +667,10 @@ rules: severity: critical - alert: ThanosSidecarIsDown annotations: - description: ThanosSidecar has disappeared from Prometheus target discovery. + description: ThanosSidecar has disappeared. Prometheus target for the component + cannot be discovered. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarisdown - summary: thanos component has disappeared from Prometheus target discovery. + summary: Thanos component has disappeared. expr: | absent(up{job=~"thanos-sidecar.*"} == 1) for: 5m @@ -684,9 +678,10 @@ rules: severity: critical - alert: ThanosStoreIsDown annotations: - description: ThanosStore has disappeared from Prometheus target discovery. + description: ThanosStore has disappeared. Prometheus target for the component + cannot be discovered. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstoreisdown - summary: thanos component has disappeared from Prometheus target discovery. + summary: Thanos component has disappeared. expr: | absent(up{job=~"thanos-store.*"} == 1) for: 5m diff --git a/examples/alerts/alerts.yaml b/examples/alerts/alerts.yaml index 6d6e4a7a84..0127ec7838 100644 --- a/examples/alerts/alerts.yaml +++ b/examples/alerts/alerts.yaml @@ -3,17 +3,17 @@ groups: rules: - alert: ThanosCompactMultipleRunning annotations: - description: No more than one Thanos Compact instance should be running at once. - There are {{ $value }} + description: 'No more than one Thanos Compact instance should be running at + once. There are {{$value}} ' runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompactmultiplerunning summary: Thanos Compact has multiple instances running. - expr: sum(up{job=~"thanos-compact.*"}) > 1 + expr: sum by (job) (up{job=~"thanos-compact.*"}) > 1 for: 5m labels: severity: warning - alert: ThanosCompactHalted annotations: - description: Thanos Compact {{$labels.job}} has failed to run and now is halted. + description: Thanos Compact {{$labels.job}} has failed to run and now is halted. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompacthalted summary: Thanos Compact has failed to run ans is now halted. expr: thanos_compact_halted{job=~"thanos-compact.*"} == 1 @@ -22,8 +22,8 @@ groups: severity: warning - alert: ThanosCompactHighCompactionFailures annotations: - description: Thanos Compact {{$labels.job}} is failing to execute {{ $value - | humanize }}% of compactions. + description: Thanos Compact {{$labels.job}} , is failing to execute {{$value + | humanize}}% of compactions. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompacthighcompactionfailures summary: Thanos Compact is failing to execute compactions. expr: | @@ -38,8 +38,8 @@ groups: severity: warning - alert: ThanosCompactBucketHighOperationFailures annotations: - description: Thanos Compact {{$labels.job}} Bucket is failing to execute {{ - $value | humanize }}% of operations. + description: Thanos Compact {{$labels.job}} , Bucket is failing to execute {{$value + | humanize}}% of operations. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompactbuckethighoperationfailures summary: Thanos Compact Bucket is having a high number of operation failures. expr: | @@ -54,11 +54,11 @@ groups: severity: warning - alert: ThanosCompactHasNotRun annotations: - description: Thanos Compact {{$labels.job}} has not uploaded anything for 24 + description: Thanos Compact {{$labels.job}} has not uploaded anything for 24 hours. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompacthasnotrun summary: Thanos Compact has not uploaded anything for last 24 hours. - expr: (time() - max(max_over_time(thanos_objstore_bucket_last_successful_upload_time{job=~"thanos-compact.*"}[24h]))) + expr: (time() - max by (job) (max_over_time(thanos_objstore_bucket_last_successful_upload_time{job=~"thanos-compact.*"}[24h]))) / 60 / 60 > 24 labels: severity: warning @@ -66,38 +66,38 @@ groups: rules: - alert: ThanosQueryHttpRequestQueryErrorRateHigh annotations: - description: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize - }}% of "query" requests. + description: Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% + of "query" requests. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryhttprequestqueryerrorratehigh summary: Thanos Query is failing to handle requests. expr: | ( - sum(rate(http_requests_total{code=~"5..", job=~"thanos-query.*", handler="query"}[5m])) + sum by (job) (rate(http_requests_total{code=~"5..", job=~"thanos-query.*", handler="query"}[5m])) / - sum(rate(http_requests_total{job=~"thanos-query.*", handler="query"}[5m])) + sum by (job) (rate(http_requests_total{job=~"thanos-query.*", handler="query"}[5m])) ) * 100 > 5 for: 5m labels: severity: critical - alert: ThanosQueryHttpRequestQueryRangeErrorRateHigh annotations: - description: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize - }}% of "query_range" requests. + description: Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% + of "query_range" requests. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryhttprequestqueryrangeerrorratehigh summary: Thanos Query is failing to handle requests. expr: | ( - sum(rate(http_requests_total{code=~"5..", job=~"thanos-query.*", handler="query_range"}[5m])) + sum by (job) (rate(http_requests_total{code=~"5..", job=~"thanos-query.*", handler="query_range"}[5m])) / - sum(rate(http_requests_total{job=~"thanos-query.*", handler="query_range"}[5m])) + sum by (job) (rate(http_requests_total{job=~"thanos-query.*", handler="query_range"}[5m])) ) * 100 > 5 for: 5m labels: severity: critical - alert: ThanosQueryGrpcServerErrorRate annotations: - description: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize - }}% of requests. + description: Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% + of requests. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosquerygrpcservererrorrate summary: Thanos Query is failing to handle requests. expr: | @@ -112,8 +112,8 @@ groups: severity: warning - alert: ThanosQueryGrpcClientErrorRate annotations: - description: Thanos Query {{$labels.job}} is failing to send {{ $value | humanize - }}% of requests. + description: Thanos Query {{$labels.job}} is failing to send {{$value | humanize}}% + of requests. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosquerygrpcclienterrorrate summary: Thanos Query is failing to send requests. expr: | @@ -127,7 +127,7 @@ groups: severity: warning - alert: ThanosQueryHighDNSFailures annotations: - description: Thanos Query {{$labels.job}} have {{ $value | humanize }}% of failing + description: Thanos Query {{$labels.job}} have {{$value | humanize}}% of failing DNS queries for store endpoints. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryhighdnsfailures summary: Thanos Query is having high number of DNS failures. @@ -142,8 +142,8 @@ groups: severity: warning - alert: ThanosQueryInstantLatencyHigh annotations: - description: Thanos Query {{$labels.job}} has a 99th percentile latency of {{ - $value }} seconds for instant queries. + description: Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} + seconds for instant queries. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryinstantlatencyhigh summary: Thanos Query has high latency for queries. expr: | @@ -157,8 +157,8 @@ groups: severity: critical - alert: ThanosQueryRangeLatencyHigh annotations: - description: Thanos Query {{$labels.job}} has a 99th percentile latency of {{ - $value }} seconds for range queries. + description: Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} + seconds for range queries. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryrangelatencyhigh summary: Thanos Query has high latency for queries. expr: | @@ -174,15 +174,15 @@ groups: rules: - alert: ThanosReceiveHttpRequestErrorRateHigh annotations: - description: Thanos Receive {{$labels.job}} is failing to handle {{ $value | - humanize }}% of requests. + description: Thanos Receive {{$labels.job}} is failing to handle {{$value | + humanize}}% of requests. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivehttprequesterrorratehigh summary: Thanos Receive is failing to handle requests. expr: | ( - sum(rate(http_requests_total{code=~"5..", job=~"thanos-receive.*", handler="receive"}[5m])) + sum by (job) (rate(http_requests_total{code=~"5..", job=~"thanos-receive.*", handler="receive"}[5m])) / - sum(rate(http_requests_total{job=~"thanos-receive.*", handler="receive"}[5m])) + sum by (job) (rate(http_requests_total{job=~"thanos-receive.*", handler="receive"}[5m])) ) * 100 > 5 for: 5m labels: @@ -204,8 +204,8 @@ groups: severity: critical - alert: ThanosReceiveHighReplicationFailures annotations: - description: Thanos Receive {{$labels.job}} is failing to replicate {{ $value - | humanize }}% of requests. + description: Thanos Receive {{$labels.job}} is failing to replicate {{$value + | humanize}}% of requests. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivehighreplicationfailures summary: Thanos Receive is having high number of replication failures. expr: | @@ -229,8 +229,8 @@ groups: severity: warning - alert: ThanosReceiveHighForwardRequestFailures annotations: - description: Thanos Receive {{$labels.job}} is failing to forward {{ $value - | humanize }}% of requests. + description: Thanos Receive {{$labels.job}} is failing to forward {{$value | + humanize}}% of requests. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivehighforwardrequestfailures summary: Thanos Receive is failing to forward requests. expr: | @@ -245,7 +245,7 @@ groups: - alert: ThanosReceiveHighHashringFileRefreshFailures annotations: description: Thanos Receive {{$labels.job}} is failing to refresh hashring file, - {{ $value | humanize }} of attempts failed. + {{$value | humanize}} of attempts failed. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivehighhashringfilerefreshfailures summary: Thanos Receive is failing to refresh hasring file. expr: | @@ -264,21 +264,21 @@ groups: configurations. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceiveconfigreloadfailure summary: Thanos Receive has not been able to reload configuration. - expr: avg(thanos_receive_config_last_reload_successful{job=~"thanos-receive.*"}) - by (job) != 1 + expr: avg by (job) (thanos_receive_config_last_reload_successful{job=~"thanos-receive.*"}) + != 1 for: 5m labels: severity: warning - alert: ThanosReceiveNoUpload annotations: - description: Thanos Receive {{ $labels.instance }} of {{$labels.job}} has not - uploaded latest data to object storage. + description: Thanos Receive {{$labels.instance}} has not uploaded latest data + to object storage. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivenoupload summary: Thanos Receive has not uploaded latest data to object storage. expr: | (up{job=~"thanos-receive.*"} - 1) - + on (instance) # filters to only alert on current instance last 3h - (sum by (instance) (increase(thanos_shipper_uploads_total{job=~"thanos-receive.*"}[3h])) == 0) + + on (job, instance) # filters to only alert on current instance last 3h + (sum by (job, instance) (increase(thanos_shipper_uploads_total{job=~"thanos-receive.*"}[3h])) == 0) for: 3h labels: severity: critical @@ -286,42 +286,40 @@ groups: rules: - alert: ThanosSidecarPrometheusDown annotations: - description: Thanos Sidecar {{$labels.job}} {{$labels.instance}} cannot connect - to Prometheus. + description: Thanos Sidecar {{$labels.instance}} cannot connect to Prometheus. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarprometheusdown summary: Thanos Sidecar cannot connect to Prometheus expr: | - sum by (job, instance) (thanos_sidecar_prometheus_up{job=~"thanos-sidecar.*"} == 0) + thanos_sidecar_prometheus_up{job=~"thanos-sidecar.*"} == 0 for: 5m labels: severity: critical - alert: ThanosSidecarBucketOperationsFailed annotations: - description: Thanos Sidecar {{$labels.job}} {{$labels.instance}} bucket operations - are failing + description: Thanos Sidecar {{$labels.instance}} bucket operations are failing runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarbucketoperationsfailed summary: Thanos Sidecar bucket operations are failing expr: | - rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-sidecar.*"}[5m]) > 0 + sum by (job, instance) (rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-sidecar.*"}[5m])) > 0 for: 5m labels: severity: critical - alert: ThanosSidecarUnhealthy annotations: - description: Thanos Sidecar {{$labels.job}} {{$labels.instance}} is unhealthy - for more than {{$value}} seconds. + description: Thanos Sidecar {{$labels.instance}} is unhealthy for {{$value}} + seconds. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy summary: Thanos Sidecar is unhealthy. expr: | - time() - max by (job, instance) (timestamp(thanos_sidecar_last_heartbeat_success_time_seconds{job=~"thanos-sidecar.*"})) >= 240 + time() - max by (job, instance) (thanos_sidecar_last_heartbeat_success_time_seconds{job=~"thanos-sidecar.*"}) >= 600 labels: severity: critical - name: thanos-store rules: - alert: ThanosStoreGrpcErrorRate annotations: - description: Thanos Store {{$labels.job}} is failing to handle {{ $value | humanize - }}% of requests. + description: Thanos Store {{$labels.job}} is failing to handle {{$value | humanize}}% + of requests. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstoregrpcerrorrate summary: Thanos Store is failing to handle qrpcd requests. expr: | @@ -336,8 +334,8 @@ groups: severity: warning - alert: ThanosStoreSeriesGateLatencyHigh annotations: - description: Thanos Store {{$labels.job}} has a 99th percentile latency of {{ - $value }} seconds for store series gate requests. + description: Thanos Store {{$labels.job}} has a 99th percentile latency of {{$value}} + seconds for store series gate requests. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstoreseriesgatelatencyhigh summary: Thanos Store has high latency for store series gate requests. expr: | @@ -351,8 +349,8 @@ groups: severity: warning - alert: ThanosStoreBucketHighOperationFailures annotations: - description: Thanos Store {{$labels.job}} Bucket is failing to execute {{ $value - | humanize }}% of operations. + description: Thanos Store {{$labels.job}} Bucket is failing to execute {{$value + | humanize}}% of operations. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstorebuckethighoperationfailures summary: Thanos Store Bucket is failing to execute operations. expr: | @@ -368,7 +366,7 @@ groups: - alert: ThanosStoreObjstoreOperationLatencyHigh annotations: description: Thanos Store {{$labels.job}} Bucket has a 99th percentile latency - of {{ $value }} seconds for the bucket operations. + of {{$value}} seconds for the bucket operations. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstoreobjstoreoperationlatencyhigh summary: Thanos Store is having high latency for bucket operations. expr: | @@ -384,34 +382,34 @@ groups: rules: - alert: ThanosRuleQueueIsDroppingAlerts annotations: - description: Thanos Rule {{$labels.job}} is failing to queue alerts. + description: Thanos Rule {{$labels.instance}} is failing to queue alerts. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulequeueisdroppingalerts summary: Thanos Rule is failing to queue alerts. expr: | - sum by (job) (rate(thanos_alert_queue_alerts_dropped_total{job=~"thanos-rule.*"}[5m])) > 0 + sum by (job, instance) (rate(thanos_alert_queue_alerts_dropped_total{job=~"thanos-rule.*"}[5m])) > 0 for: 5m labels: severity: critical - alert: ThanosRuleSenderIsFailingAlerts annotations: - description: Thanos Rule {{$labels.job}} is failing to send alerts to alertmanager. + description: Thanos Rule {{$labels.instance}} is failing to send alerts to alertmanager. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulesenderisfailingalerts summary: Thanos Rule is failing to send alerts to alertmanager. expr: | - sum by (job) (rate(thanos_alert_sender_alerts_dropped_total{job=~"thanos-rule.*"}[5m])) > 0 + sum by (job, instance) (rate(thanos_alert_sender_alerts_dropped_total{job=~"thanos-rule.*"}[5m])) > 0 for: 5m labels: severity: critical - alert: ThanosRuleHighRuleEvaluationFailures annotations: - description: Thanos Rule {{$labels.job}} is failing to evaluate rules. + description: Thanos Rule {{$labels.instance}} is failing to evaluate rules. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulehighruleevaluationfailures summary: Thanos Rule is failing to evaluate rules. expr: | ( - sum by (job) (rate(prometheus_rule_evaluation_failures_total{job=~"thanos-rule.*"}[5m])) + sum by (job, instance) (rate(prometheus_rule_evaluation_failures_total{job=~"thanos-rule.*"}[5m])) / - sum by (job) (rate(prometheus_rule_evaluations_total{job=~"thanos-rule.*"}[5m])) + sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~"thanos-rule.*"}[5m])) * 100 > 5 ) for: 5m @@ -419,18 +417,19 @@ groups: severity: critical - alert: ThanosRuleHighRuleEvaluationWarnings annotations: - description: Thanos Rule {{$labels.job}} has high number of evaluation warnings. + description: Thanos Rule {{$labels.instance}} has high number of evaluation + warnings. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulehighruleevaluationwarnings summary: Thanos Rule has high number of evaluation warnings. expr: | - sum by (job) (rate(thanos_rule_evaluation_with_warnings_total{job=~"thanos-rule.*"}[5m])) > 0 + sum by (job, instance) (rate(thanos_rule_evaluation_with_warnings_total{job=~"thanos-rule.*"}[5m])) > 0 for: 15m labels: severity: info - alert: ThanosRuleRuleEvaluationLatencyHigh annotations: - description: Thanos Rule {{$labels.job}}/{{$labels.instance}} has higher evaluation - latency than interval for {{$labels.rule_group}}. + description: Thanos Rule {{$labels.instance}} has higher evaluation latency + than interval for {{$labels.rule_group}}. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosruleruleevaluationlatencyhigh summary: Thanos Rule has high rule evaluation latency. expr: | @@ -444,15 +443,15 @@ groups: severity: warning - alert: ThanosRuleGrpcErrorRate annotations: - description: Thanos Rule {{$labels.job}} is failing to handle {{ $value | humanize - }}% of requests. + description: Thanos Rule {{$labels.job}} is failing to handle {{$value | humanize}}% + of requests. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulegrpcerrorrate summary: Thanos Rule is failing to handle grpc requests. expr: | ( - sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-rule.*"}[5m])) + sum by (job, instance) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-rule.*"}[5m])) / - sum by (job) (rate(grpc_server_started_total{job=~"thanos-rule.*"}[5m])) + sum by (job, instance) (rate(grpc_server_started_total{job=~"thanos-rule.*"}[5m])) * 100 > 5 ) for: 5m @@ -463,22 +462,22 @@ groups: description: Thanos Rule {{$labels.job}} has not been able to reload its configuration. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosruleconfigreloadfailure summary: Thanos Rule has not been able to reload configuration. - expr: avg(thanos_rule_config_last_reload_successful{job=~"thanos-rule.*"}) by - (job) != 1 + expr: avg by (job, instance) (thanos_rule_config_last_reload_successful{job=~"thanos-rule.*"}) + != 1 for: 5m labels: severity: info - alert: ThanosRuleQueryHighDNSFailures annotations: - description: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing + description: Thanos Rule {{$labels.job}} has {{$value | humanize}}% of failing DNS queries for query endpoints. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulequeryhighdnsfailures summary: Thanos Rule is having high number of DNS failures. expr: | ( - sum by (job) (rate(thanos_rule_query_apis_dns_failures_total{job=~"thanos-rule.*"}[5m])) + sum by (job, instance) (rate(thanos_rule_query_apis_dns_failures_total{job=~"thanos-rule.*"}[5m])) / - sum by (job) (rate(thanos_rule_query_apis_dns_lookups_total{job=~"thanos-rule.*"}[5m])) + sum by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total{job=~"thanos-rule.*"}[5m])) * 100 > 1 ) for: 15m @@ -486,15 +485,15 @@ groups: severity: warning - alert: ThanosRuleAlertmanagerHighDNSFailures annotations: - description: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing - DNS queries for Alertmanager endpoints. + description: Thanos Rule {{$labels.instance}} has {{$value | humanize}}% of + failing DNS queries for Alertmanager endpoints. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulealertmanagerhighdnsfailures summary: Thanos Rule is having high number of DNS failures. expr: | ( - sum by (job) (rate(thanos_rule_alertmanagers_dns_failures_total{job=~"thanos-rule.*"}[5m])) + sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_failures_total{job=~"thanos-rule.*"}[5m])) / - sum by (job) (rate(thanos_rule_alertmanagers_dns_lookups_total{job=~"thanos-rule.*"}[5m])) + sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total{job=~"thanos-rule.*"}[5m])) * 100 > 1 ) for: 15m @@ -502,53 +501,43 @@ groups: severity: warning - alert: ThanosRuleNoEvaluationFor10Intervals annotations: - description: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% rule groups + description: Thanos Rule {{$labels.job}} has {{$value | humanize}}% rule groups that did not evaluate for at least 10x of their expected interval. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulenoevaluationfor10intervals summary: Thanos Rule has rule groups that did not evaluate for 10 intervals. expr: | - time() - max by (job, group) (prometheus_rule_group_last_evaluation_timestamp_seconds{job=~"thanos-rule.*"}) + time() - max by (job, instance, group) (prometheus_rule_group_last_evaluation_timestamp_seconds{job=~"thanos-rule.*"}) > - 10 * max by (job, group) (prometheus_rule_group_interval_seconds{job=~"thanos-rule.*"}) + 10 * max by (job, instance, group) (prometheus_rule_group_interval_seconds{job=~"thanos-rule.*"}) for: 5m labels: severity: info - alert: ThanosNoRuleEvaluations annotations: - description: Thanos Rule {{$labels.job}} did not perform any rule evaluations + description: Thanos Rule {{$labels.instance}} did not perform any rule evaluations in the past 2 minutes. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosnoruleevaluations summary: Thanos Rule did not perform any rule evaluations. expr: | - sum(rate(prometheus_rule_evaluations_total{job=~"thanos-rule.*"}[2m])) <= 0 + sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~"thanos-rule.*"}[2m])) <= 0 and - sum(thanos_rule_loaded_rules{job=~"thanos-rule.*"}) > 0 + sum by (job, instance) (thanos_rule_loaded_rules{job=~"thanos-rule.*"}) > 0 for: 3m labels: severity: critical - name: thanos-bucket-replicate rules: - - alert: ThanosBucketReplicateIsDown - annotations: - description: Thanos Replicate has disappeared from Prometheus target discovery. - runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosbucketreplicateisdown - summary: Thanos Replicate has disappeared from Prometheus target discovery. - expr: | - absent(up{job=~"thanos-bucket-replicate.*"}) - for: 5m - labels: - severity: critical - alert: ThanosBucketReplicateErrorRate annotations: - description: Thanos Replicate failing to run, {{ $value | humanize }}% of attempts - failed. + description: Thanos Replicate is failing to run , {{$value | humanize}}% of + attempts failed. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosbucketreplicateerrorrate - summary: Thanose Replicate is failing to run. + summary: Thanose Replicate is failing to run in . expr: | ( - sum(rate(thanos_replicate_replication_runs_total{result="error", job=~"thanos-bucket-replicate.*"}[5m])) - / on (namespace) group_left - sum(rate(thanos_replicate_replication_runs_total{job=~"thanos-bucket-replicate.*"}[5m])) + sum by (job) (rate(thanos_replicate_replication_runs_total{result="error", job=~"thanos-bucket-replicate.*"}[5m])) + / on (job) group_left + sum by (job) (rate(thanos_replicate_replication_runs_total{job=~"thanos-bucket-replicate.*"}[5m])) ) * 100 >= 10 for: 5m labels: @@ -556,12 +545,12 @@ groups: - alert: ThanosBucketReplicateRunLatency annotations: description: Thanos Replicate {{$labels.job}} has a 99th percentile latency - of {{ $value }} seconds for the replicate operations. + of {{$value}} seconds for the replicate operations. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosbucketreplicaterunlatency summary: Thanos Replicate has a high latency for replicate operations. expr: | ( - histogram_quantile(0.99, sum by (job, le) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~"thanos-bucket-replicate.*"}[5m]))) > 20 + histogram_quantile(0.99, sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~"thanos-bucket-replicate.*"}[5m]))) > 20 and sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~"thanos-bucket-replicate.*"}[5m])) > 0 ) @@ -572,9 +561,10 @@ groups: rules: - alert: ThanosBucketReplicateIsDown annotations: - description: ThanosBucketReplicate has disappeared from Prometheus target discovery. + description: ThanosBucketReplicate has disappeared. Prometheus target for the + component cannot be discovered. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosbucketreplicateisdown - summary: thanos component has disappeared from Prometheus target discovery. + summary: Thanos component has disappeared. expr: | absent(up{job=~"thanos-bucket-replicate.*"} == 1) for: 5m @@ -582,9 +572,10 @@ groups: severity: critical - alert: ThanosCompactIsDown annotations: - description: ThanosCompact has disappeared from Prometheus target discovery. + description: ThanosCompact has disappeared. Prometheus target for the component + cannot be discovered. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompactisdown - summary: thanos component has disappeared from Prometheus target discovery. + summary: Thanos component has disappeared. expr: | absent(up{job=~"thanos-compact.*"} == 1) for: 5m @@ -592,9 +583,10 @@ groups: severity: critical - alert: ThanosQueryIsDown annotations: - description: ThanosQuery has disappeared from Prometheus target discovery. + description: ThanosQuery has disappeared. Prometheus target for the component + cannot be discovered. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryisdown - summary: thanos component has disappeared from Prometheus target discovery. + summary: Thanos component has disappeared. expr: | absent(up{job=~"thanos-query.*"} == 1) for: 5m @@ -602,9 +594,10 @@ groups: severity: critical - alert: ThanosReceiveIsDown annotations: - description: ThanosReceive has disappeared from Prometheus target discovery. + description: ThanosReceive has disappeared. Prometheus target for the component + cannot be discovered. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceiveisdown - summary: thanos component has disappeared from Prometheus target discovery. + summary: Thanos component has disappeared. expr: | absent(up{job=~"thanos-receive.*"} == 1) for: 5m @@ -612,9 +605,10 @@ groups: severity: critical - alert: ThanosRuleIsDown annotations: - description: ThanosRule has disappeared from Prometheus target discovery. + description: ThanosRule has disappeared. Prometheus target for the component + cannot be discovered. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosruleisdown - summary: thanos component has disappeared from Prometheus target discovery. + summary: Thanos component has disappeared. expr: | absent(up{job=~"thanos-rule.*"} == 1) for: 5m @@ -622,9 +616,10 @@ groups: severity: critical - alert: ThanosSidecarIsDown annotations: - description: ThanosSidecar has disappeared from Prometheus target discovery. + description: ThanosSidecar has disappeared. Prometheus target for the component + cannot be discovered. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarisdown - summary: thanos component has disappeared from Prometheus target discovery. + summary: Thanos component has disappeared. expr: | absent(up{job=~"thanos-sidecar.*"} == 1) for: 5m @@ -632,9 +627,10 @@ groups: severity: critical - alert: ThanosStoreIsDown annotations: - description: ThanosStore has disappeared from Prometheus target discovery. + description: ThanosStore has disappeared. Prometheus target for the component + cannot be discovered. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstoreisdown - summary: thanos component has disappeared from Prometheus target discovery. + summary: Thanos component has disappeared. expr: | absent(up{job=~"thanos-store.*"} == 1) for: 5m diff --git a/examples/alerts/rules.yaml b/examples/alerts/rules.yaml index 1c7b725982..a8b6ca64ec 100644 --- a/examples/alerts/rules.yaml +++ b/examples/alerts/rules.yaml @@ -3,35 +3,35 @@ groups: rules: - expr: | ( - sum(rate(grpc_client_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-query.*", grpc_type="unary"}[5m])) + sum by (job) (rate(grpc_client_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-query.*", grpc_type="unary"}[5m])) / - sum(rate(grpc_client_started_total{job=~"thanos-query.*", grpc_type="unary"}[5m])) + sum by (job) (rate(grpc_client_started_total{job=~"thanos-query.*", grpc_type="unary"}[5m])) ) record: :grpc_client_failures_per_unary:sum_rate - expr: | ( - sum(rate(grpc_client_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-query.*", grpc_type="server_stream"}[5m])) + sum by (job) (rate(grpc_client_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-query.*", grpc_type="server_stream"}[5m])) / - sum(rate(grpc_client_started_total{job=~"thanos-query.*", grpc_type="server_stream"}[5m])) + sum by (job) (rate(grpc_client_started_total{job=~"thanos-query.*", grpc_type="server_stream"}[5m])) ) record: :grpc_client_failures_per_stream:sum_rate - expr: | ( - sum(rate(thanos_query_store_apis_dns_failures_total{job=~"thanos-query.*"}[5m])) + sum by (job) (rate(thanos_query_store_apis_dns_failures_total{job=~"thanos-query.*"}[5m])) / - sum(rate(thanos_query_store_apis_dns_lookups_total{job=~"thanos-query.*"}[5m])) + sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{job=~"thanos-query.*"}[5m])) ) record: :thanos_query_store_apis_dns_failures_per_lookup:sum_rate - expr: | histogram_quantile(0.99, - sum(rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query"}[5m])) by (le) + sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query"}[5m])) ) labels: quantile: "0.99" record: :query_duration_seconds:histogram_quantile - expr: | histogram_quantile(0.99, - sum(rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query_range"}[5m])) by (le) + sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query_range"}[5m])) ) labels: quantile: "0.99" @@ -39,80 +39,80 @@ groups: - name: thanos-receive.rules rules: - expr: | - sum( - rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-receive.*", grpc_type="unary"}[5m]) + ( + sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-receive.*", grpc_type="unary"}[5m])) / - rate(grpc_server_started_total{job=~"thanos-receive.*", grpc_type="unary"}[5m]) + sum by (job) (rate(grpc_server_started_total{job=~"thanos-receive.*", grpc_type="unary"}[5m])) ) record: :grpc_server_failures_per_unary:sum_rate - expr: | - sum( - rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-receive.*", grpc_type="server_stream"}[5m]) + ( + sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-receive.*", grpc_type="server_stream"}[5m])) / - rate(grpc_server_started_total{job=~"thanos-receive.*", grpc_type="server_stream"}[5m]) + sum by (job) (rate(grpc_server_started_total{job=~"thanos-receive.*", grpc_type="server_stream"}[5m])) ) record: :grpc_server_failures_per_stream:sum_rate - expr: | - sum( - rate(http_requests_total{handler="receive", job=~"thanos-receive.*", code!~"5.."}[5m]) + ( + sum by (job) (rate(http_requests_total{handler="receive", job=~"thanos-receive.*", code!~"5.."}[5m])) / - rate(http_requests_total{handler="receive", job=~"thanos-receive.*"}[5m]) + sum by (job) (rate(http_requests_total{handler="receive", job=~"thanos-receive.*"}[5m])) ) record: :http_failure_per_request:sum_rate - expr: | histogram_quantile(0.99, - sum(rate(http_request_duration_seconds_bucket{handler="receive", job=~"thanos-receive.*"}[5m])) by (le) + sum by (job, le) (rate(http_request_duration_seconds_bucket{handler="receive", job=~"thanos-receive.*"}[5m])) ) labels: quantile: "0.99" record: :http_request_duration_seconds:histogram_quantile - expr: | ( - sum(rate(thanos_receive_replications_total{result="error", job=~"thanos-receive.*"}[5m])) + sum by (job) (rate(thanos_receive_replications_total{result="error", job=~"thanos-receive.*"}[5m])) / - sum(rate(thanos_receive_replications_total{job=~"thanos-receive.*"}[5m])) + sum by (job) (rate(thanos_receive_replications_total{job=~"thanos-receive.*"}[5m])) ) record: :thanos_receive_replication_failure_per_requests:sum_rate - expr: | ( - sum(rate(thanos_receive_forward_requests_total{result="error", job=~"thanos-receive.*"}[5m])) + sum by (job) (rate(thanos_receive_forward_requests_total{result="error", job=~"thanos-receive.*"}[5m])) / - sum(rate(thanos_receive_forward_requests_total{job=~"thanos-receive.*"}[5m])) + sum by (job) (rate(thanos_receive_forward_requests_total{job=~"thanos-receive.*"}[5m])) ) record: :thanos_receive_forward_failure_per_requests:sum_rate - expr: | ( - sum(rate(thanos_receive_hashrings_file_errors_total{job=~"thanos-receive.*"}[5m])) + sum by (job) (rate(thanos_receive_hashrings_file_errors_total{job=~"thanos-receive.*"}[5m])) / - sum(rate(thanos_receive_hashrings_file_refreshes_total{job=~"thanos-receive.*"}[5m])) + sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total{job=~"thanos-receive.*"}[5m])) ) record: :thanos_receive_hashring_file_failure_per_refresh:sum_rate - name: thanos-store.rules rules: - expr: | ( - sum(rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-store.*", grpc_type="unary"}[5m])) + sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-store.*", grpc_type="unary"}[5m])) / - sum(rate(grpc_server_started_total{job=~"thanos-store.*", grpc_type="unary"}[5m])) + sum by (job) (rate(grpc_server_started_total{job=~"thanos-store.*", grpc_type="unary"}[5m])) ) record: :grpc_server_failures_per_unary:sum_rate - expr: | ( - sum(rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-store.*", grpc_type="server_stream"}[5m])) + sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-store.*", grpc_type="server_stream"}[5m])) / - sum(rate(grpc_server_started_total{job=~"thanos-store.*", grpc_type="server_stream"}[5m])) + sum by (job) (rate(grpc_server_started_total{job=~"thanos-store.*", grpc_type="server_stream"}[5m])) ) record: :grpc_server_failures_per_stream:sum_rate - expr: | ( - sum(rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-store.*"}[5m])) + sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-store.*"}[5m])) / - sum(rate(thanos_objstore_bucket_operations_total{job=~"thanos-store.*"}[5m])) + sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~"thanos-store.*"}[5m])) ) record: :thanos_objstore_bucket_failures_per_operation:sum_rate - expr: | histogram_quantile(0.99, - sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~"thanos-store.*"}[5m])) by (le) + sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~"thanos-store.*"}[5m])) ) labels: quantile: "0.99" diff --git a/examples/alerts/tests.yaml b/examples/alerts/tests.yaml index d65135f9ee..6f35c25d76 100644 --- a/examples/alerts/tests.yaml +++ b/examples/alerts/tests.yaml @@ -8,9 +8,9 @@ tests: - interval: 1m input_series: - series: 'thanos_sidecar_last_heartbeat_success_time_seconds{namespace="production", job="thanos-sidecar", instance="thanos-sidecar-0"}' - values: '5 10 43 17 11 _x5 0x10' + values: '5 10 43 17 11 0 0 0' - series: 'thanos_sidecar_last_heartbeat_success_time_seconds{namespace="production", job="thanos-sidecar", instance="thanos-sidecar-1"}' - values: '4 9 42 15 10 _x5 0x10' + values: '4 9 42 15 10 0 0 0' promql_expr_test: - expr: time() eval_time: 1m @@ -22,61 +22,109 @@ tests: exp_samples: - labels: '{}' value: 120 - - expr: time() - max(timestamp(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"})) by (job, instance) - eval_time: 5m + - expr: max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (job, instance) + eval_time: 2m exp_samples: - labels: '{job="thanos-sidecar", instance="thanos-sidecar-0"}' - value: 60 + value: 43 - labels: '{job="thanos-sidecar", instance="thanos-sidecar-1"}' - value: 60 - - expr: time() - max(timestamp(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"})) by (job, instance) - eval_time: 6m + value: 42 + - expr: max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (job, instance) + eval_time: 10m exp_samples: - labels: '{job="thanos-sidecar", instance="thanos-sidecar-0"}' - value: 120 + value: 0 - labels: '{job="thanos-sidecar", instance="thanos-sidecar-1"}' - value: 120 - - expr: time() - max(timestamp(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"})) by (job, instance) - eval_time: 7m + value: 0 + - expr: max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (job, instance) + eval_time: 11m + exp_samples: + - labels: '{job="thanos-sidecar", instance="thanos-sidecar-0"}' + value: 0 + - labels: '{job="thanos-sidecar", instance="thanos-sidecar-1"}' + value: 0 + - expr: time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (job, instance) + eval_time: 10m + exp_samples: + - labels: '{job="thanos-sidecar", instance="thanos-sidecar-0"}' + value: 600 + - labels: '{job="thanos-sidecar", instance="thanos-sidecar-1"}' + value: 600 + - expr: time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (job, instance) + eval_time: 11m exp_samples: - labels: '{job="thanos-sidecar", instance="thanos-sidecar-0"}' - value: 180 + value: 660 - labels: '{job="thanos-sidecar", instance="thanos-sidecar-1"}' - value: 180 - - expr: time() - max(timestamp(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"})) by (job, instance) - eval_time: 8m + value: 660 + - expr: time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (job, instance) >= 600 + eval_time: 12m exp_samples: - labels: '{job="thanos-sidecar", instance="thanos-sidecar-0"}' - value: 240 + value: 720 - labels: '{job="thanos-sidecar", instance="thanos-sidecar-1"}' - value: 240 + value: 720 alert_rule_test: - - eval_time: 1m - alertname: ThanosSidecarUnhealthy - - eval_time: 2m - alertname: ThanosSidecarUnhealthy - - eval_time: 3m - alertname: ThanosSidecarUnhealthy - - eval_time: 5m - alertname: ThanosSidecarUnhealthy - - eval_time: 8m - alertname: ThanosSidecarUnhealthy - exp_alerts: - - exp_labels: - severity: critical - job: thanos-sidecar - instance: thanos-sidecar-0 - exp_annotations: - description: 'Thanos Sidecar thanos-sidecar thanos-sidecar-0 is unhealthy for more than 240 seconds.' - runbook_url: 'https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy' - summary: 'Thanos Sidecar is unhealthy.' - - exp_labels: - severity: critical - job: thanos-sidecar - instance: thanos-sidecar-1 - exp_annotations: - description: 'Thanos Sidecar thanos-sidecar thanos-sidecar-1 is unhealthy for more than 240 seconds.' - runbook_url: 'https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy' - summary: 'Thanos Sidecar is unhealthy.' - - eval_time: 10m - alertname: ThanosSidecarUnhealthy + - eval_time: 1m + alertname: ThanosSidecarUnhealthy + - eval_time: 2m + alertname: ThanosSidecarUnhealthy + - eval_time: 3m + alertname: ThanosSidecarUnhealthy + - eval_time: 10m + alertname: ThanosSidecarUnhealthy + exp_alerts: + - exp_labels: + severity: critical + job: thanos-sidecar + instance: thanos-sidecar-0 + exp_annotations: + description: 'Thanos Sidecar thanos-sidecar-0 is unhealthy for 600 seconds.' + runbook_url: 'https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy' + summary: 'Thanos Sidecar is unhealthy.' + - exp_labels: + severity: critical + job: thanos-sidecar + instance: thanos-sidecar-1 + exp_annotations: + description: 'Thanos Sidecar thanos-sidecar-1 is unhealthy for 600 seconds.' + runbook_url: 'https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy' + summary: 'Thanos Sidecar is unhealthy.' + - eval_time: 11m + alertname: ThanosSidecarUnhealthy + exp_alerts: + - exp_labels: + severity: critical + job: thanos-sidecar + instance: thanos-sidecar-0 + exp_annotations: + description: 'Thanos Sidecar thanos-sidecar-0 is unhealthy for 660 seconds.' + runbook_url: 'https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy' + summary: 'Thanos Sidecar is unhealthy.' + - exp_labels: + severity: critical + job: thanos-sidecar + instance: thanos-sidecar-1 + exp_annotations: + description: 'Thanos Sidecar thanos-sidecar-1 is unhealthy for 660 seconds.' + runbook_url: 'https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy' + summary: 'Thanos Sidecar is unhealthy.' + - eval_time: 12m + alertname: ThanosSidecarUnhealthy + exp_alerts: + - exp_labels: + severity: critical + job: thanos-sidecar + instance: thanos-sidecar-0 + exp_annotations: + description: 'Thanos Sidecar thanos-sidecar-0 is unhealthy for 720 seconds.' + runbook_url: 'https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy' + summary: 'Thanos Sidecar is unhealthy.' + - exp_labels: + severity: critical + job: thanos-sidecar + instance: thanos-sidecar-1 + exp_annotations: + description: 'Thanos Sidecar thanos-sidecar-1 is unhealthy for 720 seconds.' + runbook_url: 'https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy' + summary: 'Thanos Sidecar is unhealthy.' diff --git a/examples/dashboards/bucket_replicate.json b/examples/dashboards/bucket_replicate.json index 4d6d339bb7..1c3ecb0076 100644 --- a/examples/dashboards/bucket_replicate.json +++ b/examples/dashboards/bucket_replicate.json @@ -47,7 +47,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(thanos_replicate_replication_runs_total{result=\"error\", namespace=\"$namespace\",job=~\"thanos-bucket-replicate.*\"}[$interval])) / sum(rate(thanos_replicate_replication_runs_total{namespace=\"$namespace\",job=~\"thanos-bucket-replicate.*\"}[$interval]))", + "expr": "sum by (job) (rate(thanos_replicate_replication_runs_total{result=\"error\", job=\"$job\"}[$interval])) / sum by (job) (rate(thanos_replicate_replication_runs_total{job=\"$job\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "error", @@ -124,7 +124,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(thanos_replicate_replication_runs_total{result=\"error\", namespace=\"$namespace\",job=~\"thanos-bucket-replicate.*\"}[$interval])) by (result)", + "expr": "sum by (job, result) (rate(thanos_replicate_replication_runs_total{result=\"error\", job=\"$job\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{result}}", @@ -194,34 +194,62 @@ "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ ], + "seriesOverrides": [ + { + "alias": "p99", + "color": "#FA6400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p90", + "color": "#E0B400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p50", + "color": "#37872D", + "fill": 10, + "fillGradient": 0 + } + ], "spaceLength": 10, "span": 4, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(thanos_replicate_replication_run_duration_seconds_bucket{result=\"success\", namespace=\"$namespace\",job=~\"thanos-bucket-replicate.*\"}[$interval])) by (job, le)) * 1", + "expr": "histogram_quantile(0.50, sum by (job, le) (rate(thanos_replicate_replication_run_duration_seconds_bucket{result=\"success\", job=\"$job\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "P99 {{job}}", + "legendFormat": "p50 {{job}}", + "logBase": 10, + "max": null, + "min": null, "refId": "A", "step": 10 }, { - "expr": "sum(rate(thanos_replicate_replication_run_duration_seconds_sum{result=\"success\", namespace=\"$namespace\",job=~\"thanos-bucket-replicate.*\"}[$interval])) by (job) * 1 / sum(rate(thanos_replicate_replication_run_duration_seconds_count{result=\"success\", namespace=\"$namespace\",job=~\"thanos-bucket-replicate.*\"}[$interval])) by (job)", + "expr": "histogram_quantile(0.90, sum by (job, le) (rate(thanos_replicate_replication_run_duration_seconds_bucket{result=\"success\", job=\"$job\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "mean {{job}}", - "refId": "B", + "legendFormat": "p90 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "refId": "A", "step": 10 }, { - "expr": "histogram_quantile(0.50, sum(rate(thanos_replicate_replication_run_duration_seconds_bucket{result=\"success\", namespace=\"$namespace\",job=~\"thanos-bucket-replicate.*\"}[$interval])) by (job, le)) * 1", + "expr": "histogram_quantile(0.99, sum by (job, le) (rate(thanos_replicate_replication_run_duration_seconds_bucket{result=\"success\", job=\"$job\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "P50 {{job}}", - "refId": "C", + "legendFormat": "p99 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "refId": "A", "step": 10 } ], @@ -305,7 +333,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(blocks_meta_synced{state=\"loaded\",namespace=\"$namespace\",job=~\"thanos-bucket-replicate.*\"}[$interval]))", + "expr": "sum by (job) (rate(blocks_meta_synced{state=\"loaded\", job=\"$job\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "meta loads", @@ -313,7 +341,7 @@ "step": 10 }, { - "expr": "sum(rate(blocks_meta_synced{state=\"failed\",namespace=\"$namespace\",job=~\"thanos-bucket-replicate.*\"}[$interval]))", + "expr": "sum by (job) (rate(blocks_meta_synced{state=\"failed\", job=\"$job\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "partial meta reads", @@ -321,7 +349,7 @@ "step": 10 }, { - "expr": "sum(rate(thanos_replicate_blocks_already_replicated_total{namespace=\"$namespace\",job=~\"thanos-bucket-replicate.*\"}[$interval]))", + "expr": "sum by (job) (rate(thanos_replicate_blocks_already_replicated_total{job=\"$job\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "already replicated blocks", @@ -329,7 +357,7 @@ "step": 10 }, { - "expr": "sum(rate(thanos_replicate_blocks_replicated_total{namespace=\"$namespace\",job=~\"thanos-bucket-replicate.*\"}[$interval]))", + "expr": "sum by (job) (rate(thanos_replicate_blocks_replicated_total{job=\"$job\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "replicated blocks", @@ -337,7 +365,7 @@ "step": 10 }, { - "expr": "sum(rate(thanos_replicate_objects_replicated_total{namespace=\"$namespace\",job=~\"thanos-bucket-replicate.*\"}[$interval]))", + "expr": "sum by (job) (rate(thanos_replicate_objects_replicated_total{job=\"$job\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "replicated objects", @@ -399,8 +427,8 @@ "list": [ { "current": { - "text": "Prometheus", - "value": "Prometheus" + "text": "default", + "value": "default" }, "hide": 0, "label": null, @@ -412,27 +440,22 @@ "type": "datasource" }, { - "allValue": null, - "current": { }, - "datasource": "$datasource", + "auto": true, + "auto_count": 300, + "auto_min": "10s", + "current": { + "text": "5m", + "value": "5m" + }, "hide": 0, - "includeAll": false, - "label": "namespace", - "multi": false, - "name": "namespace", - "options": [ ], - "query": "label_values(thanos_status{}, namespace)", - "refresh": 1, - "regex": "", - "sort": 2, - "tagValuesQuery": "", - "tags": [ ], - "tagsQuery": "", - "type": "query", - "useTags": false + "label": "interval", + "name": "interval", + "query": "5m,10m,30m,1h,6h,12h", + "refresh": 2, + "type": "interval" }, { - "allValue": "thanos-bucket-replicate.*", + "allValue": null, "current": { "text": "all", "value": "$__all" @@ -444,7 +467,7 @@ "multi": false, "name": "job", "options": [ ], - "query": "label_values(up{namespace=\"$namespace\", job=~\"thanos-bucket-replicate.*\"}, job)", + "query": "label_values(up{job=~\"thanos-bucket-replicate.*\"}, job)", "refresh": 1, "regex": "", "sort": 2, @@ -453,21 +476,6 @@ "tagsQuery": "", "type": "query", "useTags": false - }, - { - "auto": true, - "auto_count": 300, - "auto_min": "10s", - "current": { - "text": "5m", - "value": "5m" - }, - "hide": 0, - "label": "interval", - "name": "interval", - "query": "5m,10m,30m,1h,6h,12h", - "refresh": 2, - "type": "interval" } ] }, diff --git a/examples/dashboards/compact.json b/examples/dashboards/compact.json index 99653ecf00..db0adb4dc6 100644 --- a/examples/dashboards/compact.json +++ b/examples/dashboards/compact.json @@ -46,7 +46,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(thanos_compact_group_compactions_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, group)", + "expr": "sum by (job, group) (rate(thanos_compact_group_compactions_total{job=\"$job\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "compaction {{job}} {{group}}", @@ -125,7 +125,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(thanos_compact_group_compactions_failures_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) / sum(rate(thanos_compact_group_compactions_total{namespace=\"$namespace\",job=~\"$job\"}[$interval]))", + "expr": "sum by (job) (rate(thanos_compact_group_compactions_failures_total{job=\"$job\"}[$interval])) / sum by (job) (rate(thanos_compact_group_compactions_total{job=\"$job\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "error", @@ -214,7 +214,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(thanos_compact_downsample_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, group)", + "expr": "sum by (job, group) (rate(thanos_compact_downsample_total{job=\"$job\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "downsample {{job}} {{group}}", @@ -293,7 +293,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(thanos_compact_downsample_failed_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) / sum(rate(thanos_compact_downsample_total{namespace=\"$namespace\",job=~\"$job\"}[$interval]))", + "expr": "sum by (job) (rate(thanos_compact_downsample_failed_total{job=\"$job\"}[$interval])) / sum by (job) (rate(thanos_compact_downsample_total{job=\"$job\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "error", @@ -382,7 +382,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(thanos_compact_garbage_collection_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job)", + "expr": "sum by (job) (rate(thanos_compact_garbage_collection_total{job=\"$job\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "garbage collection {{job}}", @@ -461,7 +461,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(thanos_compact_garbage_collection_failures_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) / sum(rate(thanos_compact_garbage_collection_total{namespace=\"$namespace\",job=~\"$job\"}[$interval]))", + "expr": "sum by (job) (rate(thanos_compact_garbage_collection_failures_total{job=\"$job\"}[$interval])) / sum by (job) (rate(thanos_compact_garbage_collection_total{job=\"$job\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "error", @@ -531,34 +531,62 @@ "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ ], + "seriesOverrides": [ + { + "alias": "p99", + "color": "#FA6400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p90", + "color": "#E0B400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p50", + "color": "#37872D", + "fill": 10, + "fillGradient": 0 + } + ], "spaceLength": 10, "span": 4, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(thanos_compact_garbage_collection_duration_seconds_bucket{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, le)) * 1", + "expr": "histogram_quantile(0.50, sum by (job, le) (rate(thanos_compact_garbage_collection_duration_seconds_bucket{job=\"$job\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "P99 {{job}}", + "legendFormat": "p50 {{job}}", + "logBase": 10, + "max": null, + "min": null, "refId": "A", "step": 10 }, { - "expr": "sum(rate(thanos_compact_garbage_collection_duration_seconds_sum{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job) * 1 / sum(rate(thanos_compact_garbage_collection_duration_seconds_count{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job)", + "expr": "histogram_quantile(0.90, sum by (job, le) (rate(thanos_compact_garbage_collection_duration_seconds_bucket{job=\"$job\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "mean {{job}}", - "refId": "B", + "legendFormat": "p90 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "refId": "A", "step": 10 }, { - "expr": "histogram_quantile(0.50, sum(rate(thanos_compact_garbage_collection_duration_seconds_bucket{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, le)) * 1", + "expr": "histogram_quantile(0.99, sum by (job, le) (rate(thanos_compact_garbage_collection_duration_seconds_bucket{job=\"$job\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "P50 {{job}}", - "refId": "C", + "legendFormat": "p99 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "refId": "A", "step": 10 } ], @@ -643,7 +671,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(thanos_compact_blocks_cleaned_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job)", + "expr": "sum by (job) (rate(thanos_compact_blocks_cleaned_total{job=\"$job\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "Blocks cleanup {{job}}", @@ -720,7 +748,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(thanos_compact_block_cleanup_failures_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job)", + "expr": "sum by (job) (rate(thanos_compact_block_cleanup_failures_total{job=\"$job\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "Blocks cleanup failures {{job}}", @@ -797,7 +825,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(thanos_compact_blocks_marked_for_deletion_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job)", + "expr": "sum by (job) (rate(thanos_compact_blocks_marked_for_deletion_total{job=\"$job\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "Blocks marked {{job}}", @@ -886,7 +914,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(thanos_blocks_meta_syncs_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job)", + "expr": "sum by (job) (rate(thanos_blocks_meta_syncs_total{job=\"$job\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "sync {{job}}", @@ -965,7 +993,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(thanos_blocks_meta_sync_failures_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) / sum(rate(thanos_blocks_meta_syncs_total{namespace=\"$namespace\",job=~\"$job\"}[$interval]))", + "expr": "sum by (job) (rate(thanos_blocks_meta_sync_failures_total{job=\"$job\"}[$interval])) / sum by (job) (rate(thanos_blocks_meta_syncs_total{job=\"$job\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "error", @@ -1035,34 +1063,62 @@ "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ ], + "seriesOverrides": [ + { + "alias": "p99", + "color": "#FA6400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p90", + "color": "#E0B400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p50", + "color": "#37872D", + "fill": 10, + "fillGradient": 0 + } + ], "spaceLength": 10, "span": 4, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(thanos_blocks_meta_sync_duration_seconds_bucket{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, le)) * 1", + "expr": "histogram_quantile(0.50, sum by (job, le) (rate(thanos_blocks_meta_sync_duration_seconds_bucket{job=\"$job\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "P99 {{job}}", + "legendFormat": "p50 {{job}}", + "logBase": 10, + "max": null, + "min": null, "refId": "A", "step": 10 }, { - "expr": "sum(rate(thanos_blocks_meta_sync_duration_seconds_sum{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job) * 1 / sum(rate(thanos_blocks_meta_sync_duration_seconds_count{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job)", + "expr": "histogram_quantile(0.90, sum by (job, le) (rate(thanos_blocks_meta_sync_duration_seconds_bucket{job=\"$job\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "mean {{job}}", - "refId": "B", + "legendFormat": "p90 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "refId": "A", "step": 10 }, { - "expr": "histogram_quantile(0.50, sum(rate(thanos_blocks_meta_sync_duration_seconds_bucket{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, le)) * 1", + "expr": "histogram_quantile(0.99, sum by (job, le) (rate(thanos_blocks_meta_sync_duration_seconds_bucket{job=\"$job\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "P50 {{job}}", - "refId": "C", + "legendFormat": "p99 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "refId": "A", "step": 10 } ], @@ -1147,7 +1203,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(thanos_objstore_bucket_operations_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, operation)", + "expr": "sum by (job, operation) (rate(thanos_objstore_bucket_operations_total{job=\"$job\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{job}} {{operation}}", @@ -1226,7 +1282,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(thanos_objstore_bucket_operation_failures_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) / sum(rate(thanos_objstore_bucket_operations_total{namespace=\"$namespace\",job=~\"$job\"}[$interval]))", + "expr": "sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=\"$job\"}[$interval])) / sum by (job) (rate(thanos_objstore_bucket_operations_total{job=\"$job\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "error", @@ -1296,34 +1352,62 @@ "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ ], + "seriesOverrides": [ + { + "alias": "p99", + "color": "#FA6400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p90", + "color": "#E0B400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p50", + "color": "#37872D", + "fill": 10, + "fillGradient": 0 + } + ], "spaceLength": 10, "span": 4, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, le)) * 1", + "expr": "histogram_quantile(0.50, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=\"$job\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "P99 {{job}}", + "legendFormat": "p50 {{job}}", + "logBase": 10, + "max": null, + "min": null, "refId": "A", "step": 10 }, { - "expr": "sum(rate(thanos_objstore_bucket_operation_duration_seconds_sum{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job) * 1 / sum(rate(thanos_objstore_bucket_operation_duration_seconds_count{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job)", + "expr": "histogram_quantile(0.90, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=\"$job\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "mean {{job}}", - "refId": "B", + "legendFormat": "p90 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "refId": "A", "step": 10 }, { - "expr": "histogram_quantile(0.50, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, le)) * 1", + "expr": "histogram_quantile(0.99, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=\"$job\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "P50 {{job}}", - "refId": "C", + "legendFormat": "p99 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "refId": "A", "step": 10 } ], @@ -1407,7 +1491,7 @@ "steppedLine": false, "targets": [ { - "expr": "go_memstats_alloc_bytes{namespace=\"$namespace\",job=~\"$job\"}", + "expr": "go_memstats_alloc_bytes{job=\"$job\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "alloc all {{instance}}", @@ -1415,7 +1499,7 @@ "step": 10 }, { - "expr": "go_memstats_heap_alloc_bytes{namespace=\"$namespace\",job=~\"$job\"}", + "expr": "go_memstats_heap_alloc_bytes{job=\"$job\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "alloc heap {{instance}}", @@ -1423,7 +1507,7 @@ "step": 10 }, { - "expr": "rate(go_memstats_alloc_bytes_total{namespace=\"$namespace\",job=~\"$job\"}[30s])", + "expr": "rate(go_memstats_alloc_bytes_total{job=\"$job\"})[30s]", "format": "time_series", "intervalFactor": 2, "legendFormat": "alloc rate all {{instance}}", @@ -1431,7 +1515,7 @@ "step": 10 }, { - "expr": "rate(go_memstats_heap_alloc_bytes{namespace=\"$namespace\",job=~\"$job\"}[30s])", + "expr": "rate(go_memstats_heap_alloc_bytes{job=\"$job\"})[30s]", "format": "time_series", "intervalFactor": 2, "legendFormat": "alloc rate heap {{instance}}", @@ -1439,18 +1523,18 @@ "step": 10 }, { - "expr": "go_memstats_stack_inuse_bytes{namespace=\"$namespace\",job=~\"$job\"}", + "expr": "go_memstats_stack_inuse_bytes{job=\"$job\"}", "format": "time_series", "intervalFactor": 2, - "legendFormat": "inuse stack {{instance}}", + "legendFormat": "inuse heap {{instance}}", "legendLink": null, "step": 10 }, { - "expr": "go_memstats_heap_inuse_bytes{namespace=\"$namespace\",job=~\"$job\"}", + "expr": "go_memstats_heap_inuse_bytes{job=\"$job\"}", "format": "time_series", "intervalFactor": 2, - "legendFormat": "inuse heap {{instance}}", + "legendFormat": "inuse stack {{instance}}", "legendLink": null, "step": 10 } @@ -1523,7 +1607,7 @@ "steppedLine": false, "targets": [ { - "expr": "go_goroutines{namespace=\"$namespace\",job=~\"$job\"}", + "expr": "go_goroutines{job=\"$job\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -1599,7 +1683,7 @@ "steppedLine": false, "targets": [ { - "expr": "go_gc_duration_seconds{namespace=\"$namespace\",job=~\"$job\"}", + "expr": "go_gc_duration_seconds{job=\"$job\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{quantile}} {{instance}}", @@ -1661,8 +1745,8 @@ "list": [ { "current": { - "text": "Prometheus", - "value": "Prometheus" + "text": "default", + "value": "default" }, "hide": 0, "label": null, @@ -1674,27 +1758,22 @@ "type": "datasource" }, { - "allValue": null, - "current": { }, - "datasource": "$datasource", + "auto": true, + "auto_count": 300, + "auto_min": "10s", + "current": { + "text": "5m", + "value": "5m" + }, "hide": 0, - "includeAll": false, - "label": "namespace", - "multi": false, - "name": "namespace", - "options": [ ], - "query": "label_values(thanos_status{}, namespace)", - "refresh": 1, - "regex": "", - "sort": 2, - "tagValuesQuery": "", - "tags": [ ], - "tagsQuery": "", - "type": "query", - "useTags": false + "label": "interval", + "name": "interval", + "query": "5m,10m,30m,1h,6h,12h", + "refresh": 2, + "type": "interval" }, { - "allValue": "thanos-compact.*", + "allValue": null, "current": { "text": "all", "value": "$__all" @@ -1706,7 +1785,7 @@ "multi": false, "name": "job", "options": [ ], - "query": "label_values(up{namespace=\"$namespace\", job=~\"thanos-compact.*\"}, job)", + "query": "label_values(up{job=~\"thanos-compact.*\"}, job)", "refresh": 1, "regex": "", "sort": 2, @@ -1715,21 +1794,6 @@ "tagsQuery": "", "type": "query", "useTags": false - }, - { - "auto": true, - "auto_count": 300, - "auto_min": "10s", - "current": { - "text": "5m", - "value": "5m" - }, - "hide": 0, - "label": "interval", - "name": "interval", - "query": "5m,10m,30m,1h,6h,12h", - "refresh": 2, - "type": "interval" } ] }, diff --git a/examples/dashboards/overview.json b/examples/dashboards/overview.json index d8e7c530cf..36ff710744 100644 --- a/examples/dashboards/overview.json +++ b/examples/dashboards/overview.json @@ -14,15 +14,7 @@ "height": "250px", "panels": [ { - "aliasColors": { - "1xx": "#EAB839", - "2xx": "#7EB26D", - "3xx": "#6ED0E0", - "4xx": "#EF843C", - "5xx": "#E24D42", - "error": "#E24D42", - "success": "#7EB26D" - }, + "aliasColors": { }, "bars": false, "dashLength": 10, "dashes": false, @@ -55,17 +47,38 @@ "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ ], + "seriesOverrides": [ + { + "alias": "/1../", + "color": "#EAB839" + }, + { + "alias": "/2../", + "color": "#37872D" + }, + { + "alias": "/3../", + "color": "#E0B400" + }, + { + "alias": "/4../", + "color": "#1F60C4" + }, + { + "alias": "/5../", + "color": "#C4162A" + } + ], "spaceLength": 10, "span": 4, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(label_replace(rate(http_requests_total{namespace=\"$namespace\",job=~\"thanos-query.*\",handler=\"query\"}[$interval]),\"status_code\", \"${1}xx\", \"code\", \"([0-9])..\")) by (job, handler, status_code)", + "expr": "sum by (job, handler, code) (rate(http_requests_total{handler=\"query\"}[$interval]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{job}} {{handler}} {{status_code}}", + "legendFormat": "{{job}} {{handler}} {{code}}", "refId": "A", "step": 10 } @@ -149,7 +162,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(http_requests_total{namespace=\"$namespace\",job=~\"thanos-query.*\",handler=\"query\",code=~\"5..\"}[$interval])) / sum(rate(http_requests_total{namespace=\"$namespace\",job=~\"thanos-query.*\",handler=\"query\"}[$interval]))", + "expr": "sum by (job) (rate(http_requests_total{handler=\"query\",code=~\"5..\"}[$interval])) / sum by (job) (rate(http_requests_total{handler=\"query\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "error", @@ -234,7 +247,7 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{namespace=\"$namespace\",job=~\"thanos-query.*\",handler=\"query\"}[$interval])) by (job, le))", + "expr": "histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{handler=\"query\"}[$interval])))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{job}} P99", @@ -308,15 +321,7 @@ "height": "250px", "panels": [ { - "aliasColors": { - "1xx": "#EAB839", - "2xx": "#7EB26D", - "3xx": "#6ED0E0", - "4xx": "#EF843C", - "5xx": "#E24D42", - "error": "#E24D42", - "success": "#7EB26D" - }, + "aliasColors": { }, "bars": false, "dashLength": 10, "dashes": false, @@ -349,17 +354,38 @@ "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ ], + "seriesOverrides": [ + { + "alias": "/1../", + "color": "#EAB839" + }, + { + "alias": "/2../", + "color": "#37872D" + }, + { + "alias": "/3../", + "color": "#E0B400" + }, + { + "alias": "/4../", + "color": "#1F60C4" + }, + { + "alias": "/5../", + "color": "#C4162A" + } + ], "spaceLength": 10, "span": 4, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(label_replace(rate(http_requests_total{namespace=\"$namespace\",job=~\"thanos-query.*\",handler=\"query_range\"}[$interval]),\"status_code\", \"${1}xx\", \"code\", \"([0-9])..\")) by (job, handler, status_code)", + "expr": "sum by (job, handler, code) (rate(http_requests_total{handler=\"query_range\"}[$interval]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{job}} {{handler}} {{status_code}}", + "legendFormat": "{{job}} {{handler}} {{code}}", "refId": "A", "step": 10 } @@ -443,7 +469,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(http_requests_total{namespace=\"$namespace\",job=~\"thanos-query.*\",handler=\"query_range\",code=~\"5..\"}[$interval])) / sum(rate(http_requests_total{namespace=\"$namespace\",job=~\"thanos-query.*\",handler=\"query_range\"}[$interval]))", + "expr": "sum by (job) (rate(http_requests_total{handler=\"query_range\",code=~\"5..\"}[$interval])) / sum by (job) (rate(http_requests_total{handler=\"query_range\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "error", @@ -528,7 +554,7 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{namespace=\"$namespace\",job=~\"thanos-query.*\",handler=\"query_range\"}[$interval])) by (job, le))", + "expr": "histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{handler=\"query_range\"}[$interval])))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{job}} P99", @@ -602,26 +628,7 @@ "height": "250px", "panels": [ { - "aliasColors": { - "Aborted": "#EAB839", - "AlreadyExists": "#7EB26D", - "Canceled": "#E24D42", - "DataLoss": "#E24D42", - "DeadlineExceeded": "#E24D42", - "FailedPrecondition": "#6ED0E0", - "Internal": "#E24D42", - "InvalidArgument": "#EF843C", - "NotFound": "#EF843C", - "OK": "#7EB26D", - "OutOfRange": "#E24D42", - "PermissionDenied": "#EF843C", - "ResourceExhausted": "#E24D42", - "Unauthenticated": "#EF843C", - "Unavailable": "#E24D42", - "Unimplemented": "#6ED0E0", - "Unknown": "#E24D42", - "error": "#E24D42" - }, + "aliasColors": { }, "bars": false, "dashLength": 10, "dashes": false, @@ -654,14 +661,87 @@ "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ ], + "seriesOverrides": [ + { + "alias": "/Aborted/", + "color": "#EAB839" + }, + { + "alias": "/AlreadyExists/", + "color": "#37872D" + }, + { + "alias": "/FailedPrecondition/", + "color": "#E0B400" + }, + { + "alias": "/Unimplemented/", + "color": "#E0B400" + }, + { + "alias": "/InvalidArgument/", + "color": "#1F60C4" + }, + { + "alias": "/NotFound/", + "color": "#1F60C4" + }, + { + "alias": "/PermissionDenied/", + "color": "#1F60C4" + }, + { + "alias": "/Unauthenticated/", + "color": "#1F60C4" + }, + { + "alias": "/Canceled/", + "color": "#C4162A" + }, + { + "alias": "/DataLoss/", + "color": "#C4162A" + }, + { + "alias": "/DeadlineExceeded/", + "color": "#C4162A" + }, + { + "alias": "/Internal/", + "color": "#C4162A" + }, + { + "alias": "/OutOfRange/", + "color": "#C4162A" + }, + { + "alias": "/ResourceExhausted/", + "color": "#C4162A" + }, + { + "alias": "/Unavailable/", + "color": "#C4162A" + }, + { + "alias": "/Unknown/", + "color": "#C4162A" + }, + { + "alias": "/OK/", + "color": "#37872D" + }, + { + "alias": "error", + "color": "#C4162A" + } + ], "spaceLength": 10, "span": 4, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(rate(grpc_server_handled_total{namespace=\"$namespace\",job=~\"thanos-store.*\",grpc_type=\"unary\"}[$interval])) by (job, grpc_method, grpc_code)", + "expr": "sum by (job, grpc_method, grpc_code) (rate(grpc_server_handled_total{grpc_type=\"unary\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{job}} {{grpc_method}} {{grpc_code}}", @@ -748,7 +828,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",namespace=\"$namespace\",job=~\"thanos-store.*\",grpc_type=\"unary\"}[$interval])) / sum(rate(grpc_server_started_total{namespace=\"$namespace\",job=~\"thanos-store.*\",grpc_type=\"unary\"}[$interval]))", + "expr": "sum by (job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",grpc_type=\"unary\"}[$interval])) / sum by (job) (rate(grpc_server_handled_total{grpc_type=\"unary\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "error", @@ -833,7 +913,7 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{grpc_type=\"unary\",namespace=\"$namespace\",job=~\"thanos-store.*\"}[$interval])) by (job, le))", + "expr": "histogram_quantile(0.99, sum by (job, le) (rate(grpc_server_handling_seconds_bucket{grpc_type=\"unary\"}[$interval])))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{job}} P99", @@ -907,26 +987,7 @@ "height": "250px", "panels": [ { - "aliasColors": { - "Aborted": "#EAB839", - "AlreadyExists": "#7EB26D", - "Canceled": "#E24D42", - "DataLoss": "#E24D42", - "DeadlineExceeded": "#E24D42", - "FailedPrecondition": "#6ED0E0", - "Internal": "#E24D42", - "InvalidArgument": "#EF843C", - "NotFound": "#EF843C", - "OK": "#7EB26D", - "OutOfRange": "#E24D42", - "PermissionDenied": "#EF843C", - "ResourceExhausted": "#E24D42", - "Unauthenticated": "#EF843C", - "Unavailable": "#E24D42", - "Unimplemented": "#6ED0E0", - "Unknown": "#E24D42", - "error": "#E24D42" - }, + "aliasColors": { }, "bars": false, "dashLength": 10, "dashes": false, @@ -959,14 +1020,87 @@ "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ ], + "seriesOverrides": [ + { + "alias": "/Aborted/", + "color": "#EAB839" + }, + { + "alias": "/AlreadyExists/", + "color": "#37872D" + }, + { + "alias": "/FailedPrecondition/", + "color": "#E0B400" + }, + { + "alias": "/Unimplemented/", + "color": "#E0B400" + }, + { + "alias": "/InvalidArgument/", + "color": "#1F60C4" + }, + { + "alias": "/NotFound/", + "color": "#1F60C4" + }, + { + "alias": "/PermissionDenied/", + "color": "#1F60C4" + }, + { + "alias": "/Unauthenticated/", + "color": "#1F60C4" + }, + { + "alias": "/Canceled/", + "color": "#C4162A" + }, + { + "alias": "/DataLoss/", + "color": "#C4162A" + }, + { + "alias": "/DeadlineExceeded/", + "color": "#C4162A" + }, + { + "alias": "/Internal/", + "color": "#C4162A" + }, + { + "alias": "/OutOfRange/", + "color": "#C4162A" + }, + { + "alias": "/ResourceExhausted/", + "color": "#C4162A" + }, + { + "alias": "/Unavailable/", + "color": "#C4162A" + }, + { + "alias": "/Unknown/", + "color": "#C4162A" + }, + { + "alias": "/OK/", + "color": "#37872D" + }, + { + "alias": "error", + "color": "#C4162A" + } + ], "spaceLength": 10, "span": 4, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(rate(grpc_server_handled_total{namespace=\"$namespace\",job=~\"thanos-sidecar.*\",grpc_type=\"unary\"}[$interval])) by (job, grpc_method, grpc_code)", + "expr": "sum by (job, grpc_method, grpc_code) (rate(grpc_server_handled_total{grpc_type=\"unary\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{job}} {{grpc_method}} {{grpc_code}}", @@ -1053,7 +1187,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",namespace=\"$namespace\",job=~\"thanos-sidecar.*\",grpc_type=\"unary\"}[$interval])) / sum(rate(grpc_server_started_total{namespace=\"$namespace\",job=~\"thanos-sidecar.*\",grpc_type=\"unary\"}[$interval]))", + "expr": "sum by (job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",grpc_type=\"unary\"}[$interval])) / sum by (job) (rate(grpc_server_handled_total{grpc_type=\"unary\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "error", @@ -1138,7 +1272,7 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{grpc_type=\"unary\",namespace=\"$namespace\",job=~\"thanos-sidecar.*\"}[$interval])) by (job, le))", + "expr": "histogram_quantile(0.99, sum by (job, le) (rate(grpc_server_handling_seconds_bucket{grpc_type=\"unary\"}[$interval])))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{job}} P99", @@ -1212,15 +1346,7 @@ "height": "250px", "panels": [ { - "aliasColors": { - "1xx": "#EAB839", - "2xx": "#7EB26D", - "3xx": "#6ED0E0", - "4xx": "#EF843C", - "5xx": "#E24D42", - "error": "#E24D42", - "success": "#7EB26D" - }, + "aliasColors": { }, "bars": false, "dashLength": 10, "dashes": false, @@ -1253,17 +1379,38 @@ "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ ], + "seriesOverrides": [ + { + "alias": "/1../", + "color": "#EAB839" + }, + { + "alias": "/2../", + "color": "#37872D" + }, + { + "alias": "/3../", + "color": "#E0B400" + }, + { + "alias": "/4../", + "color": "#1F60C4" + }, + { + "alias": "/5../", + "color": "#C4162A" + } + ], "spaceLength": 10, "span": 4, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(label_replace(rate(http_requests_total{handler=\"receive\",namespace=\"$namespace\",job=~\"thanos-receive.*\"}[$interval]),\"status_code\", \"${1}xx\", \"code\", \"([0-9])..\")) by (job, handler, status_code)", + "expr": "sum by (job, handler, code) (rate(http_requests_total{handler=\"receive\"}[$interval]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{job}} {{handler}} {{status_code}}", + "legendFormat": "{{job}} {{handler}} {{code}}", "refId": "A", "step": 10 } @@ -1347,7 +1494,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(http_requests_total{handler=\"receive\",namespace=\"$namespace\",job=~\"thanos-receive.*\",code=~\"5..\"}[$interval])) / sum(rate(http_requests_total{handler=\"receive\",namespace=\"$namespace\",job=~\"thanos-receive.*\"}[$interval]))", + "expr": "sum by (job) (rate(http_requests_total{handler=\"receive\",code=~\"5..\"}[$interval])) / sum by (job) (rate(http_requests_total{handler=\"receive\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "error", @@ -1432,7 +1579,7 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{handler=\"receive\",namespace=\"$namespace\",job=~\"thanos-receive.*\"}[$interval])) by (job, le))", + "expr": "histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{handler=\"receive\"}[$interval])))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{job}} P99", @@ -1546,7 +1693,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(thanos_alert_sender_alerts_sent_total{namespace=\"$namespace\",job=~\"thanos-rule.*\"}[$interval])) by (job, alertmanager)", + "expr": "sum by (job, alertmanager) (rate(thanos_alert_sender_alerts_sent_total{}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{alertmanager}}", @@ -1633,7 +1780,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(thanos_alert_sender_errors_total{namespace=\"$namespace\",job=~\"thanos-rule.*\"}[$interval])) / sum(rate(thanos_alert_sender_alerts_sent_total{namespace=\"$namespace\",job=~\"thanos-rule.*\"}[$interval]))", + "expr": "sum by (job) (rate(thanos_alert_sender_errors_total{}[$interval])) / sum by (job) (rate(thanos_alert_sender_alerts_sent_total{}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "error", @@ -1718,7 +1865,7 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(thanos_alert_sender_latency_seconds_bucket{namespace=\"$namespace\",job=~\"thanos-rule.*\"}[$interval])) by (job, le))", + "expr": "histogram_quantile(0.99, sum by (job, le) (rate(thanos_alert_sender_latency_seconds_bucket{}[$interval])))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{job}} P99", @@ -1832,7 +1979,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(thanos_compact_group_compactions_total{namespace=\"$namespace\",job=~\"thanos-compact.*\"}[$interval])) by (job)", + "expr": "sum by (job) (rate(thanos_compact_group_compactions_total{}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "compaction {{job}}", @@ -1919,7 +2066,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(thanos_compact_group_compactions_failures_total{namespace=\"$namespace\",job=~\"thanos-compact.*\"}[$interval])) / sum(rate(thanos_compact_group_compactions_total{namespace=\"$namespace\",job=~\"thanos-compact.*\"}[$interval]))", + "expr": "sum by (job) (rate(thanos_compact_group_compactions_failures_total{}[$interval])) / sum by (job) (rate(thanos_compact_group_compactions_total{}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "error", @@ -1981,8 +2128,8 @@ "list": [ { "current": { - "text": "Prometheus", - "value": "Prometheus" + "text": "default", + "value": "default" }, "hide": 0, "label": null, @@ -1993,26 +2140,6 @@ "regex": "", "type": "datasource" }, - { - "allValue": null, - "current": { }, - "datasource": "$datasource", - "hide": 0, - "includeAll": false, - "label": "namespace", - "multi": false, - "name": "namespace", - "options": [ ], - "query": "label_values(thanos_status{}, namespace)", - "refresh": 1, - "regex": "", - "sort": 2, - "tagValuesQuery": "", - "tags": [ ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, { "auto": true, "auto_count": 300, diff --git a/examples/dashboards/query.json b/examples/dashboards/query.json index d6c06b9537..01a9138422 100644 --- a/examples/dashboards/query.json +++ b/examples/dashboards/query.json @@ -14,15 +14,7 @@ "height": "250px", "panels": [ { - "aliasColors": { - "1xx": "#EAB839", - "2xx": "#7EB26D", - "3xx": "#6ED0E0", - "4xx": "#EF843C", - "5xx": "#E24D42", - "error": "#E24D42", - "success": "#7EB26D" - }, + "aliasColors": { }, "bars": false, "dashLength": 10, "dashes": false, @@ -47,17 +39,38 @@ "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ ], + "seriesOverrides": [ + { + "alias": "/1../", + "color": "#EAB839" + }, + { + "alias": "/2../", + "color": "#37872D" + }, + { + "alias": "/3../", + "color": "#E0B400" + }, + { + "alias": "/4../", + "color": "#1F60C4" + }, + { + "alias": "/5../", + "color": "#C4162A" + } + ], "spaceLength": 10, "span": 4, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(label_replace(rate(http_requests_total{namespace=\"$namespace\",job=~\"$job\",handler=\"query\"}[$interval]),\"status_code\", \"${1}xx\", \"code\", \"([0-9])..\")) by (job, handler, status_code)", + "expr": "sum by (job, handler, code) (rate(http_requests_total{job=\"$job\", handler=\"query\"}[$interval]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{job}} {{handler}} {{status_code}}", + "legendFormat": "{{job}} {{handler}} {{code}}", "refId": "A", "step": 10 } @@ -133,7 +146,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(http_requests_total{namespace=\"$namespace\",job=~\"$job\",handler=\"query\",code=~\"5..\"}[$interval])) / sum(rate(http_requests_total{namespace=\"$namespace\",job=~\"$job\",handler=\"query\"}[$interval]))", + "expr": "sum by (job) (rate(http_requests_total{job=\"$job\", handler=\"query\",code=~\"5..\"}[$interval])) / sum by (job) (rate(http_requests_total{job=\"$job\", handler=\"query\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "error", @@ -203,34 +216,62 @@ "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ ], + "seriesOverrides": [ + { + "alias": "p99", + "color": "#FA6400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p90", + "color": "#E0B400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p50", + "color": "#37872D", + "fill": 10, + "fillGradient": 0 + } + ], "spaceLength": 10, "span": 4, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",handler=\"query\"}[$interval])) by (job, le)) * 1", + "expr": "histogram_quantile(0.50, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=\"$job\", handler=\"query\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "P99 {{job}}", + "legendFormat": "p50 {{job}}", + "logBase": 10, + "max": null, + "min": null, "refId": "A", "step": 10 }, { - "expr": "sum(rate(http_request_duration_seconds_sum{namespace=\"$namespace\",job=~\"$job\",handler=\"query\"}[$interval])) by (job) * 1 / sum(rate(http_request_duration_seconds_count{namespace=\"$namespace\",job=~\"$job\",handler=\"query\"}[$interval])) by (job)", + "expr": "histogram_quantile(0.90, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=\"$job\", handler=\"query\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "mean {{job}}", - "refId": "B", + "legendFormat": "p90 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "refId": "A", "step": 10 }, { - "expr": "histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",handler=\"query\"}[$interval])) by (job, le)) * 1", + "expr": "histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=\"$job\", handler=\"query\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "P50 {{job}}", - "refId": "C", + "legendFormat": "p99 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "refId": "A", "step": 10 } ], @@ -283,15 +324,7 @@ "height": "250px", "panels": [ { - "aliasColors": { - "1xx": "#EAB839", - "2xx": "#7EB26D", - "3xx": "#6ED0E0", - "4xx": "#EF843C", - "5xx": "#E24D42", - "error": "#E24D42", - "success": "#7EB26D" - }, + "aliasColors": { }, "bars": false, "dashLength": 10, "dashes": false, @@ -316,17 +349,38 @@ "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ ], + "seriesOverrides": [ + { + "alias": "/1../", + "color": "#EAB839" + }, + { + "alias": "/2../", + "color": "#37872D" + }, + { + "alias": "/3../", + "color": "#E0B400" + }, + { + "alias": "/4../", + "color": "#1F60C4" + }, + { + "alias": "/5../", + "color": "#C4162A" + } + ], "spaceLength": 10, "span": 4, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(label_replace(rate(http_requests_total{namespace=\"$namespace\",job=~\"$job\",handler=\"query_range\"}[$interval]),\"status_code\", \"${1}xx\", \"code\", \"([0-9])..\")) by (job, handler, status_code)", + "expr": "sum by (job, handler, code) (rate(http_requests_total{job=\"$job\", handler=\"query_range\"}[$interval]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{job}} {{handler}} {{status_code}}", + "legendFormat": "{{job}} {{handler}} {{code}}", "refId": "A", "step": 10 } @@ -402,7 +456,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(http_requests_total{namespace=\"$namespace\",job=~\"$job\",handler=\"query_range\",code=~\"5..\"}[$interval])) / sum(rate(http_requests_total{namespace=\"$namespace\",job=~\"$job\",handler=\"query_range\"}[$interval]))", + "expr": "sum by (job) (rate(http_requests_total{job=\"$job\", handler=\"query_range\",code=~\"5..\"}[$interval])) / sum by (job) (rate(http_requests_total{job=\"$job\", handler=\"query_range\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "error", @@ -472,34 +526,62 @@ "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ ], + "seriesOverrides": [ + { + "alias": "p99", + "color": "#FA6400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p90", + "color": "#E0B400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p50", + "color": "#37872D", + "fill": 10, + "fillGradient": 0 + } + ], "spaceLength": 10, "span": 4, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",handler=\"query_range\"}[$interval])) by (job, le)) * 1", + "expr": "histogram_quantile(0.50, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=\"$job\", handler=\"query_range\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "P99 {{job}}", + "legendFormat": "p50 {{job}}", + "logBase": 10, + "max": null, + "min": null, "refId": "A", "step": 10 }, { - "expr": "sum(rate(http_request_duration_seconds_sum{namespace=\"$namespace\",job=~\"$job\",handler=\"query_range\"}[$interval])) by (job) * 1 / sum(rate(http_request_duration_seconds_count{namespace=\"$namespace\",job=~\"$job\",handler=\"query_range\"}[$interval])) by (job)", + "expr": "histogram_quantile(0.90, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=\"$job\", handler=\"query_range\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "mean {{job}}", - "refId": "B", + "legendFormat": "p90 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "refId": "A", "step": 10 }, { - "expr": "histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",handler=\"query_range\"}[$interval])) by (job, le)) * 1", + "expr": "histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=\"$job\", handler=\"query_range\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "P50 {{job}}", - "refId": "C", + "legendFormat": "p99 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "refId": "A", "step": 10 } ], @@ -552,26 +634,7 @@ "height": "250px", "panels": [ { - "aliasColors": { - "Aborted": "#EAB839", - "AlreadyExists": "#7EB26D", - "Canceled": "#E24D42", - "DataLoss": "#E24D42", - "DeadlineExceeded": "#E24D42", - "FailedPrecondition": "#6ED0E0", - "Internal": "#E24D42", - "InvalidArgument": "#EF843C", - "NotFound": "#EF843C", - "OK": "#7EB26D", - "OutOfRange": "#E24D42", - "PermissionDenied": "#EF843C", - "ResourceExhausted": "#E24D42", - "Unauthenticated": "#EF843C", - "Unavailable": "#E24D42", - "Unimplemented": "#6ED0E0", - "Unknown": "#E24D42", - "error": "#E24D42" - }, + "aliasColors": { }, "bars": false, "dashLength": 10, "dashes": false, @@ -596,14 +659,87 @@ "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ ], + "seriesOverrides": [ + { + "alias": "/Aborted/", + "color": "#EAB839" + }, + { + "alias": "/AlreadyExists/", + "color": "#37872D" + }, + { + "alias": "/FailedPrecondition/", + "color": "#E0B400" + }, + { + "alias": "/Unimplemented/", + "color": "#E0B400" + }, + { + "alias": "/InvalidArgument/", + "color": "#1F60C4" + }, + { + "alias": "/NotFound/", + "color": "#1F60C4" + }, + { + "alias": "/PermissionDenied/", + "color": "#1F60C4" + }, + { + "alias": "/Unauthenticated/", + "color": "#1F60C4" + }, + { + "alias": "/Canceled/", + "color": "#C4162A" + }, + { + "alias": "/DataLoss/", + "color": "#C4162A" + }, + { + "alias": "/DeadlineExceeded/", + "color": "#C4162A" + }, + { + "alias": "/Internal/", + "color": "#C4162A" + }, + { + "alias": "/OutOfRange/", + "color": "#C4162A" + }, + { + "alias": "/ResourceExhausted/", + "color": "#C4162A" + }, + { + "alias": "/Unavailable/", + "color": "#C4162A" + }, + { + "alias": "/Unknown/", + "color": "#C4162A" + }, + { + "alias": "/OK/", + "color": "#37872D" + }, + { + "alias": "error", + "color": "#C4162A" + } + ], "spaceLength": 10, "span": 4, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(rate(grpc_client_handled_total{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job, grpc_method, grpc_code)", + "expr": "sum by (job, grpc_method, grpc_code) (rate(grpc_client_handled_total{job=\"$job\", grpc_type=\"unary\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{job}} {{grpc_method}} {{grpc_code}}", @@ -682,7 +818,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(grpc_client_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) / sum(rate(grpc_client_started_total{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval]))", + "expr": "sum by (job) (rate(grpc_client_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=\"$job\", grpc_type=\"unary\"}[$interval])) / sum by (job) (rate(grpc_client_handled_total{job=\"$job\", grpc_type=\"unary\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "error", @@ -752,34 +888,62 @@ "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ ], + "seriesOverrides": [ + { + "alias": "p99", + "color": "#FA6400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p90", + "color": "#E0B400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p50", + "color": "#37872D", + "fill": 10, + "fillGradient": 0 + } + ], "spaceLength": 10, "span": 4, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(grpc_client_handling_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job, grpc_method, le)) * 1", + "expr": "histogram_quantile(0.50, sum by (job, le) (rate(grpc_client_handling_seconds_bucket{job=\"$job\", grpc_type=\"unary\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "P99 {{job}} {{grpc_method}}", - "legendLink": null, + "legendFormat": "p50 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "refId": "A", "step": 10 }, { - "expr": "sum(rate(grpc_client_handling_seconds_sum{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job) * 1\n/\nsum(rate(grpc_client_handling_seconds_count{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job)\n", + "expr": "histogram_quantile(0.90, sum by (job, le) (rate(grpc_client_handling_seconds_bucket{job=\"$job\", grpc_type=\"unary\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "mean {{job}} {{grpc_method}}", - "legendLink": null, + "legendFormat": "p90 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "refId": "A", "step": 10 }, { - "expr": "histogram_quantile(0.50, sum(rate(grpc_client_handling_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job, grpc_method, le)) * 1", + "expr": "histogram_quantile(0.99, sum by (job, le) (rate(grpc_client_handling_seconds_bucket{job=\"$job\", grpc_type=\"unary\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "P50 {{job}} {{grpc_method}}", - "legendLink": null, + "legendFormat": "p99 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "refId": "A", "step": 10 } ], @@ -832,26 +996,7 @@ "height": "250px", "panels": [ { - "aliasColors": { - "Aborted": "#EAB839", - "AlreadyExists": "#7EB26D", - "Canceled": "#E24D42", - "DataLoss": "#E24D42", - "DeadlineExceeded": "#E24D42", - "FailedPrecondition": "#6ED0E0", - "Internal": "#E24D42", - "InvalidArgument": "#EF843C", - "NotFound": "#EF843C", - "OK": "#7EB26D", - "OutOfRange": "#E24D42", - "PermissionDenied": "#EF843C", - "ResourceExhausted": "#E24D42", - "Unauthenticated": "#EF843C", - "Unavailable": "#E24D42", - "Unimplemented": "#6ED0E0", - "Unknown": "#E24D42", - "error": "#E24D42" - }, + "aliasColors": { }, "bars": false, "dashLength": 10, "dashes": false, @@ -876,14 +1021,87 @@ "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ ], + "seriesOverrides": [ + { + "alias": "/Aborted/", + "color": "#EAB839" + }, + { + "alias": "/AlreadyExists/", + "color": "#37872D" + }, + { + "alias": "/FailedPrecondition/", + "color": "#E0B400" + }, + { + "alias": "/Unimplemented/", + "color": "#E0B400" + }, + { + "alias": "/InvalidArgument/", + "color": "#1F60C4" + }, + { + "alias": "/NotFound/", + "color": "#1F60C4" + }, + { + "alias": "/PermissionDenied/", + "color": "#1F60C4" + }, + { + "alias": "/Unauthenticated/", + "color": "#1F60C4" + }, + { + "alias": "/Canceled/", + "color": "#C4162A" + }, + { + "alias": "/DataLoss/", + "color": "#C4162A" + }, + { + "alias": "/DeadlineExceeded/", + "color": "#C4162A" + }, + { + "alias": "/Internal/", + "color": "#C4162A" + }, + { + "alias": "/OutOfRange/", + "color": "#C4162A" + }, + { + "alias": "/ResourceExhausted/", + "color": "#C4162A" + }, + { + "alias": "/Unavailable/", + "color": "#C4162A" + }, + { + "alias": "/Unknown/", + "color": "#C4162A" + }, + { + "alias": "/OK/", + "color": "#37872D" + }, + { + "alias": "error", + "color": "#C4162A" + } + ], "spaceLength": 10, "span": 4, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(rate(grpc_client_handled_total{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job, grpc_method, grpc_code)", + "expr": "sum by (job, grpc_method, grpc_code) (rate(grpc_client_handled_total{job=\"$job\", grpc_type=\"server_stream\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{job}} {{grpc_method}} {{grpc_code}}", @@ -962,7 +1180,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(grpc_client_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) / sum(rate(grpc_client_started_total{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval]))", + "expr": "sum by (job) (rate(grpc_client_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=\"$job\", grpc_type=\"server_stream\"}[$interval])) / sum by (job) (rate(grpc_client_handled_total{job=\"$job\", grpc_type=\"server_stream\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "error", @@ -1032,34 +1250,62 @@ "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ ], + "seriesOverrides": [ + { + "alias": "p99", + "color": "#FA6400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p90", + "color": "#E0B400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p50", + "color": "#37872D", + "fill": 10, + "fillGradient": 0 + } + ], "spaceLength": 10, "span": 4, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(grpc_client_handling_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job, grpc_method, le)) * 1", + "expr": "histogram_quantile(0.50, sum by (job, le) (rate(grpc_client_handling_seconds_bucket{job=\"$job\", grpc_type=\"server_stream\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "P99 {{job}} {{grpc_method}}", - "legendLink": null, + "legendFormat": "p50 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "refId": "A", "step": 10 }, { - "expr": "sum(rate(grpc_client_handling_seconds_sum{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job) * 1\n/\nsum(rate(grpc_client_handling_seconds_count{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job)\n", + "expr": "histogram_quantile(0.90, sum by (job, le) (rate(grpc_client_handling_seconds_bucket{job=\"$job\", grpc_type=\"server_stream\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "mean {{job}} {{grpc_method}}", - "legendLink": null, + "legendFormat": "p90 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "refId": "A", "step": 10 }, { - "expr": "histogram_quantile(0.50, sum(rate(grpc_client_handling_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job, grpc_method, le)) * 1", + "expr": "histogram_quantile(0.99, sum by (job, le) (rate(grpc_client_handling_seconds_bucket{job=\"$job\", grpc_type=\"server_stream\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "P50 {{job}} {{grpc_method}}", - "legendLink": null, + "legendFormat": "p99 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "refId": "A", "step": 10 } ], @@ -1144,7 +1390,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(thanos_query_store_apis_dns_lookups_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job)", + "expr": "sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{job=\"$job\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "lookups {{job}}", @@ -1223,7 +1469,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(thanos_query_store_apis_dns_failures_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) / sum(rate(thanos_query_store_apis_dns_lookups_total{namespace=\"$namespace\",job=~\"$job\"}[$interval]))", + "expr": "sum by (job) (rate(thanos_query_store_apis_dns_failures_total{job=\"$job\"}[$interval])) / sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{job=\"$job\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "error", @@ -1311,7 +1557,7 @@ "steppedLine": false, "targets": [ { - "expr": "go_memstats_alloc_bytes{namespace=\"$namespace\",job=~\"$job\"}", + "expr": "go_memstats_alloc_bytes{job=\"$job\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "alloc all {{instance}}", @@ -1319,7 +1565,7 @@ "step": 10 }, { - "expr": "go_memstats_heap_alloc_bytes{namespace=\"$namespace\",job=~\"$job\"}", + "expr": "go_memstats_heap_alloc_bytes{job=\"$job\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "alloc heap {{instance}}", @@ -1327,7 +1573,7 @@ "step": 10 }, { - "expr": "rate(go_memstats_alloc_bytes_total{namespace=\"$namespace\",job=~\"$job\"}[30s])", + "expr": "rate(go_memstats_alloc_bytes_total{job=\"$job\"})[30s]", "format": "time_series", "intervalFactor": 2, "legendFormat": "alloc rate all {{instance}}", @@ -1335,7 +1581,7 @@ "step": 10 }, { - "expr": "rate(go_memstats_heap_alloc_bytes{namespace=\"$namespace\",job=~\"$job\"}[30s])", + "expr": "rate(go_memstats_heap_alloc_bytes{job=\"$job\"})[30s]", "format": "time_series", "intervalFactor": 2, "legendFormat": "alloc rate heap {{instance}}", @@ -1343,18 +1589,18 @@ "step": 10 }, { - "expr": "go_memstats_stack_inuse_bytes{namespace=\"$namespace\",job=~\"$job\"}", + "expr": "go_memstats_stack_inuse_bytes{job=\"$job\"}", "format": "time_series", "intervalFactor": 2, - "legendFormat": "inuse stack {{instance}}", + "legendFormat": "inuse heap {{instance}}", "legendLink": null, "step": 10 }, { - "expr": "go_memstats_heap_inuse_bytes{namespace=\"$namespace\",job=~\"$job\"}", + "expr": "go_memstats_heap_inuse_bytes{job=\"$job\"}", "format": "time_series", "intervalFactor": 2, - "legendFormat": "inuse heap {{instance}}", + "legendFormat": "inuse stack {{instance}}", "legendLink": null, "step": 10 } @@ -1427,7 +1673,7 @@ "steppedLine": false, "targets": [ { - "expr": "go_goroutines{namespace=\"$namespace\",job=~\"$job\"}", + "expr": "go_goroutines{job=\"$job\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -1503,7 +1749,7 @@ "steppedLine": false, "targets": [ { - "expr": "go_gc_duration_seconds{namespace=\"$namespace\",job=~\"$job\"}", + "expr": "go_gc_duration_seconds{job=\"$job\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{quantile}} {{instance}}", @@ -1565,8 +1811,8 @@ "list": [ { "current": { - "text": "Prometheus", - "value": "Prometheus" + "text": "default", + "value": "default" }, "hide": 0, "label": null, @@ -1578,27 +1824,22 @@ "type": "datasource" }, { - "allValue": null, - "current": { }, - "datasource": "$datasource", + "auto": true, + "auto_count": 300, + "auto_min": "10s", + "current": { + "text": "5m", + "value": "5m" + }, "hide": 0, - "includeAll": false, - "label": "namespace", - "multi": false, - "name": "namespace", - "options": [ ], - "query": "label_values(thanos_status{}, namespace)", - "refresh": 1, - "regex": "", - "sort": 2, - "tagValuesQuery": "", - "tags": [ ], - "tagsQuery": "", - "type": "query", - "useTags": false + "label": "interval", + "name": "interval", + "query": "5m,10m,30m,1h,6h,12h", + "refresh": 2, + "type": "interval" }, { - "allValue": "thanos-query.*", + "allValue": null, "current": { "text": "all", "value": "$__all" @@ -1610,7 +1851,7 @@ "multi": false, "name": "job", "options": [ ], - "query": "label_values(up{namespace=\"$namespace\", job=~\"thanos-query.*\"}, job)", + "query": "label_values(up{job=~\"thanos-query.*\"}, job)", "refresh": 1, "regex": "", "sort": 2, @@ -1619,21 +1860,6 @@ "tagsQuery": "", "type": "query", "useTags": false - }, - { - "auto": true, - "auto_count": 300, - "auto_min": "10s", - "current": { - "text": "5m", - "value": "5m" - }, - "hide": 0, - "label": "interval", - "name": "interval", - "query": "5m,10m,30m,1h,6h,12h", - "refresh": 2, - "type": "interval" } ] }, diff --git a/examples/dashboards/receive.json b/examples/dashboards/receive.json index 279b153617..de0addf3c3 100644 --- a/examples/dashboards/receive.json +++ b/examples/dashboards/receive.json @@ -14,15 +14,7 @@ "height": "250px", "panels": [ { - "aliasColors": { - "1xx": "#EAB839", - "2xx": "#7EB26D", - "3xx": "#6ED0E0", - "4xx": "#EF843C", - "5xx": "#E24D42", - "error": "#E24D42", - "success": "#7EB26D" - }, + "aliasColors": { }, "bars": false, "dashLength": 10, "dashes": false, @@ -47,17 +39,38 @@ "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ ], + "seriesOverrides": [ + { + "alias": "/1../", + "color": "#EAB839" + }, + { + "alias": "/2../", + "color": "#37872D" + }, + { + "alias": "/3../", + "color": "#E0B400" + }, + { + "alias": "/4../", + "color": "#1F60C4" + }, + { + "alias": "/5../", + "color": "#C4162A" + } + ], "spaceLength": 10, "span": 4, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(label_replace(rate(http_requests_total{handler=\"receive\",namespace=\"$namespace\",job=~\"$job\"}[$interval]),\"status_code\", \"${1}xx\", \"code\", \"([0-9])..\")) by (job, handler, status_code)", + "expr": "sum by (job, handler, code) (rate(http_requests_total{job=\"$job\", handler=\"receive\"}[$interval]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{job}} {{handler}} {{status_code}}", + "legendFormat": "{{job}} {{handler}} {{code}}", "refId": "A", "step": 10 } @@ -133,7 +146,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(http_requests_total{handler=\"receive\",namespace=\"$namespace\",job=~\"$job\",code=~\"5..\"}[$interval])) / sum(rate(http_requests_total{handler=\"receive\",namespace=\"$namespace\",job=~\"$job\"}[$interval]))", + "expr": "sum by (job) (rate(http_requests_total{job=\"$job\", handler=\"receive\",code=~\"5..\"}[$interval])) / sum by (job) (rate(http_requests_total{job=\"$job\", handler=\"receive\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "error", @@ -203,34 +216,62 @@ "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ ], + "seriesOverrides": [ + { + "alias": "p99", + "color": "#FA6400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p90", + "color": "#E0B400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p50", + "color": "#37872D", + "fill": 10, + "fillGradient": 0 + } + ], "spaceLength": 10, "span": 4, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{handler=\"receive\",namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, le)) * 1", + "expr": "histogram_quantile(0.50, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=\"$job\", handler=\"receive\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "P99 {{job}}", + "legendFormat": "p50 {{job}}", + "logBase": 10, + "max": null, + "min": null, "refId": "A", "step": 10 }, { - "expr": "sum(rate(http_request_duration_seconds_sum{handler=\"receive\",namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job) * 1 / sum(rate(http_request_duration_seconds_count{handler=\"receive\",namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job)", + "expr": "histogram_quantile(0.90, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=\"$job\", handler=\"receive\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "mean {{job}}", - "refId": "B", + "legendFormat": "p90 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "refId": "A", "step": 10 }, { - "expr": "histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket{handler=\"receive\",namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, le)) * 1", + "expr": "histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=\"$job\", handler=\"receive\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "P50 {{job}}", - "refId": "C", + "legendFormat": "p99 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "refId": "A", "step": 10 } ], @@ -315,7 +356,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(thanos_receive_replications_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job)", + "expr": "sum by (job) (rate(thanos_receive_replications_total{job=\"$job\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "all {{job}}", @@ -394,7 +435,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(thanos_receive_replications_total{namespace=\"$namespace\",job=~\"$job\",result=\"error\"}[$interval])) / sum(rate(thanos_receive_replications_total{namespace=\"$namespace\",job=~\"$job\"}[$interval]))", + "expr": "sum by (job) (rate(thanos_receive_replications_total{job=\"$job\", result=\"error\"}[$interval])) / sum by (job) (rate(thanos_receive_replications_total{job=\"$job\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "error", @@ -483,7 +524,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(thanos_receive_forward_requests_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job)", + "expr": "sum by (job) (rate(thanos_receive_forward_requests_total{job=\"$job\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "all {{job}}", @@ -562,7 +603,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(thanos_receive_forward_requests_total{namespace=\"$namespace\",job=~\"$job\",result=\"error\"}[$interval])) / sum(rate(thanos_receive_forward_requests_total{namespace=\"$namespace\",job=~\"$job\"}[$interval]))", + "expr": "sum by (job) (rate(thanos_receive_forward_requests_total{job=\"$job\", result=\"error\"}[$interval])) / sum by (job) (rate(thanos_receive_forward_requests_total{job=\"$job\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "error", @@ -619,26 +660,7 @@ "height": "250px", "panels": [ { - "aliasColors": { - "Aborted": "#EAB839", - "AlreadyExists": "#7EB26D", - "Canceled": "#E24D42", - "DataLoss": "#E24D42", - "DeadlineExceeded": "#E24D42", - "FailedPrecondition": "#6ED0E0", - "Internal": "#E24D42", - "InvalidArgument": "#EF843C", - "NotFound": "#EF843C", - "OK": "#7EB26D", - "OutOfRange": "#E24D42", - "PermissionDenied": "#EF843C", - "ResourceExhausted": "#E24D42", - "Unauthenticated": "#EF843C", - "Unavailable": "#E24D42", - "Unimplemented": "#6ED0E0", - "Unknown": "#E24D42", - "error": "#E24D42" - }, + "aliasColors": { }, "bars": false, "dashLength": 10, "dashes": false, @@ -663,14 +685,87 @@ "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ ], + "seriesOverrides": [ + { + "alias": "/Aborted/", + "color": "#EAB839" + }, + { + "alias": "/AlreadyExists/", + "color": "#37872D" + }, + { + "alias": "/FailedPrecondition/", + "color": "#E0B400" + }, + { + "alias": "/Unimplemented/", + "color": "#E0B400" + }, + { + "alias": "/InvalidArgument/", + "color": "#1F60C4" + }, + { + "alias": "/NotFound/", + "color": "#1F60C4" + }, + { + "alias": "/PermissionDenied/", + "color": "#1F60C4" + }, + { + "alias": "/Unauthenticated/", + "color": "#1F60C4" + }, + { + "alias": "/Canceled/", + "color": "#C4162A" + }, + { + "alias": "/DataLoss/", + "color": "#C4162A" + }, + { + "alias": "/DeadlineExceeded/", + "color": "#C4162A" + }, + { + "alias": "/Internal/", + "color": "#C4162A" + }, + { + "alias": "/OutOfRange/", + "color": "#C4162A" + }, + { + "alias": "/ResourceExhausted/", + "color": "#C4162A" + }, + { + "alias": "/Unavailable/", + "color": "#C4162A" + }, + { + "alias": "/Unknown/", + "color": "#C4162A" + }, + { + "alias": "/OK/", + "color": "#37872D" + }, + { + "alias": "error", + "color": "#C4162A" + } + ], "spaceLength": 10, "span": 4, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(rate(grpc_server_handled_total{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\",grpc_method=\"RemoteWrite\"}[$interval])) by (job, grpc_method, grpc_code)", + "expr": "sum by (job, grpc_method, grpc_code) (rate(grpc_server_handled_total{job=\"$job\", grpc_type=\"unary\", grpc_method=\"RemoteWrite\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{job}} {{grpc_method}} {{grpc_code}}", @@ -749,7 +844,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\",grpc_method=\"RemoteWrite\"}[$interval])) / sum(rate(grpc_server_started_total{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\",grpc_method=\"RemoteWrite\"}[$interval]))", + "expr": "sum by (job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=\"$job\", grpc_type=\"unary\", grpc_method=\"RemoteWrite\"}[$interval])) / sum by (job) (rate(grpc_server_handled_total{job=\"$job\", grpc_type=\"unary\", grpc_method=\"RemoteWrite\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "error", @@ -819,34 +914,62 @@ "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ ], + "seriesOverrides": [ + { + "alias": "p99", + "color": "#FA6400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p90", + "color": "#E0B400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p50", + "color": "#37872D", + "fill": 10, + "fillGradient": 0 + } + ], "spaceLength": 10, "span": 4, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\",grpc_method=\"RemoteWrite\"}[$interval])) by (job, grpc_method, le)) * 1", + "expr": "histogram_quantile(0.50, sum by (job, le) (rate(grpc_server_handling_seconds_bucket{job=\"$job\", grpc_type=\"unary\", grpc_method=\"RemoteWrite\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "P99 {{job}} {{grpc_method}}", - "legendLink": null, + "legendFormat": "p50 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "refId": "A", "step": 10 }, { - "expr": "sum(rate(grpc_server_handling_seconds_sum{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\",grpc_method=\"RemoteWrite\"}[$interval])) by (job) * 1\n/\nsum(rate(grpc_server_handling_seconds_count{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\",grpc_method=\"RemoteWrite\"}[$interval])) by (job)\n", + "expr": "histogram_quantile(0.90, sum by (job, le) (rate(grpc_server_handling_seconds_bucket{job=\"$job\", grpc_type=\"unary\", grpc_method=\"RemoteWrite\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "mean {{job}} {{grpc_method}}", - "legendLink": null, + "legendFormat": "p90 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "refId": "A", "step": 10 }, { - "expr": "histogram_quantile(0.50, sum(rate(grpc_server_handling_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\",grpc_method=\"RemoteWrite\"}[$interval])) by (job, grpc_method, le)) * 1", + "expr": "histogram_quantile(0.99, sum by (job, le) (rate(grpc_server_handling_seconds_bucket{job=\"$job\", grpc_type=\"unary\", grpc_method=\"RemoteWrite\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "P50 {{job}} {{grpc_method}}", - "legendLink": null, + "legendFormat": "p99 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "refId": "A", "step": 10 } ], @@ -899,26 +1022,7 @@ "height": "250px", "panels": [ { - "aliasColors": { - "Aborted": "#EAB839", - "AlreadyExists": "#7EB26D", - "Canceled": "#E24D42", - "DataLoss": "#E24D42", - "DeadlineExceeded": "#E24D42", - "FailedPrecondition": "#6ED0E0", - "Internal": "#E24D42", - "InvalidArgument": "#EF843C", - "NotFound": "#EF843C", - "OK": "#7EB26D", - "OutOfRange": "#E24D42", - "PermissionDenied": "#EF843C", - "ResourceExhausted": "#E24D42", - "Unauthenticated": "#EF843C", - "Unavailable": "#E24D42", - "Unimplemented": "#6ED0E0", - "Unknown": "#E24D42", - "error": "#E24D42" - }, + "aliasColors": { }, "bars": false, "dashLength": 10, "dashes": false, @@ -943,14 +1047,87 @@ "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ ], + "seriesOverrides": [ + { + "alias": "/Aborted/", + "color": "#EAB839" + }, + { + "alias": "/AlreadyExists/", + "color": "#37872D" + }, + { + "alias": "/FailedPrecondition/", + "color": "#E0B400" + }, + { + "alias": "/Unimplemented/", + "color": "#E0B400" + }, + { + "alias": "/InvalidArgument/", + "color": "#1F60C4" + }, + { + "alias": "/NotFound/", + "color": "#1F60C4" + }, + { + "alias": "/PermissionDenied/", + "color": "#1F60C4" + }, + { + "alias": "/Unauthenticated/", + "color": "#1F60C4" + }, + { + "alias": "/Canceled/", + "color": "#C4162A" + }, + { + "alias": "/DataLoss/", + "color": "#C4162A" + }, + { + "alias": "/DeadlineExceeded/", + "color": "#C4162A" + }, + { + "alias": "/Internal/", + "color": "#C4162A" + }, + { + "alias": "/OutOfRange/", + "color": "#C4162A" + }, + { + "alias": "/ResourceExhausted/", + "color": "#C4162A" + }, + { + "alias": "/Unavailable/", + "color": "#C4162A" + }, + { + "alias": "/Unknown/", + "color": "#C4162A" + }, + { + "alias": "/OK/", + "color": "#37872D" + }, + { + "alias": "error", + "color": "#C4162A" + } + ], "spaceLength": 10, "span": 4, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(rate(grpc_server_handled_total{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\",grpc_method!=\"RemoteWrite\"}[$interval])) by (job, grpc_method, grpc_code)", + "expr": "sum by (job, grpc_method, grpc_code) (rate(grpc_server_handled_total{job=\"$job\", grpc_type=\"unary\", grpc_method!=\"RemoteWrite\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{job}} {{grpc_method}} {{grpc_code}}", @@ -1029,7 +1206,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\",grpc_method!=\"RemoteWrite\"}[$interval])) / sum(rate(grpc_server_started_total{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\",grpc_method!=\"RemoteWrite\"}[$interval]))", + "expr": "sum by (job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=\"$job\", grpc_type=\"unary\", grpc_method!=\"RemoteWrite\"}[$interval])) / sum by (job) (rate(grpc_server_handled_total{job=\"$job\", grpc_type=\"unary\", grpc_method!=\"RemoteWrite\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "error", @@ -1099,34 +1276,62 @@ "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ ], + "seriesOverrides": [ + { + "alias": "p99", + "color": "#FA6400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p90", + "color": "#E0B400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p50", + "color": "#37872D", + "fill": 10, + "fillGradient": 0 + } + ], "spaceLength": 10, "span": 4, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\",grpc_method!=\"RemoteWrite\"}[$interval])) by (job, grpc_method, le)) * 1", + "expr": "histogram_quantile(0.50, sum by (job, le) (rate(grpc_server_handling_seconds_bucket{job=\"$job\", grpc_type=\"unary\", grpc_method!=\"RemoteWrite\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "P99 {{job}} {{grpc_method}}", - "legendLink": null, + "legendFormat": "p50 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "refId": "A", "step": 10 }, { - "expr": "sum(rate(grpc_server_handling_seconds_sum{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\",grpc_method!=\"RemoteWrite\"}[$interval])) by (job) * 1\n/\nsum(rate(grpc_server_handling_seconds_count{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\",grpc_method!=\"RemoteWrite\"}[$interval])) by (job)\n", + "expr": "histogram_quantile(0.90, sum by (job, le) (rate(grpc_server_handling_seconds_bucket{job=\"$job\", grpc_type=\"unary\", grpc_method!=\"RemoteWrite\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "mean {{job}} {{grpc_method}}", - "legendLink": null, + "legendFormat": "p90 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "refId": "A", "step": 10 }, { - "expr": "histogram_quantile(0.50, sum(rate(grpc_server_handling_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\",grpc_method!=\"RemoteWrite\"}[$interval])) by (job, grpc_method, le)) * 1", + "expr": "histogram_quantile(0.99, sum by (job, le) (rate(grpc_server_handling_seconds_bucket{job=\"$job\", grpc_type=\"unary\", grpc_method!=\"RemoteWrite\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "P50 {{job}} {{grpc_method}}", - "legendLink": null, + "legendFormat": "p99 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "refId": "A", "step": 10 } ], @@ -1179,26 +1384,7 @@ "height": "250px", "panels": [ { - "aliasColors": { - "Aborted": "#EAB839", - "AlreadyExists": "#7EB26D", - "Canceled": "#E24D42", - "DataLoss": "#E24D42", - "DeadlineExceeded": "#E24D42", - "FailedPrecondition": "#6ED0E0", - "Internal": "#E24D42", - "InvalidArgument": "#EF843C", - "NotFound": "#EF843C", - "OK": "#7EB26D", - "OutOfRange": "#E24D42", - "PermissionDenied": "#EF843C", - "ResourceExhausted": "#E24D42", - "Unauthenticated": "#EF843C", - "Unavailable": "#E24D42", - "Unimplemented": "#6ED0E0", - "Unknown": "#E24D42", - "error": "#E24D42" - }, + "aliasColors": { }, "bars": false, "dashLength": 10, "dashes": false, @@ -1223,14 +1409,87 @@ "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ ], + "seriesOverrides": [ + { + "alias": "/Aborted/", + "color": "#EAB839" + }, + { + "alias": "/AlreadyExists/", + "color": "#37872D" + }, + { + "alias": "/FailedPrecondition/", + "color": "#E0B400" + }, + { + "alias": "/Unimplemented/", + "color": "#E0B400" + }, + { + "alias": "/InvalidArgument/", + "color": "#1F60C4" + }, + { + "alias": "/NotFound/", + "color": "#1F60C4" + }, + { + "alias": "/PermissionDenied/", + "color": "#1F60C4" + }, + { + "alias": "/Unauthenticated/", + "color": "#1F60C4" + }, + { + "alias": "/Canceled/", + "color": "#C4162A" + }, + { + "alias": "/DataLoss/", + "color": "#C4162A" + }, + { + "alias": "/DeadlineExceeded/", + "color": "#C4162A" + }, + { + "alias": "/Internal/", + "color": "#C4162A" + }, + { + "alias": "/OutOfRange/", + "color": "#C4162A" + }, + { + "alias": "/ResourceExhausted/", + "color": "#C4162A" + }, + { + "alias": "/Unavailable/", + "color": "#C4162A" + }, + { + "alias": "/Unknown/", + "color": "#C4162A" + }, + { + "alias": "/OK/", + "color": "#37872D" + }, + { + "alias": "error", + "color": "#C4162A" + } + ], "spaceLength": 10, "span": 4, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(rate(grpc_server_handled_total{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job, grpc_method, grpc_code)", + "expr": "sum by (job, grpc_method, grpc_code) (rate(grpc_server_handled_total{job=\"$job\", grpc_type=\"server_stream\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{job}} {{grpc_method}} {{grpc_code}}", @@ -1309,7 +1568,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) / sum(rate(grpc_server_started_total{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval]))", + "expr": "sum by (job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=\"$job\", grpc_type=\"server_stream\"}[$interval])) / sum by (job) (rate(grpc_server_handled_total{job=\"$job\", grpc_type=\"server_stream\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "error", @@ -1379,34 +1638,62 @@ "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ ], + "seriesOverrides": [ + { + "alias": "p99", + "color": "#FA6400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p90", + "color": "#E0B400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p50", + "color": "#37872D", + "fill": 10, + "fillGradient": 0 + } + ], "spaceLength": 10, "span": 4, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job, grpc_method, le)) * 1", + "expr": "histogram_quantile(0.50, sum by (job, le) (rate(grpc_server_handling_seconds_bucket{job=\"$job\", grpc_type=\"server_stream\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "P99 {{job}} {{grpc_method}}", - "legendLink": null, + "legendFormat": "p50 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "refId": "A", "step": 10 }, { - "expr": "sum(rate(grpc_server_handling_seconds_sum{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job) * 1\n/\nsum(rate(grpc_server_handling_seconds_count{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job)\n", + "expr": "histogram_quantile(0.90, sum by (job, le) (rate(grpc_server_handling_seconds_bucket{job=\"$job\", grpc_type=\"server_stream\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "mean {{job}} {{grpc_method}}", - "legendLink": null, + "legendFormat": "p90 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "refId": "A", "step": 10 }, { - "expr": "histogram_quantile(0.50, sum(rate(grpc_server_handling_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job, grpc_method, le)) * 1", + "expr": "histogram_quantile(0.99, sum by (job, le) (rate(grpc_server_handling_seconds_bucket{job=\"$job\", grpc_type=\"server_stream\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "P50 {{job}} {{grpc_method}}", - "legendLink": null, + "legendFormat": "p99 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "refId": "A", "step": 10 } ], @@ -1503,6 +1790,7 @@ "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value", @@ -1524,7 +1812,7 @@ ], "targets": [ { - "expr": "time() - max(thanos_objstore_bucket_last_successful_upload_time{namespace=\"$namespace\",job=~\"$job\"}) by (job, bucket)", + "expr": "time() - max by (job, bucket) (thanos_objstore_bucket_last_successful_upload_time{job=\"$job\"})", "format": "table", "instant": true, "intervalFactor": 2, @@ -1614,7 +1902,7 @@ "steppedLine": false, "targets": [ { - "expr": "go_memstats_alloc_bytes{namespace=\"$namespace\",job=~\"$job\"}", + "expr": "go_memstats_alloc_bytes{job=\"$job\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "alloc all {{instance}}", @@ -1622,7 +1910,7 @@ "step": 10 }, { - "expr": "go_memstats_heap_alloc_bytes{namespace=\"$namespace\",job=~\"$job\"}", + "expr": "go_memstats_heap_alloc_bytes{job=\"$job\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "alloc heap {{instance}}", @@ -1630,7 +1918,7 @@ "step": 10 }, { - "expr": "rate(go_memstats_alloc_bytes_total{namespace=\"$namespace\",job=~\"$job\"}[30s])", + "expr": "rate(go_memstats_alloc_bytes_total{job=\"$job\"})[30s]", "format": "time_series", "intervalFactor": 2, "legendFormat": "alloc rate all {{instance}}", @@ -1638,7 +1926,7 @@ "step": 10 }, { - "expr": "rate(go_memstats_heap_alloc_bytes{namespace=\"$namespace\",job=~\"$job\"}[30s])", + "expr": "rate(go_memstats_heap_alloc_bytes{job=\"$job\"})[30s]", "format": "time_series", "intervalFactor": 2, "legendFormat": "alloc rate heap {{instance}}", @@ -1646,18 +1934,18 @@ "step": 10 }, { - "expr": "go_memstats_stack_inuse_bytes{namespace=\"$namespace\",job=~\"$job\"}", + "expr": "go_memstats_stack_inuse_bytes{job=\"$job\"}", "format": "time_series", "intervalFactor": 2, - "legendFormat": "inuse stack {{instance}}", + "legendFormat": "inuse heap {{instance}}", "legendLink": null, "step": 10 }, { - "expr": "go_memstats_heap_inuse_bytes{namespace=\"$namespace\",job=~\"$job\"}", + "expr": "go_memstats_heap_inuse_bytes{job=\"$job\"}", "format": "time_series", "intervalFactor": 2, - "legendFormat": "inuse heap {{instance}}", + "legendFormat": "inuse stack {{instance}}", "legendLink": null, "step": 10 } @@ -1730,7 +2018,7 @@ "steppedLine": false, "targets": [ { - "expr": "go_goroutines{namespace=\"$namespace\",job=~\"$job\"}", + "expr": "go_goroutines{job=\"$job\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -1806,7 +2094,7 @@ "steppedLine": false, "targets": [ { - "expr": "go_gc_duration_seconds{namespace=\"$namespace\",job=~\"$job\"}", + "expr": "go_gc_duration_seconds{job=\"$job\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{quantile}} {{instance}}", @@ -1868,8 +2156,8 @@ "list": [ { "current": { - "text": "Prometheus", - "value": "Prometheus" + "text": "default", + "value": "default" }, "hide": 0, "label": null, @@ -1881,27 +2169,22 @@ "type": "datasource" }, { - "allValue": null, - "current": { }, - "datasource": "$datasource", + "auto": true, + "auto_count": 300, + "auto_min": "10s", + "current": { + "text": "5m", + "value": "5m" + }, "hide": 0, - "includeAll": false, - "label": "namespace", - "multi": false, - "name": "namespace", - "options": [ ], - "query": "label_values(thanos_status{}, namespace)", - "refresh": 1, - "regex": "", - "sort": 2, - "tagValuesQuery": "", - "tags": [ ], - "tagsQuery": "", - "type": "query", - "useTags": false + "label": "interval", + "name": "interval", + "query": "5m,10m,30m,1h,6h,12h", + "refresh": 2, + "type": "interval" }, { - "allValue": "thanos-receive.*", + "allValue": null, "current": { "text": "all", "value": "$__all" @@ -1913,7 +2196,7 @@ "multi": false, "name": "job", "options": [ ], - "query": "label_values(up{namespace=\"$namespace\", job=~\"thanos-receive.*\"}, job)", + "query": "label_values(up{job=~\"thanos-receive.*\"}, job)", "refresh": 1, "regex": "", "sort": 2, @@ -1922,21 +2205,6 @@ "tagsQuery": "", "type": "query", "useTags": false - }, - { - "auto": true, - "auto_count": 300, - "auto_min": "10s", - "current": { - "text": "5m", - "value": "5m" - }, - "hide": 0, - "label": "interval", - "name": "interval", - "query": "5m,10m,30m,1h,6h,12h", - "refresh": 2, - "type": "interval" } ] }, diff --git a/examples/dashboards/rule.json b/examples/dashboards/rule.json index faea9dc787..c0120c20bd 100644 --- a/examples/dashboards/rule.json +++ b/examples/dashboards/rule.json @@ -45,7 +45,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (strategy) (rate(prometheus_rule_evaluations_total{namespace=\"$namespace\",job=\"$job\"}[$interval]))\n", + "expr": "sum by (job, strategy) (rate(prometheus_rule_evaluations_total{job=\"$job\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{ strategy }}", @@ -121,7 +121,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (strategy) (increase(prometheus_rule_group_iterations_missed_total{namespace=\"$namespace\",job=\"$job\"}[$interval]))\n", + "expr": "sum by (job, strategy) (increase(prometheus_rule_group_iterations_missed_total{job=\"$job\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{ strategy }}", @@ -197,7 +197,7 @@ "steppedLine": false, "targets": [ { - "expr": "(\n max by(rule_group) (prometheus_rule_group_last_duration_seconds{namespace=\"$namespace\",job=\"$job\"})\n >\n sum by(rule_group) (prometheus_rule_group_interval_seconds{namespace=\"$namespace\",job=\"$job\"})\n)\n", + "expr": "(\n max by(job, rule_group) (prometheus_rule_group_last_duration_seconds{job=\"$job\"})\n >\n sum by(job, rule_group) (prometheus_rule_group_interval_seconds{job=\"$job\"})\n)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{ rule_group }}", @@ -286,7 +286,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(thanos_alert_sender_alerts_dropped_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, alertmanager)", + "expr": "sum by (job, alertmanager) (rate(thanos_alert_sender_alerts_dropped_total{job=\"$job\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{alertmanager}}", @@ -363,7 +363,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(thanos_alert_sender_alerts_sent_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, alertmanager)", + "expr": "sum by (job, alertmanager) (rate(thanos_alert_sender_alerts_sent_total{job=\"$job\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{alertmanager}}", @@ -442,7 +442,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(thanos_alert_sender_errors_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) / sum(rate(thanos_alert_sender_alerts_sent_total{namespace=\"$namespace\",job=~\"$job\"}[$interval]))", + "expr": "sum by (job) (rate(thanos_alert_sender_errors_total{job=\"$job\"}[$interval])) / sum by (job) (rate(thanos_alert_sender_alerts_sent_total{job=\"$job\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "error", @@ -512,34 +512,62 @@ "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ ], + "seriesOverrides": [ + { + "alias": "p99", + "color": "#FA6400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p90", + "color": "#E0B400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p50", + "color": "#37872D", + "fill": 10, + "fillGradient": 0 + } + ], "spaceLength": 10, "span": 3, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(thanos_alert_sender_latency_seconds_bucket{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, le)) * 1", + "expr": "histogram_quantile(0.50, sum by (job, le) (rate(thanos_alert_sender_latency_seconds_bucket{job=\"$job\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "P99 {{job}}", + "legendFormat": "p50 {{job}}", + "logBase": 10, + "max": null, + "min": null, "refId": "A", "step": 10 }, { - "expr": "sum(rate(thanos_alert_sender_latency_seconds_sum{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job) * 1 / sum(rate(thanos_alert_sender_latency_seconds_count{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job)", + "expr": "histogram_quantile(0.90, sum by (job, le) (rate(thanos_alert_sender_latency_seconds_bucket{job=\"$job\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "mean {{job}}", - "refId": "B", + "legendFormat": "p90 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "refId": "A", "step": 10 }, { - "expr": "histogram_quantile(0.50, sum(rate(thanos_alert_sender_latency_seconds_bucket{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, le)) * 1", + "expr": "histogram_quantile(0.99, sum by (job, le) (rate(thanos_alert_sender_latency_seconds_bucket{job=\"$job\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "P50 {{job}}", - "refId": "C", + "legendFormat": "p99 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "refId": "A", "step": 10 } ], @@ -624,7 +652,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(thanos_alert_queue_alerts_dropped_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job)", + "expr": "sum by (job) (rate(thanos_alert_queue_alerts_dropped_total{job=\"$job\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{job}}", @@ -703,7 +731,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(thanos_alert_queue_alerts_dropped_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) / sum(rate(thanos_alert_queue_alerts_pushed_total{namespace=\"$namespace\",job=~\"$job\"}[$interval]))", + "expr": "sum by (job) (rate(thanos_alert_queue_alerts_dropped_total{job=\"$job\"}[$interval])) / sum by (job) (rate(thanos_alert_queue_alerts_pushed_total{job=\"$job\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "error", @@ -760,26 +788,7 @@ "height": "250px", "panels": [ { - "aliasColors": { - "Aborted": "#EAB839", - "AlreadyExists": "#7EB26D", - "Canceled": "#E24D42", - "DataLoss": "#E24D42", - "DeadlineExceeded": "#E24D42", - "FailedPrecondition": "#6ED0E0", - "Internal": "#E24D42", - "InvalidArgument": "#EF843C", - "NotFound": "#EF843C", - "OK": "#7EB26D", - "OutOfRange": "#E24D42", - "PermissionDenied": "#EF843C", - "ResourceExhausted": "#E24D42", - "Unauthenticated": "#EF843C", - "Unavailable": "#E24D42", - "Unimplemented": "#6ED0E0", - "Unknown": "#E24D42", - "error": "#E24D42" - }, + "aliasColors": { }, "bars": false, "dashLength": 10, "dashes": false, @@ -804,14 +813,87 @@ "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ ], + "seriesOverrides": [ + { + "alias": "/Aborted/", + "color": "#EAB839" + }, + { + "alias": "/AlreadyExists/", + "color": "#37872D" + }, + { + "alias": "/FailedPrecondition/", + "color": "#E0B400" + }, + { + "alias": "/Unimplemented/", + "color": "#E0B400" + }, + { + "alias": "/InvalidArgument/", + "color": "#1F60C4" + }, + { + "alias": "/NotFound/", + "color": "#1F60C4" + }, + { + "alias": "/PermissionDenied/", + "color": "#1F60C4" + }, + { + "alias": "/Unauthenticated/", + "color": "#1F60C4" + }, + { + "alias": "/Canceled/", + "color": "#C4162A" + }, + { + "alias": "/DataLoss/", + "color": "#C4162A" + }, + { + "alias": "/DeadlineExceeded/", + "color": "#C4162A" + }, + { + "alias": "/Internal/", + "color": "#C4162A" + }, + { + "alias": "/OutOfRange/", + "color": "#C4162A" + }, + { + "alias": "/ResourceExhausted/", + "color": "#C4162A" + }, + { + "alias": "/Unavailable/", + "color": "#C4162A" + }, + { + "alias": "/Unknown/", + "color": "#C4162A" + }, + { + "alias": "/OK/", + "color": "#37872D" + }, + { + "alias": "error", + "color": "#C4162A" + } + ], "spaceLength": 10, "span": 4, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(rate(grpc_server_handled_total{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job, grpc_method, grpc_code)", + "expr": "sum by (job, grpc_method, grpc_code) (rate(grpc_server_handled_total{job=\"$job\", grpc_type=\"unary\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{job}} {{grpc_method}} {{grpc_code}}", @@ -890,7 +972,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) / sum(rate(grpc_server_started_total{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval]))", + "expr": "sum by (job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=\"$job\", grpc_type=\"unary\"}[$interval])) / sum by (job) (rate(grpc_server_handled_total{job=\"$job\", grpc_type=\"unary\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "error", @@ -960,34 +1042,62 @@ "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ ], + "seriesOverrides": [ + { + "alias": "p99", + "color": "#FA6400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p90", + "color": "#E0B400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p50", + "color": "#37872D", + "fill": 10, + "fillGradient": 0 + } + ], "spaceLength": 10, "span": 4, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job, grpc_method, le)) * 1", + "expr": "histogram_quantile(0.50, sum by (job, le) (rate(grpc_server_handling_seconds_bucket{job=\"$job\", grpc_type=\"unary\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "P99 {{job}} {{grpc_method}}", - "legendLink": null, + "legendFormat": "p50 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "refId": "A", "step": 10 }, { - "expr": "sum(rate(grpc_server_handling_seconds_sum{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job) * 1\n/\nsum(rate(grpc_server_handling_seconds_count{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job)\n", + "expr": "histogram_quantile(0.90, sum by (job, le) (rate(grpc_server_handling_seconds_bucket{job=\"$job\", grpc_type=\"unary\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "mean {{job}} {{grpc_method}}", - "legendLink": null, + "legendFormat": "p90 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "refId": "A", "step": 10 }, { - "expr": "histogram_quantile(0.50, sum(rate(grpc_server_handling_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job, grpc_method, le)) * 1", + "expr": "histogram_quantile(0.99, sum by (job, le) (rate(grpc_server_handling_seconds_bucket{job=\"$job\", grpc_type=\"unary\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "P50 {{job}} {{grpc_method}}", - "legendLink": null, + "legendFormat": "p99 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "refId": "A", "step": 10 } ], @@ -1040,26 +1150,7 @@ "height": "250px", "panels": [ { - "aliasColors": { - "Aborted": "#EAB839", - "AlreadyExists": "#7EB26D", - "Canceled": "#E24D42", - "DataLoss": "#E24D42", - "DeadlineExceeded": "#E24D42", - "FailedPrecondition": "#6ED0E0", - "Internal": "#E24D42", - "InvalidArgument": "#EF843C", - "NotFound": "#EF843C", - "OK": "#7EB26D", - "OutOfRange": "#E24D42", - "PermissionDenied": "#EF843C", - "ResourceExhausted": "#E24D42", - "Unauthenticated": "#EF843C", - "Unavailable": "#E24D42", - "Unimplemented": "#6ED0E0", - "Unknown": "#E24D42", - "error": "#E24D42" - }, + "aliasColors": { }, "bars": false, "dashLength": 10, "dashes": false, @@ -1084,14 +1175,87 @@ "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ ], + "seriesOverrides": [ + { + "alias": "/Aborted/", + "color": "#EAB839" + }, + { + "alias": "/AlreadyExists/", + "color": "#37872D" + }, + { + "alias": "/FailedPrecondition/", + "color": "#E0B400" + }, + { + "alias": "/Unimplemented/", + "color": "#E0B400" + }, + { + "alias": "/InvalidArgument/", + "color": "#1F60C4" + }, + { + "alias": "/NotFound/", + "color": "#1F60C4" + }, + { + "alias": "/PermissionDenied/", + "color": "#1F60C4" + }, + { + "alias": "/Unauthenticated/", + "color": "#1F60C4" + }, + { + "alias": "/Canceled/", + "color": "#C4162A" + }, + { + "alias": "/DataLoss/", + "color": "#C4162A" + }, + { + "alias": "/DeadlineExceeded/", + "color": "#C4162A" + }, + { + "alias": "/Internal/", + "color": "#C4162A" + }, + { + "alias": "/OutOfRange/", + "color": "#C4162A" + }, + { + "alias": "/ResourceExhausted/", + "color": "#C4162A" + }, + { + "alias": "/Unavailable/", + "color": "#C4162A" + }, + { + "alias": "/Unknown/", + "color": "#C4162A" + }, + { + "alias": "/OK/", + "color": "#37872D" + }, + { + "alias": "error", + "color": "#C4162A" + } + ], "spaceLength": 10, "span": 4, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(rate(grpc_server_handled_total{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job, grpc_method, grpc_code)", + "expr": "sum by (job, grpc_method, grpc_code) (rate(grpc_server_handled_total{job=\"$job\", grpc_type=\"server_stream\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{job}} {{grpc_method}} {{grpc_code}}", @@ -1170,7 +1334,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) / sum(rate(grpc_server_started_total{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval]))", + "expr": "sum by (job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=\"$job\", grpc_type=\"server_stream\"}[$interval])) / sum by (job) (rate(grpc_server_handled_total{job=\"$job\", grpc_type=\"server_stream\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "error", @@ -1240,34 +1404,62 @@ "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ ], + "seriesOverrides": [ + { + "alias": "p99", + "color": "#FA6400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p90", + "color": "#E0B400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p50", + "color": "#37872D", + "fill": 10, + "fillGradient": 0 + } + ], "spaceLength": 10, "span": 4, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job, grpc_method, le)) * 1", + "expr": "histogram_quantile(0.50, sum by (job, le) (rate(grpc_server_handling_seconds_bucket{job=\"$job\", grpc_type=\"server_stream\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "P99 {{job}} {{grpc_method}}", - "legendLink": null, + "legendFormat": "p50 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "refId": "A", "step": 10 }, { - "expr": "sum(rate(grpc_server_handling_seconds_sum{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job) * 1\n/\nsum(rate(grpc_server_handling_seconds_count{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job)\n", + "expr": "histogram_quantile(0.90, sum by (job, le) (rate(grpc_server_handling_seconds_bucket{job=\"$job\", grpc_type=\"server_stream\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "mean {{job}} {{grpc_method}}", - "legendLink": null, + "legendFormat": "p90 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "refId": "A", "step": 10 }, { - "expr": "histogram_quantile(0.50, sum(rate(grpc_server_handling_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job, grpc_method, le)) * 1", + "expr": "histogram_quantile(0.99, sum by (job, le) (rate(grpc_server_handling_seconds_bucket{job=\"$job\", grpc_type=\"server_stream\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "P50 {{job}} {{grpc_method}}", - "legendLink": null, + "legendFormat": "p99 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "refId": "A", "step": 10 } ], @@ -1351,7 +1543,7 @@ "steppedLine": false, "targets": [ { - "expr": "go_memstats_alloc_bytes{namespace=\"$namespace\",job=~\"$job\"}", + "expr": "go_memstats_alloc_bytes{job=\"$job\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "alloc all {{instance}}", @@ -1359,7 +1551,7 @@ "step": 10 }, { - "expr": "go_memstats_heap_alloc_bytes{namespace=\"$namespace\",job=~\"$job\"}", + "expr": "go_memstats_heap_alloc_bytes{job=\"$job\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "alloc heap {{instance}}", @@ -1367,7 +1559,7 @@ "step": 10 }, { - "expr": "rate(go_memstats_alloc_bytes_total{namespace=\"$namespace\",job=~\"$job\"}[30s])", + "expr": "rate(go_memstats_alloc_bytes_total{job=\"$job\"})[30s]", "format": "time_series", "intervalFactor": 2, "legendFormat": "alloc rate all {{instance}}", @@ -1375,7 +1567,7 @@ "step": 10 }, { - "expr": "rate(go_memstats_heap_alloc_bytes{namespace=\"$namespace\",job=~\"$job\"}[30s])", + "expr": "rate(go_memstats_heap_alloc_bytes{job=\"$job\"})[30s]", "format": "time_series", "intervalFactor": 2, "legendFormat": "alloc rate heap {{instance}}", @@ -1383,18 +1575,18 @@ "step": 10 }, { - "expr": "go_memstats_stack_inuse_bytes{namespace=\"$namespace\",job=~\"$job\"}", + "expr": "go_memstats_stack_inuse_bytes{job=\"$job\"}", "format": "time_series", "intervalFactor": 2, - "legendFormat": "inuse stack {{instance}}", + "legendFormat": "inuse heap {{instance}}", "legendLink": null, "step": 10 }, { - "expr": "go_memstats_heap_inuse_bytes{namespace=\"$namespace\",job=~\"$job\"}", + "expr": "go_memstats_heap_inuse_bytes{job=\"$job\"}", "format": "time_series", "intervalFactor": 2, - "legendFormat": "inuse heap {{instance}}", + "legendFormat": "inuse stack {{instance}}", "legendLink": null, "step": 10 } @@ -1467,7 +1659,7 @@ "steppedLine": false, "targets": [ { - "expr": "go_goroutines{namespace=\"$namespace\",job=~\"$job\"}", + "expr": "go_goroutines{job=\"$job\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -1543,7 +1735,7 @@ "steppedLine": false, "targets": [ { - "expr": "go_gc_duration_seconds{namespace=\"$namespace\",job=~\"$job\"}", + "expr": "go_gc_duration_seconds{job=\"$job\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{quantile}} {{instance}}", @@ -1605,8 +1797,8 @@ "list": [ { "current": { - "text": "Prometheus", - "value": "Prometheus" + "text": "default", + "value": "default" }, "hide": 0, "label": null, @@ -1618,27 +1810,22 @@ "type": "datasource" }, { - "allValue": null, - "current": { }, - "datasource": "$datasource", + "auto": true, + "auto_count": 300, + "auto_min": "10s", + "current": { + "text": "5m", + "value": "5m" + }, "hide": 0, - "includeAll": false, - "label": "namespace", - "multi": false, - "name": "namespace", - "options": [ ], - "query": "label_values(thanos_status{}, namespace)", - "refresh": 1, - "regex": "", - "sort": 2, - "tagValuesQuery": "", - "tags": [ ], - "tagsQuery": "", - "type": "query", - "useTags": false + "label": "interval", + "name": "interval", + "query": "5m,10m,30m,1h,6h,12h", + "refresh": 2, + "type": "interval" }, { - "allValue": "thanos-rule.*", + "allValue": null, "current": { "text": "all", "value": "$__all" @@ -1650,7 +1837,7 @@ "multi": false, "name": "job", "options": [ ], - "query": "label_values(up{namespace=\"$namespace\", job=~\"thanos-rule.*\"}, job)", + "query": "label_values(up{job=~\"thanos-rule.*\"}, job)", "refresh": 1, "regex": "", "sort": 2, @@ -1659,21 +1846,6 @@ "tagsQuery": "", "type": "query", "useTags": false - }, - { - "auto": true, - "auto_count": 300, - "auto_min": "10s", - "current": { - "text": "5m", - "value": "5m" - }, - "hide": 0, - "label": "interval", - "name": "interval", - "query": "5m,10m,30m,1h,6h,12h", - "refresh": 2, - "type": "interval" } ] }, diff --git a/examples/dashboards/sidecar.json b/examples/dashboards/sidecar.json index 33e47f0902..f4512e75a1 100644 --- a/examples/dashboards/sidecar.json +++ b/examples/dashboards/sidecar.json @@ -14,26 +14,7 @@ "height": "250px", "panels": [ { - "aliasColors": { - "Aborted": "#EAB839", - "AlreadyExists": "#7EB26D", - "Canceled": "#E24D42", - "DataLoss": "#E24D42", - "DeadlineExceeded": "#E24D42", - "FailedPrecondition": "#6ED0E0", - "Internal": "#E24D42", - "InvalidArgument": "#EF843C", - "NotFound": "#EF843C", - "OK": "#7EB26D", - "OutOfRange": "#E24D42", - "PermissionDenied": "#EF843C", - "ResourceExhausted": "#E24D42", - "Unauthenticated": "#EF843C", - "Unavailable": "#E24D42", - "Unimplemented": "#6ED0E0", - "Unknown": "#E24D42", - "error": "#E24D42" - }, + "aliasColors": { }, "bars": false, "dashLength": 10, "dashes": false, @@ -58,14 +39,87 @@ "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ ], + "seriesOverrides": [ + { + "alias": "/Aborted/", + "color": "#EAB839" + }, + { + "alias": "/AlreadyExists/", + "color": "#37872D" + }, + { + "alias": "/FailedPrecondition/", + "color": "#E0B400" + }, + { + "alias": "/Unimplemented/", + "color": "#E0B400" + }, + { + "alias": "/InvalidArgument/", + "color": "#1F60C4" + }, + { + "alias": "/NotFound/", + "color": "#1F60C4" + }, + { + "alias": "/PermissionDenied/", + "color": "#1F60C4" + }, + { + "alias": "/Unauthenticated/", + "color": "#1F60C4" + }, + { + "alias": "/Canceled/", + "color": "#C4162A" + }, + { + "alias": "/DataLoss/", + "color": "#C4162A" + }, + { + "alias": "/DeadlineExceeded/", + "color": "#C4162A" + }, + { + "alias": "/Internal/", + "color": "#C4162A" + }, + { + "alias": "/OutOfRange/", + "color": "#C4162A" + }, + { + "alias": "/ResourceExhausted/", + "color": "#C4162A" + }, + { + "alias": "/Unavailable/", + "color": "#C4162A" + }, + { + "alias": "/Unknown/", + "color": "#C4162A" + }, + { + "alias": "/OK/", + "color": "#37872D" + }, + { + "alias": "error", + "color": "#C4162A" + } + ], "spaceLength": 10, "span": 4, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(rate(grpc_server_handled_total{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job, grpc_method, grpc_code)", + "expr": "sum by (job, grpc_method, grpc_code) (rate(grpc_server_handled_total{job=\"$job\", grpc_type=\"unary\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{job}} {{grpc_method}} {{grpc_code}}", @@ -144,7 +198,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) / sum(rate(grpc_server_started_total{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval]))", + "expr": "sum by (job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=\"$job\", grpc_type=\"unary\"}[$interval])) / sum by (job) (rate(grpc_server_handled_total{job=\"$job\", grpc_type=\"unary\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "error", @@ -214,34 +268,62 @@ "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ ], + "seriesOverrides": [ + { + "alias": "p99", + "color": "#FA6400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p90", + "color": "#E0B400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p50", + "color": "#37872D", + "fill": 10, + "fillGradient": 0 + } + ], "spaceLength": 10, "span": 4, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job, grpc_method, le)) * 1", + "expr": "histogram_quantile(0.50, sum by (job, le) (rate(grpc_server_handling_seconds_bucket{job=\"$job\", grpc_type=\"unary\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "P99 {{job}} {{grpc_method}}", - "legendLink": null, + "legendFormat": "p50 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "refId": "A", "step": 10 }, { - "expr": "sum(rate(grpc_server_handling_seconds_sum{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job) * 1\n/\nsum(rate(grpc_server_handling_seconds_count{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job)\n", + "expr": "histogram_quantile(0.90, sum by (job, le) (rate(grpc_server_handling_seconds_bucket{job=\"$job\", grpc_type=\"unary\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "mean {{job}} {{grpc_method}}", - "legendLink": null, + "legendFormat": "p90 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "refId": "A", "step": 10 }, { - "expr": "histogram_quantile(0.50, sum(rate(grpc_server_handling_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job, grpc_method, le)) * 1", + "expr": "histogram_quantile(0.99, sum by (job, le) (rate(grpc_server_handling_seconds_bucket{job=\"$job\", grpc_type=\"unary\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "P50 {{job}} {{grpc_method}}", - "legendLink": null, + "legendFormat": "p99 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "refId": "A", "step": 10 } ], @@ -294,26 +376,7 @@ "height": "250px", "panels": [ { - "aliasColors": { - "Aborted": "#EAB839", - "AlreadyExists": "#7EB26D", - "Canceled": "#E24D42", - "DataLoss": "#E24D42", - "DeadlineExceeded": "#E24D42", - "FailedPrecondition": "#6ED0E0", - "Internal": "#E24D42", - "InvalidArgument": "#EF843C", - "NotFound": "#EF843C", - "OK": "#7EB26D", - "OutOfRange": "#E24D42", - "PermissionDenied": "#EF843C", - "ResourceExhausted": "#E24D42", - "Unauthenticated": "#EF843C", - "Unavailable": "#E24D42", - "Unimplemented": "#6ED0E0", - "Unknown": "#E24D42", - "error": "#E24D42" - }, + "aliasColors": { }, "bars": false, "dashLength": 10, "dashes": false, @@ -338,14 +401,87 @@ "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ ], + "seriesOverrides": [ + { + "alias": "/Aborted/", + "color": "#EAB839" + }, + { + "alias": "/AlreadyExists/", + "color": "#37872D" + }, + { + "alias": "/FailedPrecondition/", + "color": "#E0B400" + }, + { + "alias": "/Unimplemented/", + "color": "#E0B400" + }, + { + "alias": "/InvalidArgument/", + "color": "#1F60C4" + }, + { + "alias": "/NotFound/", + "color": "#1F60C4" + }, + { + "alias": "/PermissionDenied/", + "color": "#1F60C4" + }, + { + "alias": "/Unauthenticated/", + "color": "#1F60C4" + }, + { + "alias": "/Canceled/", + "color": "#C4162A" + }, + { + "alias": "/DataLoss/", + "color": "#C4162A" + }, + { + "alias": "/DeadlineExceeded/", + "color": "#C4162A" + }, + { + "alias": "/Internal/", + "color": "#C4162A" + }, + { + "alias": "/OutOfRange/", + "color": "#C4162A" + }, + { + "alias": "/ResourceExhausted/", + "color": "#C4162A" + }, + { + "alias": "/Unavailable/", + "color": "#C4162A" + }, + { + "alias": "/Unknown/", + "color": "#C4162A" + }, + { + "alias": "/OK/", + "color": "#37872D" + }, + { + "alias": "error", + "color": "#C4162A" + } + ], "spaceLength": 10, "span": 4, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(rate(grpc_server_handled_total{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job, grpc_method, grpc_code)", + "expr": "sum by (job, grpc_method, grpc_code) (rate(grpc_server_handled_total{job=\"$job\", grpc_type=\"server_stream\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{job}} {{grpc_method}} {{grpc_code}}", @@ -423,7 +559,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) / sum(rate(grpc_server_started_total{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval]))", + "expr": "sum by (job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=\"$job\", grpc_type=\"server_stream\"}[$interval])) / sum by (job) (rate(grpc_server_handled_total{job=\"$job\", grpc_type=\"server_stream\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "error", @@ -493,34 +629,62 @@ "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ ], + "seriesOverrides": [ + { + "alias": "p99", + "color": "#FA6400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p90", + "color": "#E0B400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p50", + "color": "#37872D", + "fill": 10, + "fillGradient": 0 + } + ], "spaceLength": 10, "span": 4, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job, grpc_method, le)) * 1", + "expr": "histogram_quantile(0.50, sum by (job, le) (rate(grpc_server_handling_seconds_bucket{job=\"$job\", grpc_type=\"server_stream\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "P99 {{job}} {{grpc_method}}", - "legendLink": null, + "legendFormat": "p50 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "refId": "A", "step": 10 }, { - "expr": "sum(rate(grpc_server_handling_seconds_sum{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job) * 1\n/\nsum(rate(grpc_server_handling_seconds_count{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job)\n", + "expr": "histogram_quantile(0.90, sum by (job, le) (rate(grpc_server_handling_seconds_bucket{job=\"$job\", grpc_type=\"server_stream\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "mean {{job}} {{grpc_method}}", - "legendLink": null, + "legendFormat": "p90 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "refId": "A", "step": 10 }, { - "expr": "histogram_quantile(0.50, sum(rate(grpc_server_handling_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job, grpc_method, le)) * 1", + "expr": "histogram_quantile(0.99, sum by (job, le) (rate(grpc_server_handling_seconds_bucket{job=\"$job\", grpc_type=\"server_stream\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "P50 {{job}} {{grpc_method}}", - "legendLink": null, + "legendFormat": "p99 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "refId": "A", "step": 10 } ], @@ -617,6 +781,7 @@ "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, + "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", "pattern": "Value", @@ -638,7 +803,7 @@ ], "targets": [ { - "expr": "time() - max(thanos_objstore_bucket_last_successful_upload_time{namespace=\"$namespace\",job=~\"$job\"}) by (job, bucket)", + "expr": "time() - max by (job, bucket) (thanos_objstore_bucket_last_successful_upload_time{job=\"$job\"})", "format": "table", "instant": true, "intervalFactor": 2, @@ -728,7 +893,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(thanos_objstore_bucket_operations_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, operation)", + "expr": "sum by (job, operation) (rate(thanos_objstore_bucket_operations_total{job=\"$job\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{job}} {{operation}}", @@ -806,7 +971,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(thanos_objstore_bucket_operation_failures_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) / sum(rate(thanos_objstore_bucket_operations_total{namespace=\"$namespace\",job=~\"$job\"}[$interval]))", + "expr": "sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=\"$job\"}[$interval])) / sum by (job) (rate(thanos_objstore_bucket_operations_total{job=\"$job\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "error", @@ -875,34 +1040,62 @@ "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ ], + "seriesOverrides": [ + { + "alias": "p99", + "color": "#FA6400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p90", + "color": "#E0B400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p50", + "color": "#37872D", + "fill": 10, + "fillGradient": 0 + } + ], "spaceLength": 10, "span": 4, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, le)) * 1", + "expr": "histogram_quantile(0.50, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=\"$job\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "P99 {{job}}", + "legendFormat": "p50 {{job}}", + "logBase": 10, + "max": null, + "min": null, "refId": "A", "step": 10 }, { - "expr": "sum(rate(thanos_objstore_bucket_operation_duration_seconds_sum{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job) * 1 / sum(rate(thanos_objstore_bucket_operation_duration_seconds_count{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job)", + "expr": "histogram_quantile(0.90, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=\"$job\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "mean {{job}}", - "refId": "B", + "legendFormat": "p90 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "refId": "A", "step": 10 }, { - "expr": "histogram_quantile(0.50, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, le)) * 1", + "expr": "histogram_quantile(0.99, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=\"$job\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "P50 {{job}}", - "refId": "C", + "legendFormat": "p99 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "refId": "A", "step": 10 } ], @@ -986,7 +1179,7 @@ "steppedLine": false, "targets": [ { - "expr": "go_memstats_alloc_bytes{namespace=\"$namespace\",job=~\"$job\"}", + "expr": "go_memstats_alloc_bytes{job=\"$job\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "alloc all {{instance}}", @@ -994,7 +1187,7 @@ "step": 10 }, { - "expr": "go_memstats_heap_alloc_bytes{namespace=\"$namespace\",job=~\"$job\"}", + "expr": "go_memstats_heap_alloc_bytes{job=\"$job\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "alloc heap {{instance}}", @@ -1002,7 +1195,7 @@ "step": 10 }, { - "expr": "rate(go_memstats_alloc_bytes_total{namespace=\"$namespace\",job=~\"$job\"}[30s])", + "expr": "rate(go_memstats_alloc_bytes_total{job=\"$job\"})[30s]", "format": "time_series", "intervalFactor": 2, "legendFormat": "alloc rate all {{instance}}", @@ -1010,7 +1203,7 @@ "step": 10 }, { - "expr": "rate(go_memstats_heap_alloc_bytes{namespace=\"$namespace\",job=~\"$job\"}[30s])", + "expr": "rate(go_memstats_heap_alloc_bytes{job=\"$job\"})[30s]", "format": "time_series", "intervalFactor": 2, "legendFormat": "alloc rate heap {{instance}}", @@ -1018,18 +1211,18 @@ "step": 10 }, { - "expr": "go_memstats_stack_inuse_bytes{namespace=\"$namespace\",job=~\"$job\"}", + "expr": "go_memstats_stack_inuse_bytes{job=\"$job\"}", "format": "time_series", "intervalFactor": 2, - "legendFormat": "inuse stack {{instance}}", + "legendFormat": "inuse heap {{instance}}", "legendLink": null, "step": 10 }, { - "expr": "go_memstats_heap_inuse_bytes{namespace=\"$namespace\",job=~\"$job\"}", + "expr": "go_memstats_heap_inuse_bytes{job=\"$job\"}", "format": "time_series", "intervalFactor": 2, - "legendFormat": "inuse heap {{instance}}", + "legendFormat": "inuse stack {{instance}}", "legendLink": null, "step": 10 } @@ -1102,7 +1295,7 @@ "steppedLine": false, "targets": [ { - "expr": "go_goroutines{namespace=\"$namespace\",job=~\"$job\"}", + "expr": "go_goroutines{job=\"$job\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -1178,7 +1371,7 @@ "steppedLine": false, "targets": [ { - "expr": "go_gc_duration_seconds{namespace=\"$namespace\",job=~\"$job\"}", + "expr": "go_gc_duration_seconds{job=\"$job\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{quantile}} {{instance}}", @@ -1240,8 +1433,8 @@ "list": [ { "current": { - "text": "Prometheus", - "value": "Prometheus" + "text": "default", + "value": "default" }, "hide": 0, "label": null, @@ -1253,27 +1446,22 @@ "type": "datasource" }, { - "allValue": null, - "current": { }, - "datasource": "$datasource", + "auto": true, + "auto_count": 300, + "auto_min": "10s", + "current": { + "text": "5m", + "value": "5m" + }, "hide": 0, - "includeAll": false, - "label": "namespace", - "multi": false, - "name": "namespace", - "options": [ ], - "query": "label_values(thanos_status{}, namespace)", - "refresh": 1, - "regex": "", - "sort": 2, - "tagValuesQuery": "", - "tags": [ ], - "tagsQuery": "", - "type": "query", - "useTags": false + "label": "interval", + "name": "interval", + "query": "5m,10m,30m,1h,6h,12h", + "refresh": 2, + "type": "interval" }, { - "allValue": "thanos-sidecar.*", + "allValue": null, "current": { "text": "all", "value": "$__all" @@ -1285,7 +1473,7 @@ "multi": false, "name": "job", "options": [ ], - "query": "label_values(up{namespace=\"$namespace\", job=~\"thanos-sidecar.*\"}, job)", + "query": "label_values(up{job=~\"thanos-sidecar.*\"}, job)", "refresh": 1, "regex": "", "sort": 2, @@ -1294,21 +1482,6 @@ "tagsQuery": "", "type": "query", "useTags": false - }, - { - "auto": true, - "auto_count": 300, - "auto_min": "10s", - "current": { - "text": "5m", - "value": "5m" - }, - "hide": 0, - "label": "interval", - "name": "interval", - "query": "5m,10m,30m,1h,6h,12h", - "refresh": 2, - "type": "interval" } ] }, diff --git a/examples/dashboards/store.json b/examples/dashboards/store.json index 0e9710342d..71e70a9022 100644 --- a/examples/dashboards/store.json +++ b/examples/dashboards/store.json @@ -14,26 +14,7 @@ "height": "250px", "panels": [ { - "aliasColors": { - "Aborted": "#EAB839", - "AlreadyExists": "#7EB26D", - "Canceled": "#E24D42", - "DataLoss": "#E24D42", - "DeadlineExceeded": "#E24D42", - "FailedPrecondition": "#6ED0E0", - "Internal": "#E24D42", - "InvalidArgument": "#EF843C", - "NotFound": "#EF843C", - "OK": "#7EB26D", - "OutOfRange": "#E24D42", - "PermissionDenied": "#EF843C", - "ResourceExhausted": "#E24D42", - "Unauthenticated": "#EF843C", - "Unavailable": "#E24D42", - "Unimplemented": "#6ED0E0", - "Unknown": "#E24D42", - "error": "#E24D42" - }, + "aliasColors": { }, "bars": false, "dashLength": 10, "dashes": false, @@ -58,14 +39,87 @@ "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ ], + "seriesOverrides": [ + { + "alias": "/Aborted/", + "color": "#EAB839" + }, + { + "alias": "/AlreadyExists/", + "color": "#37872D" + }, + { + "alias": "/FailedPrecondition/", + "color": "#E0B400" + }, + { + "alias": "/Unimplemented/", + "color": "#E0B400" + }, + { + "alias": "/InvalidArgument/", + "color": "#1F60C4" + }, + { + "alias": "/NotFound/", + "color": "#1F60C4" + }, + { + "alias": "/PermissionDenied/", + "color": "#1F60C4" + }, + { + "alias": "/Unauthenticated/", + "color": "#1F60C4" + }, + { + "alias": "/Canceled/", + "color": "#C4162A" + }, + { + "alias": "/DataLoss/", + "color": "#C4162A" + }, + { + "alias": "/DeadlineExceeded/", + "color": "#C4162A" + }, + { + "alias": "/Internal/", + "color": "#C4162A" + }, + { + "alias": "/OutOfRange/", + "color": "#C4162A" + }, + { + "alias": "/ResourceExhausted/", + "color": "#C4162A" + }, + { + "alias": "/Unavailable/", + "color": "#C4162A" + }, + { + "alias": "/Unknown/", + "color": "#C4162A" + }, + { + "alias": "/OK/", + "color": "#37872D" + }, + { + "alias": "error", + "color": "#C4162A" + } + ], "spaceLength": 10, "span": 4, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(rate(grpc_server_handled_total{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job, grpc_method, grpc_code)", + "expr": "sum by (job, grpc_method, grpc_code) (rate(grpc_server_handled_total{job=\"$job\", grpc_type=\"unary\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{job}} {{grpc_method}} {{grpc_code}}", @@ -144,7 +198,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) / sum(rate(grpc_server_started_total{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval]))", + "expr": "sum by (job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=\"$job\", grpc_type=\"unary\"}[$interval])) / sum by (job) (rate(grpc_server_handled_total{job=\"$job\", grpc_type=\"unary\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "error", @@ -214,34 +268,62 @@ "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ ], + "seriesOverrides": [ + { + "alias": "p99", + "color": "#FA6400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p90", + "color": "#E0B400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p50", + "color": "#37872D", + "fill": 10, + "fillGradient": 0 + } + ], "spaceLength": 10, "span": 4, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job, grpc_method, le)) * 1", + "expr": "histogram_quantile(0.50, sum by (job, le) (rate(grpc_server_handling_seconds_bucket{job=\"$job\", grpc_type=\"unary\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "P99 {{job}} {{grpc_method}}", - "legendLink": null, + "legendFormat": "p50 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "refId": "A", "step": 10 }, { - "expr": "sum(rate(grpc_server_handling_seconds_sum{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job) * 1\n/\nsum(rate(grpc_server_handling_seconds_count{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job)\n", + "expr": "histogram_quantile(0.90, sum by (job, le) (rate(grpc_server_handling_seconds_bucket{job=\"$job\", grpc_type=\"unary\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "mean {{job}} {{grpc_method}}", - "legendLink": null, + "legendFormat": "p90 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "refId": "A", "step": 10 }, { - "expr": "histogram_quantile(0.50, sum(rate(grpc_server_handling_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job, grpc_method, le)) * 1", + "expr": "histogram_quantile(0.99, sum by (job, le) (rate(grpc_server_handling_seconds_bucket{job=\"$job\", grpc_type=\"unary\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "P50 {{job}} {{grpc_method}}", - "legendLink": null, + "legendFormat": "p99 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "refId": "A", "step": 10 } ], @@ -294,26 +376,7 @@ "height": "250px", "panels": [ { - "aliasColors": { - "Aborted": "#EAB839", - "AlreadyExists": "#7EB26D", - "Canceled": "#E24D42", - "DataLoss": "#E24D42", - "DeadlineExceeded": "#E24D42", - "FailedPrecondition": "#6ED0E0", - "Internal": "#E24D42", - "InvalidArgument": "#EF843C", - "NotFound": "#EF843C", - "OK": "#7EB26D", - "OutOfRange": "#E24D42", - "PermissionDenied": "#EF843C", - "ResourceExhausted": "#E24D42", - "Unauthenticated": "#EF843C", - "Unavailable": "#E24D42", - "Unimplemented": "#6ED0E0", - "Unknown": "#E24D42", - "error": "#E24D42" - }, + "aliasColors": { }, "bars": false, "dashLength": 10, "dashes": false, @@ -338,14 +401,87 @@ "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ ], + "seriesOverrides": [ + { + "alias": "/Aborted/", + "color": "#EAB839" + }, + { + "alias": "/AlreadyExists/", + "color": "#37872D" + }, + { + "alias": "/FailedPrecondition/", + "color": "#E0B400" + }, + { + "alias": "/Unimplemented/", + "color": "#E0B400" + }, + { + "alias": "/InvalidArgument/", + "color": "#1F60C4" + }, + { + "alias": "/NotFound/", + "color": "#1F60C4" + }, + { + "alias": "/PermissionDenied/", + "color": "#1F60C4" + }, + { + "alias": "/Unauthenticated/", + "color": "#1F60C4" + }, + { + "alias": "/Canceled/", + "color": "#C4162A" + }, + { + "alias": "/DataLoss/", + "color": "#C4162A" + }, + { + "alias": "/DeadlineExceeded/", + "color": "#C4162A" + }, + { + "alias": "/Internal/", + "color": "#C4162A" + }, + { + "alias": "/OutOfRange/", + "color": "#C4162A" + }, + { + "alias": "/ResourceExhausted/", + "color": "#C4162A" + }, + { + "alias": "/Unavailable/", + "color": "#C4162A" + }, + { + "alias": "/Unknown/", + "color": "#C4162A" + }, + { + "alias": "/OK/", + "color": "#37872D" + }, + { + "alias": "error", + "color": "#C4162A" + } + ], "spaceLength": 10, "span": 4, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(rate(grpc_server_handled_total{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job, grpc_method, grpc_code)", + "expr": "sum by (job, grpc_method, grpc_code) (rate(grpc_server_handled_total{job=\"$job\", grpc_type=\"server_stream\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{job}} {{grpc_method}} {{grpc_code}}", @@ -424,7 +560,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) / sum(rate(grpc_server_started_total{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval]))", + "expr": "sum by (job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=\"$job\", grpc_type=\"server_stream\"}[$interval])) / sum by (job) (rate(grpc_server_handled_total{job=\"$job\", grpc_type=\"server_stream\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "error", @@ -494,34 +630,62 @@ "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ ], + "seriesOverrides": [ + { + "alias": "p99", + "color": "#FA6400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p90", + "color": "#E0B400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p50", + "color": "#37872D", + "fill": 10, + "fillGradient": 0 + } + ], "spaceLength": 10, "span": 4, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job, grpc_method, le)) * 1", + "expr": "histogram_quantile(0.50, sum by (job, le) (rate(grpc_server_handling_seconds_bucket{job=\"$job\", grpc_type=\"server_stream\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "P99 {{job}} {{grpc_method}}", - "legendLink": null, + "legendFormat": "p50 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "refId": "A", "step": 10 }, { - "expr": "sum(rate(grpc_server_handling_seconds_sum{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job) * 1\n/\nsum(rate(grpc_server_handling_seconds_count{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job)\n", + "expr": "histogram_quantile(0.90, sum by (job, le) (rate(grpc_server_handling_seconds_bucket{job=\"$job\", grpc_type=\"server_stream\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "mean {{job}} {{grpc_method}}", - "legendLink": null, + "legendFormat": "p90 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "refId": "A", "step": 10 }, { - "expr": "histogram_quantile(0.50, sum(rate(grpc_server_handling_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job, grpc_method, le)) * 1", + "expr": "histogram_quantile(0.99, sum by (job, le) (rate(grpc_server_handling_seconds_bucket{job=\"$job\", grpc_type=\"server_stream\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "P50 {{job}} {{grpc_method}}", - "legendLink": null, + "legendFormat": "p99 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "refId": "A", "step": 10 } ], @@ -606,7 +770,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(thanos_objstore_bucket_operations_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, operation)", + "expr": "sum by (job, operation) (rate(thanos_objstore_bucket_operations_total{job=\"$job\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{job}} {{operation}}", @@ -683,7 +847,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (job, operation) (rate(thanos_objstore_bucket_operation_failures_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) / sum by (job, operation) (rate(thanos_objstore_bucket_operations_total{namespace=\"$namespace\",job=~\"$job\"}[$interval]))", + "expr": "sum by (job, operation) (rate(thanos_objstore_bucket_operation_failures_total{job=\"$job\"}[$interval])) / sum by (job, operation) (rate(thanos_objstore_bucket_operations_total{job=\"$job\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{job}} {{operation}}", @@ -760,7 +924,7 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, operation, le)) * 1", + "expr": "histogram_quantile(0.99, sum by (job, operation, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=\"$job\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, "legendFormat": "P99 {{job}}", @@ -768,7 +932,7 @@ "step": 10 }, { - "expr": "sum(rate(thanos_objstore_bucket_operation_duration_seconds_sum{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, operation) * 1 / sum(rate(thanos_objstore_bucket_operation_duration_seconds_count{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, operation)", + "expr": "sum by (job, operation) (rate(thanos_objstore_bucket_operation_duration_seconds_sum{job=\"$job\"}[$interval])) * 1 / sum by (job, operation) (rate(thanos_objstore_bucket_operation_duration_seconds_count{job=\"$job\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "mean {{job}}", @@ -776,7 +940,7 @@ "step": 10 }, { - "expr": "histogram_quantile(0.50, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, operation, le)) * 1", + "expr": "histogram_quantile(0.50, sum by (job, operation, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=\"$job\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, "legendFormat": "P50 {{job}}", @@ -865,7 +1029,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(thanos_bucket_store_block_loads_total{namespace=\"$namespace\",job=~\"$job\"}[$interval]))", + "expr": "sum by (job) (rate(thanos_bucket_store_block_loads_total{job=\"$job\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "block loads", @@ -944,7 +1108,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(thanos_bucket_store_block_load_failures_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) / sum(rate(thanos_bucket_store_block_loads_total{namespace=\"$namespace\",job=~\"$job\"}[$interval]))", + "expr": "sum by (job) (rate(thanos_bucket_store_block_load_failures_total{job=\"$job\"}[$interval])) / sum by (job) (rate(thanos_bucket_store_block_loads_total{job=\"$job\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "error", @@ -1021,7 +1185,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(thanos_bucket_store_block_drops_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, operation)", + "expr": "sum by (job, operation) (rate(thanos_bucket_store_block_drops_total{job=\"$job\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "block drops {{job}}", @@ -1100,7 +1264,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(thanos_bucket_store_block_drop_failures_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) / sum(rate(thanos_bucket_store_block_drops_total{namespace=\"$namespace\",job=~\"$job\"}[$interval]))", + "expr": "sum by (job) (rate(thanos_bucket_store_block_drop_failures_total{job=\"$job\"}[$interval])) / sum by (job) (rate(thanos_bucket_store_block_drops_total{job=\"$job\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "error", @@ -1189,7 +1353,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(thanos_store_index_cache_requests_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, item_type)", + "expr": "sum by (job, item_type) (rate(thanos_store_index_cache_requests_total{job=\"$job\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{job}} {{item_type}}", @@ -1266,7 +1430,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(thanos_store_index_cache_hits_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, item_type)", + "expr": "sum by (job, item_type) (rate(thanos_store_index_cache_hits_total{job=\"$job\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{job}} {{item_type}}", @@ -1343,7 +1507,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(thanos_store_index_cache_items_added_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, item_type)", + "expr": "sum by (job, item_type) (rate(thanos_store_index_cache_items_added_total{job=\"$job\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{job}} {{item_type}}", @@ -1420,7 +1584,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(thanos_store_index_cache_items_evicted_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, item_type)", + "expr": "sum by (job, item_type) (rate(thanos_store_index_cache_items_evicted_total{job=\"$job\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{job}} {{item_type}}", @@ -1509,7 +1673,7 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(thanos_bucket_store_sent_chunk_size_bytes_bucket{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, le))", + "expr": "histogram_quantile(0.99, sum by (job, le) (rate(thanos_bucket_store_sent_chunk_size_bytes_bucket{job=\"$job\"}[$interval])))", "format": "time_series", "intervalFactor": 2, "legendFormat": "P99", @@ -1517,7 +1681,7 @@ "step": 10 }, { - "expr": "sum(rate(thanos_bucket_store_sent_chunk_size_bytes_sum{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job) / sum(rate(thanos_bucket_store_sent_chunk_size_bytes_count{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job)", + "expr": "sum by (job) (rate(thanos_bucket_store_sent_chunk_size_bytes_sum{job=\"$job\"}[$interval])) / sum by (job) (rate(thanos_bucket_store_sent_chunk_size_bytes_count{job=\"$job\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "mean", @@ -1525,7 +1689,7 @@ "step": 10 }, { - "expr": "histogram_quantile(0.99, sum(rate(thanos_bucket_store_sent_chunk_size_bytes_bucket{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, le))", + "expr": "histogram_quantile(0.99, sum by (job, le) (rate(thanos_bucket_store_sent_chunk_size_bytes_bucket{job=\"$job\"}[$interval])))", "format": "time_series", "intervalFactor": 2, "legendFormat": "P50", @@ -1613,7 +1777,7 @@ "steppedLine": false, "targets": [ { - "expr": "thanos_bucket_store_series_blocks_queried{namespace=\"$namespace\",job=~\"$job\",quantile=\"0.99\"}", + "expr": "thanos_bucket_store_series_blocks_queried{job=\"$job\", quantile=\"0.99\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "P99", @@ -1621,7 +1785,7 @@ "step": 10 }, { - "expr": "sum(rate(thanos_bucket_store_series_blocks_queried_sum{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job) / sum(rate(thanos_bucket_store_series_blocks_queried_count{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job)", + "expr": "sum by (job) (rate(thanos_bucket_store_series_blocks_queried_sum{job=\"$job\"}[$interval])) / sum by (job) (rate(thanos_bucket_store_series_blocks_queried_count{job=\"$job\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "mean {{job}}", @@ -1629,7 +1793,7 @@ "step": 10 }, { - "expr": "thanos_bucket_store_series_blocks_queried{namespace=\"$namespace\",job=~\"$job\",quantile=\"0.50\"}", + "expr": "thanos_bucket_store_series_blocks_queried{job=\"$job\", quantile=\"0.50\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "P50", @@ -1706,7 +1870,7 @@ "steppedLine": false, "targets": [ { - "expr": "thanos_bucket_store_series_data_fetched{namespace=\"$namespace\",job=~\"$job\",quantile=\"0.99\"}", + "expr": "thanos_bucket_store_series_data_fetched{job=\"$job\", quantile=\"0.99\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "P99", @@ -1714,7 +1878,7 @@ "step": 10 }, { - "expr": "sum(rate(thanos_bucket_store_series_data_fetched_sum{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job) / sum(rate(thanos_bucket_store_series_data_fetched_count{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job)", + "expr": "sum by (job) (rate(thanos_bucket_store_series_data_fetched_sum{job=\"$job\"}[$interval])) / sum by (job) (rate(thanos_bucket_store_series_data_fetched_count{job=\"$job\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "mean {{job}}", @@ -1722,7 +1886,7 @@ "step": 10 }, { - "expr": "thanos_bucket_store_series_data_fetched{namespace=\"$namespace\",job=~\"$job\",quantile=\"0.50\"}", + "expr": "thanos_bucket_store_series_data_fetched{job=\"$job\", quantile=\"0.50\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "P50", @@ -1798,7 +1962,7 @@ "steppedLine": false, "targets": [ { - "expr": "thanos_bucket_store_series_result_series{namespace=\"$namespace\",job=~\"$job\",quantile=\"0.99\"}", + "expr": "thanos_bucket_store_series_result_series{job=\"$job\",quantile=\"0.99\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "P99", @@ -1806,7 +1970,7 @@ "step": 10 }, { - "expr": "sum(rate(thanos_bucket_store_series_result_series_sum{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job) / sum(rate(thanos_bucket_store_series_result_series_count{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job)", + "expr": "sum by (job) (rate(thanos_bucket_store_series_result_series_sum{job=\"$job\"}[$interval])) / sum by (job) (rate(thanos_bucket_store_series_result_series_count{job=\"$job\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "mean {{job}}", @@ -1814,7 +1978,7 @@ "step": 10 }, { - "expr": "thanos_bucket_store_series_result_series{namespace=\"$namespace\",job=~\"$job\",quantile=\"0.50\"}", + "expr": "thanos_bucket_store_series_result_series{job=\"$job\",quantile=\"0.50\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "P50", @@ -1896,34 +2060,62 @@ "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ ], + "seriesOverrides": [ + { + "alias": "p99", + "color": "#FA6400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p90", + "color": "#E0B400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p50", + "color": "#37872D", + "fill": 10, + "fillGradient": 0 + } + ], "spaceLength": 10, "span": 4, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(thanos_bucket_store_series_get_all_duration_seconds_bucket{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, le)) * 1", + "expr": "histogram_quantile(0.50, sum by (job, le) (rate(thanos_bucket_store_series_get_all_duration_seconds_bucket{job=\"$job\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "P99 {{job}}", + "legendFormat": "p50 {{job}}", + "logBase": 10, + "max": null, + "min": null, "refId": "A", "step": 10 }, { - "expr": "sum(rate(thanos_bucket_store_series_get_all_duration_seconds_sum{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job) * 1 / sum(rate(thanos_bucket_store_series_get_all_duration_seconds_count{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job)", + "expr": "histogram_quantile(0.90, sum by (job, le) (rate(thanos_bucket_store_series_get_all_duration_seconds_bucket{job=\"$job\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "mean {{job}}", - "refId": "B", + "legendFormat": "p90 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "refId": "A", "step": 10 }, { - "expr": "histogram_quantile(0.50, sum(rate(thanos_bucket_store_series_get_all_duration_seconds_bucket{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, le)) * 1", + "expr": "histogram_quantile(0.99, sum by (job, le) (rate(thanos_bucket_store_series_get_all_duration_seconds_bucket{job=\"$job\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "P50 {{job}}", - "refId": "C", + "legendFormat": "p99 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "refId": "A", "step": 10 } ], @@ -1989,34 +2181,62 @@ "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ ], + "seriesOverrides": [ + { + "alias": "p99", + "color": "#FA6400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p90", + "color": "#E0B400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p50", + "color": "#37872D", + "fill": 10, + "fillGradient": 0 + } + ], "spaceLength": 10, "span": 4, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(thanos_bucket_store_series_merge_duration_seconds_bucket{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, le)) * 1", + "expr": "histogram_quantile(0.50, sum by (job, le) (rate(thanos_bucket_store_series_merge_duration_seconds_bucket{job=\"$job\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "P99 {{job}}", + "legendFormat": "p50 {{job}}", + "logBase": 10, + "max": null, + "min": null, "refId": "A", "step": 10 }, { - "expr": "sum(rate(thanos_bucket_store_series_merge_duration_seconds_sum{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job) * 1 / sum(rate(thanos_bucket_store_series_merge_duration_seconds_count{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job)", + "expr": "histogram_quantile(0.90, sum by (job, le) (rate(thanos_bucket_store_series_merge_duration_seconds_bucket{job=\"$job\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "mean {{job}}", - "refId": "B", + "legendFormat": "p90 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "refId": "A", "step": 10 }, { - "expr": "histogram_quantile(0.50, sum(rate(thanos_bucket_store_series_merge_duration_seconds_bucket{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, le)) * 1", + "expr": "histogram_quantile(0.99, sum by (job, le) (rate(thanos_bucket_store_series_merge_duration_seconds_bucket{job=\"$job\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "P50 {{job}}", - "refId": "C", + "legendFormat": "p99 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "refId": "A", "step": 10 } ], @@ -2082,34 +2302,62 @@ "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ ], + "seriesOverrides": [ + { + "alias": "p99", + "color": "#FA6400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p90", + "color": "#E0B400", + "fill": 1, + "fillGradient": 1 + }, + { + "alias": "p50", + "color": "#37872D", + "fill": 10, + "fillGradient": 0 + } + ], "spaceLength": 10, "span": 4, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(thanos_bucket_store_series_gate_duration_seconds_bucket{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, le)) * 1", + "expr": "histogram_quantile(0.50, sum by (job, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket{job=\"$job\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "P99 {{job}}", + "legendFormat": "p50 {{job}}", + "logBase": 10, + "max": null, + "min": null, "refId": "A", "step": 10 }, { - "expr": "sum(rate(thanos_bucket_store_series_gate_duration_seconds_sum{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job) * 1 / sum(rate(thanos_bucket_store_series_gate_duration_seconds_count{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job)", + "expr": "histogram_quantile(0.90, sum by (job, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket{job=\"$job\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "mean {{job}}", - "refId": "B", + "legendFormat": "p90 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "refId": "A", "step": 10 }, { - "expr": "histogram_quantile(0.50, sum(rate(thanos_bucket_store_series_gate_duration_seconds_bucket{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, le)) * 1", + "expr": "histogram_quantile(0.99, sum by (job, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket{job=\"$job\"}[$interval]))) * 1", "format": "time_series", "intervalFactor": 2, - "legendFormat": "P50 {{job}}", - "refId": "C", + "legendFormat": "p99 {{job}}", + "logBase": 10, + "max": null, + "min": null, + "refId": "A", "step": 10 } ], @@ -2193,7 +2441,7 @@ "steppedLine": false, "targets": [ { - "expr": "go_memstats_alloc_bytes{namespace=\"$namespace\",job=~\"$job\"}", + "expr": "go_memstats_alloc_bytes{job=\"$job\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "alloc all {{instance}}", @@ -2201,7 +2449,7 @@ "step": 10 }, { - "expr": "go_memstats_heap_alloc_bytes{namespace=\"$namespace\",job=~\"$job\"}", + "expr": "go_memstats_heap_alloc_bytes{job=\"$job\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "alloc heap {{instance}}", @@ -2209,7 +2457,7 @@ "step": 10 }, { - "expr": "rate(go_memstats_alloc_bytes_total{namespace=\"$namespace\",job=~\"$job\"}[30s])", + "expr": "rate(go_memstats_alloc_bytes_total{job=\"$job\"})[30s]", "format": "time_series", "intervalFactor": 2, "legendFormat": "alloc rate all {{instance}}", @@ -2217,7 +2465,7 @@ "step": 10 }, { - "expr": "rate(go_memstats_heap_alloc_bytes{namespace=\"$namespace\",job=~\"$job\"}[30s])", + "expr": "rate(go_memstats_heap_alloc_bytes{job=\"$job\"})[30s]", "format": "time_series", "intervalFactor": 2, "legendFormat": "alloc rate heap {{instance}}", @@ -2225,18 +2473,18 @@ "step": 10 }, { - "expr": "go_memstats_stack_inuse_bytes{namespace=\"$namespace\",job=~\"$job\"}", + "expr": "go_memstats_stack_inuse_bytes{job=\"$job\"}", "format": "time_series", "intervalFactor": 2, - "legendFormat": "inuse stack {{instance}}", + "legendFormat": "inuse heap {{instance}}", "legendLink": null, "step": 10 }, { - "expr": "go_memstats_heap_inuse_bytes{namespace=\"$namespace\",job=~\"$job\"}", + "expr": "go_memstats_heap_inuse_bytes{job=\"$job\"}", "format": "time_series", "intervalFactor": 2, - "legendFormat": "inuse heap {{instance}}", + "legendFormat": "inuse stack {{instance}}", "legendLink": null, "step": 10 } @@ -2309,7 +2557,7 @@ "steppedLine": false, "targets": [ { - "expr": "go_goroutines{namespace=\"$namespace\",job=~\"$job\"}", + "expr": "go_goroutines{job=\"$job\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -2385,7 +2633,7 @@ "steppedLine": false, "targets": [ { - "expr": "go_gc_duration_seconds{namespace=\"$namespace\",job=~\"$job\"}", + "expr": "go_gc_duration_seconds{job=\"$job\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{quantile}} {{instance}}", @@ -2447,8 +2695,8 @@ "list": [ { "current": { - "text": "Prometheus", - "value": "Prometheus" + "text": "default", + "value": "default" }, "hide": 0, "label": null, @@ -2460,27 +2708,22 @@ "type": "datasource" }, { - "allValue": null, - "current": { }, - "datasource": "$datasource", + "auto": true, + "auto_count": 300, + "auto_min": "10s", + "current": { + "text": "5m", + "value": "5m" + }, "hide": 0, - "includeAll": false, - "label": "namespace", - "multi": false, - "name": "namespace", - "options": [ ], - "query": "label_values(thanos_status{}, namespace)", - "refresh": 1, - "regex": "", - "sort": 2, - "tagValuesQuery": "", - "tags": [ ], - "tagsQuery": "", - "type": "query", - "useTags": false + "label": "interval", + "name": "interval", + "query": "5m,10m,30m,1h,6h,12h", + "refresh": 2, + "type": "interval" }, { - "allValue": "thanos-store.*", + "allValue": null, "current": { "text": "all", "value": "$__all" @@ -2492,7 +2735,7 @@ "multi": false, "name": "job", "options": [ ], - "query": "label_values(up{namespace=\"$namespace\", job=~\"thanos-store.*\"}, job)", + "query": "label_values(up{job=~\"thanos-store.*\"}, job)", "refresh": 1, "regex": "", "sort": 2, @@ -2501,21 +2744,6 @@ "tagsQuery": "", "type": "query", "useTags": false - }, - { - "auto": true, - "auto_count": 300, - "auto_min": "10s", - "current": { - "text": "5m", - "value": "5m" - }, - "hide": 0, - "label": "interval", - "name": "interval", - "query": "5m,10m,30m,1h,6h,12h", - "refresh": 2, - "type": "interval" } ] }, diff --git a/mixin/README.md b/mixin/README.md index 0720caf241..8bc4e30d0b 100644 --- a/mixin/README.md +++ b/mixin/README.md @@ -59,48 +59,71 @@ This project is intended to be used as a library. You can extend and customize d [embedmd]:# (config.libsonnet) ```libsonnet { + local thanos = self, + // TargetGroups is a way to help mixin users to add high level target grouping to their alerts and dashboards. + // With the help of TargetGroups you can use a single observability stack to monitor several Thanos instances. + // The key in the key-value pair will be used as "label name" in the alerts and variable name in the dashboards. + // The value in the key-value pair will be used as a query to fetch available values for the given label name. + targetGroups+:: { + // For example for given following groups, + // namespace: 'thanos_status', + // cluster: 'find_mi_cluster_bitte', + // zone: 'an_i_in_da_zone', + // region: 'losing_my_region', + // will generate queriers for the alerts as follows: + // ( + // sum by (cluster, namespace, region, zone, job) (rate(thanos_compact_group_compactions_failures_total{job=~"thanos-compact.*"}[5m])) + // / + // sum by (cluster, namespace, region, zone, job) (rate(thanos_compact_group_compactions_total{job=~"thanos-compact.*"}[5m])) + // * 100 > 5 + // ) + // + // AND for the dashborads: + // + // sum by (cluster, namespace, region, zone, job) (rate(thanos_compact_group_compactions_failures_total{cluster=\"$cluster\", namespace=\"$namespace\", region=\"$region\", zone=\"$zone\", job=\"$job\"}[$interval])) + // / + // sum by (cluster, namespace, region, zone, job) (rate(thanos_compact_group_compactions_total{cluster=\"$cluster\", namespace=\"$namespace\", region=\"$region\", zone=\"$zone\", job=\"$job\"}[$interval])) + }, query+:: { - jobPrefix: 'thanos-query', - selector: 'job=~"%s.*"' % self.jobPrefix, + selector: 'job=~"thanos-query.*"', title: '%(prefix)sQuery' % $.dashboard.prefix, }, store+:: { - jobPrefix: 'thanos-store', - selector: 'job=~"%s.*"' % self.jobPrefix, + selector: 'job=~"thanos-store.*"', title: '%(prefix)sStore' % $.dashboard.prefix, }, receive+:: { - jobPrefix: 'thanos-receive', - selector: 'job=~"%s.*"' % self.jobPrefix, + selector: 'job=~"thanos-receive.*"', title: '%(prefix)sReceive' % $.dashboard.prefix, }, rule+:: { - jobPrefix: 'thanos-rule', - selector: 'job=~"%s.*"' % self.jobPrefix, + selector: 'job=~"thanos-rule.*"', title: '%(prefix)sRule' % $.dashboard.prefix, }, compact+:: { - jobPrefix: 'thanos-compact', - selector: 'job=~"%s.*"' % self.jobPrefix, + selector: 'job=~"thanos-compact.*"', title: '%(prefix)sCompact' % $.dashboard.prefix, }, sidecar+:: { - jobPrefix: 'thanos-sidecar', - selector: 'job=~"%s.*"' % self.jobPrefix, + selector: 'job=~"thanos-sidecar.*"', title: '%(prefix)sSidecar' % $.dashboard.prefix, }, + // TODO(kakkoyun): Fix naming convention: bucketReplicate bucket_replicate+:: { - jobPrefix: 'thanos-bucket-replicate', - selector: 'job=~"%s.*"' % self.jobPrefix, + selector: 'job=~"thanos-bucket-replicate.*"', title: '%(prefix)sBucketReplicate' % $.dashboard.prefix, }, - overview+:: { - title: '%(prefix)sOverview' % $.dashboard.prefix, - }, dashboard+:: { prefix: 'Thanos / ', tags: ['thanos-mixin'], - namespaceQuery: 'thanos_status', + selector: ['%s="$%s"' % [level, level] for level in std.objectFields(thanos.targetGroups)], + dimensions: ['%s' % level for level in std.objectFields(thanos.targetGroups)], + + overview+:: { + title: '%(prefix)sOverview' % $.dashboard.prefix, + selector: std.join(', ', thanos.dashboard.selector), + dimensions: std.join(', ', thanos.dashboard.dimensions + ['job']), + }, }, } ``` diff --git a/mixin/alerts/absent.libsonnet b/mixin/alerts/absent.libsonnet index 1786a53ce4..9a37b69ed0 100644 --- a/mixin/alerts/absent.libsonnet +++ b/mixin/alerts/absent.libsonnet @@ -1,5 +1,7 @@ local capitalize(str) = std.asciiUpper(std.substr(str, 0, 1)) + std.asciiLower(std.substr(str, 1, std.length(str))); local titlize(str) = std.join('', std.map(capitalize, std.split(str, '_'))); + +local components = ['query', 'receive', 'rule', 'compact', 'store', 'bucket_replicate', 'sidecar']; { local thanos = self, @@ -7,10 +9,11 @@ local titlize(str) = std.join('', std.map(capitalize, std.split(str, '_'))); jobs:: { ['Thanos%s' % titlize(component)]: thanos[component].selector for component in std.objectFieldsAll(thanos) - if component != 'jobs' && std.type(thanos[component]) == 'object' && std.objectHas(thanos[component], 'selector') + if component != 'jobs' && std.type(thanos[component]) == 'object' && std.member(components, component) }, prometheusAlerts+:: { + local location = if std.length(std.objectFields(thanos.targetGroups)) > 0 then ' from ' + std.join('/', ['{{$labels.%s}}' % level for level in std.objectFields(thanos.targetGroups)]) else '', groups+: [ { name: 'thanos-component-absent', @@ -25,8 +28,8 @@ local titlize(str) = std.join('', std.map(capitalize, std.split(str, '_'))); severity: 'critical', }, annotations: { - description: '%s has disappeared from Prometheus target discovery.' % name, - summary: 'thanos component has disappeared from Prometheus target discovery.', + description: '%s has disappeared%s. Prometheus target for the component cannot be discovered.' % [name, location], + summary: 'Thanos component has disappeared%s.' % location, }, } for name in std.objectFields(thanos.jobs) diff --git a/mixin/alerts/bucket_replicate.libsonnet b/mixin/alerts/bucket_replicate.libsonnet index b4bd6958f5..551dcb3ba7 100644 --- a/mixin/alerts/bucket_replicate.libsonnet +++ b/mixin/alerts/bucket_replicate.libsonnet @@ -4,37 +4,25 @@ selector: error 'must provide selector for Thanos Bucket Replicate dashboard', errorThreshold: 10, p99LatencyThreshold: 20, + dimensions: std.join(', ', std.objectFields(thanos.targetGroups) + ['job']), }, prometheusAlerts+:: { groups+: if thanos.bucket_replicate == null then [] else [ + local location = if std.length(std.objectFields(thanos.targetGroups)) > 0 then ' in' + std.join('/', ['{{$labels.%s}}' % level for level in std.objectFields(thanos.targetGroups)]) else ' '; { name: 'thanos-bucket-replicate', rules: [ - { - alert: 'ThanosBucketReplicateIsDown', - expr: ||| - absent(up{%(selector)s}) - ||| % thanos.bucket_replicate, - 'for': '5m', - labels: { - severity: 'critical', - }, - annotations: { - description: 'Thanos Replicate has disappeared from Prometheus target discovery.', - summary: 'Thanos Replicate has disappeared from Prometheus target discovery.', - }, - }, { alert: 'ThanosBucketReplicateErrorRate', annotations: { - description: 'Thanos Replicate failing to run, {{ $value | humanize }}% of attempts failed.', - summary: 'Thanose Replicate is failing to run.', + description: 'Thanos Replicate is failing to run%s, {{$value | humanize}}%% of attempts failed.' % location, + summary: 'Thanose Replicate is failing to run in %s.' % location, }, expr: ||| ( - sum(rate(thanos_replicate_replication_runs_total{result="error", %(selector)s}[5m])) - / on (namespace) group_left - sum(rate(thanos_replicate_replication_runs_total{%(selector)s}[5m])) + sum by (%(dimensions)s) (rate(thanos_replicate_replication_runs_total{result="error", %(selector)s}[5m])) + / on (%(dimensions)s) group_left + sum by (%(dimensions)s) (rate(thanos_replicate_replication_runs_total{%(selector)s}[5m])) ) * 100 >= %(errorThreshold)s ||| % thanos.bucket_replicate, 'for': '5m', @@ -45,14 +33,14 @@ { alert: 'ThanosBucketReplicateRunLatency', annotations: { - description: 'Thanos Replicate {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for the replicate operations.', + description: 'Thanos Replicate {{$labels.job}}%shas a 99th percentile latency of {{$value}} seconds for the replicate operations.' % location, summary: 'Thanos Replicate has a high latency for replicate operations.', }, expr: ||| ( - histogram_quantile(0.99, sum by (job, le) (rate(thanos_replicate_replication_run_duration_seconds_bucket{%(selector)s}[5m]))) > %(p99LatencyThreshold)s + histogram_quantile(0.99, sum by (%(dimensions)s) (rate(thanos_replicate_replication_run_duration_seconds_bucket{%(selector)s}[5m]))) > %(p99LatencyThreshold)s and - sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{%(selector)s}[5m])) > 0 + sum by (%(dimensions)s) (rate(thanos_replicate_replication_run_duration_seconds_bucket{%(selector)s}[5m])) > 0 ) ||| % thanos.bucket_replicate, 'for': '5m', diff --git a/mixin/alerts/compact.libsonnet b/mixin/alerts/compact.libsonnet index 75bea25b49..fa8b60b33a 100644 --- a/mixin/alerts/compact.libsonnet +++ b/mixin/alerts/compact.libsonnet @@ -4,19 +4,21 @@ selector: error 'must provide selector for Thanos Compact alerts', compactionErrorThreshold: 5, bucketOpsErrorThreshold: 5, + dimensions: std.join(', ', std.objectFields(thanos.targetGroups) + ['job']), }, prometheusAlerts+:: { groups+: if thanos.compact == null then [] else [ + local location = if std.length(std.objectFields(thanos.targetGroups)) > 0 then ' in ' + std.join('/', ['{{$labels.%s}}' % level for level in std.objectFields(thanos.targetGroups)]) else ' '; { name: 'thanos-compact', rules: [ { alert: 'ThanosCompactMultipleRunning', annotations: { - description: 'No more than one Thanos Compact instance should be running at once. There are {{ $value }}', + description: 'No more than one Thanos Compact instance should be running at once. There are {{$value}}%s' % location, summary: 'Thanos Compact has multiple instances running.', }, - expr: 'sum(up{%(selector)s}) > 1' % thanos.compact, + expr: 'sum by (%(dimensions)s) (up{%(selector)s}) > 1' % thanos.compact, 'for': '5m', labels: { severity: 'warning', @@ -25,7 +27,7 @@ { alert: 'ThanosCompactHalted', annotations: { - description: 'Thanos Compact {{$labels.job}} has failed to run and now is halted.', + description: 'Thanos Compact {{$labels.job}} has failed to run%s and now is halted.' % location, summary: 'Thanos Compact has failed to run ans is now halted.', }, expr: 'thanos_compact_halted{%(selector)s} == 1' % thanos.compact, @@ -37,14 +39,14 @@ { alert: 'ThanosCompactHighCompactionFailures', annotations: { - description: 'Thanos Compact {{$labels.job}} is failing to execute {{ $value | humanize }}% of compactions.', + description: 'Thanos Compact {{$labels.job}}%s, is failing to execute {{$value | humanize}}%% of compactions.' % location, summary: 'Thanos Compact is failing to execute compactions.', }, expr: ||| ( - sum by (job) (rate(thanos_compact_group_compactions_failures_total{%(selector)s}[5m])) + sum by (%(dimensions)s) (rate(thanos_compact_group_compactions_failures_total{%(selector)s}[5m])) / - sum by (job) (rate(thanos_compact_group_compactions_total{%(selector)s}[5m])) + sum by (%(dimensions)s) (rate(thanos_compact_group_compactions_total{%(selector)s}[5m])) * 100 > %(compactionErrorThreshold)s ) ||| % thanos.compact, @@ -56,14 +58,14 @@ { alert: 'ThanosCompactBucketHighOperationFailures', annotations: { - description: 'Thanos Compact {{$labels.job}} Bucket is failing to execute {{ $value | humanize }}% of operations.', + description: 'Thanos Compact {{$labels.job}}%s, Bucket is failing to execute {{$value | humanize}}%% of operations.' % location, summary: 'Thanos Compact Bucket is having a high number of operation failures.', }, expr: ||| ( - sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{%(selector)s}[5m])) + sum by (%(dimensions)s) (rate(thanos_objstore_bucket_operation_failures_total{%(selector)s}[5m])) / - sum by (job) (rate(thanos_objstore_bucket_operations_total{%(selector)s}[5m])) + sum by (%(dimensions)s) (rate(thanos_objstore_bucket_operations_total{%(selector)s}[5m])) * 100 > %(bucketOpsErrorThreshold)s ) ||| % thanos.compact, @@ -75,10 +77,10 @@ { alert: 'ThanosCompactHasNotRun', annotations: { - description: 'Thanos Compact {{$labels.job}} has not uploaded anything for 24 hours.', + description: 'Thanos Compact {{$labels.job}}%s has not uploaded anything for 24 hours.' % location, summary: 'Thanos Compact has not uploaded anything for last 24 hours.', }, - expr: '(time() - max(max_over_time(thanos_objstore_bucket_last_successful_upload_time{%(selector)s}[24h]))) / 60 / 60 > 24' % thanos.compact, + expr: '(time() - max by (%(dimensions)s) (max_over_time(thanos_objstore_bucket_last_successful_upload_time{%(selector)s}[24h]))) / 60 / 60 > 24' % thanos.compact, labels: { severity: 'warning', }, diff --git a/mixin/alerts/query.libsonnet b/mixin/alerts/query.libsonnet index f765c59c65..f1fb95b7ca 100644 --- a/mixin/alerts/query.libsonnet +++ b/mixin/alerts/query.libsonnet @@ -7,23 +7,25 @@ dnsErrorThreshold: 1, p99QueryLatencyThreshold: 40, p99QueryRangeLatencyThreshold: 90, + dimensions: std.join(', ', std.objectFields(thanos.targetGroups) + ['job']), }, prometheusAlerts+:: { groups+: if thanos.query == null then [] else [ + local location = if std.length(std.objectFields(thanos.targetGroups)) > 0 then ' in ' + std.join('/', ['{{$labels.%s}}' % level for level in std.objectFields(thanos.targetGroups)]) else ' '; { name: 'thanos-query', rules: [ { alert: 'ThanosQueryHttpRequestQueryErrorRateHigh', annotations: { - description: 'Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize }}% of "query" requests.', + description: 'Thanos Query {{$labels.job}}%sis failing to handle {{$value | humanize}}%% of "query" requests.' % location, summary: 'Thanos Query is failing to handle requests.', }, expr: ||| ( - sum(rate(http_requests_total{code=~"5..", %(selector)s, handler="query"}[5m])) + sum by (%(dimensions)s) (rate(http_requests_total{code=~"5..", %(selector)s, handler="query"}[5m])) / - sum(rate(http_requests_total{%(selector)s, handler="query"}[5m])) + sum by (%(dimensions)s) (rate(http_requests_total{%(selector)s, handler="query"}[5m])) ) * 100 > %(httpErrorThreshold)s ||| % thanos.query, 'for': '5m', @@ -34,14 +36,14 @@ { alert: 'ThanosQueryHttpRequestQueryRangeErrorRateHigh', annotations: { - description: 'Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize }}% of "query_range" requests.', + description: 'Thanos Query {{$labels.job}}%sis failing to handle {{$value | humanize}}%% of "query_range" requests.' % location, summary: 'Thanos Query is failing to handle requests.', }, expr: ||| ( - sum(rate(http_requests_total{code=~"5..", %(selector)s, handler="query_range"}[5m])) + sum by (%(dimensions)s) (rate(http_requests_total{code=~"5..", %(selector)s, handler="query_range"}[5m])) / - sum(rate(http_requests_total{%(selector)s, handler="query_range"}[5m])) + sum by (%(dimensions)s) (rate(http_requests_total{%(selector)s, handler="query_range"}[5m])) ) * 100 > %(httpErrorThreshold)s ||| % thanos.query, 'for': '5m', @@ -52,14 +54,14 @@ { alert: 'ThanosQueryGrpcServerErrorRate', annotations: { - description: 'Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests.', + description: 'Thanos Query {{$labels.job}}%sis failing to handle {{$value | humanize}}%% of requests.' % location, summary: 'Thanos Query is failing to handle requests.', }, expr: ||| ( - sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", %(selector)s}[5m])) + sum by (%(dimensions)s) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", %(selector)s}[5m])) / - sum by (job) (rate(grpc_server_started_total{%(selector)s}[5m])) + sum by (%(dimensions)s) (rate(grpc_server_started_total{%(selector)s}[5m])) * 100 > %(grpcErrorThreshold)s ) ||| % thanos.query, @@ -71,14 +73,14 @@ { alert: 'ThanosQueryGrpcClientErrorRate', annotations: { - description: 'Thanos Query {{$labels.job}} is failing to send {{ $value | humanize }}% of requests.', + description: 'Thanos Query {{$labels.job}}%sis failing to send {{$value | humanize}}%% of requests.' % location, summary: 'Thanos Query is failing to send requests.', }, expr: ||| ( - sum by (job) (rate(grpc_client_handled_total{grpc_code!="OK", %(selector)s}[5m])) + sum by (%(dimensions)s) (rate(grpc_client_handled_total{grpc_code!="OK", %(selector)s}[5m])) / - sum by (job) (rate(grpc_client_started_total{%(selector)s}[5m])) + sum by (%(dimensions)s) (rate(grpc_client_started_total{%(selector)s}[5m])) ) * 100 > %(grpcErrorThreshold)s ||| % thanos.query, 'for': '5m', @@ -89,14 +91,14 @@ { alert: 'ThanosQueryHighDNSFailures', annotations: { - description: 'Thanos Query {{$labels.job}} have {{ $value | humanize }}% of failing DNS queries for store endpoints.', + description: 'Thanos Query {{$labels.job}}%shave {{$value | humanize}}%% of failing DNS queries for store endpoints.' % location, summary: 'Thanos Query is having high number of DNS failures.', }, expr: ||| ( - sum by (job) (rate(thanos_query_store_apis_dns_failures_total{%(selector)s}[5m])) + sum by (%(dimensions)s) (rate(thanos_query_store_apis_dns_failures_total{%(selector)s}[5m])) / - sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{%(selector)s}[5m])) + sum by (%(dimensions)s) (rate(thanos_query_store_apis_dns_lookups_total{%(selector)s}[5m])) ) * 100 > %(dnsErrorThreshold)s ||| % thanos.query, 'for': '15m', @@ -107,14 +109,14 @@ { alert: 'ThanosQueryInstantLatencyHigh', annotations: { - description: 'Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for instant queries.', + description: 'Thanos Query {{$labels.job}}%shas a 99th percentile latency of {{$value}} seconds for instant queries.' % location, summary: 'Thanos Query has high latency for queries.', }, expr: ||| ( - histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{%(selector)s, handler="query"}[5m]))) > %(p99QueryLatencyThreshold)s + histogram_quantile(0.99, sum by (%(dimensions)s, le) (rate(http_request_duration_seconds_bucket{%(selector)s, handler="query"}[5m]))) > %(p99QueryLatencyThreshold)s and - sum by (job) (rate(http_request_duration_seconds_bucket{%(selector)s, handler="query"}[5m])) > 0 + sum by (%(dimensions)s) (rate(http_request_duration_seconds_bucket{%(selector)s, handler="query"}[5m])) > 0 ) ||| % thanos.query, 'for': '10m', @@ -125,14 +127,14 @@ { alert: 'ThanosQueryRangeLatencyHigh', annotations: { - description: 'Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for range queries.', + description: 'Thanos Query {{$labels.job}}%shas a 99th percentile latency of {{$value}} seconds for range queries.' % location, summary: 'Thanos Query has high latency for queries.', }, expr: ||| ( - histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{%(selector)s, handler="query_range"}[5m]))) > %(p99QueryRangeLatencyThreshold)s + histogram_quantile(0.99, sum by (%(dimensions)s, le) (rate(http_request_duration_seconds_bucket{%(selector)s, handler="query_range"}[5m]))) > %(p99QueryRangeLatencyThreshold)s and - sum by (job) (rate(http_request_duration_seconds_count{%(selector)s, handler="query_range"}[5m])) > 0 + sum by (%(dimensions)s) (rate(http_request_duration_seconds_count{%(selector)s, handler="query_range"}[5m])) > 0 ) ||| % thanos.query, 'for': '10m', diff --git a/mixin/alerts/receive.libsonnet b/mixin/alerts/receive.libsonnet index 034bb26e46..4e6006edf7 100644 --- a/mixin/alerts/receive.libsonnet +++ b/mixin/alerts/receive.libsonnet @@ -6,23 +6,25 @@ forwardErrorThreshold: 20, refreshErrorThreshold: 0, p99LatencyThreshold: 10, + dimensions: std.join(', ', std.objectFields(thanos.targetGroups) + ['job']), }, prometheusAlerts+:: { groups+: if thanos.receive == null then [] else [ + local location = if std.length(std.objectFields(thanos.targetGroups)) > 0 then ' in ' + std.join('/', ['{{$labels.%s}}' % level for level in std.objectFields(thanos.targetGroups)]) else ' '; { name: 'thanos-receive', rules: [ { alert: 'ThanosReceiveHttpRequestErrorRateHigh', annotations: { - description: 'Thanos Receive {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests.', + description: 'Thanos Receive {{$labels.job}}%sis failing to handle {{$value | humanize}}%% of requests.' % location, summary: 'Thanos Receive is failing to handle requests.', }, expr: ||| ( - sum(rate(http_requests_total{code=~"5..", %(selector)s, handler="receive"}[5m])) + sum by (%(dimensions)s) (rate(http_requests_total{code=~"5..", %(selector)s, handler="receive"}[5m])) / - sum(rate(http_requests_total{%(selector)s, handler="receive"}[5m])) + sum by (%(dimensions)s) (rate(http_requests_total{%(selector)s, handler="receive"}[5m])) ) * 100 > %(httpErrorThreshold)s ||| % thanos.receive, 'for': '5m', @@ -33,14 +35,14 @@ { alert: 'ThanosReceiveHttpRequestLatencyHigh', annotations: { - description: 'Thanos Receive {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for requests.', + description: 'Thanos Receive {{$labels.job}}%shas a 99th percentile latency of {{ $value }} seconds for requests.' % location, summary: 'Thanos Receive has high HTTP requests latency.', }, expr: ||| ( - histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{%(selector)s, handler="receive"}[5m]))) > %(p99LatencyThreshold)s + histogram_quantile(0.99, sum by (%(dimensions)s, le) (rate(http_request_duration_seconds_bucket{%(selector)s, handler="receive"}[5m]))) > %(p99LatencyThreshold)s and - sum by (job) (rate(http_request_duration_seconds_count{%(selector)s, handler="receive"}[5m])) > 0 + sum by (%(dimensions)s) (rate(http_request_duration_seconds_count{%(selector)s, handler="receive"}[5m])) > 0 ) ||| % thanos.receive, 'for': '10m', @@ -51,7 +53,7 @@ { alert: 'ThanosReceiveHighReplicationFailures', annotations: { - description: 'Thanos Receive {{$labels.job}} is failing to replicate {{ $value | humanize }}% of requests.', + description: 'Thanos Receive {{$labels.job}}%sis failing to replicate {{$value | humanize}}%% of requests.' % location, summary: 'Thanos Receive is having high number of replication failures.', }, expr: ||| @@ -59,15 +61,15 @@ and ( ( - sum by (job) (rate(thanos_receive_replications_total{result="error", %(selector)s}[5m])) + sum by (%(dimensions)s) (rate(thanos_receive_replications_total{result="error", %(selector)s}[5m])) / - sum by (job) (rate(thanos_receive_replications_total{%(selector)s}[5m])) + sum by (%(dimensions)s) (rate(thanos_receive_replications_total{%(selector)s}[5m])) ) > ( - max by (job) (floor((thanos_receive_replication_factor{%(selector)s}+1) / 2)) + max by (%(dimensions)s) (floor((thanos_receive_replication_factor{%(selector)s}+1) / 2)) / - max by (job) (thanos_receive_hashring_nodes{%(selector)s}) + max by (%(dimensions)s) (thanos_receive_hashring_nodes{%(selector)s}) ) ) * 100 ||| % thanos.receive, @@ -79,14 +81,14 @@ { alert: 'ThanosReceiveHighForwardRequestFailures', annotations: { - description: 'Thanos Receive {{$labels.job}} is failing to forward {{ $value | humanize }}% of requests.', + description: 'Thanos Receive {{$labels.job}}%sis failing to forward {{$value | humanize}}%% of requests.' % location, summary: 'Thanos Receive is failing to forward requests.', }, expr: ||| ( - sum by (job) (rate(thanos_receive_forward_requests_total{result="error", %(selector)s}[5m])) + sum by (%(dimensions)s) (rate(thanos_receive_forward_requests_total{result="error", %(selector)s}[5m])) / - sum by (job) (rate(thanos_receive_forward_requests_total{%(selector)s}[5m])) + sum by (%(dimensions)s) (rate(thanos_receive_forward_requests_total{%(selector)s}[5m])) ) * 100 > %(forwardErrorThreshold)s ||| % thanos.receive, 'for': '5m', @@ -97,14 +99,14 @@ { alert: 'ThanosReceiveHighHashringFileRefreshFailures', annotations: { - description: 'Thanos Receive {{$labels.job}} is failing to refresh hashring file, {{ $value | humanize }} of attempts failed.', + description: 'Thanos Receive {{$labels.job}}%sis failing to refresh hashring file, {{$value | humanize}} of attempts failed.' % location, summary: 'Thanos Receive is failing to refresh hasring file.', }, expr: ||| ( - sum by (job) (rate(thanos_receive_hashrings_file_errors_total{%(selector)s}[5m])) + sum by (%(dimensions)s) (rate(thanos_receive_hashrings_file_errors_total{%(selector)s}[5m])) / - sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total{%(selector)s}[5m])) + sum by (%(dimensions)s) (rate(thanos_receive_hashrings_file_refreshes_total{%(selector)s}[5m])) > %(refreshErrorThreshold)s ) ||| % thanos.receive, @@ -116,10 +118,10 @@ { alert: 'ThanosReceiveConfigReloadFailure', annotations: { - description: 'Thanos Receive {{$labels.job}} has not been able to reload hashring configurations.', + description: 'Thanos Receive {{$labels.job}}%shas not been able to reload hashring configurations.' % location, summary: 'Thanos Receive has not been able to reload configuration.', }, - expr: 'avg(thanos_receive_config_last_reload_successful{%(selector)s}) by (job) != 1' % thanos.receive, + expr: 'avg by (%(dimensions)s) (thanos_receive_config_last_reload_successful{%(selector)s}) != 1' % thanos.receive, 'for': '5m', labels: { severity: 'warning', @@ -128,13 +130,13 @@ { alert: 'ThanosReceiveNoUpload', annotations: { - description: 'Thanos Receive {{ $labels.instance }} of {{$labels.job}} has not uploaded latest data to object storage.', + description: 'Thanos Receive {{$labels.instance}}%shas not uploaded latest data to object storage.' % location, summary: 'Thanos Receive has not uploaded latest data to object storage.', }, expr: ||| (up{%(selector)s} - 1) - + on (instance) # filters to only alert on current instance last 3h - (sum by (instance) (increase(thanos_shipper_uploads_total{%(selector)s}[3h])) == 0) + + on (%(dimensions)s, instance) # filters to only alert on current instance last 3h + (sum by (%(dimensions)s, instance) (increase(thanos_shipper_uploads_total{%(selector)s}[3h])) == 0) ||| % thanos.receive, 'for': '3h', labels: { diff --git a/mixin/alerts/rule.libsonnet b/mixin/alerts/rule.libsonnet index d019a40199..0a88832755 100644 --- a/mixin/alerts/rule.libsonnet +++ b/mixin/alerts/rule.libsonnet @@ -6,20 +6,22 @@ rulerDnsErrorThreshold: 1, alertManagerDnsErrorThreshold: 1, evalErrorThreshold: 5, + dimensions: std.join(', ', std.objectFields(thanos.targetGroups) + ['job', 'instance']), }, prometheusAlerts+:: { groups+: if thanos.rule == null then [] else [ + local location = if std.length(std.objectFields(thanos.targetGroups)) > 0 then ' in ' + std.join('/', ['{{$labels.%s}}' % level for level in std.objectFields(thanos.targetGroups)]) else ' '; { name: 'thanos-rule', rules: [ { alert: 'ThanosRuleQueueIsDroppingAlerts', annotations: { - description: 'Thanos Rule {{$labels.job}} is failing to queue alerts.', + description: 'Thanos Rule {{$labels.instance}}%sis failing to queue alerts.' % location, summary: 'Thanos Rule is failing to queue alerts.', }, expr: ||| - sum by (job) (rate(thanos_alert_queue_alerts_dropped_total{%(selector)s}[5m])) > 0 + sum by (%(dimensions)s) (rate(thanos_alert_queue_alerts_dropped_total{%(selector)s}[5m])) > 0 ||| % thanos.rule, 'for': '5m', labels: { @@ -29,11 +31,11 @@ { alert: 'ThanosRuleSenderIsFailingAlerts', annotations: { - description: 'Thanos Rule {{$labels.job}} is failing to send alerts to alertmanager.', + description: 'Thanos Rule {{$labels.instance}}%sis failing to send alerts to alertmanager.' % location, summary: 'Thanos Rule is failing to send alerts to alertmanager.', }, expr: ||| - sum by (job) (rate(thanos_alert_sender_alerts_dropped_total{%(selector)s}[5m])) > 0 + sum by (%(dimensions)s) (rate(thanos_alert_sender_alerts_dropped_total{%(selector)s}[5m])) > 0 ||| % thanos.rule, 'for': '5m', labels: { @@ -43,14 +45,14 @@ { alert: 'ThanosRuleHighRuleEvaluationFailures', annotations: { - description: 'Thanos Rule {{$labels.job}} is failing to evaluate rules.', + description: 'Thanos Rule {{$labels.instance}}%sis failing to evaluate rules.' % location, summary: 'Thanos Rule is failing to evaluate rules.', }, expr: ||| ( - sum by (job) (rate(prometheus_rule_evaluation_failures_total{%(selector)s}[5m])) + sum by (%(dimensions)s) (rate(prometheus_rule_evaluation_failures_total{%(selector)s}[5m])) / - sum by (job) (rate(prometheus_rule_evaluations_total{%(selector)s}[5m])) + sum by (%(dimensions)s) (rate(prometheus_rule_evaluations_total{%(selector)s}[5m])) * 100 > %(evalErrorThreshold)s ) ||| % thanos.rule, @@ -63,11 +65,11 @@ { alert: 'ThanosRuleHighRuleEvaluationWarnings', annotations: { - description: 'Thanos Rule {{$labels.job}} has high number of evaluation warnings.', + description: 'Thanos Rule {{$labels.instance}}%shas high number of evaluation warnings.' % location, summary: 'Thanos Rule has high number of evaluation warnings.', }, expr: ||| - sum by (job) (rate(thanos_rule_evaluation_with_warnings_total{%(selector)s}[5m])) > 0 + sum by (%(dimensions)s) (rate(thanos_rule_evaluation_with_warnings_total{%(selector)s}[5m])) > 0 ||| % thanos.rule, 'for': '15m', @@ -78,14 +80,14 @@ { alert: 'ThanosRuleRuleEvaluationLatencyHigh', annotations: { - description: 'Thanos Rule {{$labels.job}}/{{$labels.instance}} has higher evaluation latency than interval for {{$labels.rule_group}}.', + description: 'Thanos Rule {{$labels.instance}}%shas higher evaluation latency than interval for {{$labels.rule_group}}.' % location, summary: 'Thanos Rule has high rule evaluation latency.', }, expr: ||| ( - sum by (job, instance, rule_group) (prometheus_rule_group_last_duration_seconds{%(selector)s}) + sum by (%(dimensions)s, rule_group) (prometheus_rule_group_last_duration_seconds{%(selector)s}) > - sum by (job, instance, rule_group) (prometheus_rule_group_interval_seconds{%(selector)s}) + sum by (%(dimensions)s, rule_group) (prometheus_rule_group_interval_seconds{%(selector)s}) ) ||| % thanos.rule, 'for': '5m', @@ -96,14 +98,14 @@ { alert: 'ThanosRuleGrpcErrorRate', annotations: { - description: 'Thanos Rule {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests.', + description: 'Thanos Rule {{$labels.job}}%sis failing to handle {{$value | humanize}}%% of requests.' % location, summary: 'Thanos Rule is failing to handle grpc requests.', }, expr: ||| ( - sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", %(selector)s}[5m])) + sum by (%(dimensions)s) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", %(selector)s}[5m])) / - sum by (job) (rate(grpc_server_started_total{%(selector)s}[5m])) + sum by (%(dimensions)s) (rate(grpc_server_started_total{%(selector)s}[5m])) * 100 > %(grpcErrorThreshold)s ) ||| % thanos.rule, @@ -115,10 +117,10 @@ { alert: 'ThanosRuleConfigReloadFailure', annotations: { - description: 'Thanos Rule {{$labels.job}} has not been able to reload its configuration.', + description: 'Thanos Rule {{$labels.job}}%shas not been able to reload its configuration.' % location, summary: 'Thanos Rule has not been able to reload configuration.', }, - expr: 'avg(thanos_rule_config_last_reload_successful{%(selector)s}) by (job) != 1' % thanos.rule, + expr: 'avg by (%(dimensions)s) (thanos_rule_config_last_reload_successful{%(selector)s}) != 1' % thanos.rule, 'for': '5m', labels: { severity: 'info', @@ -127,14 +129,14 @@ { alert: 'ThanosRuleQueryHighDNSFailures', annotations: { - description: 'Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing DNS queries for query endpoints.', + description: 'Thanos Rule {{$labels.job}}%shas {{$value | humanize}}%% of failing DNS queries for query endpoints.' % location, summary: 'Thanos Rule is having high number of DNS failures.', }, expr: ||| ( - sum by (job) (rate(thanos_rule_query_apis_dns_failures_total{%(selector)s}[5m])) + sum by (%(dimensions)s) (rate(thanos_rule_query_apis_dns_failures_total{%(selector)s}[5m])) / - sum by (job) (rate(thanos_rule_query_apis_dns_lookups_total{%(selector)s}[5m])) + sum by (%(dimensions)s) (rate(thanos_rule_query_apis_dns_lookups_total{%(selector)s}[5m])) * 100 > %(rulerDnsErrorThreshold)s ) ||| % thanos.rule, @@ -146,14 +148,14 @@ { alert: 'ThanosRuleAlertmanagerHighDNSFailures', annotations: { - description: 'Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing DNS queries for Alertmanager endpoints.', + description: 'Thanos Rule {{$labels.instance}}%shas {{$value | humanize}}%% of failing DNS queries for Alertmanager endpoints.' % location, summary: 'Thanos Rule is having high number of DNS failures.', }, expr: ||| ( - sum by (job) (rate(thanos_rule_alertmanagers_dns_failures_total{%(selector)s}[5m])) + sum by (%(dimensions)s) (rate(thanos_rule_alertmanagers_dns_failures_total{%(selector)s}[5m])) / - sum by (job) (rate(thanos_rule_alertmanagers_dns_lookups_total{%(selector)s}[5m])) + sum by (%(dimensions)s) (rate(thanos_rule_alertmanagers_dns_lookups_total{%(selector)s}[5m])) * 100 > %(alertManagerDnsErrorThreshold)s ) ||| % thanos.rule, @@ -166,13 +168,13 @@ // NOTE: This alert will give false positive if no rules are configured. alert: 'ThanosRuleNoEvaluationFor10Intervals', annotations: { - description: 'Thanos Rule {{$labels.job}} has {{ $value | humanize }}% rule groups that did not evaluate for at least 10x of their expected interval.', + description: 'Thanos Rule {{$labels.job}}%shas {{$value | humanize}}%% rule groups that did not evaluate for at least 10x of their expected interval.' % location, summary: 'Thanos Rule has rule groups that did not evaluate for 10 intervals.', }, expr: ||| - time() - max by (job, group) (prometheus_rule_group_last_evaluation_timestamp_seconds{%(selector)s}) + time() - max by (%(dimensions)s, group) (prometheus_rule_group_last_evaluation_timestamp_seconds{%(selector)s}) > - 10 * max by (job, group) (prometheus_rule_group_interval_seconds{%(selector)s}) + 10 * max by (%(dimensions)s, group) (prometheus_rule_group_interval_seconds{%(selector)s}) ||| % thanos.rule, 'for': '5m', labels: { @@ -183,13 +185,13 @@ { alert: 'ThanosNoRuleEvaluations', annotations: { - description: 'Thanos Rule {{$labels.job}} did not perform any rule evaluations in the past 2 minutes.', + description: 'Thanos Rule {{$labels.instance}}%sdid not perform any rule evaluations in the past 2 minutes.' % location, summary: 'Thanos Rule did not perform any rule evaluations.', }, expr: ||| - sum(rate(prometheus_rule_evaluations_total{%(selector)s}[2m])) <= 0 + sum by (%(dimensions)s) (rate(prometheus_rule_evaluations_total{%(selector)s}[2m])) <= 0 and - sum(thanos_rule_loaded_rules{%(selector)s}) > 0 + sum by (%(dimensions)s) (thanos_rule_loaded_rules{%(selector)s}) > 0 ||| % thanos.rule, 'for': '3m', labels: { diff --git a/mixin/alerts/sidecar.libsonnet b/mixin/alerts/sidecar.libsonnet index ee09c9aeb8..f46ba181f1 100644 --- a/mixin/alerts/sidecar.libsonnet +++ b/mixin/alerts/sidecar.libsonnet @@ -2,20 +2,22 @@ local thanos = self, sidecar+:: { selector: error 'must provide selector for Thanos Sidecar alerts', + dimensions: std.join(', ', std.objectFields(thanos.targetGroups) + ['job', 'instance']), }, prometheusAlerts+:: { groups+: if thanos.sidecar == null then [] else [ + local location = if std.length(std.objectFields(thanos.targetGroups)) > 0 then ' in ' + std.join('/', ['{{$labels.%s}}' % level for level in std.objectFields(thanos.targetGroups)]) else ' '; { name: 'thanos-sidecar', rules: [ { alert: 'ThanosSidecarPrometheusDown', annotations: { - description: 'Thanos Sidecar {{$labels.job}} {{$labels.instance}} cannot connect to Prometheus.', + description: 'Thanos Sidecar {{$labels.instance}}%scannot connect to Prometheus.' % location, summary: 'Thanos Sidecar cannot connect to Prometheus', }, expr: ||| - sum by (job, instance) (thanos_sidecar_prometheus_up{%(selector)s} == 0) + thanos_sidecar_prometheus_up{%(selector)s} == 0 ||| % thanos.sidecar, 'for': '5m', labels: { @@ -25,11 +27,11 @@ { alert: 'ThanosSidecarBucketOperationsFailed', annotations: { - description: 'Thanos Sidecar {{$labels.job}} {{$labels.instance}} bucket operations are failing', + description: 'Thanos Sidecar {{$labels.instance}}%sbucket operations are failing' % location, summary: 'Thanos Sidecar bucket operations are failing', }, expr: ||| - rate(thanos_objstore_bucket_operation_failures_total{%(selector)s}[5m]) > 0 + sum by (%(dimensions)s) (rate(thanos_objstore_bucket_operation_failures_total{%(selector)s}[5m])) > 0 ||| % thanos.sidecar, 'for': '5m', labels: { @@ -39,11 +41,11 @@ { alert: 'ThanosSidecarUnhealthy', annotations: { - description: 'Thanos Sidecar {{$labels.job}} {{$labels.instance}} is unhealthy for more than {{$value}} seconds.', + description: 'Thanos Sidecar {{$labels.instance}}%sis unhealthy for {{$value}} seconds.' % location, summary: 'Thanos Sidecar is unhealthy.', }, expr: ||| - time() - max by (job, instance) (timestamp(thanos_sidecar_last_heartbeat_success_time_seconds{%(selector)s})) >= 240 + time() - max by (%(dimensions)s) (thanos_sidecar_last_heartbeat_success_time_seconds{%(selector)s}) >= 600 ||| % thanos.sidecar, labels: { severity: 'critical', diff --git a/mixin/alerts/store.libsonnet b/mixin/alerts/store.libsonnet index 8fb25e9cb3..7ab8279da6 100644 --- a/mixin/alerts/store.libsonnet +++ b/mixin/alerts/store.libsonnet @@ -7,23 +7,25 @@ seriesGateErrorThreshold: 2, bucketOpsErrorThreshold: 5, bucketOpsP99LatencyThreshold: 2, + dimensions: std.join(', ', std.objectFields(thanos.targetGroups) + ['job']), }, prometheusAlerts+:: { groups+: if thanos.store == null then [] else [ + local location = if std.length(std.objectFields(thanos.targetGroups)) > 0 then ' in ' + std.join('/', ['{{$labels.%s}}' % level for level in std.objectFields(thanos.targetGroups)]) else ' '; { name: 'thanos-store', rules: [ { alert: 'ThanosStoreGrpcErrorRate', annotations: { - description: 'Thanos Store {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests.', + description: 'Thanos Store {{$labels.job}}%sis failing to handle {{$value | humanize}}%% of requests.' % location, summary: 'Thanos Store is failing to handle qrpcd requests.', }, expr: ||| ( - sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", %(selector)s}[5m])) + sum by (%(dimensions)s) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", %(selector)s}[5m])) / - sum by (job) (rate(grpc_server_started_total{%(selector)s}[5m])) + sum by (%(dimensions)s) (rate(grpc_server_started_total{%(selector)s}[5m])) * 100 > %(grpcErrorThreshold)s ) ||| % thanos.store, @@ -35,14 +37,14 @@ { alert: 'ThanosStoreSeriesGateLatencyHigh', annotations: { - description: 'Thanos Store {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for store series gate requests.', + description: 'Thanos Store {{$labels.job}}%shas a 99th percentile latency of {{$value}} seconds for store series gate requests.' % location, summary: 'Thanos Store has high latency for store series gate requests.', }, expr: ||| ( - histogram_quantile(0.99, sum by (job, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket{%(selector)s}[5m]))) > %(seriesGateErrorThreshold)s + histogram_quantile(0.99, sum by (%(dimensions)s, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket{%(selector)s}[5m]))) > %(seriesGateErrorThreshold)s and - sum by (job) (rate(thanos_bucket_store_series_gate_duration_seconds_count{%(selector)s}[5m])) > 0 + sum by (%(dimensions)s) (rate(thanos_bucket_store_series_gate_duration_seconds_count{%(selector)s}[5m])) > 0 ) ||| % thanos.store, 'for': '10m', @@ -53,14 +55,14 @@ { alert: 'ThanosStoreBucketHighOperationFailures', annotations: { - description: 'Thanos Store {{$labels.job}} Bucket is failing to execute {{ $value | humanize }}% of operations.', + description: 'Thanos Store {{$labels.job}}%sBucket is failing to execute {{$value | humanize}}%% of operations.' % location, summary: 'Thanos Store Bucket is failing to execute operations.', }, expr: ||| ( - sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{%(selector)s}[5m])) + sum by (%(dimensions)s) (rate(thanos_objstore_bucket_operation_failures_total{%(selector)s}[5m])) / - sum by (job) (rate(thanos_objstore_bucket_operations_total{%(selector)s}[5m])) + sum by (%(dimensions)s) (rate(thanos_objstore_bucket_operations_total{%(selector)s}[5m])) * 100 > %(bucketOpsErrorThreshold)s ) ||| % thanos.store, @@ -72,14 +74,14 @@ { alert: 'ThanosStoreObjstoreOperationLatencyHigh', annotations: { - description: 'Thanos Store {{$labels.job}} Bucket has a 99th percentile latency of {{ $value }} seconds for the bucket operations.', + description: 'Thanos Store {{$labels.job}}%sBucket has a 99th percentile latency of {{$value}} seconds for the bucket operations.' % location, summary: 'Thanos Store is having high latency for bucket operations.', }, expr: ||| ( - histogram_quantile(0.99, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{%(selector)s}[5m]))) > %(bucketOpsP99LatencyThreshold)s + histogram_quantile(0.99, sum by (%(dimensions)s, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{%(selector)s}[5m]))) > %(bucketOpsP99LatencyThreshold)s and - sum by (job) (rate(thanos_objstore_bucket_operation_duration_seconds_count{%(selector)s}[5m])) > 0 + sum by (%(dimensions)s) (rate(thanos_objstore_bucket_operation_duration_seconds_count{%(selector)s}[5m])) > 0 ) ||| % thanos.store, 'for': '10m', diff --git a/mixin/config.libsonnet b/mixin/config.libsonnet index d0123e6612..b30a3b3cdb 100644 --- a/mixin/config.libsonnet +++ b/mixin/config.libsonnet @@ -1,45 +1,68 @@ { + local thanos = self, + // TargetGroups is a way to help mixin users to add high level target grouping to their alerts and dashboards. + // With the help of TargetGroups you can use a single observability stack to monitor several Thanos instances. + // The key in the key-value pair will be used as "label name" in the alerts and variable name in the dashboards. + // The value in the key-value pair will be used as a query to fetch available values for the given label name. + targetGroups+:: { + // For example for given following groups, + // namespace: 'thanos_status', + // cluster: 'find_mi_cluster_bitte', + // zone: 'an_i_in_da_zone', + // region: 'losing_my_region', + // will generate queriers for the alerts as follows: + // ( + // sum by (cluster, namespace, region, zone, job) (rate(thanos_compact_group_compactions_failures_total{job=~"thanos-compact.*"}[5m])) + // / + // sum by (cluster, namespace, region, zone, job) (rate(thanos_compact_group_compactions_total{job=~"thanos-compact.*"}[5m])) + // * 100 > 5 + // ) + // + // AND for the dashborads: + // + // sum by (cluster, namespace, region, zone, job) (rate(thanos_compact_group_compactions_failures_total{cluster=\"$cluster\", namespace=\"$namespace\", region=\"$region\", zone=\"$zone\", job=\"$job\"}[$interval])) + // / + // sum by (cluster, namespace, region, zone, job) (rate(thanos_compact_group_compactions_total{cluster=\"$cluster\", namespace=\"$namespace\", region=\"$region\", zone=\"$zone\", job=\"$job\"}[$interval])) + }, query+:: { - jobPrefix: 'thanos-query', - selector: 'job=~"%s.*"' % self.jobPrefix, + selector: 'job=~"thanos-query.*"', title: '%(prefix)sQuery' % $.dashboard.prefix, }, store+:: { - jobPrefix: 'thanos-store', - selector: 'job=~"%s.*"' % self.jobPrefix, + selector: 'job=~"thanos-store.*"', title: '%(prefix)sStore' % $.dashboard.prefix, }, receive+:: { - jobPrefix: 'thanos-receive', - selector: 'job=~"%s.*"' % self.jobPrefix, + selector: 'job=~"thanos-receive.*"', title: '%(prefix)sReceive' % $.dashboard.prefix, }, rule+:: { - jobPrefix: 'thanos-rule', - selector: 'job=~"%s.*"' % self.jobPrefix, + selector: 'job=~"thanos-rule.*"', title: '%(prefix)sRule' % $.dashboard.prefix, }, compact+:: { - jobPrefix: 'thanos-compact', - selector: 'job=~"%s.*"' % self.jobPrefix, + selector: 'job=~"thanos-compact.*"', title: '%(prefix)sCompact' % $.dashboard.prefix, }, sidecar+:: { - jobPrefix: 'thanos-sidecar', - selector: 'job=~"%s.*"' % self.jobPrefix, + selector: 'job=~"thanos-sidecar.*"', title: '%(prefix)sSidecar' % $.dashboard.prefix, }, + // TODO(kakkoyun): Fix naming convention: bucketReplicate bucket_replicate+:: { - jobPrefix: 'thanos-bucket-replicate', - selector: 'job=~"%s.*"' % self.jobPrefix, + selector: 'job=~"thanos-bucket-replicate.*"', title: '%(prefix)sBucketReplicate' % $.dashboard.prefix, }, - overview+:: { - title: '%(prefix)sOverview' % $.dashboard.prefix, - }, dashboard+:: { prefix: 'Thanos / ', tags: ['thanos-mixin'], - namespaceQuery: 'thanos_status', + selector: ['%s="$%s"' % [level, level] for level in std.objectFields(thanos.targetGroups)], + dimensions: ['%s' % level for level in std.objectFields(thanos.targetGroups)], + + overview+:: { + title: '%(prefix)sOverview' % $.dashboard.prefix, + selector: std.join(', ', thanos.dashboard.selector), + dimensions: std.join(', ', thanos.dashboard.dimensions + ['job']), + }, }, } diff --git a/mixin/dashboards.jsonnet b/mixin/dashboards.jsonnet index 94353a6c15..4e4c5e4e87 100644 --- a/mixin/dashboards.jsonnet +++ b/mixin/dashboards.jsonnet @@ -1,7 +1,6 @@ -local dashboards = - ( - import 'mixin.libsonnet' - ).grafanaDashboards; +local dashboards = ( + import 'mixin.libsonnet' +).grafanaDashboards; { [name]: dashboards[name] diff --git a/mixin/dashboards/bucket_replicate.libsonnet b/mixin/dashboards/bucket_replicate.libsonnet index 882ef35d5a..82ae53380e 100644 --- a/mixin/dashboards/bucket_replicate.libsonnet +++ b/mixin/dashboards/bucket_replicate.libsonnet @@ -3,9 +3,12 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; { local thanos = self, bucket_replicate+:: { - jobPrefix: error 'must provide job prefix for Thanos Bucket Replicate dashboard', selector: error 'must provide selector for Thanos Bucket Replicate dashboard', title: error 'must provide title for Thanos Bucket Replicate dashboard', + dashboard:: { + selector: std.join(', ', thanos.dashboard.selector + ['job="$job"']), + dimensions: std.join(', ', thanos.dashboard.dimensions + ['job']), + }, }, grafanaDashboards+:: { [if thanos.bucket_replicate != null then 'bucket_replicate.json']: @@ -15,14 +18,15 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; .addPanel( g.panel('Rate') + g.qpsErrTotalPanel( - 'thanos_replicate_replication_runs_total{result="error", namespace="$namespace",%(selector)s}' % thanos.bucket_replicate, - 'thanos_replicate_replication_runs_total{namespace="$namespace",%(selector)s}' % thanos.bucket_replicate, + 'thanos_replicate_replication_runs_total{result="error", %s}' % thanos.bucket_replicate.dashboard.selector, + 'thanos_replicate_replication_runs_total{%s}' % thanos.bucket_replicate.dashboard.selector, + thanos.rule.dashboard.dimensions ) ) .addPanel( g.panel('Errors', 'Shows rate of errors.') + g.queryPanel( - 'sum(rate(thanos_replicate_replication_runs_total{result="error", namespace="$namespace",%(selector)s}[$interval])) by (result)' % thanos.bucket_replicate, + 'sum by (%(dimensions)s, result) (rate(thanos_replicate_replication_runs_total{result="error", %(selector)s}[$interval]))' % thanos.bucket_replicate.dashboard, '{{result}}' ) + { yaxes: g.yaxes('percentunit') } + @@ -30,7 +34,11 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; ) .addPanel( g.panel('Duration', 'Shows how long has it taken to run a replication cycle.') + - g.latencyPanel('thanos_replicate_replication_run_duration_seconds', 'result="success", namespace="$namespace",%(selector)s' % thanos.bucket_replicate) + g.latencyPanel( + 'thanos_replicate_replication_run_duration_seconds', + 'result="success", %s' % thanos.bucket_replicate.dashboard.selector, + thanos.rule.dashboard.dimensions + ) ) ) .addRow( @@ -39,18 +47,15 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; g.panel('Metrics') + g.queryPanel( [ - 'sum(rate(blocks_meta_synced{state="loaded",namespace="$namespace",%(selector)s}[$interval]))' % thanos.bucket_replicate, - 'sum(rate(blocks_meta_synced{state="failed",namespace="$namespace",%(selector)s}[$interval]))' % thanos.bucket_replicate, - 'sum(rate(thanos_replicate_blocks_already_replicated_total{namespace="$namespace",%(selector)s}[$interval]))' % thanos.bucket_replicate, - 'sum(rate(thanos_replicate_blocks_replicated_total{namespace="$namespace",%(selector)s}[$interval]))' % thanos.bucket_replicate, - 'sum(rate(thanos_replicate_objects_replicated_total{namespace="$namespace",%(selector)s}[$interval]))' % thanos.bucket_replicate, + 'sum by (%(dimensions)s) (rate(blocks_meta_synced{state="loaded", %(selector)s}[$interval]))' % thanos.bucket_replicate.dashboard, + 'sum by (%(dimensions)s) (rate(blocks_meta_synced{state="failed", %(selector)s}[$interval]))' % thanos.bucket_replicate.dashboard, + 'sum by (%(dimensions)s) (rate(thanos_replicate_blocks_already_replicated_total{%(selector)s}[$interval]))' % thanos.bucket_replicate.dashboard, + 'sum by (%(dimensions)s) (rate(thanos_replicate_blocks_replicated_total{%(selector)s}[$interval]))' % thanos.bucket_replicate.dashboard, + 'sum by (%(dimensions)s) (rate(thanos_replicate_objects_replicated_total{%(selector)s}[$interval]))' % thanos.bucket_replicate.dashboard, ], ['meta loads', 'partial meta reads', 'already replicated blocks', 'replicated blocks', 'replicated objects'] ) ) - ) - + - g.template('namespace', thanos.dashboard.namespaceQuery) + - g.template('job', 'up', 'namespace="$namespace", %(selector)s' % thanos.bucket_replicate, true, '%(jobPrefix)s.*' % thanos.bucket_replicate), + ), }, } diff --git a/mixin/dashboards/compact.libsonnet b/mixin/dashboards/compact.libsonnet index 7439233182..b5db251613 100644 --- a/mixin/dashboards/compact.libsonnet +++ b/mixin/dashboards/compact.libsonnet @@ -1,11 +1,15 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; +local utils = import '../lib/utils.libsonnet'; { local thanos = self, compact+:: { - jobPrefix: error 'must provide job prefix for Thanos Compact dashboard', selector: error 'must provide selector for Thanos Compact dashboard', title: error 'must provide title for Thanos Compact dashboard', + dashboard:: { + selector: std.join(', ', thanos.dashboard.selector + ['job="$job"']), + dimensions: std.join(', ', thanos.dashboard.dimensions + ['job']), + }, }, grafanaDashboards+:: { [if thanos.compact != null then 'compact.json']: @@ -18,7 +22,7 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; 'Shows rate of execution for compactions against blocks that are stored in the bucket by compaction group.' ) + g.queryPanel( - 'sum(rate(thanos_compact_group_compactions_total{namespace="$namespace",job=~"$job"}[$interval])) by (job, group)', + 'sum by (%(dimensions)s, group) (rate(thanos_compact_group_compactions_total{%(selector)s}[$interval]))' % thanos.compact.dashboard, 'compaction {{job}} {{group}}' ) + g.stack @@ -29,8 +33,9 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; 'Shows ratio of errors compared to the total number of executed compactions against blocks that are stored in the bucket.' ) + g.qpsErrTotalPanel( - 'thanos_compact_group_compactions_failures_total{namespace="$namespace",job=~"$job"}', - 'thanos_compact_group_compactions_total{namespace="$namespace",job=~"$job"}', + 'thanos_compact_group_compactions_failures_total{%(selector)s}' % thanos.compact.dashboard.selector, + 'thanos_compact_group_compactions_total{%(selector)s}' % thanos.compact.dashboard.selector, + thanos.rule.dashboard.dimensions ) ) ) @@ -42,7 +47,7 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; 'Shows rate of execution for downsampling against blocks that are stored in the bucket by compaction group.' ) + g.queryPanel( - 'sum(rate(thanos_compact_downsample_total{namespace="$namespace",job=~"$job"}[$interval])) by (job, group)', + 'sum by (%(dimensions)s, group) (rate(thanos_compact_downsample_total{%(selector)s}[$interval]))' % thanos.compact.dashboard, 'downsample {{job}} {{group}}' ) + g.stack @@ -50,8 +55,9 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; .addPanel( g.panel('Errors', 'Shows ratio of errors compared to the total number of executed downsampling against blocks that are stored in the bucket.') + g.qpsErrTotalPanel( - 'thanos_compact_downsample_failed_total{namespace="$namespace",job=~"$job"}', - 'thanos_compact_downsample_total{namespace="$namespace",job=~"$job"}', + 'thanos_compact_downsample_failed_total{%(selector)s}' % thanos.compact.dashboard.selector, + 'thanos_compact_downsample_total{%(selector)s}' % thanos.compact.dashboard.selector, + thanos.rule.dashboard.dimensions ) ) ) @@ -63,7 +69,7 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; 'Shows rate of execution for removals of blocks if their data is available as part of a block with a higher compaction level.' ) + g.queryPanel( - 'sum(rate(thanos_compact_garbage_collection_total{namespace="$namespace",job=~"$job"}[$interval])) by (job)', + 'sum by (%(dimensions)s) (rate(thanos_compact_garbage_collection_total{%(selector)s}[$interval]))' % thanos.compact.dashboard, 'garbage collection {{job}}' ) + g.stack @@ -71,13 +77,14 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; .addPanel( g.panel('Errors', 'Shows ratio of errors compared to the total number of executed garbage collections.') + g.qpsErrTotalPanel( - 'thanos_compact_garbage_collection_failures_total{namespace="$namespace",job=~"$job"}', - 'thanos_compact_garbage_collection_total{namespace="$namespace",job=~"$job"}', + 'thanos_compact_garbage_collection_failures_total{%(selector)s}' % thanos.compact.dashboard.selector, + 'thanos_compact_garbage_collection_total{%(selector)s}' % thanos.compact.dashboard.selector, + thanos.rule.dashboard.dimensions ) ) .addPanel( g.panel('Duration', 'Shows how long has it taken to execute garbage collection in quantiles.') + - g.latencyPanel('thanos_compact_garbage_collection_duration_seconds', 'namespace="$namespace",job=~"$job"') + g.latencyPanel('thanos_compact_garbage_collection_duration_seconds', thanos.rule.dashboard.selector, thanos.rule.dashboard.dimensions) ) ) .addRow( @@ -88,7 +95,7 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; 'Shows deletion rate of blocks already marked for deletion.' ) + g.queryPanel( - 'sum(rate(thanos_compact_blocks_cleaned_total{namespace="$namespace",job=~"$job"}[$interval])) by (job)', + 'sum by (%(dimensions)s) (rate(thanos_compact_blocks_cleaned_total{%(selector)s}[$interval]))' % thanos.compact.dashboard, 'Blocks cleanup {{job}}' ) + g.stack @@ -99,7 +106,7 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; 'Shows deletion failures rate of blocks already marked for deletion.' ) + g.queryPanel( - 'sum(rate(thanos_compact_block_cleanup_failures_total{namespace="$namespace",job=~"$job"}[$interval])) by (job)', + 'sum by (%(dimensions)s) (rate(thanos_compact_block_cleanup_failures_total{%(selector)s}[$interval]))' % thanos.compact.dashboard, 'Blocks cleanup failures {{job}}' ) ) @@ -109,7 +116,7 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; 'Shows rate at which blocks are marked for deletion (from GC and retention policy).' ) + g.queryPanel( - 'sum(rate(thanos_compact_blocks_marked_for_deletion_total{namespace="$namespace",job=~"$job"}[$interval])) by (job)', + 'sum by (%(dimensions)s) (rate(thanos_compact_blocks_marked_for_deletion_total{%(selector)s}[$interval]))' % thanos.compact.dashboard, 'Blocks marked {{job}}' ) ) @@ -122,7 +129,7 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; 'Shows rate of execution for all meta files from blocks in the bucket into the memory.' ) + g.queryPanel( - 'sum(rate(thanos_blocks_meta_syncs_total{namespace="$namespace",job=~"$job"}[$interval])) by (job)', + 'sum by (%(dimensions)s) (rate(thanos_blocks_meta_syncs_total{%(selector)s}[$interval]))' % thanos.compact.dashboard, 'sync {{job}}' ) + g.stack @@ -130,13 +137,14 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; .addPanel( g.panel('Errors', 'Shows ratio of errors compared to the total number of executed meta file sync.') + g.qpsErrTotalPanel( - 'thanos_blocks_meta_sync_failures_total{namespace="$namespace",job=~"$job"}', - 'thanos_blocks_meta_syncs_total{namespace="$namespace",job=~"$job"}', + 'thanos_blocks_meta_sync_failures_total{%(selector)s}' % thanos.compact.dashboard.selector, + 'thanos_blocks_meta_syncs_total{%(selector)s}' % thanos.compact.dashboard.selector, + thanos.rule.dashboard.dimensions ) ) .addPanel( g.panel('Duration', 'Shows how long has it taken to execute meta file sync, in quantiles.') + - g.latencyPanel('thanos_blocks_meta_sync_duration_seconds', 'namespace="$namespace",job=~"$job"') + g.latencyPanel('thanos_blocks_meta_sync_duration_seconds', thanos.rule.dashboard.selector, thanos.rule.dashboard.dimensions) ) ) .addRow( @@ -144,7 +152,7 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; .addPanel( g.panel('Rate', 'Shows rate of execution for operations against the bucket.') + g.queryPanel( - 'sum(rate(thanos_objstore_bucket_operations_total{namespace="$namespace",job=~"$job"}[$interval])) by (job, operation)', + 'sum by (%(dimensions)s, operation) (rate(thanos_objstore_bucket_operations_total{%(selector)s}[$interval]))' % thanos.compact.dashboard, '{{job}} {{operation}}' ) + g.stack @@ -152,20 +160,19 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; .addPanel( g.panel('Errors', 'Shows ratio of errors compared to the total number of executed operations against the bucket.') + g.qpsErrTotalPanel( - 'thanos_objstore_bucket_operation_failures_total{namespace="$namespace",job=~"$job"}', - 'thanos_objstore_bucket_operations_total{namespace="$namespace",job=~"$job"}', + 'thanos_objstore_bucket_operation_failures_total{%(selector)s}' % thanos.compact.dashboard.selector, + 'thanos_objstore_bucket_operations_total{%(selector)s}' % thanos.compact.dashboard.selector, + thanos.rule.dashboard.dimensions ) ) .addPanel( g.panel('Duration', 'Shows how long has it taken to execute operations against the bucket, in quantiles.') + - g.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', 'namespace="$namespace",job=~"$job"') + g.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', thanos.rule.dashboard.selector, thanos.rule.dashboard.dimensions) ) ) .addRow( - g.resourceUtilizationRow() - ) + - g.template('namespace', thanos.dashboard.namespaceQuery) + - g.template('job', 'up', 'namespace="$namespace", %(selector)s' % thanos.compact, true, '%(jobPrefix)s.*' % thanos.compact), + g.resourceUtilizationRow(thanos.rule.dashboard.selector, thanos.rule.dashboard.dimensions) + ), __overviewRows__+:: [ g.row('Compact') @@ -175,7 +182,7 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; 'Shows rate of execution for compactions against blocks that are stored in the bucket by compaction group.' ) + g.queryPanel( - 'sum(rate(thanos_compact_group_compactions_total{namespace="$namespace",%(selector)s}[$interval])) by (job)' % thanos.compact, + 'sum by (%(dimensions)s) (rate(thanos_compact_group_compactions_total{%(selector)s}[$interval]))' % thanos.dashboard.overview, 'compaction {{job}}' ) + g.stack + @@ -187,8 +194,9 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; 'Shows ratio of errors compared to the total number of executed compactions against blocks that are stored in the bucket.' ) + g.qpsErrTotalPanel( - 'thanos_compact_group_compactions_failures_total{namespace="$namespace",%(selector)s}' % thanos.compact, - 'thanos_compact_group_compactions_total{namespace="$namespace",%(selector)s}' % thanos.compact, + 'thanos_compact_group_compactions_failures_total{%(selector)s}' % thanos.dashboard.overview.selector, + 'thanos_compact_group_compactions_total{%(selector)s}' % thanos.dashboard.overview.selector, + thanos.rule.dashboard.dimensions ) + g.addDashboardLink(thanos.compact.title) ) + diff --git a/mixin/dashboards/defaults.libsonnet b/mixin/dashboards/defaults.libsonnet index 0c6d59b6f3..c2fccf2418 100644 --- a/mixin/dashboards/defaults.libsonnet +++ b/mixin/dashboards/defaults.libsonnet @@ -7,12 +7,12 @@ dashboard:: { prefix: 'Thanos / ', tags: error 'must provide dashboard tags', - namespaceQuery: error 'must provide a query for namespace variable for dashboard template', }, // Automatically add a uid to each dashboard based on the base64 encoding // of the file name and set the timezone to be 'default'. grafanaDashboards:: { + local component = std.split(filename, '.')[0], [filename]: grafanaDashboards[filename] { uid: std.md5(filename), timezone: 'UTC', @@ -43,7 +43,39 @@ ), ], }, - } + } { + templating+: { + list+: [ + template.new( + level, + '$datasource', + 'label_values(%s, %s)' % [thanos.targetGroups[level], level], + label=level, + refresh=1, + sort=2, + ) + for level in std.objectFields(thanos.targetGroups) + ], + }, + } + if std.objectHas(thanos[component], 'selector') then { + templating+: { + local name = 'job', + local selector = std.join(', ', thanos.dashboard.selector + [thanos[component].selector]), + list+: [ + template.new( + name, + '$datasource', + 'label_values(up{%s}, %s)' % [selector, name], + label=name, + refresh=1, + sort=2, + current='all', + allValues=null, + includeAll=true + ), + ], + }, + } else {} for filename in std.objectFields(grafanaDashboards) }, } diff --git a/mixin/dashboards/overview.libsonnet b/mixin/dashboards/overview.libsonnet index a0b1082a9d..e27736db32 100644 --- a/mixin/dashboards/overview.libsonnet +++ b/mixin/dashboards/overview.libsonnet @@ -7,15 +7,13 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; }, grafanaDashboards+:: { 'overview.json': - g.dashboard(thanos.overview.title) + - g.template('namespace', thanos.dashboard.namespaceQuery), + g.dashboard(thanos.dashboard.overview.title), }, } + { local grafanaDashboards = super.grafanaDashboards, grafanaDashboards+:: { 'overview.json'+: { - __enumeratedRows__+:: std.foldl( function(acc, row) local n = std.length(row.panels); diff --git a/mixin/dashboards/query.libsonnet b/mixin/dashboards/query.libsonnet index 529bd3d7f2..7a82729811 100644 --- a/mixin/dashboards/query.libsonnet +++ b/mixin/dashboards/query.libsonnet @@ -1,73 +1,81 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; +local utils = import '../lib/utils.libsonnet'; { local thanos = self, query+:: { - jobPrefix: error 'must provide job prefix for Thanos Query dashboard', selector: error 'must provide selector for Thanos Query dashboard', title: error 'must provide title for Thanos Query dashboard', + dashboard:: { + selector: std.join(', ', thanos.dashboard.selector + ['job="$job"']), + dimensions: std.join(', ', thanos.dashboard.dimensions + ['job']), + }, }, grafanaDashboards+:: { [if thanos.query != null then 'query.json']: + local queryHandlerSelector = utils.joinLabels([thanos.query.dashboard.selector, 'handler="query"']); + local queryRangeHandlerSelector = utils.joinLabels([thanos.query.dashboard.selector, 'handler="query_range"']); + local grpcUnarySelector = utils.joinLabels([thanos.query.dashboard.selector, 'grpc_type="unary"']); + local grpcServerStreamSelector = utils.joinLabels([thanos.query.dashboard.selector, 'grpc_type="server_stream"']); g.dashboard(thanos.query.title) .addRow( g.row('Instant Query API') .addPanel( g.panel('Rate', 'Shows rate of requests against /query for the given time.') + - g.httpQpsPanel('http_requests_total', 'namespace="$namespace",job=~"$job",handler="query"') + g.httpQpsPanel('http_requests_total', queryHandlerSelector, thanos.query.dashboard.dimensions) ) .addPanel( g.panel('Errors', 'Shows ratio of errors compared to the the total number of handled requests against /query.') + - g.httpErrPanel('http_requests_total', 'namespace="$namespace",job=~"$job",handler="query"') + g.httpErrPanel('http_requests_total', queryHandlerSelector, thanos.query.dashboard.dimensions) ) .addPanel( g.panel('Duration', 'Shows how long has it taken to handle requests in quantiles.') + - g.latencyPanel('http_request_duration_seconds', 'namespace="$namespace",job=~"$job",handler="query"') + g.latencyPanel('http_request_duration_seconds', queryHandlerSelector, thanos.query.dashboard.dimensions) ) ) .addRow( g.row('Range Query API') .addPanel( g.panel('Rate', 'Shows rate of requests against /query_range for the given time range.') + - g.httpQpsPanel('http_requests_total', 'namespace="$namespace",job=~"$job",handler="query_range"') + g.httpQpsPanel('http_requests_total', queryRangeHandlerSelector, thanos.query.dashboard.dimensions) ) .addPanel( g.panel('Errors', 'Shows ratio of errors compared to the the total number of handled requests against /query_range.') + - g.httpErrPanel('http_requests_total', 'namespace="$namespace",job=~"$job",handler="query_range"') + g.httpErrPanel('http_requests_total', queryRangeHandlerSelector, thanos.query.dashboard.dimensions) ) .addPanel( g.panel('Duration', 'Shows how long has it taken to handle requests in quantiles.') + - g.latencyPanel('http_request_duration_seconds', 'namespace="$namespace",job=~"$job",handler="query_range"') + g.latencyPanel('http_request_duration_seconds', queryRangeHandlerSelector, thanos.query.dashboard.dimensions) ) ) .addRow( g.row('gRPC (Unary)') .addPanel( g.panel('Rate', 'Shows rate of handled Unary gRPC requests from other queriers.') + - g.grpcQpsPanel('client', 'namespace="$namespace",job=~"$job",grpc_type="unary"') + g.grpcRequestsPanel('grpc_client_handled_total', grpcUnarySelector, thanos.query.dashboard.dimensions) ) .addPanel( g.panel('Errors', 'Shows ratio of errors compared to the the total number of handled requests from other queriers.') + - g.grpcErrorsPanel('client', 'namespace="$namespace",job=~"$job",grpc_type="unary"') + g.grpcErrorsPanel('grpc_client_handled_total', grpcUnarySelector, thanos.query.dashboard.dimensions) ) .addPanel( g.panel('Duration', 'Shows how long has it taken to handle requests from other queriers, in quantiles.') + - g.grpcLatencyPanel('client', 'namespace="$namespace",job=~"$job",grpc_type="unary"') + g.latencyPanel('grpc_client_handling_seconds', grpcUnarySelector, thanos.query.dashboard.dimensions) ) ) .addRow( g.row('gRPC (Stream)') .addPanel( g.panel('Rate', 'Shows rate of handled Streamed gRPC requests from other queriers.') + - g.grpcQpsPanel('client', 'namespace="$namespace",job=~"$job",grpc_type="server_stream"') + g.grpcRequestsPanel('grpc_client_handled_total', grpcServerStreamSelector, thanos.query.dashboard.dimensions) ) .addPanel( g.panel('Errors', 'Shows ratio of errors compared to the the total number of handled requests from other queriers.') + - g.grpcErrorsPanel('client', 'namespace="$namespace",job=~"$job",grpc_type="server_stream"') + g.grpcErrorsPanel('grpc_client_handled_total', grpcServerStreamSelector, thanos.query.dashboard.dimensions) ) .addPanel( g.panel('Duration', 'Shows how long has it taken to handle requests from other queriers, in quantiles') + - g.grpcLatencyPanel('client', 'namespace="$namespace",job=~"$job",grpc_type="server_stream"') + g.latencyPanel('grpc_client_handling_seconds', grpcServerStreamSelector, thanos.query.dashboard.dimensions) ) ) .addRow( @@ -75,41 +83,41 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; .addPanel( g.panel('Rate', 'Shows rate of DNS lookups to discover stores.') + g.queryPanel( - 'sum(rate(thanos_query_store_apis_dns_lookups_total{namespace="$namespace",job=~"$job"}[$interval])) by (job)', + 'sum by (%s) (rate(thanos_query_store_apis_dns_lookups_total{%s}[$interval]))' % [thanos.query.dashboard.dimensions, thanos.query.dashboard.selector], 'lookups {{job}}' ) ) .addPanel( g.panel('Errors', 'Shows ratio of failures compared to the the total number of executed DNS lookups.') + g.qpsErrTotalPanel( - 'thanos_query_store_apis_dns_failures_total{namespace="$namespace",job=~"$job"}', - 'thanos_query_store_apis_dns_lookups_total{namespace="$namespace",job=~"$job"}', + 'thanos_query_store_apis_dns_failures_total{%s}' % thanos.query.dashboard.selector, + 'thanos_query_store_apis_dns_lookups_total{%s}' % thanos.query.dashboard.selector, + thanos.query.dashboard.dimensions ) ) ) .addRow( - g.resourceUtilizationRow() - ) + - g.template('namespace', thanos.dashboard.namespaceQuery) + - g.template('job', 'up', 'namespace="$namespace", %(selector)s' % thanos.query, true, '%(jobPrefix)s.*' % thanos.query), + g.resourceUtilizationRow(thanos.query.dashboard.selector, thanos.query.dashboard.dimensions) + ), __overviewRows__+:: [ g.row('Instant Query') .addPanel( g.panel('Requests Rate', 'Shows rate of requests against /query for the given time.') + - g.httpQpsPanel('http_requests_total', 'namespace="$namespace",%(selector)s,handler="query"' % thanos.query) + + g.httpQpsPanel('http_requests_total', utils.joinLabels([thanos.dashboard.overview.selector, 'handler="query"']), thanos.dashboard.overview.dimensions) + g.addDashboardLink(thanos.query.title) ) .addPanel( g.panel('Requests Errors', 'Shows ratio of errors compared to the the total number of handled requests against /query.') + - g.httpErrPanel('http_requests_total', 'namespace="$namespace",%(selector)s,handler="query"' % thanos.query) + + g.httpErrPanel('http_requests_total', utils.joinLabels([thanos.dashboard.overview.selector, 'handler="query"']), thanos.dashboard.overview.dimensions) + g.addDashboardLink(thanos.query.title) ) .addPanel( g.sloLatency( 'Latency 99th Percentile', 'Shows how long has it taken to handle requests.', - 'http_request_duration_seconds_bucket{namespace="$namespace",%(selector)s,handler="query"}' % thanos.query, + 'http_request_duration_seconds_bucket{%s}' % utils.joinLabels([thanos.dashboard.overview.selector, 'handler="query"']), + thanos.dashboard.overview.dimensions, 0.99, 0.5, 1 @@ -120,19 +128,20 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; g.row('Range Query') .addPanel( g.panel('Requests Rate', 'Shows rate of requests against /query_range for the given time range.') + - g.httpQpsPanel('http_requests_total', 'namespace="$namespace",%(selector)s,handler="query_range"' % thanos.query) + + g.httpQpsPanel('http_requests_total', utils.joinLabels([thanos.dashboard.overview.selector, 'handler="query_range"']), thanos.dashboard.overview.dimensions) + g.addDashboardLink(thanos.query.title) ) .addPanel( g.panel('Requests Errors', 'Shows ratio of errors compared to the the total number of handled requests against /query_range.') + - g.httpErrPanel('http_requests_total', 'namespace="$namespace",%(selector)s,handler="query_range"' % thanos.query) + + g.httpErrPanel('http_requests_total', utils.joinLabels([thanos.dashboard.overview.selector, 'handler="query_range"']), thanos.dashboard.overview.dimensions) + g.addDashboardLink(thanos.query.title) ) .addPanel( g.sloLatency( 'Latency 99th Percentile', 'Shows how long has it taken to handle requests.', - 'http_request_duration_seconds_bucket{namespace="$namespace",%(selector)s,handler="query_range"}' % thanos.query, + 'http_request_duration_seconds_bucket{%s}' % utils.joinLabels([thanos.dashboard.overview.selector, 'handler="query_range"']), + thanos.dashboard.overview.dimensions, 0.99, 0.5, 1 diff --git a/mixin/dashboards/receive.libsonnet b/mixin/dashboards/receive.libsonnet index 4ff3317d95..bb23250fcb 100644 --- a/mixin/dashboards/receive.libsonnet +++ b/mixin/dashboards/receive.libsonnet @@ -1,28 +1,36 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; +local utils = import '../lib/utils.libsonnet'; { local thanos = self, receive+:: { - jobPrefix: error 'must provide job prefix for Thanos Receive dashboard', selector: error 'must provide selector for Thanos Receive dashboard', title: error 'must provide title for Thanos Receive dashboard', + dashboard:: { + selector: std.join(', ', thanos.dashboard.selector + ['job="$job"']), + dimensions: std.join(', ', thanos.dashboard.dimensions + ['job']), + }, }, grafanaDashboards+:: { [if thanos.receive != null then 'receive.json']: + local receiveHandlerSelector = utils.joinLabels([thanos.receive.dashboard.selector, 'handler="receive"']); + local grpcUnaryWriteSelector = utils.joinLabels([thanos.receive.dashboard.selector, 'grpc_type="unary"', 'grpc_method="RemoteWrite"']); + local grpcUnaryReadSelector = utils.joinLabels([thanos.receive.dashboard.selector, 'grpc_type="unary"', 'grpc_method!="RemoteWrite"']); + local grpcServerStreamSelector = utils.joinLabels([thanos.receive.dashboard.selector, 'grpc_type="server_stream"']); g.dashboard(thanos.receive.title) .addRow( g.row('WRITE - Incoming Request') .addPanel( g.panel('Rate', 'Shows rate of incoming requests.') + - g.httpQpsPanel('http_requests_total', 'handler="receive",namespace="$namespace",job=~"$job"') + g.httpQpsPanel('http_requests_total', receiveHandlerSelector, thanos.receive.dashboard.dimensions) ) .addPanel( g.panel('Errors', 'Shows ratio of errors compared to the total number of handled incoming requests.') + - g.httpErrPanel('http_requests_total', 'handler="receive",namespace="$namespace",job=~"$job"') + g.httpErrPanel('http_requests_total', receiveHandlerSelector, thanos.receive.dashboard.dimensions) ) .addPanel( g.panel('Duration', 'Shows how long has it taken to handle incoming requests in quantiles.') + - g.latencyPanel('http_request_duration_seconds', 'handler="receive",namespace="$namespace",job=~"$job"') + g.latencyPanel('http_request_duration_seconds', receiveHandlerSelector, thanos.receive.dashboard.dimensions) ) ) .addRow( @@ -30,15 +38,16 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; .addPanel( g.panel('Rate', 'Shows rate of replications to other receive nodes.') + g.queryPanel( - 'sum(rate(thanos_receive_replications_total{namespace="$namespace",job=~"$job"}[$interval])) by (job)', + 'sum by (%s) (rate(thanos_receive_replications_total{%s}[$interval]))' % [thanos.receive.dashboard.dimensions, thanos.receive.dashboard.selector], 'all {{job}}', ) ) .addPanel( g.panel('Errors', 'Shows ratio of errors compared to the total number of replications to other receive nodes.') + g.qpsErrTotalPanel( - 'thanos_receive_replications_total{namespace="$namespace",job=~"$job",result="error"}', - 'thanos_receive_replications_total{namespace="$namespace",job=~"$job"}', + 'thanos_receive_replications_total{%s}' % utils.joinLabels([thanos.receive.dashboard.selector, 'result="error"']), + 'thanos_receive_replications_total{%s}' % thanos.receive.dashboard.selector, + thanos.receive.dashboard.dimensions ) ) ) @@ -47,61 +56,65 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; .addPanel( g.panel('Rate', 'Shows rate of forwarded requests to other receive nodes.') + g.queryPanel( - 'sum(rate(thanos_receive_forward_requests_total{namespace="$namespace",job=~"$job"}[$interval])) by (job)', + 'sum by (%s) (rate(thanos_receive_forward_requests_total{%s}[$interval]))' % [thanos.receive.dashboard.dimensions, thanos.receive.dashboard.selector], 'all {{job}}', ) ) .addPanel( g.panel('Errors', 'Shows ratio of errors compared to the total number of forwareded requests to other receive nodes.') + g.qpsErrTotalPanel( - 'thanos_receive_forward_requests_total{namespace="$namespace",job=~"$job",result="error"}', - 'thanos_receive_forward_requests_total{namespace="$namespace",job=~"$job"}', + 'thanos_receive_forward_requests_total{%s}' % utils.joinLabels([thanos.receive.dashboard.selector, 'result="error"']), + 'thanos_receive_forward_requests_total{%s}' % thanos.receive.dashboard.selector, + thanos.receive.dashboard.dimensions ) ) ) .addRow( + // TODO(https://github.com/thanos-io/thanos/issues/3926) g.row('WRITE - gRPC (Unary)') .addPanel( g.panel('Rate', 'Shows rate of handled Unary gRPC requests from queriers.') + - g.grpcQpsPanel('server', 'namespace="$namespace",job=~"$job",grpc_type="unary",grpc_method="RemoteWrite"') + g.grpcRequestsPanel('grpc_server_handled_total', grpcUnaryWriteSelector, thanos.receive.dashboard.dimensions) ) .addPanel( g.panel('Errors', 'Shows ratio of errors compared to the total number of handled requests from queriers.') + - g.grpcErrorsPanel('server', 'namespace="$namespace",job=~"$job",grpc_type="unary",grpc_method="RemoteWrite"') + g.grpcErrorsPanel('grpc_server_handled_total', grpcUnaryWriteSelector, thanos.receive.dashboard.dimensions) ) .addPanel( g.panel('Duration', 'Shows how long has it taken to handle requests from queriers, in quantiles.') + - g.grpcLatencyPanel('server', 'namespace="$namespace",job=~"$job",grpc_type="unary",grpc_method="RemoteWrite"') + g.latencyPanel('grpc_server_handling_seconds', grpcUnaryWriteSelector, thanos.receive.dashboard.dimensions) ) ) .addRow( + // TODO(https://github.com/thanos-io/thanos/issues/3926) g.row('READ - gRPC (Unary)') .addPanel( g.panel('Rate', 'Shows rate of handled Unary gRPC requests from queriers.') + - g.grpcQpsPanel('server', 'namespace="$namespace",job=~"$job",grpc_type="unary",grpc_method!="RemoteWrite"') + g.grpcRequestsPanel('grpc_server_handled_total', grpcUnaryReadSelector, thanos.receive.dashboard.dimensions) ) .addPanel( g.panel('Errors', 'Shows ratio of errors compared to the total number of handled requests from queriers.') + - g.grpcErrorsPanel('server', 'namespace="$namespace",job=~"$job",grpc_type="unary",grpc_method!="RemoteWrite"') + g.grpcErrorsPanel('grpc_server_handled_total', grpcUnaryReadSelector, thanos.receive.dashboard.dimensions) ) .addPanel( g.panel('Duration', 'Shows how long has it taken to handle requests from queriers, in quantiles.') + - g.grpcLatencyPanel('server', 'namespace="$namespace",job=~"$job",grpc_type="unary",grpc_method!="RemoteWrite"') + g.latencyPanel('grpc_server_handling_seconds', grpcUnaryReadSelector, thanos.receive.dashboard.dimensions) ) ) .addRow( + // TODO(https://github.com/thanos-io/thanos/issues/3926) g.row('READ - gRPC (Stream)') .addPanel( g.panel('Rate', 'Shows rate of handled Streamed gRPC requests from queriers.') + - g.grpcQpsPanel('server', 'namespace="$namespace",job=~"$job",grpc_type="server_stream"') + g.grpcRequestsPanel('grpc_server_handled_total', grpcServerStreamSelector, thanos.receive.dashboard.dimensions) ) .addPanel( g.panel('Errors', 'Shows ratio of errors compared to the total number of handled requests from queriers.') + - g.grpcErrorsPanel('server', 'namespace="$namespace",job=~"$job",grpc_type="server_stream"') + g.grpcErrorsPanel('grpc_server_handled_total', grpcServerStreamSelector, thanos.receive.dashboard.dimensions) ) .addPanel( g.panel('Duration', 'Shows how long has it taken to handle requests from queriers, in quantiles.') + - g.grpcLatencyPanel('server', 'namespace="$namespace",job=~"$job",grpc_type="server_stream"') + g.latencyPanel('grpc_server_handling_seconds', grpcServerStreamSelector, thanos.receive.dashboard.dimensions) ) ) .addRow( @@ -109,7 +122,7 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; .addPanel( g.panel('Successful Upload', 'Shows the relative time of last successful upload to the object-store bucket.') + g.tablePanel( - ['time() - max(thanos_objstore_bucket_last_successful_upload_time{namespace="$namespace",job=~"$job"}) by (job, bucket)'], + ['time() - max by (%s) (thanos_objstore_bucket_last_successful_upload_time{%s})' % [utils.joinLabels([thanos.receive.dashboard.dimensions, 'bucket']), thanos.receive.dashboard.selector]], { Value: { alias: 'Uploaded Ago', @@ -121,28 +134,27 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; ) ) .addRow( - g.resourceUtilizationRow() - ) + - g.template('namespace', thanos.dashboard.namespaceQuery) + - g.template('job', 'up', 'namespace="$namespace", %(selector)s' % thanos.receive, true, '%(jobPrefix)s.*' % thanos.receive), + g.resourceUtilizationRow(thanos.receive.dashboard.selector, thanos.receive.dashboard.dimensions) + ), __overviewRows__+:: [ g.row('Receive') .addPanel( g.panel('Incoming Requests Rate', 'Shows rate of incoming requests.') + - g.httpQpsPanel('http_requests_total', 'handler="receive",namespace="$namespace",%(selector)s' % thanos.receive) + + g.httpQpsPanel('http_requests_total', utils.joinLabels([thanos.dashboard.overview.selector, 'handler="receive"']), thanos.dashboard.overview.dimensions) + g.addDashboardLink(thanos.receive.title) ) .addPanel( g.panel('Incoming Requests Errors', 'Shows ratio of errors compared to the total number of handled incoming requests.') + - g.httpErrPanel('http_requests_total', 'handler="receive",namespace="$namespace",%(selector)s' % thanos.receive) + + g.httpErrPanel('http_requests_total', utils.joinLabels([thanos.dashboard.overview.selector, 'handler="receive"']), thanos.dashboard.overview.dimensions) + g.addDashboardLink(thanos.receive.title) ) .addPanel( g.sloLatency( 'Incoming Requests Latency 99th Percentile', 'Shows how long has it taken to handle incoming requests.', - 'http_request_duration_seconds_bucket{handler="receive",namespace="$namespace",%(selector)s}' % thanos.receive, + 'http_request_duration_seconds_bucket{%s}' % utils.joinLabels([thanos.dashboard.overview.selector, 'handler="receive"']), + thanos.dashboard.overview.dimensions, 0.99, 0.5, 1 diff --git a/mixin/dashboards/rule.libsonnet b/mixin/dashboards/rule.libsonnet index 9b5480cf32..a1dd966370 100644 --- a/mixin/dashboards/rule.libsonnet +++ b/mixin/dashboards/rule.libsonnet @@ -1,32 +1,35 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; +local utils = import '../lib/utils.libsonnet'; { local thanos = self, rule+:: { - jobPrefix: error 'must provide job prefix for Thanos Rule dashboard', selector: error 'must provide selector for Thanos Rule dashboard', title: error 'must provide title for Thanos Rule dashboard', + dashboard:: { + selector: std.join(', ', thanos.dashboard.selector + ['job="$job"']), + dimensions: std.join(', ', thanos.dashboard.dimensions + ['job']), + }, }, grafanaDashboards+:: { [if thanos.rule != null then 'rule.json']: + local grpcUnarySelector = utils.joinLabels([thanos.rule.dashboard.selector, 'grpc_type="unary"']); + local grpcServerStreamSelector = utils.joinLabels([thanos.rule.dashboard.selector, 'grpc_type="server_stream"']); + g.dashboard(thanos.rule.title) .addRow( g.row('Rule Group Evaluations') .addPanel( g.panel('Rule Group Evaluations') + g.queryPanel( - ||| - sum by (strategy) (rate(prometheus_rule_evaluations_total{namespace="$namespace",job="$job"}[$interval])) - |||, + 'sum by (%s) (rate(prometheus_rule_evaluations_total{%s}[$interval]))' % [utils.joinLabels([thanos.rule.dashboard.dimensions, 'strategy']), thanos.rule.dashboard.selector], '{{ strategy }}', ) ) .addPanel( g.panel('Rule Group Evaluations Missed') + g.queryPanel( - ||| - sum by (strategy) (increase(prometheus_rule_group_iterations_missed_total{namespace="$namespace",job="$job"}[$interval])) - |||, + 'sum by (%s) (increase(prometheus_rule_group_iterations_missed_total{%s}[$interval]))' % [utils.joinLabels([thanos.rule.dashboard.dimensions, 'strategy']), thanos.rule.dashboard.selector], '{{ strategy }}', ) ) @@ -35,11 +38,11 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; g.queryPanel( ||| ( - max by(rule_group) (prometheus_rule_group_last_duration_seconds{namespace="$namespace",job="$job"}) + max by(%(dimensions)s, rule_group) (prometheus_rule_group_last_duration_seconds{%(selector)s}) > - sum by(rule_group) (prometheus_rule_group_interval_seconds{namespace="$namespace",job="$job"}) + sum by(%(dimensions)s, rule_group) (prometheus_rule_group_interval_seconds{%(selector)s}) ) - |||, + ||| % thanos.rule.dashboard, '{{ rule_group }}', ) ) @@ -49,14 +52,14 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; .addPanel( g.panel('Dropped Rate', 'Shows rate of dropped alerts.') + g.queryPanel( - 'sum(rate(thanos_alert_sender_alerts_dropped_total{namespace="$namespace",job=~"$job"}[$interval])) by (job, alertmanager)', + 'sum by (%(dimensions)s, alertmanager) (rate(thanos_alert_sender_alerts_dropped_total{%s}[$interval]))' % [thanos.rule.dashboard.dimensions, thanos.rule.dashboard.selector], '{{alertmanager}}' ) ) .addPanel( g.panel('Sent Rate', 'Shows rate of alerts that successfully sent to alert manager.') + g.queryPanel( - 'sum(rate(thanos_alert_sender_alerts_sent_total{namespace="$namespace",job=~"$job"}[$interval])) by (job, alertmanager)', + 'sum by (%(dimensions)s, alertmanager) (rate(thanos_alert_sender_alerts_sent_total{%s}[$interval]))' % [thanos.rule.dashboard.dimensions, thanos.rule.dashboard.selector], '{{alertmanager}}' ) + g.stack @@ -64,13 +67,14 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; .addPanel( g.panel('Sent Errors', 'Shows ratio of errors compared to the total number of sent alerts.') + g.qpsErrTotalPanel( - 'thanos_alert_sender_errors_total{namespace="$namespace",job=~"$job"}', - 'thanos_alert_sender_alerts_sent_total{namespace="$namespace",job=~"$job"}', + 'thanos_alert_sender_errors_total{%s}' % thanos.rule.dashboard.selector, + 'thanos_alert_sender_alerts_sent_total{%s}' % thanos.rule.dashboard.selector, + thanos.rule.dashboard.dimensions ) ) .addPanel( g.panel('Sent Duration', 'Shows how long has it taken to send alerts to alert manager.') + - g.latencyPanel('thanos_alert_sender_latency_seconds', 'namespace="$namespace",job=~"$job"'), + g.latencyPanel('thanos_alert_sender_latency_seconds', thanos.rule.dashboard.selector, thanos.rule.dashboard.dimensions), ) ) .addRow( @@ -78,15 +82,16 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; .addPanel( g.panel('Push Rate', 'Shows rate of queued alerts.') + g.queryPanel( - 'sum(rate(thanos_alert_queue_alerts_dropped_total{namespace="$namespace",job=~"$job"}[$interval])) by (job)', + 'sum by (%s) (rate(thanos_alert_queue_alerts_dropped_total{%s}[$interval]))' % [thanos.rule.dashboard.dimensions, thanos.rule.dashboard.selector], '{{job}}' ) ) .addPanel( g.panel('Drop Ratio', 'Shows ratio of dropped alerts compared to the total number of queued alerts.') + g.qpsErrTotalPanel( - 'thanos_alert_queue_alerts_dropped_total{namespace="$namespace",job=~"$job"}', - 'thanos_alert_queue_alerts_pushed_total{namespace="$namespace",job=~"$job"}', + 'thanos_alert_queue_alerts_dropped_total{%s}' % thanos.rule.dashboard.selector, + 'thanos_alert_queue_alerts_pushed_total{%s}' % thanos.rule.dashboard.selector, + thanos.rule.dashboard.dimensions ) ) ) @@ -94,44 +99,42 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; g.row('gRPC (Unary)') .addPanel( g.panel('Rate', 'Shows rate of handled Unary gRPC requests.') + - g.grpcQpsPanel('server', 'namespace="$namespace",job=~"$job",grpc_type="unary"') + g.grpcRequestsPanel('grpc_server_handled_total', grpcUnarySelector, thanos.rule.dashboard.dimensions) ) .addPanel( g.panel('Errors', 'Shows ratio of errors compared to the total number of handled requests.') + - g.grpcErrorsPanel('server', 'namespace="$namespace",job=~"$job",grpc_type="unary"') + g.grpcErrorsPanel('grpc_server_handled_total', grpcUnarySelector, thanos.rule.dashboard.dimensions) ) .addPanel( g.panel('Duration', 'Shows how long has it taken to handle requests, in quantiles.') + - g.grpcLatencyPanel('server', 'namespace="$namespace",job=~"$job",grpc_type="unary"') + g.latencyPanel('grpc_server_handling_seconds', grpcUnarySelector, thanos.rule.dashboard.dimensions) ) ) .addRow( g.row('gRPC (Stream)') .addPanel( g.panel('Rate', 'Shows rate of handled Streamed gRPC requests.') + - g.grpcQpsPanel('server', 'namespace="$namespace",job=~"$job",grpc_type="server_stream"') + g.grpcRequestsPanel('grpc_server_handled_total', grpcServerStreamSelector, thanos.rule.dashboard.dimensions) ) .addPanel( g.panel('Errors', 'Shows ratio of errors compared to the total number of handled requests.') + - g.grpcErrorsPanel('server', 'namespace="$namespace",job=~"$job",grpc_type="server_stream"') + g.grpcErrorsPanel('grpc_server_handled_total', grpcServerStreamSelector, thanos.rule.dashboard.dimensions) ) .addPanel( g.panel('Duration', 'Shows how long has it taken to handle requests, in quantiles') + - g.grpcLatencyPanel('server', 'namespace="$namespace",job=~"$job",grpc_type="server_stream"') + g.latencyPanel('grpc_server_handling_seconds', grpcServerStreamSelector, thanos.rule.dashboard.dimensions) ) ) .addRow( - g.resourceUtilizationRow() - ) + - g.template('namespace', thanos.dashboard.namespaceQuery) + - g.template('job', 'up', 'namespace="$namespace", %(selector)s' % thanos.rule, true, '%(jobPrefix)s.*' % thanos.rule), + g.resourceUtilizationRow(thanos.rule.dashboard.selector, thanos.rule.dashboard.dimensions) + ), __overviewRows__+:: [ g.row('Rule') .addPanel( g.panel('Alert Sent Rate', 'Shows rate of alerts that successfully sent to alert manager.') + g.queryPanel( - 'sum(rate(thanos_alert_sender_alerts_sent_total{namespace="$namespace",%(selector)s}[$interval])) by (job, alertmanager)' % thanos.rule, + 'sum by (%s) (rate(thanos_alert_sender_alerts_sent_total{%s}[$interval]))' % [utils.joinLabels([thanos.dashboard.overview.dimensions, 'alertmanager']), thanos.dashboard.overview.selector], '{{alertmanager}}' ) + g.addDashboardLink(thanos.rule.title) + @@ -140,8 +143,9 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; .addPanel( g.panel('Alert Sent Errors', 'Shows ratio of errors compared to the total number of sent alerts.') + g.qpsErrTotalPanel( - 'thanos_alert_sender_errors_total{namespace="$namespace",%(selector)s}' % thanos.rule, - 'thanos_alert_sender_alerts_sent_total{namespace="$namespace",%(selector)s}' % thanos.rule, + 'thanos_alert_sender_errors_total{%s}' % thanos.dashboard.overview.selector, + 'thanos_alert_sender_alerts_sent_total{%s}' % thanos.dashboard.overview.selector, + thanos.dashboard.overview.dimensions ) + g.addDashboardLink(thanos.rule.title) ) @@ -149,7 +153,8 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; g.sloLatency( 'Alert Sent Duration', 'Shows how long has it taken to send alerts to alert manager.', - 'thanos_alert_sender_latency_seconds_bucket{namespace="$namespace",%(selector)s}' % thanos.rule, + 'thanos_alert_sender_latency_seconds_bucket{%s}' % thanos.dashboard.overview.selector, + thanos.dashboard.overview.dimensions, 0.99, 0.5, 1 diff --git a/mixin/dashboards/sidecar.libsonnet b/mixin/dashboards/sidecar.libsonnet index 84c07ebfa0..de726945b3 100644 --- a/mixin/dashboards/sidecar.libsonnet +++ b/mixin/dashboards/sidecar.libsonnet @@ -1,43 +1,50 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; +local utils = import '../lib/utils.libsonnet'; { local thanos = self, sidecar+:: { - jobPrefix: error 'must provide job prefix for Thanos Sidecar dashboard', selector: error 'must provide selector for Thanos Sidecar dashboard', title: error 'must provide title for Thanos Sidecar dashboard', + dashboard:: { + selector: std.join(', ', thanos.dashboard.selector + ['job="$job"']), + dimensions: std.join(', ', thanos.dashboard.dimensions + ['job']), + }, }, grafanaDashboards+:: { [if thanos.sidecar != null then 'sidecar.json']: + local grpcUnarySelector = utils.joinLabels([thanos.sidecar.dashboard.selector, 'grpc_type="unary"']); + local grpcServerSelector = utils.joinLabels([thanos.sidecar.dashboard.selector, 'grpc_type="server_stream"']); + g.dashboard(thanos.sidecar.title) .addRow( g.row('gRPC (Unary)') .addPanel( g.panel('Rate', 'Shows rate of handled Unary gRPC requests from queriers.') + - g.grpcQpsPanel('server', 'namespace="$namespace",job=~"$job",grpc_type="unary"') + g.grpcRequestsPanel('grpc_server_handled_total', grpcUnarySelector, thanos.sidecar.dashboard.dimensions) ) .addPanel( g.panel('Errors', 'Shows ratio of errors compared to the total number of handled requests from queriers.') + - g.grpcErrorsPanel('server', 'namespace="$namespace",job=~"$job",grpc_type="unary"') + g.grpcErrorsPanel('grpc_server_handled_total', grpcUnarySelector, thanos.sidecar.dashboard.dimensions) ) .addPanel( g.panel('Duration', 'Shows how long has it taken to handle requests from queriers, in quantiles.') + - g.grpcLatencyPanel('server', 'namespace="$namespace",job=~"$job",grpc_type="unary"') + g.latencyPanel('grpc_server_handling_seconds', grpcUnarySelector, thanos.sidecar.dashboard.dimensions) ) ) .addRow( g.row('gRPC (Stream)') .addPanel( g.panel('Rate', 'Shows rate of handled Streamed gRPC requests from queriers.') + - g.grpcQpsPanel('server', 'namespace="$namespace",job=~"$job",grpc_type="server_stream"') + g.grpcRequestsPanel('grpc_server_handled_total', grpcServerSelector, thanos.sidecar.dashboard.dimensions) ) .addPanel( g.panel('Errors') + - g.grpcErrorsPanel('server', 'namespace="$namespace",job=~"$job",grpc_type="server_stream"') + g.grpcErrorsPanel('grpc_server_handled_total', grpcServerSelector, thanos.sidecar.dashboard.dimensions) ) .addPanel( g.panel('Duration', 'Shows how long has it taken to handle requests from queriers, in quantiles.') + - g.grpcLatencyPanel('server', 'namespace="$namespace",job=~"$job",grpc_type="server_stream"') + g.latencyPanel('grpc_server_handling_seconds', grpcServerSelector, thanos.sidecar.dashboard.dimensions) ) ) .addRow( @@ -45,7 +52,7 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; .addPanel( g.panel('Successful Upload', 'Shows the relative time of last successful upload to the object-store bucket.') + g.tablePanel( - ['time() - max(thanos_objstore_bucket_last_successful_upload_time{namespace="$namespace",job=~"$job"}) by (job, bucket)'], + ['time() - max by (%s) (thanos_objstore_bucket_last_successful_upload_time{%s})' % [utils.joinLabels([thanos.sidecar.dashboard.dimensions, 'bucket']), thanos.sidecar.dashboard.selector]], { Value: { alias: 'Uploaded Ago', @@ -61,7 +68,7 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; .addPanel( g.panel('Rate') + g.queryPanel( - 'sum(rate(thanos_objstore_bucket_operations_total{namespace="$namespace",job=~"$job"}[$interval])) by (job, operation)', + 'sum by (%s) (rate(thanos_objstore_bucket_operations_total{%s}[$interval]))' % [utils.joinLabels([thanos.sidecar.dashboard.dimensions, 'operation']), thanos.sidecar.dashboard.selector], '{{job}} {{operation}}' ) + g.stack @@ -69,38 +76,38 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; .addPanel( g.panel('Errors') + g.qpsErrTotalPanel( - 'thanos_objstore_bucket_operation_failures_total{namespace="$namespace",job=~"$job"}', - 'thanos_objstore_bucket_operations_total{namespace="$namespace",job=~"$job"}', + 'thanos_objstore_bucket_operation_failures_total{%s}' % thanos.sidecar.dashboard.selector, + 'thanos_objstore_bucket_operations_total{%s}' % thanos.sidecar.dashboard.selector, + thanos.sidecar.dashboard.dimensions ) ) .addPanel( g.panel('Duration') + - g.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', 'namespace="$namespace",job=~"$job"') + g.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', thanos.sidecar.dashboard.selector, thanos.sidecar.dashboard.dimensions) ) ) .addRow( - g.resourceUtilizationRow() - ) + - g.template('namespace', thanos.dashboard.namespaceQuery) + - g.template('job', 'up', 'namespace="$namespace", %(selector)s' % thanos.sidecar, true, '%(jobPrefix)s.*' % thanos.sidecar), + g.resourceUtilizationRow(thanos.sidecar.dashboard.selector, thanos.sidecar.dashboard.dimensions) + ), __overviewRows__+:: [ g.row('Sidecar') .addPanel( g.panel('gPRC (Unary) Rate', 'Shows rate of handled Unary gRPC requests from queriers.') + - g.grpcQpsPanel('server', 'namespace="$namespace",%(selector)s,grpc_type="unary"' % thanos.sidecar) + + g.grpcRequestsPanel('grpc_server_handled_total', utils.joinLabels([thanos.dashboard.overview.selector, 'grpc_type="unary"']), thanos.dashboard.overview.dimensions) + g.addDashboardLink(thanos.sidecar.title) ) .addPanel( g.panel('gPRC (Unary) Errors', 'Shows ratio of errors compared to the total number of handled requests from queriers.') + - g.grpcErrorsPanel('server', 'namespace="$namespace",%(selector)s,grpc_type="unary"' % thanos.sidecar) + + g.grpcErrorsPanel('grpc_server_handled_total', utils.joinLabels([thanos.dashboard.overview.selector, 'grpc_type="unary"']), thanos.dashboard.overview.dimensions) + g.addDashboardLink(thanos.sidecar.title) ) .addPanel( g.sloLatency( 'gPRC (Unary) Latency 99th Percentile', 'Shows how long has it taken to handle requests from queriers, in quantiles.', - 'grpc_server_handling_seconds_bucket{grpc_type="unary",namespace="$namespace",%(selector)s}' % thanos.sidecar, + 'grpc_server_handling_seconds_bucket{%s}' % utils.joinLabels([thanos.dashboard.overview.selector, 'grpc_type="unary"']), + thanos.dashboard.overview.dimensions, 0.99, 0.5, 1 diff --git a/mixin/dashboards/store.libsonnet b/mixin/dashboards/store.libsonnet index c94ae0f2db..93b2a860b8 100644 --- a/mixin/dashboards/store.libsonnet +++ b/mixin/dashboards/store.libsonnet @@ -1,43 +1,50 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; +local utils = import '../lib/utils.libsonnet'; { local thanos = self, store+:: { - jobPrefix: error 'must provide job prefix for Thanos Store dashboard', selector: error 'must provide selector for Thanos Store dashboard', title: error 'must provide title for Thanos Store dashboard', + dashboard:: { + selector: std.join(', ', thanos.dashboard.selector + ['job="$job"']), + dimensions: std.join(', ', thanos.dashboard.dimensions + ['job']), + }, }, grafanaDashboards+:: { [if thanos.store != null then 'store.json']: + local grpcUnarySelector = utils.joinLabels([thanos.store.dashboard.selector, 'grpc_type="unary"']); + local grpcServerStreamSelector = utils.joinLabels([thanos.store.dashboard.selector, 'grpc_type="server_stream"']); + g.dashboard(thanos.store.title) .addRow( g.row('gRPC (Unary)') .addPanel( g.panel('Rate', 'Shows rate of handled Unary gRPC requests from queriers.') + - g.grpcQpsPanel('server', 'namespace="$namespace",job=~"$job",grpc_type="unary"') + g.grpcRequestsPanel('grpc_server_handled_total', grpcUnarySelector, thanos.store.dashboard.dimensions) ) .addPanel( g.panel('Errors', 'Shows ratio of errors compared to the total number of handled requests from queriers.') + - g.grpcErrorsPanel('server', 'namespace="$namespace",job=~"$job",grpc_type="unary"') + g.grpcErrorsPanel('grpc_server_handled_total', grpcUnarySelector, thanos.store.dashboard.dimensions) ) .addPanel( g.panel('Duration', 'Shows how long has it taken to handle requests from queriers, in quantiles.') + - g.grpcLatencyPanel('server', 'namespace="$namespace",job=~"$job",grpc_type="unary"') + g.latencyPanel('grpc_server_handling_seconds', grpcUnarySelector, thanos.store.dashboard.dimensions) ) ) .addRow( g.row('gRPC (Stream)') .addPanel( g.panel('Rate', 'Shows rate of handled Streamed gRPC requests from queriers.') + - g.grpcQpsPanel('server', 'namespace="$namespace",job=~"$job",grpc_type="server_stream"') + g.grpcRequestsPanel('grpc_server_handled_total', grpcServerStreamSelector, thanos.store.dashboard.dimensions) ) .addPanel( g.panel('Errors', 'Shows ratio of errors compared to the total number of handled requests from queriers.') + - g.grpcErrorsPanel('server', 'namespace="$namespace",job=~"$job",grpc_type="server_stream"') + g.grpcErrorsPanel('grpc_server_handled_total', grpcServerStreamSelector, thanos.store.dashboard.dimensions) ) .addPanel( g.panel('Duration', 'Shows how long has it taken to handle requests from queriers, in quantiles.') + - g.grpcLatencyPanel('server', 'namespace="$namespace",job=~"$job",grpc_type="server_stream"') + g.latencyPanel('grpc_server_handling_seconds', grpcServerStreamSelector, thanos.store.dashboard.dimensions) ) ) .addRow( @@ -45,7 +52,7 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; .addPanel( g.panel('Rate', 'Shows rate of execution for operations against the bucket.') + g.queryPanel( - 'sum(rate(thanos_objstore_bucket_operations_total{namespace="$namespace",job=~"$job"}[$interval])) by (job, operation)', + 'sum by (%s) (rate(thanos_objstore_bucket_operations_total{%s}[$interval]))' % [utils.joinLabels([thanos.store.dashboard.dimensions, 'operation']), thanos.store.dashboard.selector], '{{job}} {{operation}}' ) + g.stack @@ -53,7 +60,7 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; .addPanel( g.panel('Errors', 'Shows ratio of errors compared to the total number of executed operations against the bucket.') + g.queryPanel( - 'sum by (job, operation) (rate(thanos_objstore_bucket_operation_failures_total{namespace="$namespace",job=~"$job"}[$interval])) / sum by (job, operation) (rate(thanos_objstore_bucket_operations_total{namespace="$namespace",job=~"$job"}[$interval]))', + 'sum by (%(dimensions)s) (rate(thanos_objstore_bucket_operation_failures_total{%(selector)s}[$interval])) / sum by (%(dimensions)s) (rate(thanos_objstore_bucket_operations_total{%(selector)s}[$interval]))' % thanos.store.dashboard { dimensions: utils.joinLabels([thanos.store.dashboard.dimensions, 'operation']) }, '{{job}} {{operation}}' ) + { yaxes: g.yaxes({ format: 'percentunit' }) } + @@ -61,7 +68,7 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; ) .addPanel( g.panel('Duration', 'Shows how long has it taken to execute operations against the bucket, in quantiles.') + - $.latencyByOperationPanel('thanos_objstore_bucket_operation_duration_seconds', 'namespace="$namespace",job=~"$job"') + $.latencyByOperationPanel('thanos_objstore_bucket_operation_duration_seconds', thanos.store.dashboard.selector, thanos.store.dashboard.dimensions) ) ) .addRow( @@ -69,7 +76,7 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; .addPanel( g.panel('Block Load Rate', 'Shows rate of block loads from the bucket.') + g.queryPanel( - 'sum(rate(thanos_bucket_store_block_loads_total{namespace="$namespace",job=~"$job"}[$interval]))', + 'sum by (%s) (rate(thanos_bucket_store_block_loads_total{%s}[$interval]))' % [thanos.store.dashboard.dimensions, thanos.store.dashboard.selector], 'block loads' ) + g.stack @@ -77,14 +84,15 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; .addPanel( g.panel('Block Load Errors', 'Shows ratio of errors compared to the total number of block loads from the bucket.') + g.qpsErrTotalPanel( - 'thanos_bucket_store_block_load_failures_total{namespace="$namespace",job=~"$job"}', - 'thanos_bucket_store_block_loads_total{namespace="$namespace",job=~"$job"}', + 'thanos_bucket_store_block_load_failures_total{%s}' % thanos.store.dashboard.selector, + 'thanos_bucket_store_block_loads_total{%s}' % thanos.store.dashboard.selector, + thanos.store.dashboard.dimensions ) ) .addPanel( g.panel('Block Drop Rate', 'Shows rate of block drops.') + g.queryPanel( - 'sum(rate(thanos_bucket_store_block_drops_total{namespace="$namespace",job=~"$job"}[$interval])) by (job, operation)', + 'sum by (%s) (rate(thanos_bucket_store_block_drops_total{%s}[$interval]))' % [utils.joinLabels([thanos.store.dashboard.dimensions, 'operation']), thanos.store.dashboard.selector], 'block drops {{job}}' ) + g.stack @@ -92,8 +100,9 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; .addPanel( g.panel('Block Drop Errors', 'Shows ratio of errors compared to the total number of block drops.') + g.qpsErrTotalPanel( - 'thanos_bucket_store_block_drop_failures_total{namespace="$namespace",job=~"$job"}', - 'thanos_bucket_store_block_drops_total{namespace="$namespace",job=~"$job"}', + 'thanos_bucket_store_block_drop_failures_total{%s}' % thanos.store.dashboard.selector, + 'thanos_bucket_store_block_drops_total{%s}' % thanos.store.dashboard.selector, + thanos.store.dashboard.dimensions ) ) ) @@ -102,7 +111,7 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; .addPanel( g.panel('Requests', 'Show rate of cache requests.') + g.queryPanel( - 'sum(rate(thanos_store_index_cache_requests_total{namespace="$namespace",job=~"$job"}[$interval])) by (job, item_type)', + 'sum by (%s) (rate(thanos_store_index_cache_requests_total{%s}[$interval]))' % [utils.joinLabels([thanos.store.dashboard.dimensions, 'item_type']), thanos.store.dashboard.selector], '{{job}} {{item_type}}', ) + g.stack @@ -110,7 +119,7 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; .addPanel( g.panel('Hits', 'Shows ratio of errors compared to the total number of cache hits.') + g.queryPanel( - 'sum(rate(thanos_store_index_cache_hits_total{namespace="$namespace",job=~"$job"}[$interval])) by (job, item_type)', + 'sum by (%s) (rate(thanos_store_index_cache_hits_total{%s}[$interval]))' % [utils.joinLabels([thanos.store.dashboard.dimensions, 'item_type']), thanos.store.dashboard.selector], '{{job}} {{item_type}}', ) + g.stack @@ -118,7 +127,7 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; .addPanel( g.panel('Added', 'Show rate of added items to cache.') + g.queryPanel( - 'sum(rate(thanos_store_index_cache_items_added_total{namespace="$namespace",job=~"$job"}[$interval])) by (job, item_type)', + 'sum by (%s) (rate(thanos_store_index_cache_items_added_total{%s}[$interval]))' % [utils.joinLabels([thanos.store.dashboard.dimensions, 'item_type']), thanos.store.dashboard.selector], '{{job}} {{item_type}}', ) + g.stack @@ -126,7 +135,7 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; .addPanel( g.panel('Evicted', 'Show rate of evicted items from cache.') + g.queryPanel( - 'sum(rate(thanos_store_index_cache_items_evicted_total{namespace="$namespace",job=~"$job"}[$interval])) by (job, item_type)', + 'sum by (%s) (rate(thanos_store_index_cache_items_evicted_total{%s}[$interval]))' % [utils.joinLabels([thanos.store.dashboard.dimensions, 'item_type']), thanos.store.dashboard.selector], '{{job}} {{item_type}}', ) + g.stack @@ -138,9 +147,9 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; g.panel('Chunk Size', 'Shows size of chunks that have sent to the bucket.') + g.queryPanel( [ - 'histogram_quantile(0.99, sum(rate(thanos_bucket_store_sent_chunk_size_bytes_bucket{namespace="$namespace",job=~"$job"}[$interval])) by (job, le))', - 'sum(rate(thanos_bucket_store_sent_chunk_size_bytes_sum{namespace="$namespace",job=~"$job"}[$interval])) by (job) / sum(rate(thanos_bucket_store_sent_chunk_size_bytes_count{namespace="$namespace",job=~"$job"}[$interval])) by (job)', - 'histogram_quantile(0.99, sum(rate(thanos_bucket_store_sent_chunk_size_bytes_bucket{namespace="$namespace",job=~"$job"}[$interval])) by (job, le))', + 'histogram_quantile(0.99, sum by (%s) (rate(thanos_bucket_store_sent_chunk_size_bytes_bucket{%s}[$interval])))' % [utils.joinLabels([thanos.store.dashboard.dimensions, 'le']), thanos.store.dashboard.selector], + 'sum by (%(dimensions)s) (rate(thanos_bucket_store_sent_chunk_size_bytes_sum{%(selector)s}[$interval])) / sum by (%(dimensions)s) (rate(thanos_bucket_store_sent_chunk_size_bytes_count{%(selector)s}[$interval]))' % thanos.store.dashboard, + 'histogram_quantile(0.99, sum by (%s) (rate(thanos_bucket_store_sent_chunk_size_bytes_bucket{%s}[$interval])))' % [utils.joinLabels([thanos.store.dashboard.dimensions, 'le']), thanos.store.dashboard.selector], ], [ 'P99', @@ -157,9 +166,9 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; g.panel('Block queried') + g.queryPanel( [ - 'thanos_bucket_store_series_blocks_queried{namespace="$namespace",job=~"$job",quantile="0.99"}', - 'sum(rate(thanos_bucket_store_series_blocks_queried_sum{namespace="$namespace",job=~"$job"}[$interval])) by (job) / sum(rate(thanos_bucket_store_series_blocks_queried_count{namespace="$namespace",job=~"$job"}[$interval])) by (job)', - 'thanos_bucket_store_series_blocks_queried{namespace="$namespace",job=~"$job",quantile="0.50"}', + 'thanos_bucket_store_series_blocks_queried{%s, quantile="0.99"}' % thanos.store.dashboard.selector, + 'sum by (%(dimensions)s) (rate(thanos_bucket_store_series_blocks_queried_sum{%(selector)s}[$interval])) / sum by (%(dimensions)s) (rate(thanos_bucket_store_series_blocks_queried_count{%(selector)s}[$interval]))' % thanos.store.dashboard, + 'thanos_bucket_store_series_blocks_queried{%s, quantile="0.50"}' % thanos.store.dashboard.selector, ], [ 'P99', 'mean {{job}}', @@ -171,9 +180,9 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; g.panel('Data Fetched', 'Show the size of data fetched') + g.queryPanel( [ - 'thanos_bucket_store_series_data_fetched{namespace="$namespace",job=~"$job",quantile="0.99"}', - 'sum(rate(thanos_bucket_store_series_data_fetched_sum{namespace="$namespace",job=~"$job"}[$interval])) by (job) / sum(rate(thanos_bucket_store_series_data_fetched_count{namespace="$namespace",job=~"$job"}[$interval])) by (job)', - 'thanos_bucket_store_series_data_fetched{namespace="$namespace",job=~"$job",quantile="0.50"}', + 'thanos_bucket_store_series_data_fetched{%s, quantile="0.99"}' % thanos.store.dashboard.selector, + 'sum by (%(dimensions)s) (rate(thanos_bucket_store_series_data_fetched_sum{%(selector)s}[$interval])) / sum by (%(dimensions)s) (rate(thanos_bucket_store_series_data_fetched_count{%(selector)s}[$interval]))' % thanos.store.dashboard, + 'thanos_bucket_store_series_data_fetched{%s, quantile="0.50"}' % thanos.store.dashboard.selector, ], [ 'P99', 'mean {{job}}', @@ -186,9 +195,9 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; g.panel('Result series') + g.queryPanel( [ - 'thanos_bucket_store_series_result_series{namespace="$namespace",job=~"$job",quantile="0.99"}', - 'sum(rate(thanos_bucket_store_series_result_series_sum{namespace="$namespace",job=~"$job"}[$interval])) by (job) / sum(rate(thanos_bucket_store_series_result_series_count{namespace="$namespace",job=~"$job"}[$interval])) by (job)', - 'thanos_bucket_store_series_result_series{namespace="$namespace",job=~"$job",quantile="0.50"}', + 'thanos_bucket_store_series_result_series{%s,quantile="0.99"}' % thanos.store.dashboard.selector, + 'sum by (%(dimensions)s) (rate(thanos_bucket_store_series_result_series_sum{%(selector)s}[$interval])) / sum by (%(dimensions)s) (rate(thanos_bucket_store_series_result_series_count{%(selector)s}[$interval]))' % thanos.store.dashboard, + 'thanos_bucket_store_series_result_series{%s,quantile="0.50"}' % thanos.store.dashboard.selector, ], [ 'P99', 'mean {{job}}', @@ -201,40 +210,39 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; g.row('Series Operation Durations') .addPanel( g.panel('Get All', 'Shows how long has it taken to get all series.') + - g.latencyPanel('thanos_bucket_store_series_get_all_duration_seconds', 'namespace="$namespace",job=~"$job"') + g.latencyPanel('thanos_bucket_store_series_get_all_duration_seconds', thanos.store.dashboard.selector, thanos.store.dashboard.dimensions) ) .addPanel( g.panel('Merge', 'Shows how long has it taken to merge series.') + - g.latencyPanel('thanos_bucket_store_series_merge_duration_seconds', 'namespace="$namespace",job=~"$job"') + g.latencyPanel('thanos_bucket_store_series_merge_duration_seconds', thanos.store.dashboard.selector, thanos.store.dashboard.dimensions) ) .addPanel( g.panel('Gate', 'Shows how long has it taken for a series to wait at the gate.') + - g.latencyPanel('thanos_bucket_store_series_gate_duration_seconds', 'namespace="$namespace",job=~"$job"') + g.latencyPanel('thanos_bucket_store_series_gate_duration_seconds', thanos.store.dashboard.selector, thanos.store.dashboard.dimensions) ) ) .addRow( - g.resourceUtilizationRow() - ) + - g.template('namespace', thanos.dashboard.namespaceQuery) + - g.template('job', 'up', 'namespace="$namespace", %(selector)s' % thanos.store, true, '%(jobPrefix)s.*' % thanos.store), + g.resourceUtilizationRow(thanos.store.dashboard.selector, thanos.store.dashboard.dimensions) + ), __overviewRows__+:: [ g.row('Store') .addPanel( g.panel('gPRC (Unary) Rate', 'Shows rate of handled Unary gRPC requests from queriers.') + - g.grpcQpsPanel('server', 'namespace="$namespace",%(selector)s,grpc_type="unary"' % thanos.store) + + g.grpcRequestsPanel('grpc_server_handled_total', utils.joinLabels([thanos.dashboard.overview.selector, 'grpc_type="unary"']), thanos.dashboard.overview.dimensions) + g.addDashboardLink(thanos.store.title) ) .addPanel( g.panel('gPRC (Unary) Errors', 'Shows ratio of errors compared to the total number of handled requests from queriers.') + - g.grpcErrorsPanel('server', 'namespace="$namespace",%(selector)s,grpc_type="unary"' % thanos.store) + + g.grpcErrorsPanel('grpc_server_handled_total', utils.joinLabels([thanos.dashboard.overview.selector, 'grpc_type="unary"']), thanos.dashboard.overview.dimensions) + g.addDashboardLink(thanos.store.title) ) .addPanel( g.sloLatency( 'gRPC Latency 99th Percentile', 'Shows how long has it taken to handle requests from queriers.', - 'grpc_server_handling_seconds_bucket{grpc_type="unary",namespace="$namespace",%(selector)s}' % thanos.store, + 'grpc_server_handling_seconds_bucket{%s}' % utils.joinLabels([thanos.dashboard.overview.selector, 'grpc_type="unary"']), + thanos.dashboard.overview.dimensions, 0.99, 0.5, 1 @@ -244,11 +252,13 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; ], }, - latencyByOperationPanel(metricName, selector, multiplier='1'):: { + latencyByOperationPanel(metricName, selector, dimensions, multiplier='1'):: { + local params = { metricName: metricName, selector: selector, multiplier: multiplier, dimensions: dimensions }, + nullPointMode: 'null as zero', targets: [ { - expr: 'histogram_quantile(0.99, sum(rate(%s_bucket{%s}[$interval])) by (job, operation, le)) * %s' % [metricName, selector, multiplier], + expr: 'histogram_quantile(0.99, sum by (%(dimensions)s, operation, le) (rate(%(metricName)s_bucket{%(selector)s}[$interval]))) * %(multiplier)s' % params, format: 'time_series', intervalFactor: 2, legendFormat: 'P99 {{job}}', @@ -256,7 +266,7 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; step: 10, }, { - expr: 'sum(rate(%s_sum{%s}[$interval])) by (job, operation) * %s / sum(rate(%s_count{%s}[$interval])) by (job, operation)' % [metricName, selector, multiplier, metricName, selector], + expr: 'sum by (%(dimensions)s, operation) (rate(%(metricName)s_sum{%(selector)s}[$interval])) * %(multiplier)s / sum by (%(dimensions)s, operation) (rate(%(metricName)s_count{%(selector)s}[$interval]))' % params, format: 'time_series', intervalFactor: 2, legendFormat: 'mean {{job}}', @@ -264,7 +274,7 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; step: 10, }, { - expr: 'histogram_quantile(0.50, sum(rate(%s_bucket{%s}[$interval])) by (job, operation, le)) * %s' % [metricName, selector, multiplier], + expr: 'histogram_quantile(0.50, sum by (%(dimensions)s, operation, le) (rate(%(metricName)s_bucket{%(selector)s}[$interval]))) * %(multiplier)s' % params, format: 'time_series', intervalFactor: 2, legendFormat: 'P50 {{job}}', diff --git a/mixin/jsonnetfile.lock.json b/mixin/jsonnetfile.lock.json index 7e871c8033..de1b098e19 100644 --- a/mixin/jsonnetfile.lock.json +++ b/mixin/jsonnetfile.lock.json @@ -8,8 +8,8 @@ "subdir": "grafonnet" } }, - "version": "8fb95bd89990e493a8534205ee636bfcb8db67bd", - "sum": "tDuuSKE9f4Ew2bjBM33Rs6behLEAzkmKkShSt+jpAak=" + "version": "daad85cf3fad3580e58029414630e29956aefe21", + "sum": "zkOBVXtNSGlOdbm5TRCbEik7c/Jk+btbJqaE9qW8j3Y=" }, { "source": { @@ -18,8 +18,8 @@ "subdir": "grafana-builder" } }, - "version": "f4c59f64f80442f871a06c91edf74d014b82acaf", - "sum": "ELsYwK+kGdzX1mee2Yy+/b2mdO4Y503BOCDkFzwmGbE=" + "version": "4d4b5b1ce01003547a110f93cc86b8b7afb282a6", + "sum": "GRf2GvwEU4jhXV+JOonXSZ4wdDv8mnHBPCQ6TUVd+g8=" } ], "legacyImports": false diff --git a/mixin/lib/thanos-grafana-builder/builder.libsonnet b/mixin/lib/thanos-grafana-builder/builder.libsonnet index 4039ecf7d4..5b879a23f3 100644 --- a/mixin/lib/thanos-grafana-builder/builder.libsonnet +++ b/mixin/lib/thanos-grafana-builder/builder.libsonnet @@ -1,5 +1,6 @@ local grafana = import 'grafonnet/grafana.libsonnet'; local template = grafana.template; +local utils = import '../utils.libsonnet'; (import 'grafana-builder/grafana.libsonnet') + { @@ -22,37 +23,6 @@ local template = grafana.template; ], }, - template(name, metricName, selector='', includeAll=false, allValues=''):: - local t = if includeAll then - template.new( - name, - '$datasource', - 'label_values(%s{%s}, %s)' % [metricName, selector, name], - label=name, - refresh=1, - sort=2, - current='all', - allValues=allValues, - includeAll=true - ) - else - template.new( - name, - '$datasource', - 'label_values(%s{%s}, %s)' % [metricName, selector, name], - label=name, - refresh=1, - sort=2, - ); - - { - templating+: { - list+: [ - t, - ], - }, - }, - spanSize(size):: { span: size, }, @@ -69,39 +39,50 @@ local template = grafana.template; }, }, - latencyPanel(metricName, selector, multiplier='1'):: { + latencyPanel(metricName, selector, dimensions, multiplier='1'):: { + local aggregatedLabels = std.split(dimensions, ','), + local dimensionsTemplate = std.join(' ', ['{{%s}}' % std.stripChars(label, ' ') for label in aggregatedLabels]), + nullPointMode: 'null as zero', targets: [ { - expr: 'histogram_quantile(0.99, sum(rate(%s_bucket{%s}[$interval])) by (job, le)) * %s' % [metricName, selector, multiplier], + expr: 'histogram_quantile(%.2f, sum by (%s) (rate(%s_bucket{%s}[$interval]))) * %s' % [percentile, utils.joinLabels([dimensions, 'le']), metricName, selector, multiplier], format: 'time_series', intervalFactor: 2, - legendFormat: 'P99 {{job}}', + legendFormat: 'p%d %s' % [100 * percentile, dimensionsTemplate], + logBase: 10, + min: null, + max: null, refId: 'A', step: 10, + } + for percentile in [0.5, 0.9, 0.99] + ], + yaxes: $.yaxes('s'), + seriesOverrides: [ + { + alias: 'p99', + color: '#FA6400', + fill: 1, + fillGradient: 1, }, { - expr: 'sum(rate(%s_sum{%s}[$interval])) by (job) * %s / sum(rate(%s_count{%s}[$interval])) by (job)' % [metricName, selector, multiplier, metricName, selector], - format: 'time_series', - intervalFactor: 2, - legendFormat: 'mean {{job}}', - refId: 'B', - step: 10, + alias: 'p90', + color: '#E0B400', + fill: 1, + fillGradient: 1, }, { - expr: 'histogram_quantile(0.50, sum(rate(%s_bucket{%s}[$interval])) by (job, le)) * %s' % [metricName, selector, multiplier], - format: 'time_series', - intervalFactor: 2, - legendFormat: 'P50 {{job}}', - refId: 'C', - step: 10, + alias: 'p50', + color: '#37872D', + fill: 10, + fillGradient: 0, }, ], - yaxes: $.yaxes('s'), }, - qpsErrTotalPanel(selectorErr, selectorTotal):: { - local expr(selector) = 'sum(rate(' + selector + '[$interval]))', // {{job}} + qpsErrTotalPanel(selectorErr, selectorTotal, dimensions):: { + local expr(selector) = 'sum by (%s) (rate(%s[$interval]))' % [dimensions, selector], aliasColors: { 'error': '#E24D42', @@ -119,8 +100,8 @@ local template = grafana.template; yaxes: $.yaxes({ format: 'percentunit' }), } + $.stack, - qpsSuccErrRatePanel(selectorErr, selectorTotal):: { - local expr(selector) = 'sum(rate(' + selector + '[$interval]))', // {{job}} + qpsSuccErrRatePanel(selectorErr, selectorTotal, dimensions):: { + local expr(selector) = 'sum by (%s) (rate(%s[$interval]))' % [dimensions, selector], aliasColors: { success: '#7EB26D', @@ -147,26 +128,26 @@ local template = grafana.template; yaxes: $.yaxes({ format: 'percentunit', max: 1 }), } + $.stack, - resourceUtilizationRow():: + resourceUtilizationRow(selector, dimensions):: $.row('Resources') .addPanel( $.panel('Memory Used') + $.queryPanel( [ - 'go_memstats_alloc_bytes{namespace="$namespace",job=~"$job"}', - 'go_memstats_heap_alloc_bytes{namespace="$namespace",job=~"$job"}', - 'rate(go_memstats_alloc_bytes_total{namespace="$namespace",job=~"$job"}[30s])', - 'rate(go_memstats_heap_alloc_bytes{namespace="$namespace",job=~"$job"}[30s])', - 'go_memstats_stack_inuse_bytes{namespace="$namespace",job=~"$job"}', - 'go_memstats_heap_inuse_bytes{namespace="$namespace",job=~"$job"}', + 'go_memstats_alloc_bytes{%s}' % selector, + 'go_memstats_heap_alloc_bytes{%s}' % selector, + 'rate(go_memstats_alloc_bytes_total{%s})[30s]' % selector, + 'rate(go_memstats_heap_alloc_bytes{%s})[30s]' % selector, + 'go_memstats_stack_inuse_bytes{%s}' % selector, + 'go_memstats_heap_inuse_bytes{%s}' % selector, ], [ 'alloc all {{instance}}', 'alloc heap {{instance}}', 'alloc rate all {{instance}}', 'alloc rate heap {{instance}}', - 'inuse stack {{instance}}', 'inuse heap {{instance}}', + 'inuse stack {{instance}}', ] ) + { yaxes: $.yaxes('bytes') }, @@ -174,14 +155,14 @@ local template = grafana.template; .addPanel( $.panel('Goroutines') + $.queryPanel( - 'go_goroutines{namespace="$namespace",job=~"$job"}', + 'go_goroutines{%s}' % selector, '{{instance}}' ) ) .addPanel( $.panel('GC Time Quantiles') + $.queryPanel( - 'go_gc_duration_seconds{namespace="$namespace",job=~"$job"}', + 'go_gc_duration_seconds{%s}' % selector, '{{quantile}} {{instance}}' ) ) + diff --git a/mixin/lib/thanos-grafana-builder/grpc.libsonnet b/mixin/lib/thanos-grafana-builder/grpc.libsonnet index c7eb4be517..12f25ea345 100644 --- a/mixin/lib/thanos-grafana-builder/grpc.libsonnet +++ b/mixin/lib/thanos-grafana-builder/grpc.libsonnet @@ -1,63 +1,46 @@ +local utils = import '../utils.libsonnet'; + { - grpcQpsPanel(type, selector):: { - local prefix = if type == 'client' then 'grpc_client' else 'grpc_server', + grpcRequestsPanel(metric, selector, dimensions):: { + local aggregatedLabels = std.split(dimensions, ','), + local dimensionsTemplate = std.join(' ', ['{{%s}}' % std.stripChars(label, ' ') for label in aggregatedLabels]), - aliasColors: { - Aborted: '#EAB839', - AlreadyExists: '#7EB26D', - FailedPrecondition: '#6ED0E0', - Unimplemented: '#6ED0E0', - InvalidArgument: '#EF843C', - NotFound: '#EF843C', - PermissionDenied: '#EF843C', - Unauthenticated: '#EF843C', - Canceled: '#E24D42', - DataLoss: '#E24D42', - DeadlineExceeded: '#E24D42', - Internal: '#E24D42', - OutOfRange: '#E24D42', - ResourceExhausted: '#E24D42', - Unavailable: '#E24D42', - Unknown: '#E24D42', - OK: '#7EB26D', - 'error': '#E24D42', - }, + seriesOverrides: [ + { alias: '/Aborted/', color: '#EAB839' }, + { alias: '/AlreadyExists/', color: '#37872D' }, + { alias: '/FailedPrecondition/', color: '#E0B400' }, + { alias: '/Unimplemented/', color: '#E0B400' }, + { alias: '/InvalidArgument/', color: '#1F60C4' }, + { alias: '/NotFound/', color: '#1F60C4' }, + { alias: '/PermissionDenied/', color: '#1F60C4' }, + { alias: '/Unauthenticated/', color: '#1F60C4' }, + { alias: '/Canceled/', color: '#C4162A' }, + { alias: '/DataLoss/', color: '#C4162A' }, + { alias: '/DeadlineExceeded/', color: '#C4162A' }, + { alias: '/Internal/', color: '#C4162A' }, + { alias: '/OutOfRange/', color: '#C4162A' }, + { alias: '/ResourceExhausted/', color: '#C4162A' }, + { alias: '/Unavailable/', color: '#C4162A' }, + { alias: '/Unknown/', color: '#C4162A' }, + { alias: '/OK/', color: '#37872D' }, + { alias: 'error', color: '#C4162A' }, + ], targets: [ { - expr: 'sum(rate(%s_handled_total{%s}[$interval])) by (job, grpc_method, grpc_code)' % [prefix, selector], + expr: 'sum by (%s) (rate(%s{%s}[$interval]))' % [utils.joinLabels(aggregatedLabels + ['grpc_method', 'grpc_code']), metric, selector], format: 'time_series', intervalFactor: 2, - legendFormat: '{{job}} {{grpc_method}} {{grpc_code}}', + legendFormat: dimensionsTemplate + ' {{grpc_method}} {{grpc_code}}', refId: 'A', step: 10, }, ], } + $.stack, - grpcErrorsPanel(type, selector):: - local prefix = if type == 'client' then 'grpc_client' else 'grpc_server'; + grpcErrorsPanel(metric, selector, dimensions):: $.qpsErrTotalPanel( - '%s_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss",%s}' % [prefix, selector], - '%s_started_total{%s}' % [prefix, selector], + '%s{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss",%s}' % [metric, selector], + '%s{%s}' % [metric, selector], + dimensions ), - - grpcLatencyPanel(type, selector, multiplier='1'):: - local prefix = if type == 'client' then 'grpc_client' else 'grpc_server'; - $.queryPanel( - [ - 'histogram_quantile(0.99, sum(rate(%s_handling_seconds_bucket{%s}[$interval])) by (job, grpc_method, le)) * %s' % [prefix, selector, multiplier], - ||| - sum(rate(%s_handling_seconds_sum{%s}[$interval])) by (job) * %s - / - sum(rate(%s_handling_seconds_count{%s}[$interval])) by (job) - ||| % [prefix, selector, multiplier, prefix, selector], - 'histogram_quantile(0.50, sum(rate(%s_handling_seconds_bucket{%s}[$interval])) by (job, grpc_method, le)) * %s' % [prefix, selector, multiplier], - ], - [ - 'P99 {{job}} {{grpc_method}}', - 'mean {{job}} {{grpc_method}}', - 'P50 {{job}} {{grpc_method}}', - ] - ) + - { yaxes: $.yaxes('s') }, } diff --git a/mixin/lib/thanos-grafana-builder/http.libsonnet b/mixin/lib/thanos-grafana-builder/http.libsonnet index 3d1720cbf0..d1962cccc3 100644 --- a/mixin/lib/thanos-grafana-builder/http.libsonnet +++ b/mixin/lib/thanos-grafana-builder/http.libsonnet @@ -1,29 +1,34 @@ +local utils = import '../utils.libsonnet'; + { - httpQpsPanel(metricName, selector):: { - aliasColors: { - '1xx': '#EAB839', - '2xx': '#7EB26D', - '3xx': '#6ED0E0', - '4xx': '#EF843C', - '5xx': '#E24D42', - success: '#7EB26D', - 'error': '#E24D42', - }, + httpQpsPanel(metric, selector, dimensions):: { + local aggregatedLabels = std.split(dimensions, ','), + local dimensionsTemplate = std.join(' ', ['{{%s}}' % std.stripChars(label, ' ') for label in aggregatedLabels]), + + seriesOverrides: [ + { alias: '/1../', color: '#EAB839' }, + { alias: '/2../', color: '#37872D' }, + { alias: '/3../', color: '#E0B400' }, + { alias: '/4../', color: '#1F60C4' }, + { alias: '/5../', color: '#C4162A' }, + ], + targets: [ { - expr: 'sum(label_replace(rate(%s{%s}[$interval]),"status_code", "${1}xx", "code", "([0-9])..")) by (job, handler, status_code)' % [metricName, selector], + expr: 'sum by (%s) (rate(%s{%s}[$interval]))' % [utils.joinLabels(aggregatedLabels + ['handler', 'code']), metric, selector], format: 'time_series', intervalFactor: 2, - legendFormat: '{{job}} {{handler}} {{status_code}}', + legendFormat: dimensionsTemplate + ' {{handler}} {{code}}', refId: 'A', step: 10, }, ], } + $.stack, - httpErrPanel(metricName, selector):: + httpErrPanel(metric, selector, dimensions):: $.qpsErrTotalPanel( - '%s{%s,code=~"5.."}' % [metricName, selector], - '%s{%s}' % [metricName, selector], + '%s{%s,code=~"5.."}' % [metric, selector], + '%s{%s}' % [metric, selector], + dimensions ), } diff --git a/mixin/lib/thanos-grafana-builder/slo.libsonnet b/mixin/lib/thanos-grafana-builder/slo.libsonnet index bee885037a..80acca3cfc 100644 --- a/mixin/lib/thanos-grafana-builder/slo.libsonnet +++ b/mixin/lib/thanos-grafana-builder/slo.libsonnet @@ -1,9 +1,14 @@ +local utils = import '../utils.libsonnet'; + { - sloLatency(title, description, selector, quantile, warning, critical):: + sloLatency(title, description, selector, dimensions, quantile, warning, critical):: + local aggregatedLabels = std.split(dimensions, ','); + local dimensionsTemplate = std.join(' ', ['{{%s}}' % std.stripChars(label, ' ') for label in aggregatedLabels]); + $.panel(title, description) + $.queryPanel( - 'histogram_quantile(%.2f, sum(rate(%s[$interval])) by (job, le))' % [quantile, selector], - '{{job}} P' + quantile * 100 + 'histogram_quantile(%.2f, sum by (%s) (rate(%s[$interval])))' % [quantile, utils.joinLabels(aggregatedLabels + ['le']), selector], + dimensionsTemplate + ' P' + quantile * 100 ) + { yaxes: $.yaxes('s'), diff --git a/mixin/lib/utils.libsonnet b/mixin/lib/utils.libsonnet index 700ada95a6..f5700196ea 100644 --- a/mixin/lib/utils.libsonnet +++ b/mixin/lib/utils.libsonnet @@ -10,4 +10,6 @@ for group in super.groups ], }, + + joinLabels(labels): std.join(', ', std.filter(function(x) std.length(std.stripChars(x, ' ')) > 0, labels)), } diff --git a/mixin/rules/bucket_replicate.libsonnet b/mixin/rules/bucket_replicate.libsonnet index 14eb5c945d..a25c9f838b 100644 --- a/mixin/rules/bucket_replicate.libsonnet +++ b/mixin/rules/bucket_replicate.libsonnet @@ -7,8 +7,7 @@ groups+: [ { name: 'thanos-bucket-replicate.rules', - rules: [ - ], + rules: [], }, ], }, diff --git a/mixin/rules/query.libsonnet b/mixin/rules/query.libsonnet index 062af2a676..44f7d13bde 100644 --- a/mixin/rules/query.libsonnet +++ b/mixin/rules/query.libsonnet @@ -2,6 +2,7 @@ local thanos = self, query+:: { selector: error 'must provide selector for Thanos Query recording rules', + dimensions: std.join(', ', std.objectFields(thanos.targetGroups) + ['job']), }, prometheusRules+:: { groups+: [ @@ -12,9 +13,9 @@ record: ':grpc_client_failures_per_unary:sum_rate', expr: ||| ( - sum(rate(grpc_client_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", %(selector)s, grpc_type="unary"}[5m])) + sum by (%(dimensions)s) (rate(grpc_client_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", %(selector)s, grpc_type="unary"}[5m])) / - sum(rate(grpc_client_started_total{%(selector)s, grpc_type="unary"}[5m])) + sum by (%(dimensions)s) (rate(grpc_client_started_total{%(selector)s, grpc_type="unary"}[5m])) ) ||| % thanos.query, }, @@ -22,9 +23,9 @@ record: ':grpc_client_failures_per_stream:sum_rate', expr: ||| ( - sum(rate(grpc_client_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", %(selector)s, grpc_type="server_stream"}[5m])) + sum by (%(dimensions)s) (rate(grpc_client_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", %(selector)s, grpc_type="server_stream"}[5m])) / - sum(rate(grpc_client_started_total{%(selector)s, grpc_type="server_stream"}[5m])) + sum by (%(dimensions)s) (rate(grpc_client_started_total{%(selector)s, grpc_type="server_stream"}[5m])) ) ||| % thanos.query, }, @@ -32,9 +33,9 @@ record: ':thanos_query_store_apis_dns_failures_per_lookup:sum_rate', expr: ||| ( - sum(rate(thanos_query_store_apis_dns_failures_total{%(selector)s}[5m])) + sum by (%(dimensions)s) (rate(thanos_query_store_apis_dns_failures_total{%(selector)s}[5m])) / - sum(rate(thanos_query_store_apis_dns_lookups_total{%(selector)s}[5m])) + sum by (%(dimensions)s) (rate(thanos_query_store_apis_dns_lookups_total{%(selector)s}[5m])) ) ||| % thanos.query, }, @@ -42,7 +43,7 @@ record: ':query_duration_seconds:histogram_quantile', expr: ||| histogram_quantile(0.99, - sum(rate(http_request_duration_seconds_bucket{%(selector)s, handler="query"}[5m])) by (le) + sum by (%(dimensions)s, le) (rate(http_request_duration_seconds_bucket{%(selector)s, handler="query"}[5m])) ) ||| % thanos.query, labels: { @@ -53,7 +54,7 @@ record: ':api_range_query_duration_seconds:histogram_quantile', expr: ||| histogram_quantile(0.99, - sum(rate(http_request_duration_seconds_bucket{%(selector)s, handler="query_range"}[5m])) by (le) + sum by (%(dimensions)s, le) (rate(http_request_duration_seconds_bucket{%(selector)s, handler="query_range"}[5m])) ) ||| % thanos.query, labels: { diff --git a/mixin/rules/receive.libsonnet b/mixin/rules/receive.libsonnet index 1017ce4fa1..fcf3b6429c 100644 --- a/mixin/rules/receive.libsonnet +++ b/mixin/rules/receive.libsonnet @@ -2,6 +2,7 @@ local thanos = self, receive+:: { selector: error 'must provide selector for Thanos Receive recording rules', + dimensions: std.join(', ', std.objectFields(thanos.targetGroups) + ['job']), }, prometheusRules+:: { groups+: [ @@ -11,30 +12,30 @@ { record: ':grpc_server_failures_per_unary:sum_rate', expr: ||| - sum( - rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", %(selector)s, grpc_type="unary"}[5m]) + ( + sum by (%(dimensions)s) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", %(selector)s, grpc_type="unary"}[5m])) / - rate(grpc_server_started_total{%(selector)s, grpc_type="unary"}[5m]) + sum by (%(dimensions)s) (rate(grpc_server_started_total{%(selector)s, grpc_type="unary"}[5m])) ) ||| % thanos.receive, }, { record: ':grpc_server_failures_per_stream:sum_rate', expr: ||| - sum( - rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", %(selector)s, grpc_type="server_stream"}[5m]) + ( + sum by (%(dimensions)s) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", %(selector)s, grpc_type="server_stream"}[5m])) / - rate(grpc_server_started_total{%(selector)s, grpc_type="server_stream"}[5m]) + sum by (%(dimensions)s) (rate(grpc_server_started_total{%(selector)s, grpc_type="server_stream"}[5m])) ) ||| % thanos.receive, }, { record: ':http_failure_per_request:sum_rate', expr: ||| - sum( - rate(http_requests_total{handler="receive", %(selector)s, code!~"5.."}[5m]) + ( + sum by (%(dimensions)s) (rate(http_requests_total{handler="receive", %(selector)s, code!~"5.."}[5m])) / - rate(http_requests_total{handler="receive", %(selector)s}[5m]) + sum by (%(dimensions)s) (rate(http_requests_total{handler="receive", %(selector)s}[5m])) ) ||| % thanos.receive, }, @@ -42,7 +43,7 @@ record: ':http_request_duration_seconds:histogram_quantile', expr: ||| histogram_quantile(0.99, - sum(rate(http_request_duration_seconds_bucket{handler="receive", %(selector)s}[5m])) by (le) + sum by (%(dimensions)s, le) (rate(http_request_duration_seconds_bucket{handler="receive", %(selector)s}[5m])) ) ||| % thanos.receive, labels: { @@ -53,9 +54,9 @@ record: ':thanos_receive_replication_failure_per_requests:sum_rate', expr: ||| ( - sum(rate(thanos_receive_replications_total{result="error", %(selector)s}[5m])) + sum by (%(dimensions)s) (rate(thanos_receive_replications_total{result="error", %(selector)s}[5m])) / - sum(rate(thanos_receive_replications_total{%(selector)s}[5m])) + sum by (%(dimensions)s) (rate(thanos_receive_replications_total{%(selector)s}[5m])) ) ||| % thanos.receive, }, @@ -63,9 +64,9 @@ record: ':thanos_receive_forward_failure_per_requests:sum_rate', expr: ||| ( - sum(rate(thanos_receive_forward_requests_total{result="error", %(selector)s}[5m])) + sum by (%(dimensions)s) (rate(thanos_receive_forward_requests_total{result="error", %(selector)s}[5m])) / - sum(rate(thanos_receive_forward_requests_total{%(selector)s}[5m])) + sum by (%(dimensions)s) (rate(thanos_receive_forward_requests_total{%(selector)s}[5m])) ) ||| % thanos.receive, }, @@ -73,9 +74,9 @@ record: ':thanos_receive_hashring_file_failure_per_refresh:sum_rate', expr: ||| ( - sum(rate(thanos_receive_hashrings_file_errors_total{%(selector)s}[5m])) + sum by (%(dimensions)s) (rate(thanos_receive_hashrings_file_errors_total{%(selector)s}[5m])) / - sum(rate(thanos_receive_hashrings_file_refreshes_total{%(selector)s}[5m])) + sum by (%(dimensions)s) (rate(thanos_receive_hashrings_file_refreshes_total{%(selector)s}[5m])) ) ||| % thanos.receive, }, diff --git a/mixin/rules/store.libsonnet b/mixin/rules/store.libsonnet index 14574d1ef3..249c5923cb 100644 --- a/mixin/rules/store.libsonnet +++ b/mixin/rules/store.libsonnet @@ -2,6 +2,7 @@ local thanos = self, store+:: { selector: error 'must provide selector for Thanos Store recording rules', + dimensions: std.join(', ', std.objectFields(thanos.targetGroups) + ['job']), }, prometheusRules+:: { groups+: [ @@ -12,9 +13,9 @@ record: ':grpc_server_failures_per_unary:sum_rate', expr: ||| ( - sum(rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", %(selector)s, grpc_type="unary"}[5m])) + sum by (%(dimensions)s) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", %(selector)s, grpc_type="unary"}[5m])) / - sum(rate(grpc_server_started_total{%(selector)s, grpc_type="unary"}[5m])) + sum by (%(dimensions)s) (rate(grpc_server_started_total{%(selector)s, grpc_type="unary"}[5m])) ) ||| % thanos.store, }, @@ -22,9 +23,9 @@ record: ':grpc_server_failures_per_stream:sum_rate', expr: ||| ( - sum(rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", %(selector)s, grpc_type="server_stream"}[5m])) + sum by (%(dimensions)s) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", %(selector)s, grpc_type="server_stream"}[5m])) / - sum(rate(grpc_server_started_total{%(selector)s, grpc_type="server_stream"}[5m])) + sum by (%(dimensions)s) (rate(grpc_server_started_total{%(selector)s, grpc_type="server_stream"}[5m])) ) ||| % thanos.store, }, @@ -32,9 +33,9 @@ record: ':thanos_objstore_bucket_failures_per_operation:sum_rate', expr: ||| ( - sum(rate(thanos_objstore_bucket_operation_failures_total{%(selector)s}[5m])) + sum by (%(dimensions)s) (rate(thanos_objstore_bucket_operation_failures_total{%(selector)s}[5m])) / - sum(rate(thanos_objstore_bucket_operations_total{%(selector)s}[5m])) + sum by (%(dimensions)s) (rate(thanos_objstore_bucket_operations_total{%(selector)s}[5m])) ) ||| % thanos.store, }, @@ -42,7 +43,7 @@ record: ':thanos_objstore_bucket_operation_duration_seconds:histogram_quantile', expr: ||| histogram_quantile(0.99, - sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{%(selector)s}[5m])) by (le) + sum by (%(dimensions)s, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{%(selector)s}[5m])) ) ||| % thanos.store, labels: { diff --git a/mixin/runbook.md b/mixin/runbook.md index 78da401bcd..c871810c80 100755 --- a/mixin/runbook.md +++ b/mixin/runbook.md @@ -15,85 +15,84 @@ |Name|Summary|Description|Severity|Runbook| |---|---|---|---|---| -|ThanosBucketReplicateIsDown|Thanos Replicate has disappeared from Prometheus target discovery.|Thanos Replicate has disappeared from Prometheus target discovery.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosbucketreplicateisdown](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosbucketreplicateisdown)| -|ThanosBucketReplicateErrorRate|Thanose Replicate is failing to run.|Thanos Replicate failing to run, {{ $value humanize }}% of attempts failed.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosbucketreplicateerrorrate](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosbucketreplicateerrorrate)| -|ThanosBucketReplicateRunLatency|Thanos Replicate has a high latency for replicate operations.|Thanos Replicate {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for the replicate operations.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosbucketreplicaterunlatency](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosbucketreplicaterunlatency)| +|ThanosBucketReplicateErrorRate|Thanose Replicate is failing to run in .|Thanos Replicate is failing to run , {{$value humanize}}% of attempts failed.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosbucketreplicateerrorrate](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosbucketreplicateerrorrate)| +|ThanosBucketReplicateRunLatency|Thanos Replicate has a high latency for replicate operations.|Thanos Replicate {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for the replicate operations.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosbucketreplicaterunlatency](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosbucketreplicaterunlatency)| ## thanos-compact |Name|Summary|Description|Severity|Runbook| |---|---|---|---|---| -|ThanosCompactMultipleRunning|Thanos Compact has multiple instances running.|No more than one Thanos Compact instance should be running at once. There are {{ $value }}|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompactmultiplerunning](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompactmultiplerunning)| -|ThanosCompactHalted|Thanos Compact has failed to run ans is now halted.|Thanos Compact {{$labels.job}} has failed to run and now is halted.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompacthalted](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompacthalted)| -|ThanosCompactHighCompactionFailures|Thanos Compact is failing to execute compactions.|Thanos Compact {{$labels.job}} is failing to execute {{ $value humanize }}% of compactions.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompacthighcompactionfailures](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompacthighcompactionfailures)| -|ThanosCompactBucketHighOperationFailures|Thanos Compact Bucket is having a high number of operation failures.|Thanos Compact {{$labels.job}} Bucket is failing to execute {{ $value humanize }}% of operations.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompactbuckethighoperationfailures](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompactbuckethighoperationfailures)| -|ThanosCompactHasNotRun|Thanos Compact has not uploaded anything for last 24 hours.|Thanos Compact {{$labels.job}} has not uploaded anything for 24 hours.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompacthasnotrun](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompacthasnotrun)| +|ThanosCompactMultipleRunning|Thanos Compact has multiple instances running.|No more than one Thanos Compact instance should be running at once. There are {{$value}} |warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompactmultiplerunning](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompactmultiplerunning)| +|ThanosCompactHalted|Thanos Compact has failed to run ans is now halted.|Thanos Compact {{$labels.job}} has failed to run and now is halted.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompacthalted](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompacthalted)| +|ThanosCompactHighCompactionFailures|Thanos Compact is failing to execute compactions.|Thanos Compact {{$labels.job}} , is failing to execute {{$value humanize}}% of compactions.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompacthighcompactionfailures](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompacthighcompactionfailures)| +|ThanosCompactBucketHighOperationFailures|Thanos Compact Bucket is having a high number of operation failures.|Thanos Compact {{$labels.job}} , Bucket is failing to execute {{$value humanize}}% of operations.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompactbuckethighoperationfailures](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompactbuckethighoperationfailures)| +|ThanosCompactHasNotRun|Thanos Compact has not uploaded anything for last 24 hours.|Thanos Compact {{$labels.job}} has not uploaded anything for 24 hours.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompacthasnotrun](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompacthasnotrun)| ## thanos-component-absent |Name|Summary|Description|Severity|Runbook| |---|---|---|---|---| -|ThanosBucketReplicateIsDown|thanos component has disappeared from Prometheus target discovery.|ThanosBucketReplicate has disappeared from Prometheus target discovery.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosbucketreplicateisdown](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosbucketreplicateisdown)| -|ThanosCompactIsDown|thanos component has disappeared from Prometheus target discovery.|ThanosCompact has disappeared from Prometheus target discovery.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompactisdown](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompactisdown)| -|ThanosQueryIsDown|thanos component has disappeared from Prometheus target discovery.|ThanosQuery has disappeared from Prometheus target discovery.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryisdown](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryisdown)| -|ThanosReceiveIsDown|thanos component has disappeared from Prometheus target discovery.|ThanosReceive has disappeared from Prometheus target discovery.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceiveisdown](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceiveisdown)| -|ThanosRuleIsDown|thanos component has disappeared from Prometheus target discovery.|ThanosRule has disappeared from Prometheus target discovery.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosruleisdown](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosruleisdown)| -|ThanosSidecarIsDown|thanos component has disappeared from Prometheus target discovery.|ThanosSidecar has disappeared from Prometheus target discovery.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarisdown](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarisdown)| -|ThanosStoreIsDown|thanos component has disappeared from Prometheus target discovery.|ThanosStore has disappeared from Prometheus target discovery.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstoreisdown](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstoreisdown)| +|ThanosBucketReplicateIsDown|Thanos component has disappeared.|ThanosBucketReplicate has disappeared. Prometheus target for the component cannot be discovered.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosbucketreplicateisdown](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosbucketreplicateisdown)| +|ThanosCompactIsDown|Thanos component has disappeared.|ThanosCompact has disappeared. Prometheus target for the component cannot be discovered.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompactisdown](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompactisdown)| +|ThanosQueryIsDown|Thanos component has disappeared.|ThanosQuery has disappeared. Prometheus target for the component cannot be discovered.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryisdown](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryisdown)| +|ThanosReceiveIsDown|Thanos component has disappeared.|ThanosReceive has disappeared. Prometheus target for the component cannot be discovered.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceiveisdown](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceiveisdown)| +|ThanosRuleIsDown|Thanos component has disappeared.|ThanosRule has disappeared. Prometheus target for the component cannot be discovered.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosruleisdown](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosruleisdown)| +|ThanosSidecarIsDown|Thanos component has disappeared.|ThanosSidecar has disappeared. Prometheus target for the component cannot be discovered.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarisdown](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarisdown)| +|ThanosStoreIsDown|Thanos component has disappeared.|ThanosStore has disappeared. Prometheus target for the component cannot be discovered.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstoreisdown](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstoreisdown)| ## thanos-query |Name|Summary|Description|Severity|Runbook| |---|---|---|---|---| -|ThanosQueryHttpRequestQueryErrorRateHigh|Thanos Query is failing to handle requests.|Thanos Query {{$labels.job}} is failing to handle {{ $value humanize }}% of "query" requests.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryhttprequestqueryerrorratehigh](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryhttprequestqueryerrorratehigh)| -|ThanosQueryHttpRequestQueryRangeErrorRateHigh|Thanos Query is failing to handle requests.|Thanos Query {{$labels.job}} is failing to handle {{ $value humanize }}% of "query_range" requests.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryhttprequestqueryrangeerrorratehigh](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryhttprequestqueryrangeerrorratehigh)| -|ThanosQueryGrpcServerErrorRate|Thanos Query is failing to handle requests.|Thanos Query {{$labels.job}} is failing to handle {{ $value humanize }}% of requests.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosquerygrpcservererrorrate](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosquerygrpcservererrorrate)| -|ThanosQueryGrpcClientErrorRate|Thanos Query is failing to send requests.|Thanos Query {{$labels.job}} is failing to send {{ $value humanize }}% of requests.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosquerygrpcclienterrorrate](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosquerygrpcclienterrorrate)| -|ThanosQueryHighDNSFailures|Thanos Query is having high number of DNS failures.|Thanos Query {{$labels.job}} have {{ $value humanize }}% of failing DNS queries for store endpoints.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryhighdnsfailures](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryhighdnsfailures)| -|ThanosQueryInstantLatencyHigh|Thanos Query has high latency for queries.|Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for instant queries.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryinstantlatencyhigh](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryinstantlatencyhigh)| -|ThanosQueryRangeLatencyHigh|Thanos Query has high latency for queries.|Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for range queries.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryrangelatencyhigh](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryrangelatencyhigh)| +|ThanosQueryHttpRequestQueryErrorRateHigh|Thanos Query is failing to handle requests.|Thanos Query {{$labels.job}} is failing to handle {{$value humanize}}% of "query" requests.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryhttprequestqueryerrorratehigh](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryhttprequestqueryerrorratehigh)| +|ThanosQueryHttpRequestQueryRangeErrorRateHigh|Thanos Query is failing to handle requests.|Thanos Query {{$labels.job}} is failing to handle {{$value humanize}}% of "query_range" requests.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryhttprequestqueryrangeerrorratehigh](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryhttprequestqueryrangeerrorratehigh)| +|ThanosQueryGrpcServerErrorRate|Thanos Query is failing to handle requests.|Thanos Query {{$labels.job}} is failing to handle {{$value humanize}}% of requests.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosquerygrpcservererrorrate](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosquerygrpcservererrorrate)| +|ThanosQueryGrpcClientErrorRate|Thanos Query is failing to send requests.|Thanos Query {{$labels.job}} is failing to send {{$value humanize}}% of requests.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosquerygrpcclienterrorrate](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosquerygrpcclienterrorrate)| +|ThanosQueryHighDNSFailures|Thanos Query is having high number of DNS failures.|Thanos Query {{$labels.job}} have {{$value humanize}}% of failing DNS queries for store endpoints.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryhighdnsfailures](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryhighdnsfailures)| +|ThanosQueryInstantLatencyHigh|Thanos Query has high latency for queries.|Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for instant queries.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryinstantlatencyhigh](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryinstantlatencyhigh)| +|ThanosQueryRangeLatencyHigh|Thanos Query has high latency for queries.|Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for range queries.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryrangelatencyhigh](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryrangelatencyhigh)| ## thanos-receive |Name|Summary|Description|Severity|Runbook| |---|---|---|---|---| -|ThanosReceiveHttpRequestErrorRateHigh|Thanos Receive is failing to handle requests.|Thanos Receive {{$labels.job}} is failing to handle {{ $value humanize }}% of requests.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivehttprequesterrorratehigh](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivehttprequesterrorratehigh)| +|ThanosReceiveHttpRequestErrorRateHigh|Thanos Receive is failing to handle requests.|Thanos Receive {{$labels.job}} is failing to handle {{$value humanize}}% of requests.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivehttprequesterrorratehigh](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivehttprequesterrorratehigh)| |ThanosReceiveHttpRequestLatencyHigh|Thanos Receive has high HTTP requests latency.|Thanos Receive {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for requests.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivehttprequestlatencyhigh](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivehttprequestlatencyhigh)| -|ThanosReceiveHighReplicationFailures|Thanos Receive is having high number of replication failures.|Thanos Receive {{$labels.job}} is failing to replicate {{ $value humanize }}% of requests.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivehighreplicationfailures](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivehighreplicationfailures)| -|ThanosReceiveHighForwardRequestFailures|Thanos Receive is failing to forward requests.|Thanos Receive {{$labels.job}} is failing to forward {{ $value humanize }}% of requests.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivehighforwardrequestfailures](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivehighforwardrequestfailures)| -|ThanosReceiveHighHashringFileRefreshFailures|Thanos Receive is failing to refresh hasring file.|Thanos Receive {{$labels.job}} is failing to refresh hashring file, {{ $value humanize }} of attempts failed.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivehighhashringfilerefreshfailures](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivehighhashringfilerefreshfailures)| +|ThanosReceiveHighReplicationFailures|Thanos Receive is having high number of replication failures.|Thanos Receive {{$labels.job}} is failing to replicate {{$value humanize}}% of requests.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivehighreplicationfailures](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivehighreplicationfailures)| +|ThanosReceiveHighForwardRequestFailures|Thanos Receive is failing to forward requests.|Thanos Receive {{$labels.job}} is failing to forward {{$value humanize}}% of requests.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivehighforwardrequestfailures](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivehighforwardrequestfailures)| +|ThanosReceiveHighHashringFileRefreshFailures|Thanos Receive is failing to refresh hasring file.|Thanos Receive {{$labels.job}} is failing to refresh hashring file, {{$value humanize}} of attempts failed.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivehighhashringfilerefreshfailures](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivehighhashringfilerefreshfailures)| |ThanosReceiveConfigReloadFailure|Thanos Receive has not been able to reload configuration.|Thanos Receive {{$labels.job}} has not been able to reload hashring configurations.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceiveconfigreloadfailure](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceiveconfigreloadfailure)| -|ThanosReceiveNoUpload|Thanos Receive has not uploaded latest data to object storage.|Thanos Receive {{ $labels.instance }} of {{$labels.job}} has not uploaded latest data to object storage.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivenoupload](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivenoupload)| +|ThanosReceiveNoUpload|Thanos Receive has not uploaded latest data to object storage.|Thanos Receive {{$labels.instance}} has not uploaded latest data to object storage.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivenoupload](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivenoupload)| ## thanos-rule |Name|Summary|Description|Severity|Runbook| |---|---|---|---|---| -|ThanosRuleQueueIsDroppingAlerts|Thanos Rule is failing to queue alerts.|Thanos Rule {{$labels.job}} is failing to queue alerts.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulequeueisdroppingalerts](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulequeueisdroppingalerts)| -|ThanosRuleSenderIsFailingAlerts|Thanos Rule is failing to send alerts to alertmanager.|Thanos Rule {{$labels.job}} is failing to send alerts to alertmanager.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulesenderisfailingalerts](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulesenderisfailingalerts)| -|ThanosRuleHighRuleEvaluationFailures|Thanos Rule is failing to evaluate rules.|Thanos Rule {{$labels.job}} is failing to evaluate rules.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulehighruleevaluationfailures](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulehighruleevaluationfailures)| -|ThanosRuleHighRuleEvaluationWarnings|Thanos Rule has high number of evaluation warnings.|Thanos Rule {{$labels.job}} has high number of evaluation warnings.|info|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulehighruleevaluationwarnings](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulehighruleevaluationwarnings)| -|ThanosRuleRuleEvaluationLatencyHigh|Thanos Rule has high rule evaluation latency.|Thanos Rule {{$labels.job}}/{{$labels.instance}} has higher evaluation latency than interval for {{$labels.rule_group}}.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosruleruleevaluationlatencyhigh](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosruleruleevaluationlatencyhigh)| -|ThanosRuleGrpcErrorRate|Thanos Rule is failing to handle grpc requests.|Thanos Rule {{$labels.job}} is failing to handle {{ $value humanize }}% of requests.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulegrpcerrorrate](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulegrpcerrorrate)| +|ThanosRuleQueueIsDroppingAlerts|Thanos Rule is failing to queue alerts.|Thanos Rule {{$labels.instance}} is failing to queue alerts.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulequeueisdroppingalerts](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulequeueisdroppingalerts)| +|ThanosRuleSenderIsFailingAlerts|Thanos Rule is failing to send alerts to alertmanager.|Thanos Rule {{$labels.instance}} is failing to send alerts to alertmanager.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulesenderisfailingalerts](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulesenderisfailingalerts)| +|ThanosRuleHighRuleEvaluationFailures|Thanos Rule is failing to evaluate rules.|Thanos Rule {{$labels.instance}} is failing to evaluate rules.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulehighruleevaluationfailures](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulehighruleevaluationfailures)| +|ThanosRuleHighRuleEvaluationWarnings|Thanos Rule has high number of evaluation warnings.|Thanos Rule {{$labels.instance}} has high number of evaluation warnings.|info|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulehighruleevaluationwarnings](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulehighruleevaluationwarnings)| +|ThanosRuleRuleEvaluationLatencyHigh|Thanos Rule has high rule evaluation latency.|Thanos Rule {{$labels.instance}} has higher evaluation latency than interval for {{$labels.rule_group}}.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosruleruleevaluationlatencyhigh](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosruleruleevaluationlatencyhigh)| +|ThanosRuleGrpcErrorRate|Thanos Rule is failing to handle grpc requests.|Thanos Rule {{$labels.job}} is failing to handle {{$value humanize}}% of requests.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulegrpcerrorrate](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulegrpcerrorrate)| |ThanosRuleConfigReloadFailure|Thanos Rule has not been able to reload configuration.|Thanos Rule {{$labels.job}} has not been able to reload its configuration.|info|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosruleconfigreloadfailure](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosruleconfigreloadfailure)| -|ThanosRuleQueryHighDNSFailures|Thanos Rule is having high number of DNS failures.|Thanos Rule {{$labels.job}} has {{ $value humanize }}% of failing DNS queries for query endpoints.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulequeryhighdnsfailures](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulequeryhighdnsfailures)| -|ThanosRuleAlertmanagerHighDNSFailures|Thanos Rule is having high number of DNS failures.|Thanos Rule {{$labels.job}} has {{ $value humanize }}% of failing DNS queries for Alertmanager endpoints.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulealertmanagerhighdnsfailures](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulealertmanagerhighdnsfailures)| -|ThanosRuleNoEvaluationFor10Intervals|Thanos Rule has rule groups that did not evaluate for 10 intervals.|Thanos Rule {{$labels.job}} has {{ $value humanize }}% rule groups that did not evaluate for at least 10x of their expected interval.|info|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulenoevaluationfor10intervals](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulenoevaluationfor10intervals)| -|ThanosNoRuleEvaluations|Thanos Rule did not perform any rule evaluations.|Thanos Rule {{$labels.job}} did not perform any rule evaluations in the past 2 minutes.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosnoruleevaluations](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosnoruleevaluations)| +|ThanosRuleQueryHighDNSFailures|Thanos Rule is having high number of DNS failures.|Thanos Rule {{$labels.job}} has {{$value humanize}}% of failing DNS queries for query endpoints.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulequeryhighdnsfailures](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulequeryhighdnsfailures)| +|ThanosRuleAlertmanagerHighDNSFailures|Thanos Rule is having high number of DNS failures.|Thanos Rule {{$labels.instance}} has {{$value humanize}}% of failing DNS queries for Alertmanager endpoints.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulealertmanagerhighdnsfailures](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulealertmanagerhighdnsfailures)| +|ThanosRuleNoEvaluationFor10Intervals|Thanos Rule has rule groups that did not evaluate for 10 intervals.|Thanos Rule {{$labels.job}} has {{$value humanize}}% rule groups that did not evaluate for at least 10x of their expected interval.|info|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulenoevaluationfor10intervals](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulenoevaluationfor10intervals)| +|ThanosNoRuleEvaluations|Thanos Rule did not perform any rule evaluations.|Thanos Rule {{$labels.instance}} did not perform any rule evaluations in the past 2 minutes.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosnoruleevaluations](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosnoruleevaluations)| ## thanos-sidecar |Name|Summary|Description|Severity|Runbook| |---|---|---|---|---| -|ThanosSidecarPrometheusDown|Thanos Sidecar cannot connect to Prometheus|Thanos Sidecar {{$labels.job}} {{$labels.instance}} cannot connect to Prometheus.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarprometheusdown](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarprometheusdown)| -|ThanosSidecarBucketOperationsFailed|Thanos Sidecar bucket operations are failing|Thanos Sidecar {{$labels.job}} {{$labels.instance}} bucket operations are failing|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarbucketoperationsfailed](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarbucketoperationsfailed)| -|ThanosSidecarUnhealthy|Thanos Sidecar is unhealthy.|Thanos Sidecar {{$labels.job}} {{$labels.instance}} is unhealthy for more than {{$value}} seconds.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy)| +|ThanosSidecarPrometheusDown|Thanos Sidecar cannot connect to Prometheus|Thanos Sidecar {{$labels.instance}} cannot connect to Prometheus.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarprometheusdown](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarprometheusdown)| +|ThanosSidecarBucketOperationsFailed|Thanos Sidecar bucket operations are failing|Thanos Sidecar {{$labels.instance}} bucket operations are failing|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarbucketoperationsfailed](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarbucketoperationsfailed)| +|ThanosSidecarUnhealthy|Thanos Sidecar is unhealthy.|Thanos Sidecar {{$labels.instance}} is unhealthy for {{$value}} seconds.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy)| ## thanos-store |Name|Summary|Description|Severity|Runbook| |---|---|---|---|---| -|ThanosStoreGrpcErrorRate|Thanos Store is failing to handle qrpcd requests.|Thanos Store {{$labels.job}} is failing to handle {{ $value humanize }}% of requests.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstoregrpcerrorrate](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstoregrpcerrorrate)| -|ThanosStoreSeriesGateLatencyHigh|Thanos Store has high latency for store series gate requests.|Thanos Store {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for store series gate requests.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstoreseriesgatelatencyhigh](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstoreseriesgatelatencyhigh)| -|ThanosStoreBucketHighOperationFailures|Thanos Store Bucket is failing to execute operations.|Thanos Store {{$labels.job}} Bucket is failing to execute {{ $value humanize }}% of operations.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstorebuckethighoperationfailures](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstorebuckethighoperationfailures)| -|ThanosStoreObjstoreOperationLatencyHigh|Thanos Store is having high latency for bucket operations.|Thanos Store {{$labels.job}} Bucket has a 99th percentile latency of {{ $value }} seconds for the bucket operations.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstoreobjstoreoperationlatencyhigh](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstoreobjstoreoperationlatencyhigh)| +|ThanosStoreGrpcErrorRate|Thanos Store is failing to handle qrpcd requests.|Thanos Store {{$labels.job}} is failing to handle {{$value humanize}}% of requests.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstoregrpcerrorrate](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstoregrpcerrorrate)| +|ThanosStoreSeriesGateLatencyHigh|Thanos Store has high latency for store series gate requests.|Thanos Store {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for store series gate requests.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstoreseriesgatelatencyhigh](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstoreseriesgatelatencyhigh)| +|ThanosStoreBucketHighOperationFailures|Thanos Store Bucket is failing to execute operations.|Thanos Store {{$labels.job}} Bucket is failing to execute {{$value humanize}}% of operations.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstorebuckethighoperationfailures](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstorebuckethighoperationfailures)| +|ThanosStoreObjstoreOperationLatencyHigh|Thanos Store is having high latency for bucket operations.|Thanos Store {{$labels.job}} Bucket has a 99th percentile latency of {{$value}} seconds for the bucket operations.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstoreobjstoreoperationlatencyhigh](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstoreobjstoreoperationlatencyhigh)| diff --git a/pkg/rules/rules_test.go b/pkg/rules/rules_test.go index ada7bf8ebf..0f0c99499a 100644 --- a/pkg/rules/rules_test.go +++ b/pkg/rules/rules_test.go @@ -36,7 +36,7 @@ func testRulesAgainstExamples(t *testing.T, dir string, server rulespb.RulesServ { Name: "thanos-bucket-replicate", File: filepath.Join(dir, "alerts.yaml"), - Rules: []*rulespb.Rule{someAlert, someAlert, someAlert}, + Rules: []*rulespb.Rule{someAlert, someAlert}, Interval: 60, PartialResponseStrategy: storepb.PartialResponseStrategy_ABORT, },