Introduce agent-flow-mixin (#2014)

This introduces an agent-flow-mixin for users to visualize and alert on the behavior of Grafana Agent Flow. This first commit introduces a controller dashboard that can be used for monitoring Flow controller information.
grafana · Aug 9, 2022 · 064a2a3 · 064a2a3
1 parent 84c3c3d
commit 064a2a3
Show file tree

Hide file tree

Showing 7 changed files with 464 additions and 0 deletions.
diff --git a/operations/agent-flow-mixin/dashboards.libsonnet b/operations/agent-flow-mixin/dashboards.libsonnet
@@ -0,0 +1,4 @@
+{
+  grafanaDashboards+:
+    (import './dashboards/controller.libsonnet'),
+}
diff --git a/operations/agent-flow-mixin/dashboards/controller.libsonnet b/operations/agent-flow-mixin/dashboards/controller.libsonnet
@@ -0,0 +1,246 @@
+local dashboard = import './utils/dashboard.jsonnet';
+local panel = import './utils/panel.jsonnet';
+local filename = 'agent-flow-controller.json';
+
+{
+  [filename]:
+    dashboard.new(name='Grafana Agent Flow / Controller') +
+    dashboard.withUID(std.md5(filename)) +
+    dashboard.withTemplateVariablesMixin([
+      dashboard.newTemplateVariable('cluster', |||
+        label_values(agent_component_controller_running_components_total, cluster)
+      |||),
+      dashboard.newTemplateVariable('namespace', |||
+        label_values(agent_component_controller_running_components_total{cluster="$cluster"}, namespace)
+      |||),
+    ]) +
+    dashboard.withPanelsMixin([
+      // Running agents
+      (
+        panel.newSingleStat('Running agents') +
+        panel.withUnit('agents') +
+        panel.withDescription(|||
+          The number of Grafana Agent Flow instances whose metrics are being sent and reported.
+        |||) +
+        panel.withPosition({ x: 0, y: 0, w: 10, h: 4 }) +
+        panel.withQueries([
+          panel.newQuery(
+            expr='count(agent_component_controller_evaluating{cluster="$cluster", namespace="$namespace"})',
+          ),
+        ])
+      ),
+
+      // Running components
+      (
+        panel.newSingleStat('Running components') +
+        panel.withUnit('components') +
+        panel.withDescription(|||
+          The number of running components across all running agents.
+        |||) +
+        panel.withPosition({ x: 0, y: 4, w: 10, h: 4 }) +
+        panel.withQueries([
+          panel.newQuery(
+            expr='sum(agent_component_controller_running_components_total{cluster="$cluster", namespace="$namespace"})',
+          ),
+        ])
+      ),
+
+      // Overall component health
+      (
+        panel.newGraphedSingleStat('Overall component health') {
+          fieldConfig: {
+            defaults: {
+              min: 0,
+              max: 1,
+              noValue: 'No components',
+            },
+          },
+        } +
+        panel.withUnit('percentunit') +
+        panel.withDescription(|||
+          The percentage of components which are in a healthy state.
+        |||) +
+        panel.withPosition({ x: 0, y: 8, w: 10, h: 4 }) +
+        panel.withQueries([
+          panel.newQuery(
+            expr=|||
+              sum(agent_component_controller_running_components_total{cluster="$cluster", namespace="$namespace",health_type="healthy"}) /
+              sum(agent_component_controller_running_components_total{cluster="$cluster", namespace="$namespace"})
+            |||,
+          ),
+        ])
+      ),
+
+      // Components by health
+      (
+        panel.new(title='Components by health', type='bargauge') {
+          options: {
+            orientation: 'vertical',
+            showUnfilled: true,
+          },
+          fieldConfig: {
+            defaults: {
+              min: 0,
+              thresholds: {
+                mode: 'absolute',
+                steps: [{ color: 'green', value: null }],
+              },
+            },
+            overrides: [
+              {
+                matcher: { id: 'byName', options: 'Unhealthy' },
+                properties: [{
+                  id: 'thresholds',
+                  value: {
+                    mode: 'absolute',
+                    steps: [
+                      { color: 'green', value: null },
+                      { color: 'red', value: 1 },
+                    ],
+                  },
+                }],
+              },
+              {
+                matcher: { id: 'byName', options: 'Unknown' },
+                properties: [{
+                  id: 'thresholds',
+                  value: {
+                    mode: 'absolute',
+                    steps: [
+                      { color: 'green', value: null },
+                      { color: 'blue', value: 1 },
+                    ],
+                  },
+                }],
+              },
+              {
+                matcher: { id: 'byName', options: 'Exited' },
+                properties: [{
+                  id: 'thresholds',
+                  value: {
+                    mode: 'absolute',
+                    steps: [
+                      { color: 'green', value: null },
+                      { color: 'orange', value: 1 },
+                    ],
+                  },
+                }],
+              },
+            ],
+          },
+        } +
+        panel.withDescription(|||
+          Breakdown of components by health across all running agents.
+
+          * Healthy: components have been evaluated completely and are reporting themselves as healthy.
+          * Unhealthy: Components either could not be evaluated or are reporting themselves as unhealthy.
+          * Unknown: A component has been created but has not yet been started.
+          * Exited: A component has exited. It will not return to the running state.
+
+          More information on a component's health state can be retrieved at
+          the /debug/config?debug=1 HTTP endpoint of the Grafana Agent process.
+
+          Note that components may be in a degraded state even if they report
+          themselves as healthy. Use component-specific dashboards and alerts
+          to observe detailed information about the behavior of a component.
+        |||) +
+        panel.withPosition({ x: 10, y: 0, w: 14, h: 12 }) +
+        panel.withQueries([
+          panel.newInstantQuery(
+            legendFormat='Healthy',
+            expr='sum(agent_component_controller_running_components_total{cluster="$cluster", namespace="$namespace", health_type="healthy"}) or vector(0)',
+          ),
+          panel.newInstantQuery(
+            legendFormat='Unhealthy',
+            expr='sum(agent_component_controller_running_components_total{cluster="$cluster", namespace="$namespace", health_type="unhealthy"}) or vector(0)',
+          ),
+          panel.newInstantQuery(
+            legendFormat='Unknown',
+            expr='sum(agent_component_controller_running_components_total{cluster="$cluster", namespace="$namespace", health_type="unknown"}) or vector(0)',
+          ),
+          panel.newInstantQuery(
+            legendFormat='Exited',
+            expr='sum(agent_component_controller_running_components_total{cluster="$cluster", namespace="$namespace", health_type="exited"}) or vector(0)',
+          ),
+        ])
+      ),
+
+      // Graph evaluation rate
+      (
+        panel.new(title='Graph evaluation rate', type='timeseries') {
+          fieldConfig: {
+            defaults: {
+              custom: {
+                drawStyle: 'points',
+                pointSize: 3,
+              },
+            },
+          },
+        } +
+        panel.withUnit('ops') +
+        panel.withDescription(|||
+          The frequency in which the component graph gets updated.
+        |||) +
+        panel.withPosition({ x: 0, y: 12, w: 8, h: 10 }) +
+        panel.withMultiTooltip() +
+        panel.withQueries([
+          panel.newQuery(
+            expr='sum by (instance) (rate(agent_component_evaluation_seconds_count{cluster="$cluster", namespace="$namespace"}[$__rate_interval]))',
+          ),
+        ])
+      ),
+
+      // Graph evaluation time
+      (
+        panel.new(title='Graph evaluation time', type='timeseries') +
+        panel.withUnit('s') +
+        panel.withDescription(|||
+          The percentiles for how long it takes to complete a graph evaluation.
+
+          Graph evaluations must complete for components to have the latest
+          arguments. The longer graph evaluations take, the slower it will be to
+          reconcile the state of components.
+
+          If evaluation is taking too long, consider sharding your components to
+          deal with smaller amounts of data and reuse data as much as possible.
+        |||) +
+        panel.withPosition({ x: 8, y: 12, w: 8, h: 10 }) +
+        panel.withQueries([
+          panel.newQuery(
+            expr='histogram_quantile(0.99, sum by (le) (rate(agent_component_evaluation_seconds_bucket{cluster="$cluster",namespace="$namespace"}[$__rate_interval])))',
+            legendFormat='99th percentile',
+          ),
+          panel.newQuery(
+            expr='histogram_quantile(0.50, sum by (le) (rate(agent_component_evaluation_seconds_bucket{cluster="$cluster",namespace="$namespace"}[$__rate_interval])))',
+            legendFormat='50th percentile',
+          ),
+          panel.newQuery(
+            expr=|||
+              sum(rate(agent_component_evaluation_seconds_sum{cluster="$cluster",namespace="$namespace"}[$__rate_interval])) /
+              sum(rate(agent_component_evaluation_seconds_count{cluster="$cluster",namespace="$namespace"}[$__rate_interval]))
+            |||,
+            legendFormat='Average',
+          ),
+        ])
+      ),
+
+      // Graph evaluation histogram
+      (
+        panel.newHeatmap('Graph evaluation histogram') +
+        panel.withDescription(|||
+          Detailed histogram view of how long graph evaluations take.
+
+          The goal is to design your config so that evaluations take as little
+          time as possible; under 100ms is a good goal.
+        |||) +
+        panel.withPosition({ x: 16, y: 12, w: 8, h: 10 }) +
+        panel.withQueries([
+          panel.newQuery(
+            expr='sum by (le) (increase(agent_component_evaluation_seconds_bucket{cluster="$cluster", namespace="$namespace"}[$__rate_interval]))',
+            format='heatmap',
+            legendFormat='{{le}}',
+          ),
+        ])
+      ),
+    ]),
+}
diff --git a/operations/agent-flow-mixin/dashboards/utils/dashboard.jsonnet b/operations/agent-flow-mixin/dashboards/utils/dashboard.jsonnet
@@ -0,0 +1,75 @@
+// dashboard.jsonnet defines utilities to create dashboards using the
+// schemaVersion present in Grafana 9.
+
+{
+  new(name=''):: {
+    title: name,
+    timezone: 'utc',
+    refresh: '10s',
+    schemaVersion: 36,
+    tags: ['grafana-agent-flow-mixin'],
+    templating: {
+      list: [{
+        name: 'datasource',
+        label: 'Data Source',
+        type: 'datasource',
+        query: 'prometheus',
+        refresh: 1,
+        sort: 2,
+      }],
+    },
+    time: {
+      from: 'now-1h',
+      to: 'now',
+    },
+    timepicker: {
+      refresh_intervals: [
+        '5s',
+        '10s',
+        '30s',
+        '1m',
+        '5m',
+        '15m',
+        '30m',
+        '1h',
+        '2h',
+        '1d',
+      ],
+      time_options: [
+        '5m',
+        '15m',
+        '1h',
+        '6h',
+        '12h',
+        '24h',
+        '2d',
+        '7d',
+        '30d',
+        '90d',
+      ],
+    },
+  },
+
+  withUID(uid):: { uid: uid },
+
+  withTemplateVariablesMixin(vars):: {
+    templating+: {
+      list+: vars,
+    },
+  },
+
+  newTemplateVariable(name, query):: {
+    name: name,
+    label: name,
+    type: 'query',
+    query: {
+      query: query,
+      refId: name,
+    },
+    datasource: '${datasource}',
+    refresh: 2,
+    sort: 2,
+  },
+
+  withPanelsMixin(panels):: { panels+: panels },
+}