Skip to content

Commit

Permalink
Introduce agent-flow-mixin (#2014)
Browse files Browse the repository at this point in the history
This introduces an agent-flow-mixin for users to visualize and alert on
the behavior of Grafana Agent Flow.

This first commit introduces a controller dashboard that can be used for
monitoring Flow controller information.
  • Loading branch information
rfratto committed Aug 9, 2022
1 parent 84c3c3d commit 064a2a3
Show file tree
Hide file tree
Showing 7 changed files with 464 additions and 0 deletions.
4 changes: 4 additions & 0 deletions operations/agent-flow-mixin/dashboards.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
grafanaDashboards+:
(import './dashboards/controller.libsonnet'),
}
246 changes: 246 additions & 0 deletions operations/agent-flow-mixin/dashboards/controller.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,246 @@
local dashboard = import './utils/dashboard.jsonnet';
local panel = import './utils/panel.jsonnet';
local filename = 'agent-flow-controller.json';

{
[filename]:
dashboard.new(name='Grafana Agent Flow / Controller') +
dashboard.withUID(std.md5(filename)) +
dashboard.withTemplateVariablesMixin([
dashboard.newTemplateVariable('cluster', |||
label_values(agent_component_controller_running_components_total, cluster)
|||),
dashboard.newTemplateVariable('namespace', |||
label_values(agent_component_controller_running_components_total{cluster="$cluster"}, namespace)
|||),
]) +
dashboard.withPanelsMixin([
// Running agents
(
panel.newSingleStat('Running agents') +
panel.withUnit('agents') +
panel.withDescription(|||
The number of Grafana Agent Flow instances whose metrics are being sent and reported.
|||) +
panel.withPosition({ x: 0, y: 0, w: 10, h: 4 }) +
panel.withQueries([
panel.newQuery(
expr='count(agent_component_controller_evaluating{cluster="$cluster", namespace="$namespace"})',
),
])
),

// Running components
(
panel.newSingleStat('Running components') +
panel.withUnit('components') +
panel.withDescription(|||
The number of running components across all running agents.
|||) +
panel.withPosition({ x: 0, y: 4, w: 10, h: 4 }) +
panel.withQueries([
panel.newQuery(
expr='sum(agent_component_controller_running_components_total{cluster="$cluster", namespace="$namespace"})',
),
])
),

// Overall component health
(
panel.newGraphedSingleStat('Overall component health') {
fieldConfig: {
defaults: {
min: 0,
max: 1,
noValue: 'No components',
},
},
} +
panel.withUnit('percentunit') +
panel.withDescription(|||
The percentage of components which are in a healthy state.
|||) +
panel.withPosition({ x: 0, y: 8, w: 10, h: 4 }) +
panel.withQueries([
panel.newQuery(
expr=|||
sum(agent_component_controller_running_components_total{cluster="$cluster", namespace="$namespace",health_type="healthy"}) /
sum(agent_component_controller_running_components_total{cluster="$cluster", namespace="$namespace"})
|||,
),
])
),

// Components by health
(
panel.new(title='Components by health', type='bargauge') {
options: {
orientation: 'vertical',
showUnfilled: true,
},
fieldConfig: {
defaults: {
min: 0,
thresholds: {
mode: 'absolute',
steps: [{ color: 'green', value: null }],
},
},
overrides: [
{
matcher: { id: 'byName', options: 'Unhealthy' },
properties: [{
id: 'thresholds',
value: {
mode: 'absolute',
steps: [
{ color: 'green', value: null },
{ color: 'red', value: 1 },
],
},
}],
},
{
matcher: { id: 'byName', options: 'Unknown' },
properties: [{
id: 'thresholds',
value: {
mode: 'absolute',
steps: [
{ color: 'green', value: null },
{ color: 'blue', value: 1 },
],
},
}],
},
{
matcher: { id: 'byName', options: 'Exited' },
properties: [{
id: 'thresholds',
value: {
mode: 'absolute',
steps: [
{ color: 'green', value: null },
{ color: 'orange', value: 1 },
],
},
}],
},
],
},
} +
panel.withDescription(|||
Breakdown of components by health across all running agents.
* Healthy: components have been evaluated completely and are reporting themselves as healthy.
* Unhealthy: Components either could not be evaluated or are reporting themselves as unhealthy.
* Unknown: A component has been created but has not yet been started.
* Exited: A component has exited. It will not return to the running state.
More information on a component's health state can be retrieved at
the /debug/config?debug=1 HTTP endpoint of the Grafana Agent process.
Note that components may be in a degraded state even if they report
themselves as healthy. Use component-specific dashboards and alerts
to observe detailed information about the behavior of a component.
|||) +
panel.withPosition({ x: 10, y: 0, w: 14, h: 12 }) +
panel.withQueries([
panel.newInstantQuery(
legendFormat='Healthy',
expr='sum(agent_component_controller_running_components_total{cluster="$cluster", namespace="$namespace", health_type="healthy"}) or vector(0)',
),
panel.newInstantQuery(
legendFormat='Unhealthy',
expr='sum(agent_component_controller_running_components_total{cluster="$cluster", namespace="$namespace", health_type="unhealthy"}) or vector(0)',
),
panel.newInstantQuery(
legendFormat='Unknown',
expr='sum(agent_component_controller_running_components_total{cluster="$cluster", namespace="$namespace", health_type="unknown"}) or vector(0)',
),
panel.newInstantQuery(
legendFormat='Exited',
expr='sum(agent_component_controller_running_components_total{cluster="$cluster", namespace="$namespace", health_type="exited"}) or vector(0)',
),
])
),

// Graph evaluation rate
(
panel.new(title='Graph evaluation rate', type='timeseries') {
fieldConfig: {
defaults: {
custom: {
drawStyle: 'points',
pointSize: 3,
},
},
},
} +
panel.withUnit('ops') +
panel.withDescription(|||
The frequency in which the component graph gets updated.
|||) +
panel.withPosition({ x: 0, y: 12, w: 8, h: 10 }) +
panel.withMultiTooltip() +
panel.withQueries([
panel.newQuery(
expr='sum by (instance) (rate(agent_component_evaluation_seconds_count{cluster="$cluster", namespace="$namespace"}[$__rate_interval]))',
),
])
),

// Graph evaluation time
(
panel.new(title='Graph evaluation time', type='timeseries') +
panel.withUnit('s') +
panel.withDescription(|||
The percentiles for how long it takes to complete a graph evaluation.
Graph evaluations must complete for components to have the latest
arguments. The longer graph evaluations take, the slower it will be to
reconcile the state of components.
If evaluation is taking too long, consider sharding your components to
deal with smaller amounts of data and reuse data as much as possible.
|||) +
panel.withPosition({ x: 8, y: 12, w: 8, h: 10 }) +
panel.withQueries([
panel.newQuery(
expr='histogram_quantile(0.99, sum by (le) (rate(agent_component_evaluation_seconds_bucket{cluster="$cluster",namespace="$namespace"}[$__rate_interval])))',
legendFormat='99th percentile',
),
panel.newQuery(
expr='histogram_quantile(0.50, sum by (le) (rate(agent_component_evaluation_seconds_bucket{cluster="$cluster",namespace="$namespace"}[$__rate_interval])))',
legendFormat='50th percentile',
),
panel.newQuery(
expr=|||
sum(rate(agent_component_evaluation_seconds_sum{cluster="$cluster",namespace="$namespace"}[$__rate_interval])) /
sum(rate(agent_component_evaluation_seconds_count{cluster="$cluster",namespace="$namespace"}[$__rate_interval]))
|||,
legendFormat='Average',
),
])
),

// Graph evaluation histogram
(
panel.newHeatmap('Graph evaluation histogram') +
panel.withDescription(|||
Detailed histogram view of how long graph evaluations take.
The goal is to design your config so that evaluations take as little
time as possible; under 100ms is a good goal.
|||) +
panel.withPosition({ x: 16, y: 12, w: 8, h: 10 }) +
panel.withQueries([
panel.newQuery(
expr='sum by (le) (increase(agent_component_evaluation_seconds_bucket{cluster="$cluster", namespace="$namespace"}[$__rate_interval]))',
format='heatmap',
legendFormat='{{le}}',
),
])
),
]),
}
75 changes: 75 additions & 0 deletions operations/agent-flow-mixin/dashboards/utils/dashboard.jsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
// dashboard.jsonnet defines utilities to create dashboards using the
// schemaVersion present in Grafana 9.

{
new(name=''):: {
title: name,
timezone: 'utc',
refresh: '10s',
schemaVersion: 36,
tags: ['grafana-agent-flow-mixin'],
templating: {
list: [{
name: 'datasource',
label: 'Data Source',
type: 'datasource',
query: 'prometheus',
refresh: 1,
sort: 2,
}],
},
time: {
from: 'now-1h',
to: 'now',
},
timepicker: {
refresh_intervals: [
'5s',
'10s',
'30s',
'1m',
'5m',
'15m',
'30m',
'1h',
'2h',
'1d',
],
time_options: [
'5m',
'15m',
'1h',
'6h',
'12h',
'24h',
'2d',
'7d',
'30d',
'90d',
],
},
},

withUID(uid):: { uid: uid },

withTemplateVariablesMixin(vars):: {
templating+: {
list+: vars,
},
},

newTemplateVariable(name, query):: {
name: name,
label: name,
type: 'query',
query: {
query: query,
refId: name,
},
datasource: '${datasource}',
refresh: 2,
sort: 2,
},

withPanelsMixin(panels):: { panels+: panels },
}
Loading

0 comments on commit 064a2a3

Please sign in to comment.