-
Notifications
You must be signed in to change notification settings - Fork 486
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
This introduces an agent-flow-mixin for users to visualize and alert on the behavior of Grafana Agent Flow. This first commit introduces a controller dashboard that can be used for monitoring Flow controller information.
- Loading branch information
Showing
7 changed files
with
464 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
{ | ||
grafanaDashboards+: | ||
(import './dashboards/controller.libsonnet'), | ||
} |
246 changes: 246 additions & 0 deletions
246
operations/agent-flow-mixin/dashboards/controller.libsonnet
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,246 @@ | ||
local dashboard = import './utils/dashboard.jsonnet'; | ||
local panel = import './utils/panel.jsonnet'; | ||
local filename = 'agent-flow-controller.json'; | ||
|
||
{ | ||
[filename]: | ||
dashboard.new(name='Grafana Agent Flow / Controller') + | ||
dashboard.withUID(std.md5(filename)) + | ||
dashboard.withTemplateVariablesMixin([ | ||
dashboard.newTemplateVariable('cluster', ||| | ||
label_values(agent_component_controller_running_components_total, cluster) | ||
|||), | ||
dashboard.newTemplateVariable('namespace', ||| | ||
label_values(agent_component_controller_running_components_total{cluster="$cluster"}, namespace) | ||
|||), | ||
]) + | ||
dashboard.withPanelsMixin([ | ||
// Running agents | ||
( | ||
panel.newSingleStat('Running agents') + | ||
panel.withUnit('agents') + | ||
panel.withDescription(||| | ||
The number of Grafana Agent Flow instances whose metrics are being sent and reported. | ||
|||) + | ||
panel.withPosition({ x: 0, y: 0, w: 10, h: 4 }) + | ||
panel.withQueries([ | ||
panel.newQuery( | ||
expr='count(agent_component_controller_evaluating{cluster="$cluster", namespace="$namespace"})', | ||
), | ||
]) | ||
), | ||
|
||
// Running components | ||
( | ||
panel.newSingleStat('Running components') + | ||
panel.withUnit('components') + | ||
panel.withDescription(||| | ||
The number of running components across all running agents. | ||
|||) + | ||
panel.withPosition({ x: 0, y: 4, w: 10, h: 4 }) + | ||
panel.withQueries([ | ||
panel.newQuery( | ||
expr='sum(agent_component_controller_running_components_total{cluster="$cluster", namespace="$namespace"})', | ||
), | ||
]) | ||
), | ||
|
||
// Overall component health | ||
( | ||
panel.newGraphedSingleStat('Overall component health') { | ||
fieldConfig: { | ||
defaults: { | ||
min: 0, | ||
max: 1, | ||
noValue: 'No components', | ||
}, | ||
}, | ||
} + | ||
panel.withUnit('percentunit') + | ||
panel.withDescription(||| | ||
The percentage of components which are in a healthy state. | ||
|||) + | ||
panel.withPosition({ x: 0, y: 8, w: 10, h: 4 }) + | ||
panel.withQueries([ | ||
panel.newQuery( | ||
expr=||| | ||
sum(agent_component_controller_running_components_total{cluster="$cluster", namespace="$namespace",health_type="healthy"}) / | ||
sum(agent_component_controller_running_components_total{cluster="$cluster", namespace="$namespace"}) | ||
|||, | ||
), | ||
]) | ||
), | ||
|
||
// Components by health | ||
( | ||
panel.new(title='Components by health', type='bargauge') { | ||
options: { | ||
orientation: 'vertical', | ||
showUnfilled: true, | ||
}, | ||
fieldConfig: { | ||
defaults: { | ||
min: 0, | ||
thresholds: { | ||
mode: 'absolute', | ||
steps: [{ color: 'green', value: null }], | ||
}, | ||
}, | ||
overrides: [ | ||
{ | ||
matcher: { id: 'byName', options: 'Unhealthy' }, | ||
properties: [{ | ||
id: 'thresholds', | ||
value: { | ||
mode: 'absolute', | ||
steps: [ | ||
{ color: 'green', value: null }, | ||
{ color: 'red', value: 1 }, | ||
], | ||
}, | ||
}], | ||
}, | ||
{ | ||
matcher: { id: 'byName', options: 'Unknown' }, | ||
properties: [{ | ||
id: 'thresholds', | ||
value: { | ||
mode: 'absolute', | ||
steps: [ | ||
{ color: 'green', value: null }, | ||
{ color: 'blue', value: 1 }, | ||
], | ||
}, | ||
}], | ||
}, | ||
{ | ||
matcher: { id: 'byName', options: 'Exited' }, | ||
properties: [{ | ||
id: 'thresholds', | ||
value: { | ||
mode: 'absolute', | ||
steps: [ | ||
{ color: 'green', value: null }, | ||
{ color: 'orange', value: 1 }, | ||
], | ||
}, | ||
}], | ||
}, | ||
], | ||
}, | ||
} + | ||
panel.withDescription(||| | ||
Breakdown of components by health across all running agents. | ||
* Healthy: components have been evaluated completely and are reporting themselves as healthy. | ||
* Unhealthy: Components either could not be evaluated or are reporting themselves as unhealthy. | ||
* Unknown: A component has been created but has not yet been started. | ||
* Exited: A component has exited. It will not return to the running state. | ||
More information on a component's health state can be retrieved at | ||
the /debug/config?debug=1 HTTP endpoint of the Grafana Agent process. | ||
Note that components may be in a degraded state even if they report | ||
themselves as healthy. Use component-specific dashboards and alerts | ||
to observe detailed information about the behavior of a component. | ||
|||) + | ||
panel.withPosition({ x: 10, y: 0, w: 14, h: 12 }) + | ||
panel.withQueries([ | ||
panel.newInstantQuery( | ||
legendFormat='Healthy', | ||
expr='sum(agent_component_controller_running_components_total{cluster="$cluster", namespace="$namespace", health_type="healthy"}) or vector(0)', | ||
), | ||
panel.newInstantQuery( | ||
legendFormat='Unhealthy', | ||
expr='sum(agent_component_controller_running_components_total{cluster="$cluster", namespace="$namespace", health_type="unhealthy"}) or vector(0)', | ||
), | ||
panel.newInstantQuery( | ||
legendFormat='Unknown', | ||
expr='sum(agent_component_controller_running_components_total{cluster="$cluster", namespace="$namespace", health_type="unknown"}) or vector(0)', | ||
), | ||
panel.newInstantQuery( | ||
legendFormat='Exited', | ||
expr='sum(agent_component_controller_running_components_total{cluster="$cluster", namespace="$namespace", health_type="exited"}) or vector(0)', | ||
), | ||
]) | ||
), | ||
|
||
// Graph evaluation rate | ||
( | ||
panel.new(title='Graph evaluation rate', type='timeseries') { | ||
fieldConfig: { | ||
defaults: { | ||
custom: { | ||
drawStyle: 'points', | ||
pointSize: 3, | ||
}, | ||
}, | ||
}, | ||
} + | ||
panel.withUnit('ops') + | ||
panel.withDescription(||| | ||
The frequency in which the component graph gets updated. | ||
|||) + | ||
panel.withPosition({ x: 0, y: 12, w: 8, h: 10 }) + | ||
panel.withMultiTooltip() + | ||
panel.withQueries([ | ||
panel.newQuery( | ||
expr='sum by (instance) (rate(agent_component_evaluation_seconds_count{cluster="$cluster", namespace="$namespace"}[$__rate_interval]))', | ||
), | ||
]) | ||
), | ||
|
||
// Graph evaluation time | ||
( | ||
panel.new(title='Graph evaluation time', type='timeseries') + | ||
panel.withUnit('s') + | ||
panel.withDescription(||| | ||
The percentiles for how long it takes to complete a graph evaluation. | ||
Graph evaluations must complete for components to have the latest | ||
arguments. The longer graph evaluations take, the slower it will be to | ||
reconcile the state of components. | ||
If evaluation is taking too long, consider sharding your components to | ||
deal with smaller amounts of data and reuse data as much as possible. | ||
|||) + | ||
panel.withPosition({ x: 8, y: 12, w: 8, h: 10 }) + | ||
panel.withQueries([ | ||
panel.newQuery( | ||
expr='histogram_quantile(0.99, sum by (le) (rate(agent_component_evaluation_seconds_bucket{cluster="$cluster",namespace="$namespace"}[$__rate_interval])))', | ||
legendFormat='99th percentile', | ||
), | ||
panel.newQuery( | ||
expr='histogram_quantile(0.50, sum by (le) (rate(agent_component_evaluation_seconds_bucket{cluster="$cluster",namespace="$namespace"}[$__rate_interval])))', | ||
legendFormat='50th percentile', | ||
), | ||
panel.newQuery( | ||
expr=||| | ||
sum(rate(agent_component_evaluation_seconds_sum{cluster="$cluster",namespace="$namespace"}[$__rate_interval])) / | ||
sum(rate(agent_component_evaluation_seconds_count{cluster="$cluster",namespace="$namespace"}[$__rate_interval])) | ||
|||, | ||
legendFormat='Average', | ||
), | ||
]) | ||
), | ||
|
||
// Graph evaluation histogram | ||
( | ||
panel.newHeatmap('Graph evaluation histogram') + | ||
panel.withDescription(||| | ||
Detailed histogram view of how long graph evaluations take. | ||
The goal is to design your config so that evaluations take as little | ||
time as possible; under 100ms is a good goal. | ||
|||) + | ||
panel.withPosition({ x: 16, y: 12, w: 8, h: 10 }) + | ||
panel.withQueries([ | ||
panel.newQuery( | ||
expr='sum by (le) (increase(agent_component_evaluation_seconds_bucket{cluster="$cluster", namespace="$namespace"}[$__rate_interval]))', | ||
format='heatmap', | ||
legendFormat='{{le}}', | ||
), | ||
]) | ||
), | ||
]), | ||
} |
75 changes: 75 additions & 0 deletions
75
operations/agent-flow-mixin/dashboards/utils/dashboard.jsonnet
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
// dashboard.jsonnet defines utilities to create dashboards using the | ||
// schemaVersion present in Grafana 9. | ||
|
||
{ | ||
new(name=''):: { | ||
title: name, | ||
timezone: 'utc', | ||
refresh: '10s', | ||
schemaVersion: 36, | ||
tags: ['grafana-agent-flow-mixin'], | ||
templating: { | ||
list: [{ | ||
name: 'datasource', | ||
label: 'Data Source', | ||
type: 'datasource', | ||
query: 'prometheus', | ||
refresh: 1, | ||
sort: 2, | ||
}], | ||
}, | ||
time: { | ||
from: 'now-1h', | ||
to: 'now', | ||
}, | ||
timepicker: { | ||
refresh_intervals: [ | ||
'5s', | ||
'10s', | ||
'30s', | ||
'1m', | ||
'5m', | ||
'15m', | ||
'30m', | ||
'1h', | ||
'2h', | ||
'1d', | ||
], | ||
time_options: [ | ||
'5m', | ||
'15m', | ||
'1h', | ||
'6h', | ||
'12h', | ||
'24h', | ||
'2d', | ||
'7d', | ||
'30d', | ||
'90d', | ||
], | ||
}, | ||
}, | ||
|
||
withUID(uid):: { uid: uid }, | ||
|
||
withTemplateVariablesMixin(vars):: { | ||
templating+: { | ||
list+: vars, | ||
}, | ||
}, | ||
|
||
newTemplateVariable(name, query):: { | ||
name: name, | ||
label: name, | ||
type: 'query', | ||
query: { | ||
query: query, | ||
refId: name, | ||
}, | ||
datasource: '${datasource}', | ||
refresh: 2, | ||
sort: 2, | ||
}, | ||
|
||
withPanelsMixin(panels):: { panels+: panels }, | ||
} |
Oops, something went wrong.