Skip to content

Commit

Permalink
Add aggregated graphs to master dashboard
Browse files Browse the repository at this point in the history
  • Loading branch information
jprzychodzen committed Apr 23, 2020
1 parent a82fef5 commit 49998fe
Show file tree
Hide file tree
Showing 2 changed files with 521 additions and 63 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -14,25 +14,18 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from collections import namedtuple
from grafanalib import core as g
import defaults as d


def api_call_latency(title, verb, scope, threshold):
def api_call_latency(title, verb, scope, threshold, expression):
return d.Graph(
title=title,
targets=[
g.Target(expr=str(threshold), legendFormat="threshold"),
g.Target(
expr=d.one_line(
"""
apiserver:apiserver_request_latency_1m:histogram_quantile{
quantile="0.99",
verb=~"%(verb)s",
scope=~"%(scope)s",
resource=~"${resource:regex}s*",
}"""
% {"verb": verb, "scope": scope}
expr=d.one_line(expression % {"verb": verb, "scope": scope}
),
# TODO(github.com/grafana/grafana/issues/19410): uncomment once fixed
# legendFormat="{{verb}} {{scope}}/{{resource}}",
Expand All @@ -41,34 +34,52 @@ def api_call_latency(title, verb, scope, threshold):
yAxes=g.single_y_axis(format=g.SECONDS_FORMAT),
)

ApiCallLatencyPanelArgs = namedtuple('ApiCallLatencyPanelArgs', ['title', 'verb', 'scope', 'threshold'])

CLUSTERLOADER_PANELS = [
api_call_latency(
API_CALL_LATENCY_PANELS_TEMPLATE = [
ApiCallLatencyPanelArgs(
title="Read-only API call latency (percentaile=99, scope=resource, threshold=1s)",
verb="GET",
scope="namespace",
threshold=1,
threshold=1
),
api_call_latency(
ApiCallLatencyPanelArgs(
title="Read-only API call latency (percentaile=99, scope=namespace, threshold=5s)",
verb="LIST",
scope="namespace",
threshold=5,
),
api_call_latency(
ApiCallLatencyPanelArgs(
title="Read-only API call latency (percentaile=99, scope=cluster, threshold=30s)",
verb="LIST",
scope="cluster",
threshold=30,
),
api_call_latency(
ApiCallLatencyPanelArgs(
title="Mutating API call latency (threshold=1s)",
verb=d.any_of("CREATE", "DELETE", "PATCH", "POST", "PUT"),
scope=d.any_of("namespace", "cluster"),
threshold=1,
),
]

API_CALL_LATENCY_PANELS = [api_call_latency(title=n.title, verb=n.verb, scope=n.scope, threshold=n.threshold, expression="""
apiserver:apiserver_request_latency_1m:histogram_quantile{
quantile="0.99",
verb=~"%(verb)s",
scope=~"%(scope)s",
resource=~"${resource:regex}s*",
}""") for n in API_CALL_LATENCY_PANELS_TEMPLATE]

QUANTILE_API_CALL_LATENCY_PANELS = [api_call_latency(title=n.title, verb=n.verb, scope=n.scope, threshold=n.threshold, expression="""
quantile_over_time(0.99,
apiserver:apiserver_request_latency_1m:histogram_quantile{
quantile="0.99",
verb=~"%(verb)s",
scope=~"%(scope)s",
resource=~"${resource:regex}s*",
}[5d])""") for n in API_CALL_LATENCY_PANELS_TEMPLATE]

HEALTH_PANELS = [
d.simple_graph(
"Unhealthy nodes",
Expand Down Expand Up @@ -485,7 +496,8 @@ def api_call_latency(title, verb, scope, threshold):
title="Master dashboard",
refresh="",
rows=[
d.Row(title="Clusterloader", panels=CLUSTERLOADER_PANELS),
d.Row(title="API call latency", panels=API_CALL_LATENCY_PANELS),
d.Row(title="API call latency aggregated with quantile", panels=QUANTILE_API_CALL_LATENCY_PANELS, collapse=True),
d.Row(title="Overall cluster health", panels=HEALTH_PANELS, collapse=True),
d.Row(title="etcd", panels=ETCD_PANELS, collapse=True),
d.Row(title="kube-apiserver", panels=APISERVER_PANELS, collapse=True),
Expand Down
Loading

0 comments on commit 49998fe

Please sign in to comment.