Add aggregated graphs to master dashboard

kubernetes · Apr 23, 2020 · 49998fe · 49998fe
1 parent a82fef5
commit 49998fe
Show file tree

Hide file tree

Showing 2 changed files with 521 additions and 63 deletions.
diff --git a/clusterloader2/pkg/prometheus/manifests/dashboards/master-dashboard.dashboard.py b/clusterloader2/pkg/prometheus/manifests/dashboards/master-dashboard.dashboard.py
@@ -14,25 +14,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from collections import namedtuple
 from grafanalib import core as g
 import defaults as d
 
 
-def api_call_latency(title, verb, scope, threshold):
+def api_call_latency(title, verb, scope, threshold, expression):
     return d.Graph(
         title=title,
         targets=[
             g.Target(expr=str(threshold), legendFormat="threshold"),
             g.Target(
-                expr=d.one_line(
-                    """
-apiserver:apiserver_request_latency_1m:histogram_quantile{
-  quantile="0.99",
-  verb=~"%(verb)s",
-  scope=~"%(scope)s",
-  resource=~"${resource:regex}s*",
-}"""
-                    % {"verb": verb, "scope": scope}
+                expr=d.one_line(expression % {"verb": verb, "scope": scope}
                 ),
                 # TODO(github.com/grafana/grafana/issues/19410): uncomment once fixed
                 # legendFormat="{{verb}} {{scope}}/{{resource}}",
@@ -41,34 +34,52 @@ def api_call_latency(title, verb, scope, threshold):
         yAxes=g.single_y_axis(format=g.SECONDS_FORMAT),
     )
 
+ApiCallLatencyPanelArgs = namedtuple('ApiCallLatencyPanelArgs', ['title', 'verb', 'scope', 'threshold'])
 
-CLUSTERLOADER_PANELS = [
-    api_call_latency(
+API_CALL_LATENCY_PANELS_TEMPLATE = [
+    ApiCallLatencyPanelArgs(
         title="Read-only API call latency (percentaile=99, scope=resource, threshold=1s)",
         verb="GET",
         scope="namespace",
-        threshold=1,
+        threshold=1
     ),
-    api_call_latency(
+    ApiCallLatencyPanelArgs(
         title="Read-only API call latency (percentaile=99, scope=namespace, threshold=5s)",
         verb="LIST",
         scope="namespace",
         threshold=5,
     ),
-    api_call_latency(
+    ApiCallLatencyPanelArgs(
         title="Read-only API call latency (percentaile=99, scope=cluster, threshold=30s)",
         verb="LIST",
         scope="cluster",
         threshold=30,
     ),
-    api_call_latency(
+    ApiCallLatencyPanelArgs(
         title="Mutating API call latency (threshold=1s)",
         verb=d.any_of("CREATE", "DELETE", "PATCH", "POST", "PUT"),
         scope=d.any_of("namespace", "cluster"),
         threshold=1,
     ),
 ]
 
+API_CALL_LATENCY_PANELS = [api_call_latency(title=n.title, verb=n.verb, scope=n.scope, threshold=n.threshold, expression="""
+apiserver:apiserver_request_latency_1m:histogram_quantile{
+  quantile="0.99",
+  verb=~"%(verb)s",
+  scope=~"%(scope)s",
+  resource=~"${resource:regex}s*",
+}""") for n in API_CALL_LATENCY_PANELS_TEMPLATE]
+
+QUANTILE_API_CALL_LATENCY_PANELS = [api_call_latency(title=n.title, verb=n.verb, scope=n.scope, threshold=n.threshold, expression="""
+quantile_over_time(0.99,
+apiserver:apiserver_request_latency_1m:histogram_quantile{
+  quantile="0.99",
+  verb=~"%(verb)s",
+  scope=~"%(scope)s",
+  resource=~"${resource:regex}s*",
+}[5d])""") for n in API_CALL_LATENCY_PANELS_TEMPLATE]
+
 HEALTH_PANELS = [
     d.simple_graph(
         "Unhealthy nodes",
@@ -485,7 +496,8 @@ def api_call_latency(title, verb, scope, threshold):
     title="Master dashboard",
     refresh="",
     rows=[
-        d.Row(title="Clusterloader", panels=CLUSTERLOADER_PANELS),
+        d.Row(title="API call latency", panels=API_CALL_LATENCY_PANELS),
+        d.Row(title="API call latency aggregated with quantile", panels=QUANTILE_API_CALL_LATENCY_PANELS, collapse=True),
         d.Row(title="Overall cluster health", panels=HEALTH_PANELS, collapse=True),
         d.Row(title="etcd", panels=ETCD_PANELS, collapse=True),
         d.Row(title="kube-apiserver", panels=APISERVER_PANELS, collapse=True),