pingcap · crazycs520 · Feb 18, 2020 · Feb 14, 2020 · Feb 17, 2020 · Feb 17, 2020
diff --git a/executor/diagnostics.go b/executor/diagnostics.go
@@ -79,13 +79,17 @@ type (
 	// criticalErrorInspection is used to check are there some critical errors
 	// occurred in the past
 	criticalErrorInspection struct{ inspectionName }
+
+	// thresholdCheckInspection is used to check some threshold value, like CPU usage, leader count change.
+	thresholdCheckInspection struct{ inspectionName }
 )
 
 var inspectionRules = []inspectionRule{
 	&configInspection{inspectionName: "config"},
 	&versionInspection{inspectionName: "version"},
 	&currentLoadInspection{inspectionName: "current-load"},
 	&criticalErrorInspection{inspectionName: "critical-error"},
+	&thresholdCheckInspection{inspectionName: "threshold-check"},
 }
 
 type inspectionRetriever struct {
@@ -363,3 +367,117 @@ func (criticalErrorInspection) inspect(ctx context.Context, sctx sessionctx.Cont
 	}
 	return results
 }
+
+func (thresholdCheckInspection) inspect(ctx context.Context, sctx sessionctx.Context, filter inspectionFilter) []inspectionResult {
+	var rules = []struct {
+		item      string
+		component string
+		configKey string
+		threshold float64
+	}{
+		{
+			item:      "coprocessor_normal_cpu",
+			component: "cop_normal%",
+			configKey: "readpool.coprocessor.normal-concurrency",
+			threshold: 0.9},
+		{
+			item:      "coprocessor_high_cpu",
+			component: "cop_high%",
+			configKey: "readpool.coprocessor.high-concurrency",
+			threshold: 0.9,
+		},
+		{
+			item:      "coprocessor_low_cpu",
+			component: "cop_low%",
+			configKey: "readpool.coprocessor.low-concurrency",
+			threshold: 0.9,
+		},
+		{
+			item:      "grpc_cpu",
+			component: "grpc%",
+			configKey: "server.grpc-concurrency",
+			threshold: 0.9,
+		},
+		{
+			item:      "raftstore_cpu",
+			component: "raftstore_%",
+			configKey: "raftstore.store-pool-size",
+			threshold: 0.8,
+		},
+		{
+			item:      "apply_cpu",
+			component: "apply_%",
+			configKey: "raftstore.apply-pool-size",
+			threshold: 0.8,
+		},
+		{
+			item:      "storage_readpool_normal_cpu",
+			component: "store_read_norm%",
+			configKey: "readpool.storage.normal-concurrency",
+			threshold: 0.9,
+		},
+		{
+			item:      "storage_readpool_high_cpu",
+			component: "store_read_high%",
+			configKey: "readpool.storage.high-concurrency",
+			threshold: 0.9,
+		},
+		{
+			item:      "storage_readpool_low_cpu",
+			component: "store_read_low%",
+			configKey: "readpool.storage.low-concurrency",
+			threshold: 0.9,
+		},
+		{
+			item:      "scheduler_worker_cpu",
+			component: "sched_%",
+			configKey: "storage.scheduler-worker-pool-size",
+			threshold: 0.85,
+		},
+		{
+			item:      "split_check_cpu",
+			component: "split_check",
+			threshold: 0.9,
+		},
+	}
+	var results []inspectionResult
+	for _, rule := range rules {
+		var sql string
+		if len(rule.configKey) > 0 {
+			sql = fmt.Sprintf("select t1.instance, t1.cpu, t2.threshold, t2.value from "+
+				"(select instance, sum(value) as cpu from metric_schema.tikv_thread_cpu where name like '%[1]s' and time=now() group by instance) as t1,"+
+				"(select value * %[2]f as threshold, value from inspection_schema.cluster_config where type='tikv' and `key` = '%[3]s' limit 1) as t2 "+
+				"where t1.cpu > t2.threshold;", rule.component, rule.threshold, rule.configKey)
+		} else {
+			sql = fmt.Sprintf("select t1.instance, t1.cpu, %[2]f from "+
+				"(select instance, sum(value) as cpu from metric_schema.tikv_thread_cpu where name like '%[1]s' and time=now() group by instance) as t1 "+
+				"where t1.cpu > %[2]f;", rule.component, rule.threshold)
+		}
+		rows, _, err := sctx.(sqlexec.RestrictedSQLExecutor).ExecRestrictedSQLWithContext(ctx, sql)
+		if err != nil {
+			sctx.GetSessionVars().StmtCtx.AppendWarning(fmt.Errorf("execute '%s' failed: %v", sql, err))
+			continue
+		}
+		for _, row := range rows {
+			actual := fmt.Sprintf("%.2f", row.GetFloat64(1))
+			expected := ""
+			if len(rule.configKey) > 0 {
+				expected = fmt.Sprintf("< %.2f, config: %v=%v", row.GetFloat64(2), rule.configKey, row.GetString(3))
+			} else {
+				expected = fmt.Sprintf("< %.2f", row.GetFloat64(2))
+			}
+			detail := fmt.Sprintf("select instance, sum(value) as cpu from metric_schema.tikv_thread_cpu where name like '%[1]s' and time=now() group by instance", rule.component)
+			result := inspectionResult{
+				tp:       "tikv",
+				instance: row.GetString(0),
+				item:     rule.item,
+				actual:   actual,
+				expected: expected,
+				severity: "warning",
+				detail:   detail,
+			}
+			results = append(results, result)
+		}
+	}
+	return results
+}
diff --git a/executor/diagnostics_test.go b/executor/diagnostics_test.go
@@ -15,13 +15,13 @@ package executor_test
 
 import (
 	"context"
-
 	. "github.com/pingcap/check"
 	"github.com/pingcap/failpoint"
 	"github.com/pingcap/parser/mysql"
 	"github.com/pingcap/tidb/domain"
 	"github.com/pingcap/tidb/infoschema"
 	"github.com/pingcap/tidb/kv"
+	"github.com/pingcap/tidb/session"
 	"github.com/pingcap/tidb/sessionctx/variable"
 	"github.com/pingcap/tidb/types"
 	"github.com/pingcap/tidb/util/testkit"
@@ -166,17 +166,126 @@ func (s *diagnosticsSuite) TestInspectionResult(c *C) {
 	}
 }
 
+func (s *diagnosticsSuite) parseTime(c *C, se session.Session, str string) types.Time {
+	t, err := types.ParseTime(se.GetSessionVars().StmtCtx, str, mysql.TypeDatetime, types.MaxFsp)
+	c.Assert(err, IsNil)
+	return t
+}
+
+func (s *diagnosticsSuite) TestThresholdCheckInspection(c *C) {
+	tk := testkit.NewTestKitWithInit(c, s.store)
+	// mock tikv configuration.
+	configurations := map[string]variable.TableSnapshot{}
+	configurations[infoschema.TableClusterConfig] = variable.TableSnapshot{
+		Rows: [][]types.Datum{
+			types.MakeDatums("tikv", "tikv-0", "raftstore.apply-pool-size", "2"),
+			types.MakeDatums("tikv", "tikv-0", "raftstore.store-pool-size", "2"),
+			types.MakeDatums("tikv", "tikv-0", "readpool.coprocessor.high-concurrency", "4"),
+			types.MakeDatums("tikv", "tikv-0", "readpool.coprocessor.low-concurrency", "4"),
+			types.MakeDatums("tikv", "tikv-0", "readpool.coprocessor.normal-concurrency", "4"),
+			types.MakeDatums("tikv", "tikv-0", "readpool.storage.high-concurrency", "4"),
+			types.MakeDatums("tikv", "tikv-0", "readpool.storage.low-concurrency", "4"),
+			types.MakeDatums("tikv", "tikv-0", "readpool.storage.normal-concurrency", "4"),
+			types.MakeDatums("tikv", "tikv-0", "server.grpc-concurrency", "8"),
+			types.MakeDatums("tikv", "tikv-0", "storage.scheduler-worker-pool-size", "6"),
+		},
+	}
+	datetime := func(str string) types.Time {
+		return s.parseTime(c, tk.Se, str)
+	}
+	// construct some mock abnormal data
+	mockData := map[string][][]types.Datum{
+		// columns: time, instance, name, value
+		"tikv_thread_cpu": {
+			types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "cop_normal0", 10.0),
+			types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "cop_normal1", 10.0),
+			types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "cop_high1", 10.0),
+			types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "cop_low1", 10.0),
+			types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "grpc_1", 10.0),
+			types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "raftstore_1", 10.0),
+			types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "apply_0", 10.0),
+			types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "store_read_norm1", 10.0),
+			types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "store_read_high2", 10.0),
+			types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "store_read_low0", 10.0),
+			types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "sched_2", 10.0),
+			types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "split_check", 10.0),
+		},
+	}
+
+	fpName := "github.com/pingcap/tidb/executor/mockMergeMockInspectionTables"
+	c.Assert(failpoint.Enable(fpName, "return"), IsNil)
+	defer func() { c.Assert(failpoint.Disable(fpName), IsNil) }()
+
+	// Mock for metric table data.
+	fpName2 := "github.com/pingcap/tidb/executor/mockMetricsTableData"
+	c.Assert(failpoint.Enable(fpName2, "return"), IsNil)
+	defer func() { c.Assert(failpoint.Disable(fpName2), IsNil) }()
+
+	ctx := context.WithValue(context.Background(), "__mockInspectionTables", configurations)
+	ctx = context.WithValue(ctx, "__mockMetricsTableData", mockData)
+	ctx = failpoint.WithHook(ctx, func(_ context.Context, fpname2 string) bool {
+		return fpName2 == fpname2 || fpname2 == fpName
+	})
+
+	rs, err := tk.Se.Execute(ctx, "select item, type, instance, value, reference, details from information_schema.inspection_result where rule='threshold-check' order by item")
+	c.Assert(err, IsNil)
+	result := tk.ResultSetToResultWithCtx(ctx, rs[0], Commentf("execute inspect SQL failed"))
+	c.Assert(tk.Se.GetSessionVars().StmtCtx.WarningCount(), Equals, uint16(0), Commentf("unexpected warnings: %+v", tk.Se.GetSessionVars().StmtCtx.GetWarnings()))
+	result.Check(testkit.Rows(
+		"apply_cpu tikv tikv-0 10.00 < 1.60, config: raftstore.apply-pool-size=2 select instance, sum(value) as cpu from metric_schema.tikv_thread_cpu where name like 'apply_%' and time=now() group by instance",
+		"coprocessor_high_cpu tikv tikv-0 10.00 < 3.60, config: readpool.coprocessor.high-concurrency=4 select instance, sum(value) as cpu from metric_schema.tikv_thread_cpu where name like 'cop_high%' and time=now() group by instance",
+		"coprocessor_low_cpu tikv tikv-0 10.00 < 3.60, config: readpool.coprocessor.low-concurrency=4 select instance, sum(value) as cpu from metric_schema.tikv_thread_cpu where name like 'cop_low%' and time=now() group by instance",
+		"coprocessor_normal_cpu tikv tikv-0 20.00 < 3.60, config: readpool.coprocessor.normal-concurrency=4 select instance, sum(value) as cpu from metric_schema.tikv_thread_cpu where name like 'cop_normal%' and time=now() group by instance",
+		"grpc_cpu tikv tikv-0 10.00 < 7.20, config: server.grpc-concurrency=8 select instance, sum(value) as cpu from metric_schema.tikv_thread_cpu where name like 'grpc%' and time=now() group by instance",
+		"raftstore_cpu tikv tikv-0 10.00 < 1.60, config: raftstore.store-pool-size=2 select instance, sum(value) as cpu from metric_schema.tikv_thread_cpu where name like 'raftstore_%' and time=now() group by instance",
+		"scheduler_worker_cpu tikv tikv-0 10.00 < 5.10, config: storage.scheduler-worker-pool-size=6 select instance, sum(value) as cpu from metric_schema.tikv_thread_cpu where name like 'sched_%' and time=now() group by instance",
+		"split_check_cpu tikv tikv-0 10.00 < 0.00 select instance, sum(value) as cpu from metric_schema.tikv_thread_cpu where name like 'split_check' and time=now() group by instance",
+		"storage_readpool_high_cpu tikv tikv-0 10.00 < 3.60, config: readpool.storage.high-concurrency=4 select instance, sum(value) as cpu from metric_schema.tikv_thread_cpu where name like 'store_read_high%' and time=now() group by instance",
+		"storage_readpool_low_cpu tikv tikv-0 10.00 < 3.60, config: readpool.storage.low-concurrency=4 select instance, sum(value) as cpu from metric_schema.tikv_thread_cpu where name like 'store_read_low%' and time=now() group by instance",
+		"storage_readpool_normal_cpu tikv tikv-0 10.00 < 3.60, config: readpool.storage.normal-concurrency=4 select instance, sum(value) as cpu from metric_schema.tikv_thread_cpu where name like 'store_read_norm%' and time=now() group by instance",
+	))
+
+	// construct some mock normal data
+	mockData = map[string][][]types.Datum{
+		// columns: time, instance, name, value
+		"tikv_thread_cpu": {
+			types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "cop_normal0", 1.0),
+			types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "cop_high1", 0.1),
+			types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "cop_low1", 1.0),
+			types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "grpc_1", 7.0),
+			types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "grpc_2", 0.21),
+			types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "raftstore_1", 1.0),
+			types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "apply_0", 1.0),
+			types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "store_read_norm1", 1.0),
+			types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "store_read_high2", 1.0),
+			types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "store_read_low0", 1.0),
+			types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "sched_2", 0.3),
+			types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "split_check", 0.5),
+		},
+	}
+
+	ctx = context.WithValue(context.Background(), "__mockInspectionTables", configurations)
+	ctx = context.WithValue(ctx, "__mockMetricsTableData", mockData)
+	ctx = failpoint.WithHook(ctx, func(_ context.Context, fpname2 string) bool {
+		return fpName2 == fpname2 || fpname2 == fpName
+	})
+	rs, err = tk.Se.Execute(ctx, "select item, type, instance, value, reference, details from information_schema.inspection_result where rule='threshold-check' order by item")
+	c.Assert(err, IsNil)
+	result = tk.ResultSetToResultWithCtx(ctx, rs[0], Commentf("execute inspect SQL failed"))
+	c.Assert(tk.Se.GetSessionVars().StmtCtx.WarningCount(), Equals, uint16(0), Commentf("unexpected warnings: %+v", tk.Se.GetSessionVars().StmtCtx.GetWarnings()))
+	result.Check(testkit.Rows(
+		"grpc_cpu tikv tikv-0 7.21 < 7.20, config: server.grpc-concurrency=8 select instance, sum(value) as cpu from metric_schema.tikv_thread_cpu where name like 'grpc%' and time=now() group by instance"))
+}
+
 func (s *diagnosticsSuite) TestCriticalErrorInspection(c *C) {
 	tk := testkit.NewTestKitWithInit(c, s.store)
 
 	fpName := "github.com/pingcap/tidb/executor/mockMetricsTableData"
 	c.Assert(failpoint.Enable(fpName, "return"), IsNil)
 	defer func() { c.Assert(failpoint.Disable(fpName), IsNil) }()
 
-	datetime := func(s string) types.Time {
-		t, err := types.ParseTime(tk.Se.GetSessionVars().StmtCtx, s, mysql.TypeDatetime, types.MaxFsp)
-		c.Assert(err, IsNil)
-		return t
+	datetime := func(str string) types.Time {
+		return s.parseTime(c, tk.Se, str)
 	}
 
 	// construct some mock data

diff --git a/infoschema/metric_table_def.go b/infoschema/metric_table_def.go
@@ -354,17 +354,20 @@ var MetricTableMap = map[string]MetricTableDef{
 		Quantile: 0.999,
 	},
 	"pd_tso_wait_duration": {
-		PromQL:   "histogram_quantile($QUANTILE, sum(rate(pd_client_cmd_handle_cmds_duration_seconds_bucket{type=\"wait\"}[$RANGE_DURATION])) by (le))",
+		PromQL:   "histogram_quantile($QUANTILE, sum(rate(pd_client_cmd_handle_cmds_duration_seconds_bucket{type=\"wait\"}[$RANGE_DURATION])) by (le,instance))",
+		Labels:   []string{"instance"},
 		Quantile: 0.999,
 		Comment:  "The quantile duration of a client starting to wait for the TS until received the TS result.",
 	},
 	"pd_tso_rpc_duration": {
 		Comment:  "The quantile duration of a client sending TSO request until received the response.",
-		PromQL:   "histogram_quantile($QUANTILE, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{type=\"tso\"}[$RANGE_DURATION])) by (le))",
+		PromQL:   "histogram_quantile($QUANTILE, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{type=\"tso\"}[$RANGE_DURATION])) by (le,instance))",
+		Labels:   []string{"instance"},
 		Quantile: 0.999,
 	},
 	"pd_start_tso_wait_duration": {
-		PromQL:   "histogram_quantile($QUANTILE, sum(rate(tidb_pdclient_ts_future_wait_seconds_bucket[$RANGE_DURATION])) by (le))",
+		PromQL:   "histogram_quantile($QUANTILE, sum(rate(tidb_pdclient_ts_future_wait_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le,instance))",
+		Labels:   []string{"instance"},
 		Quantile: 0.999,
 		Comment:  "The quantile duration of the waiting time for getting the start timestamp oracle",
 	},