diff --git a/executor/diagnostics.go b/executor/diagnostics.go index 11adbeb60fb06..b292fa020cb76 100644 --- a/executor/diagnostics.go +++ b/executor/diagnostics.go @@ -79,6 +79,9 @@ type ( // criticalErrorInspection is used to check are there some critical errors // occurred in the past criticalErrorInspection struct{ inspectionName } + + // thresholdCheckInspection is used to check some threshold value, like CPU usage, leader count change. + thresholdCheckInspection struct{ inspectionName } ) var inspectionRules = []inspectionRule{ @@ -86,6 +89,7 @@ var inspectionRules = []inspectionRule{ &versionInspection{inspectionName: "version"}, ¤tLoadInspection{inspectionName: "current-load"}, &criticalErrorInspection{inspectionName: "critical-error"}, + &thresholdCheckInspection{inspectionName: "threshold-check"}, } type inspectionRetriever struct { @@ -363,3 +367,117 @@ func (criticalErrorInspection) inspect(ctx context.Context, sctx sessionctx.Cont } return results } + +func (thresholdCheckInspection) inspect(ctx context.Context, sctx sessionctx.Context, filter inspectionFilter) []inspectionResult { + var rules = []struct { + item string + component string + configKey string + threshold float64 + }{ + { + item: "coprocessor_normal_cpu", + component: "cop_normal%", + configKey: "readpool.coprocessor.normal-concurrency", + threshold: 0.9}, + { + item: "coprocessor_high_cpu", + component: "cop_high%", + configKey: "readpool.coprocessor.high-concurrency", + threshold: 0.9, + }, + { + item: "coprocessor_low_cpu", + component: "cop_low%", + configKey: "readpool.coprocessor.low-concurrency", + threshold: 0.9, + }, + { + item: "grpc_cpu", + component: "grpc%", + configKey: "server.grpc-concurrency", + threshold: 0.9, + }, + { + item: "raftstore_cpu", + component: "raftstore_%", + configKey: "raftstore.store-pool-size", + threshold: 0.8, + }, + { + item: "apply_cpu", + component: "apply_%", + configKey: "raftstore.apply-pool-size", + threshold: 0.8, + }, + { + item: "storage_readpool_normal_cpu", + component: "store_read_norm%", + configKey: "readpool.storage.normal-concurrency", + threshold: 0.9, + }, + { + item: "storage_readpool_high_cpu", + component: "store_read_high%", + configKey: "readpool.storage.high-concurrency", + threshold: 0.9, + }, + { + item: "storage_readpool_low_cpu", + component: "store_read_low%", + configKey: "readpool.storage.low-concurrency", + threshold: 0.9, + }, + { + item: "scheduler_worker_cpu", + component: "sched_%", + configKey: "storage.scheduler-worker-pool-size", + threshold: 0.85, + }, + { + item: "split_check_cpu", + component: "split_check", + threshold: 0.9, + }, + } + var results []inspectionResult + for _, rule := range rules { + var sql string + if len(rule.configKey) > 0 { + sql = fmt.Sprintf("select t1.instance, t1.cpu, t2.threshold, t2.value from "+ + "(select instance, sum(value) as cpu from metric_schema.tikv_thread_cpu where name like '%[1]s' and time=now() group by instance) as t1,"+ + "(select value * %[2]f as threshold, value from inspection_schema.cluster_config where type='tikv' and `key` = '%[3]s' limit 1) as t2 "+ + "where t1.cpu > t2.threshold;", rule.component, rule.threshold, rule.configKey) + } else { + sql = fmt.Sprintf("select t1.instance, t1.cpu, %[2]f from "+ + "(select instance, sum(value) as cpu from metric_schema.tikv_thread_cpu where name like '%[1]s' and time=now() group by instance) as t1 "+ + "where t1.cpu > %[2]f;", rule.component, rule.threshold) + } + rows, _, err := sctx.(sqlexec.RestrictedSQLExecutor).ExecRestrictedSQLWithContext(ctx, sql) + if err != nil { + sctx.GetSessionVars().StmtCtx.AppendWarning(fmt.Errorf("execute '%s' failed: %v", sql, err)) + continue + } + for _, row := range rows { + actual := fmt.Sprintf("%.2f", row.GetFloat64(1)) + expected := "" + if len(rule.configKey) > 0 { + expected = fmt.Sprintf("< %.2f, config: %v=%v", row.GetFloat64(2), rule.configKey, row.GetString(3)) + } else { + expected = fmt.Sprintf("< %.2f", row.GetFloat64(2)) + } + detail := fmt.Sprintf("select instance, sum(value) as cpu from metric_schema.tikv_thread_cpu where name like '%[1]s' and time=now() group by instance", rule.component) + result := inspectionResult{ + tp: "tikv", + instance: row.GetString(0), + item: rule.item, + actual: actual, + expected: expected, + severity: "warning", + detail: detail, + } + results = append(results, result) + } + } + return results +} diff --git a/executor/diagnostics_test.go b/executor/diagnostics_test.go index e0e75808df7ef..87a57687cd094 100644 --- a/executor/diagnostics_test.go +++ b/executor/diagnostics_test.go @@ -15,13 +15,13 @@ package executor_test import ( "context" - . "github.com/pingcap/check" "github.com/pingcap/failpoint" "github.com/pingcap/parser/mysql" "github.com/pingcap/tidb/domain" "github.com/pingcap/tidb/infoschema" "github.com/pingcap/tidb/kv" + "github.com/pingcap/tidb/session" "github.com/pingcap/tidb/sessionctx/variable" "github.com/pingcap/tidb/types" "github.com/pingcap/tidb/util/testkit" @@ -166,6 +166,117 @@ func (s *diagnosticsSuite) TestInspectionResult(c *C) { } } +func (s *diagnosticsSuite) parseTime(c *C, se session.Session, str string) types.Time { + t, err := types.ParseTime(se.GetSessionVars().StmtCtx, str, mysql.TypeDatetime, types.MaxFsp) + c.Assert(err, IsNil) + return t +} + +func (s *diagnosticsSuite) TestThresholdCheckInspection(c *C) { + tk := testkit.NewTestKitWithInit(c, s.store) + // mock tikv configuration. + configurations := map[string]variable.TableSnapshot{} + configurations[infoschema.TableClusterConfig] = variable.TableSnapshot{ + Rows: [][]types.Datum{ + types.MakeDatums("tikv", "tikv-0", "raftstore.apply-pool-size", "2"), + types.MakeDatums("tikv", "tikv-0", "raftstore.store-pool-size", "2"), + types.MakeDatums("tikv", "tikv-0", "readpool.coprocessor.high-concurrency", "4"), + types.MakeDatums("tikv", "tikv-0", "readpool.coprocessor.low-concurrency", "4"), + types.MakeDatums("tikv", "tikv-0", "readpool.coprocessor.normal-concurrency", "4"), + types.MakeDatums("tikv", "tikv-0", "readpool.storage.high-concurrency", "4"), + types.MakeDatums("tikv", "tikv-0", "readpool.storage.low-concurrency", "4"), + types.MakeDatums("tikv", "tikv-0", "readpool.storage.normal-concurrency", "4"), + types.MakeDatums("tikv", "tikv-0", "server.grpc-concurrency", "8"), + types.MakeDatums("tikv", "tikv-0", "storage.scheduler-worker-pool-size", "6"), + }, + } + datetime := func(str string) types.Time { + return s.parseTime(c, tk.Se, str) + } + // construct some mock abnormal data + mockData := map[string][][]types.Datum{ + // columns: time, instance, name, value + "tikv_thread_cpu": { + types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "cop_normal0", 10.0), + types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "cop_normal1", 10.0), + types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "cop_high1", 10.0), + types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "cop_low1", 10.0), + types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "grpc_1", 10.0), + types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "raftstore_1", 10.0), + types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "apply_0", 10.0), + types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "store_read_norm1", 10.0), + types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "store_read_high2", 10.0), + types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "store_read_low0", 10.0), + types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "sched_2", 10.0), + types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "split_check", 10.0), + }, + } + + fpName := "github.com/pingcap/tidb/executor/mockMergeMockInspectionTables" + c.Assert(failpoint.Enable(fpName, "return"), IsNil) + defer func() { c.Assert(failpoint.Disable(fpName), IsNil) }() + + // Mock for metric table data. + fpName2 := "github.com/pingcap/tidb/executor/mockMetricsTableData" + c.Assert(failpoint.Enable(fpName2, "return"), IsNil) + defer func() { c.Assert(failpoint.Disable(fpName2), IsNil) }() + + ctx := context.WithValue(context.Background(), "__mockInspectionTables", configurations) + ctx = context.WithValue(ctx, "__mockMetricsTableData", mockData) + ctx = failpoint.WithHook(ctx, func(_ context.Context, fpname2 string) bool { + return fpName2 == fpname2 || fpname2 == fpName + }) + + rs, err := tk.Se.Execute(ctx, "select item, type, instance, value, reference, details from information_schema.inspection_result where rule='threshold-check' order by item") + c.Assert(err, IsNil) + result := tk.ResultSetToResultWithCtx(ctx, rs[0], Commentf("execute inspect SQL failed")) + c.Assert(tk.Se.GetSessionVars().StmtCtx.WarningCount(), Equals, uint16(0), Commentf("unexpected warnings: %+v", tk.Se.GetSessionVars().StmtCtx.GetWarnings())) + result.Check(testkit.Rows( + "apply_cpu tikv tikv-0 10.00 < 1.60, config: raftstore.apply-pool-size=2 select instance, sum(value) as cpu from metric_schema.tikv_thread_cpu where name like 'apply_%' and time=now() group by instance", + "coprocessor_high_cpu tikv tikv-0 10.00 < 3.60, config: readpool.coprocessor.high-concurrency=4 select instance, sum(value) as cpu from metric_schema.tikv_thread_cpu where name like 'cop_high%' and time=now() group by instance", + "coprocessor_low_cpu tikv tikv-0 10.00 < 3.60, config: readpool.coprocessor.low-concurrency=4 select instance, sum(value) as cpu from metric_schema.tikv_thread_cpu where name like 'cop_low%' and time=now() group by instance", + "coprocessor_normal_cpu tikv tikv-0 20.00 < 3.60, config: readpool.coprocessor.normal-concurrency=4 select instance, sum(value) as cpu from metric_schema.tikv_thread_cpu where name like 'cop_normal%' and time=now() group by instance", + "grpc_cpu tikv tikv-0 10.00 < 7.20, config: server.grpc-concurrency=8 select instance, sum(value) as cpu from metric_schema.tikv_thread_cpu where name like 'grpc%' and time=now() group by instance", + "raftstore_cpu tikv tikv-0 10.00 < 1.60, config: raftstore.store-pool-size=2 select instance, sum(value) as cpu from metric_schema.tikv_thread_cpu where name like 'raftstore_%' and time=now() group by instance", + "scheduler_worker_cpu tikv tikv-0 10.00 < 5.10, config: storage.scheduler-worker-pool-size=6 select instance, sum(value) as cpu from metric_schema.tikv_thread_cpu where name like 'sched_%' and time=now() group by instance", + "split_check_cpu tikv tikv-0 10.00 < 0.00 select instance, sum(value) as cpu from metric_schema.tikv_thread_cpu where name like 'split_check' and time=now() group by instance", + "storage_readpool_high_cpu tikv tikv-0 10.00 < 3.60, config: readpool.storage.high-concurrency=4 select instance, sum(value) as cpu from metric_schema.tikv_thread_cpu where name like 'store_read_high%' and time=now() group by instance", + "storage_readpool_low_cpu tikv tikv-0 10.00 < 3.60, config: readpool.storage.low-concurrency=4 select instance, sum(value) as cpu from metric_schema.tikv_thread_cpu where name like 'store_read_low%' and time=now() group by instance", + "storage_readpool_normal_cpu tikv tikv-0 10.00 < 3.60, config: readpool.storage.normal-concurrency=4 select instance, sum(value) as cpu from metric_schema.tikv_thread_cpu where name like 'store_read_norm%' and time=now() group by instance", + )) + + // construct some mock normal data + mockData = map[string][][]types.Datum{ + // columns: time, instance, name, value + "tikv_thread_cpu": { + types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "cop_normal0", 1.0), + types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "cop_high1", 0.1), + types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "cop_low1", 1.0), + types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "grpc_1", 7.0), + types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "grpc_2", 0.21), + types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "raftstore_1", 1.0), + types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "apply_0", 1.0), + types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "store_read_norm1", 1.0), + types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "store_read_high2", 1.0), + types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "store_read_low0", 1.0), + types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "sched_2", 0.3), + types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "split_check", 0.5), + }, + } + + ctx = context.WithValue(context.Background(), "__mockInspectionTables", configurations) + ctx = context.WithValue(ctx, "__mockMetricsTableData", mockData) + ctx = failpoint.WithHook(ctx, func(_ context.Context, fpname2 string) bool { + return fpName2 == fpname2 || fpname2 == fpName + }) + rs, err = tk.Se.Execute(ctx, "select item, type, instance, value, reference, details from information_schema.inspection_result where rule='threshold-check' order by item") + c.Assert(err, IsNil) + result = tk.ResultSetToResultWithCtx(ctx, rs[0], Commentf("execute inspect SQL failed")) + c.Assert(tk.Se.GetSessionVars().StmtCtx.WarningCount(), Equals, uint16(0), Commentf("unexpected warnings: %+v", tk.Se.GetSessionVars().StmtCtx.GetWarnings())) + result.Check(testkit.Rows( + "grpc_cpu tikv tikv-0 7.21 < 7.20, config: server.grpc-concurrency=8 select instance, sum(value) as cpu from metric_schema.tikv_thread_cpu where name like 'grpc%' and time=now() group by instance")) +} + func (s *diagnosticsSuite) TestCriticalErrorInspection(c *C) { tk := testkit.NewTestKitWithInit(c, s.store) @@ -173,10 +284,8 @@ func (s *diagnosticsSuite) TestCriticalErrorInspection(c *C) { c.Assert(failpoint.Enable(fpName, "return"), IsNil) defer func() { c.Assert(failpoint.Disable(fpName), IsNil) }() - datetime := func(s string) types.Time { - t, err := types.ParseTime(tk.Se.GetSessionVars().StmtCtx, s, mysql.TypeDatetime, types.MaxFsp) - c.Assert(err, IsNil) - return t + datetime := func(str string) types.Time { + return s.parseTime(c, tk.Se, str) } // construct some mock data diff --git a/infoschema/metric_table_def.go b/infoschema/metric_table_def.go index 0cc7786de048a..f8f0f70396929 100644 --- a/infoschema/metric_table_def.go +++ b/infoschema/metric_table_def.go @@ -354,17 +354,20 @@ var MetricTableMap = map[string]MetricTableDef{ Quantile: 0.999, }, "pd_tso_wait_duration": { - PromQL: "histogram_quantile($QUANTILE, sum(rate(pd_client_cmd_handle_cmds_duration_seconds_bucket{type=\"wait\"}[$RANGE_DURATION])) by (le))", + PromQL: "histogram_quantile($QUANTILE, sum(rate(pd_client_cmd_handle_cmds_duration_seconds_bucket{type=\"wait\"}[$RANGE_DURATION])) by (le,instance))", + Labels: []string{"instance"}, Quantile: 0.999, Comment: "The quantile duration of a client starting to wait for the TS until received the TS result.", }, "pd_tso_rpc_duration": { Comment: "The quantile duration of a client sending TSO request until received the response.", - PromQL: "histogram_quantile($QUANTILE, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{type=\"tso\"}[$RANGE_DURATION])) by (le))", + PromQL: "histogram_quantile($QUANTILE, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{type=\"tso\"}[$RANGE_DURATION])) by (le,instance))", + Labels: []string{"instance"}, Quantile: 0.999, }, "pd_start_tso_wait_duration": { - PromQL: "histogram_quantile($QUANTILE, sum(rate(tidb_pdclient_ts_future_wait_seconds_bucket[$RANGE_DURATION])) by (le))", + PromQL: "histogram_quantile($QUANTILE, sum(rate(tidb_pdclient_ts_future_wait_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le,instance))", + Labels: []string{"instance"}, Quantile: 0.999, Comment: "The quantile duration of the waiting time for getting the start timestamp oracle", },