From f483b39c34b91f2f0260748f40d952e0d9a366f2 Mon Sep 17 00:00:00 2001 From: Song Gao Date: Wed, 4 Jan 2023 10:50:20 +0800 Subject: [PATCH] metrics: add metrics for plan replayer and historical stats (#40271) --- domain/historical_stats.go | 8 ++ domain/plan_replayer.go | 12 +- domain/plan_replayer_dump.go | 9 ++ metrics/grafana/tidb.json | 229 +++++++++++++++++++++++++++++++++++ metrics/metrics.go | 4 + metrics/stats.go | 21 ++++ statistics/handle/dump.go | 16 ++- 7 files changed, 297 insertions(+), 2 deletions(-) diff --git a/domain/historical_stats.go b/domain/historical_stats.go index 04d50608c58c4..ca68319c31ba8 100644 --- a/domain/historical_stats.go +++ b/domain/historical_stats.go @@ -16,10 +16,16 @@ package domain import ( "github.com/pingcap/errors" + "github.com/pingcap/tidb/metrics" "github.com/pingcap/tidb/sessionctx" "github.com/pingcap/tidb/statistics/handle" ) +var ( + generateHistoricalStatsSuccessCounter = metrics.HistoricalStatsCounter.WithLabelValues("generate", "success") + generateHistoricalStatsFailedCounter = metrics.HistoricalStatsCounter.WithLabelValues("generate", "fail") +) + // HistoricalStatsWorker indicates for dump historical stats type HistoricalStatsWorker struct { tblCH chan int64 @@ -52,8 +58,10 @@ func (w *HistoricalStatsWorker) DumpHistoricalStats(tableID int64, statsHandle * return errors.Errorf("cannot get DBInfo by TableID %d", tableID) } if _, err := statsHandle.RecordHistoricalStatsToStorage(dbInfo.Name.O, tblInfo); err != nil { + generateHistoricalStatsFailedCounter.Inc() return errors.Errorf("record table %s.%s's historical stats failed", dbInfo.Name.O, tblInfo.Name.O) } + generateHistoricalStatsSuccessCounter.Inc() return nil } diff --git a/domain/plan_replayer.go b/domain/plan_replayer.go index 54c109cc34dc3..2bbb15772d56c 100644 --- a/domain/plan_replayer.go +++ b/domain/plan_replayer.go @@ -29,6 +29,7 @@ import ( "github.com/pingcap/tidb/bindinfo" "github.com/pingcap/tidb/domain/infosync" "github.com/pingcap/tidb/kv" + "github.com/pingcap/tidb/metrics" "github.com/pingcap/tidb/parser" "github.com/pingcap/tidb/parser/ast" "github.com/pingcap/tidb/parser/terror" @@ -167,6 +168,13 @@ func insertPlanReplayerSuccessStatusRecord(ctx context.Context, sctx sessionctx. } } +var ( + planReplayerCaptureTaskSendCounter = metrics.PlanReplayerTaskCounter.WithLabelValues("capture", "send") + planReplayerCaptureTaskDiscardCounter = metrics.PlanReplayerTaskCounter.WithLabelValues("capture", "discard") + + planReplayerRegisterTaskGauge = metrics.PlanReplayerRegisterTaskGauge +) + type planReplayerHandle struct { *planReplayerTaskCollectorHandle *planReplayerTaskDumpHandle @@ -181,9 +189,10 @@ func (h *planReplayerHandle) SendTask(task *PlanReplayerDumpTask) bool { if !task.IsContinuesCapture { h.planReplayerTaskCollectorHandle.removeTask(task.PlanReplayerTaskKey) } + planReplayerCaptureTaskSendCounter.Inc() return true default: - // TODO: add metrics here + planReplayerCaptureTaskDiscardCounter.Inc() // directly discard the task if the task channel is full in order not to block the query process logutil.BgLogger().Warn("discard one plan replayer dump task", zap.String("sql-digest", task.SQLDigest), zap.String("plan-digest", task.PlanDigest)) @@ -221,6 +230,7 @@ func (h *planReplayerTaskCollectorHandle) CollectPlanReplayerTask() error { } } h.setupTasks(tasks) + planReplayerRegisterTaskGauge.Set(float64(len(tasks))) return nil } diff --git a/domain/plan_replayer_dump.go b/domain/plan_replayer_dump.go index cad0898c81ef2..bd121b26dd388 100644 --- a/domain/plan_replayer_dump.go +++ b/domain/plan_replayer_dump.go @@ -28,6 +28,7 @@ import ( "github.com/pingcap/tidb/bindinfo" "github.com/pingcap/tidb/config" "github.com/pingcap/tidb/infoschema" + "github.com/pingcap/tidb/metrics" "github.com/pingcap/tidb/parser/ast" "github.com/pingcap/tidb/parser/model" "github.com/pingcap/tidb/sessionctx" @@ -145,6 +146,11 @@ func (tne *tableNameExtractor) handleIsView(t *ast.TableName) (bool, error) { return true, nil } +var ( + planReplayerDumpTaskSuccess = metrics.PlanReplayerTaskCounter.WithLabelValues("dump", "success") + planReplayerDumpTaskFailed = metrics.PlanReplayerTaskCounter.WithLabelValues("dump", "fail") +) + // DumpPlanReplayerInfo will dump the information about sqls. // The files will be organized into the following format: /* @@ -212,6 +218,9 @@ func DumpPlanReplayerInfo(ctx context.Context, sctx sessionctx.Context, zap.Strings("sqls", sqls)) } errMsg = err.Error() + planReplayerDumpTaskFailed.Inc() + } else { + planReplayerDumpTaskSuccess.Inc() } err1 := zw.Close() if err1 != nil { diff --git a/metrics/grafana/tidb.json b/metrics/grafana/tidb.json index 0e2ce93934f7c..9637940d4dbd2 100644 --- a/metrics/grafana/tidb.json +++ b/metrics/grafana/tidb.json @@ -14337,6 +14337,235 @@ "align": false, "alignLevel": null } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 184 + }, + "hiddenSeries": false, + "id": 236, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.11", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(tidb_plan_replayer_task{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=~\"dump\"}[1m])) by (result)", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "dump-task-{{result}}", + "refId": "A", + "step": 30 + }, + { + "exemplar": true, + "expr": "sum(rate(tidb_plan_replayer_task{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=~\"capture\"}[1m])) by (result)", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "capture-task-{{result}}", + "refId": "B", + "step": 30 + }, + { + "exemplar": true, + "expr": "avg(tidb_plan_replayer_register_task{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"})", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "register-task", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Plan Replayer Task OPM", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 8, + "y": 184 + }, + "hiddenSeries": false, + "id": 237, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.11", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(tidb_statistics_historical_stats{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=~\"generate\"}[1m])) by (result)", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "generate-{{result}}", + "refId": "A", + "step": 30 + }, + { + "exemplar": true, + "expr": "sum(rate(tidb_statistics_historical_stats{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=~\"dump\"}[1m])) by (result)", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "dump-{{result}}", + "refId": "B", + "step": 30 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Historical Stats OPM", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } } ], "repeat": null, diff --git a/metrics/metrics.go b/metrics/metrics.go index 8f303ba58180e..889f4c5996481 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -217,6 +217,10 @@ func RegisterMetrics() { prometheus.MustRegister(EMACPUUsageGauge) + prometheus.MustRegister(HistoricalStatsCounter) + prometheus.MustRegister(PlanReplayerTaskCounter) + prometheus.MustRegister(PlanReplayerRegisterTaskGauge) + tikvmetrics.InitMetrics(TiDB, TiKVClient) tikvmetrics.RegisterMetrics() tikvmetrics.TiKVPanicCounter = PanicCounter // reset tidb metrics for tikv metrics diff --git a/metrics/stats.go b/metrics/stats.go index 76bd1ec7a936b..5d73753f5669c 100644 --- a/metrics/stats.go +++ b/metrics/stats.go @@ -150,4 +150,25 @@ var ( Name: "stats_healthy", Help: "Gauge of stats healthy", }, []string{LblType}) + + HistoricalStatsCounter = prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: "tidb", + Subsystem: "statistics", + Name: "historical_stats", + Help: "counter of the historical stats operation", + }, []string{LblType, LblResult}) + + PlanReplayerTaskCounter = prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: "tidb", + Subsystem: "plan_replayer", + Name: "task", + Help: "counter of plan replayer captured task", + }, []string{LblType, LblResult}) + + PlanReplayerRegisterTaskGauge = prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: "tidb", + Subsystem: "plan_replayer", + Name: "register_task", + Help: "gauge of plan replayer registered task", + }) ) diff --git a/statistics/handle/dump.go b/statistics/handle/dump.go index daaf28ead7573..75f4ee9ea958a 100644 --- a/statistics/handle/dump.go +++ b/statistics/handle/dump.go @@ -24,6 +24,7 @@ import ( "github.com/pingcap/errors" "github.com/pingcap/tidb/infoschema" + "github.com/pingcap/tidb/metrics" "github.com/pingcap/tidb/parser/model" "github.com/pingcap/tidb/parser/mysql" "github.com/pingcap/tidb/sessionctx" @@ -131,8 +132,21 @@ func (h *Handle) DumpStatsToJSON(dbName string, tableInfo *model.TableInfo, return h.DumpStatsToJSONBySnapshot(dbName, tableInfo, snapshot, dumpPartitionStats) } +var ( + dumpHistoricalStatsSuccessCounter = metrics.HistoricalStatsCounter.WithLabelValues("dump", "success") + dumpHistoricalStatsFailedCounter = metrics.HistoricalStatsCounter.WithLabelValues("dump", "fail") +) + // DumpHistoricalStatsBySnapshot dumped json tables from mysql.stats_meta_history and mysql.stats_history -func (h *Handle) DumpHistoricalStatsBySnapshot(dbName string, tableInfo *model.TableInfo, snapshot uint64) (*JSONTable, error) { +func (h *Handle) DumpHistoricalStatsBySnapshot(dbName string, tableInfo *model.TableInfo, snapshot uint64) (jt *JSONTable, err error) { + defer func() { + if err == nil { + dumpHistoricalStatsSuccessCounter.Inc() + } else { + dumpHistoricalStatsFailedCounter.Inc() + } + }() + pi := tableInfo.GetPartitionInfo() if pi == nil { return h.tableHistoricalStatsToJSON(tableInfo.ID, snapshot)