*: improve row count estimation for columns with NULL

pingcap · Mar 27, 2019 · 829c5c5 · 829c5c5
1 parent 83ff58c
commit 829c5c5
Show file tree

Hide file tree

Showing 9 changed files with 285 additions and 77 deletions.
diff --git a/executor/analyze.go b/executor/analyze.go
@@ -152,8 +152,9 @@ func analyzeIndexPushdown(idxExec *AnalyzeIndexExec) statistics.AnalyzeResult {
 		Cms:             []*statistics.CMSketch{cms},
 		IsIndex:         1,
 	}
+	result.Count = hist.NullCount
 	if hist.Len() > 0 {
-		result.Count = hist.Buckets[hist.Len()-1].Count
+		result.Count += hist.Buckets[hist.Len()-1].Count
 	}
 	return result
 }
@@ -167,12 +168,16 @@ type AnalyzeIndexExec struct {
 	priority        int
 	analyzePB       *tipb.AnalyzeReq
 	result          distsql.SelectResult
+	countNullRes    distsql.SelectResult
 	maxNumBuckets   uint64
 }
 
-func (e *AnalyzeIndexExec) open() error {
+// fetchAnalyzeResult builds and dispatches the `kv.Request` from given ranges, and stores the `SelectResult`
+// in corresponding fields based on the input `isNullRange` argument, which indicates if the range is the
+// special null range for single-column index to get the null count.
+func (e *AnalyzeIndexExec) fetchAnalyzeResult(ranges []*ranger.Range, isNullRange bool) error {
 	var builder distsql.RequestBuilder
-	kvReq, err := builder.SetIndexRanges(e.ctx.GetSessionVars().StmtCtx, e.physicalTableID, e.idxInfo.ID, ranger.FullRange()).
+	kvReq, err := builder.SetIndexRanges(e.ctx.GetSessionVars().StmtCtx, e.physicalTableID, e.idxInfo.ID, ranges).
 		SetAnalyzeRequest(e.analyzePB).
 		SetKeepOrder(true).
 		SetConcurrency(e.concurrency).
@@ -181,29 +186,51 @@ func (e *AnalyzeIndexExec) open() error {
 		return errors.Trace(err)
 	}
 	ctx := context.TODO()
-	e.result, err = distsql.Analyze(ctx, e.ctx.GetClient(), kvReq, e.ctx.GetSessionVars().KVVars, e.ctx.GetSessionVars().InRestrictedSQL)
+	result, err := distsql.Analyze(ctx, e.ctx.GetClient(), kvReq, e.ctx.GetSessionVars().KVVars, e.ctx.GetSessionVars().InRestrictedSQL)
 	if err != nil {
 		return errors.Trace(err)
 	}
-	e.result.Fetch(ctx)
+	result.Fetch(ctx)
+	if isNullRange {
+		e.countNullRes = result
+	} else {
+		e.result = result
+	}
 	return nil
 }
 
-func (e *AnalyzeIndexExec) buildStats() (hist *statistics.Histogram, cms *statistics.CMSketch, err error) {
-	if err = e.open(); err != nil {
-		return nil, nil, errors.Trace(err)
+func (e *AnalyzeIndexExec) open() error {
+	ranges := ranger.FullRange()
+	// For single-column index, we do not load null rows from TiKV, so the built histogram would not include
+	// null values, and its `NullCount` would be set by result of another distsql call to get null rows.
+	// For multi-column index, we cannot define null for the rows, so we still use full range, and the rows
+	// containing null fields would exist in built histograms. Note that, the `NullCount` of histograms for
+	// multi-column index is always 0 then.
+	if len(e.idxInfo.Columns) == 1 {
+		ranges = ranger.FullNotNullRange()
 	}
-	defer func() {
-		if err1 := e.result.Close(); err1 != nil {
-			hist = nil
-			cms = nil
-			err = errors.Trace(err1)
+	err := e.fetchAnalyzeResult(ranges, false)
+	if err != nil {
+		return err
+	}
+	if len(e.idxInfo.Columns) == 1 {
+		ranges = ranger.NullRange()
+		err = e.fetchAnalyzeResult(ranges, true)
+		if err != nil {
+			return err
 		}
-	}()
-	hist = &statistics.Histogram{}
-	cms = statistics.NewCMSketch(defaultCMSketchDepth, defaultCMSketchWidth)
+	}
+	return nil
+}
+
+func (e *AnalyzeIndexExec) buildStatsFromResult(result distsql.SelectResult, needCMS bool) (*statistics.Histogram, *statistics.CMSketch, error) {
+	hist := &statistics.Histogram{}
+	var cms *statistics.CMSketch
+	if needCMS {
+		cms = statistics.NewCMSketch(defaultCMSketchDepth, defaultCMSketchWidth)
+	}
 	for {
-		data, err := e.result.NextRaw(context.TODO())
+		data, err := result.NextRaw(context.TODO())
 		if err != nil {
 			return nil, nil, errors.Trace(err)
 		}
@@ -217,15 +244,42 @@ func (e *AnalyzeIndexExec) buildStats() (hist *statistics.Histogram, cms *statis
 		}
 		hist, err = statistics.MergeHistograms(e.ctx.GetSessionVars().StmtCtx, hist, statistics.HistogramFromProto(resp.Hist), int(e.maxNumBuckets))
 		if err != nil {
-			return nil, nil, errors.Trace(err)
+			return nil, nil, err
 		}
-		if resp.Cms != nil {
-			err := cms.MergeCMSketch(statistics.CMSketchFromProto(resp.Cms))
-			if err != nil {
-				return nil, nil, errors.Trace(err)
+		if needCMS {
+			if resp.Cms == nil {
+				logutil.Logger(context.TODO()).Warn("nil CMS in response", zap.String("table", e.idxInfo.Table.O), zap.String("index", e.idxInfo.Name.O))
+			} else {
+				err := cms.MergeCMSketch(statistics.CMSketchFromProto(resp.Cms))
+				if err != nil {
+					return nil, nil, errors.Trace(err)
+				}
 			}
 		}
 	}
+	return hist, cms, nil
+}
+
+func (e *AnalyzeIndexExec) buildStats() (hist *statistics.Histogram, cms *statistics.CMSketch, err error) {
+	if err = e.open(); err != nil {
+		return nil, nil, err
+	}
+	defer func() {
+		err = closeAll(e.result, e.countNullRes)
+	}()
+	hist, cms, err = e.buildStatsFromResult(e.result, true)
+	if err != nil {
+		return nil, nil, err
+	}
+	if e.countNullRes != nil {
+		nullHist, _, err := e.buildStatsFromResult(e.countNullRes, false)
+		if err != nil {
+			return nil, nil, err
+		}
+		if l := nullHist.Len(); l > 0 {
+			hist.NullCount = nullHist.Buckets[l-1].Count
+		}
+	}
 	hist.ID = e.idxInfo.ID
 	return hist, cms, nil
 }

diff --git a/executor/show_stats_test.go b/executor/show_stats_test.go
@@ -82,7 +82,58 @@ func (s *testSuite1) TestShowStatsHasNullValue(c *C) {
 	tk.MustExec("create table t (a int, index idx(a))")
 	tk.MustExec("insert into t values(NULL)")
 	tk.MustExec("analyze table t")
-	tk.MustQuery("show stats_buckets").Check(testkit.Rows("test t  idx 1 0 1 1 NULL NULL"))
+	// Null values are excluded from histogram for single-column index.
+	tk.MustQuery("show stats_buckets").Check(testkit.Rows())
+	tk.MustExec("insert into t values(1)")
+	tk.MustExec("analyze table t")
+	tk.MustQuery("show stats_buckets").Sort().Check(testkit.Rows(
+		"test t  a 0 0 1 1 1 1",
+		"test t  idx 1 0 1 1 1 1",
+	))
+	tk.MustExec("drop table t")
+	tk.MustExec("create table t (a int, b int, index idx(a, b))")
+	tk.MustExec("insert into t values(NULL, NULL)")
+	tk.MustExec("analyze table t")
+	tk.MustQuery("show stats_buckets").Check(testkit.Rows("test t  idx 1 0 1 1 (NULL, NULL) (NULL, NULL)"))
+
+	tk.MustExec("drop table t")
+	tk.MustExec("create table t(a int, b int, c int, index idx_b(b), index idx_c_a(c, a))")
+	tk.MustExec("insert into t values(1,null,1),(2,null,2),(3,3,3),(4,null,4),(null,null,null)")
+	res := tk.MustQuery("show stats_histograms where table_name = 't'")
+	c.Assert(len(res.Rows()), Equals, 0)
+	tk.MustExec("analyze table t index idx_b")
+	res = tk.MustQuery("show stats_histograms where table_name = 't' and column_name = 'idx_b'")
+	c.Assert(len(res.Rows()), Equals, 1)
+	c.Assert(res.Rows()[0][7], Equals, "4")
+	res = tk.MustQuery("show stats_histograms where table_name = 't' and column_name = 'b'")
+	c.Assert(len(res.Rows()), Equals, 0)
+	tk.MustExec("analyze table t index idx_c_a")
+	res = tk.MustQuery("show stats_histograms where table_name = 't' and column_name = 'idx_c_a'")
+	c.Assert(len(res.Rows()), Equals, 1)
+	c.Assert(res.Rows()[0][7], Equals, "0")
+	res = tk.MustQuery("show stats_histograms where table_name = 't' and column_name = 'c'")
+	c.Assert(len(res.Rows()), Equals, 0)
+	res = tk.MustQuery("show stats_histograms where table_name = 't' and column_name = 'a'")
+	c.Assert(len(res.Rows()), Equals, 0)
+	tk.MustExec("truncate table t")
+	tk.MustExec("insert into t values(1,null,1),(2,null,2),(3,3,3),(4,null,4),(null,null,null)")
+	res = tk.MustQuery("show stats_histograms where table_name = 't'")
+	c.Assert(len(res.Rows()), Equals, 0)
+	tk.MustExec("analyze table t index")
+	res = tk.MustQuery("show stats_histograms where table_name = 't'").Sort()
+	c.Assert(len(res.Rows()), Equals, 2)
+	c.Assert(res.Rows()[0][7], Equals, "4")
+	c.Assert(res.Rows()[1][7], Equals, "0")
+	tk.MustExec("truncate table t")
+	tk.MustExec("insert into t values(1,null,1),(2,null,2),(3,3,3),(4,null,4),(null,null,null)")
+	tk.MustExec("analyze table t")
+	res = tk.MustQuery("show stats_histograms where table_name = 't'").Sort()
+	c.Assert(len(res.Rows()), Equals, 5)
+	c.Assert(res.Rows()[0][7], Equals, "1")
+	c.Assert(res.Rows()[1][7], Equals, "4")
+	c.Assert(res.Rows()[2][7], Equals, "1")
+	c.Assert(res.Rows()[3][7], Equals, "4")
+	c.Assert(res.Rows()[4][7], Equals, "0")
 }
 
 func (s *testSuite1) TestShowPartitionStats(c *C) {

diff --git a/planner/core/planbuilder.go b/planner/core/planbuilder.go
@@ -688,24 +688,18 @@ func (b *PlanBuilder) buildCheckIndexSchema(tn *ast.TableName, indexName string)
 // getColsInfo returns the info of index columns, normal columns and primary key.
 func getColsInfo(tn *ast.TableName) (indicesInfo []*model.IndexInfo, colsInfo []*model.ColumnInfo, pkCol *model.ColumnInfo) {
 	tbl := tn.TableInfo
-	if tbl.PKIsHandle {
-		for _, col := range tbl.Columns {
-			if mysql.HasPriKeyFlag(col.Flag) {
-				pkCol = col
-			}
+	for _, col := range tbl.Columns {
+		if tbl.PKIsHandle && mysql.HasPriKeyFlag(col.Flag) {
+			pkCol = col
+		} else {
+			colsInfo = append(colsInfo, col)
 		}
 	}
 	for _, idx := range tn.TableInfo.Indices {
 		if idx.State == model.StatePublic {
 			indicesInfo = append(indicesInfo, idx)
 		}
 	}
-	for _, col := range tbl.Columns {
-		if col == pkCol {
-			continue
-		}
-		colsInfo = append(colsInfo, col)
-	}
 	return
 }
 

diff --git a/statistics/feedback.go b/statistics/feedback.go
@@ -1077,6 +1077,7 @@ func dumpFeedbackForIndex(h *Handle, q *QueryFeedback, t *Table) error {
 }
 
 func (q *QueryFeedback) dumpRangeFeedback(sc *stmtctx.StatementContext, h *Handle, ran *ranger.Range, rangeCount float64) error {
+	lowIsNull := ran.LowVal[0].IsNull()
 	if q.tp == indexType {
 		lower, err := codec.EncodeKey(sc, nil, ran.LowVal[0])
 		if err != nil {
@@ -1102,8 +1103,17 @@ func (q *QueryFeedback) dumpRangeFeedback(sc *stmtctx.StatementContext, h *Handl
 	ranges := q.hist.SplitRange(sc, []*ranger.Range{ran}, q.tp == indexType)
 	counts := make([]float64, 0, len(ranges))
 	sum := 0.0
-	for _, r := range ranges {
+	for i, r := range ranges {
+		// Though after `SplitRange`, we may have ranges like `[l, r]`, we still use
+		// `betweenRowCount` to compute the estimation since the ranges of feedback are all in `[l, r)`
+		// form, that is to say, we ignore the exclusiveness of ranges from `SplitRange` and just use
+		// its result of boundary values.
 		count := q.hist.betweenRowCount(r.LowVal[0], r.HighVal[0])
+		// We have to include `NullCount` of histogram for [l, r) cases where l is null because `betweenRowCount`
+		// does not include null values of lower bound.
+		if i == 0 && lowIsNull {
+			count += float64(q.hist.NullCount)
+		}
 		sum += count
 		counts = append(counts, count)
 	}