From 829c5c5bd9521bf4efdbf40e4b4708c178cc3b3b Mon Sep 17 00:00:00 2001 From: Kenan Yao Date: Mon, 25 Feb 2019 23:22:19 +0800 Subject: [PATCH] *: improve row count estimation for columns with NULL --- executor/analyze.go | 98 ++++++++++++++++++++++++++-------- executor/show_stats_test.go | 53 +++++++++++++++++- planner/core/planbuilder.go | 16 ++---- statistics/feedback.go | 12 ++++- statistics/histogram.go | 80 ++++++++++++++++----------- statistics/selectivity_test.go | 59 ++++++++++++++++++++ statistics/statistics_test.go | 6 --- statistics/table.go | 26 +++++++-- util/ranger/points.go | 12 ++++- 9 files changed, 285 insertions(+), 77 deletions(-) diff --git a/executor/analyze.go b/executor/analyze.go index 1c934b2e368d7..7330bb490f416 100644 --- a/executor/analyze.go +++ b/executor/analyze.go @@ -152,8 +152,9 @@ func analyzeIndexPushdown(idxExec *AnalyzeIndexExec) statistics.AnalyzeResult { Cms: []*statistics.CMSketch{cms}, IsIndex: 1, } + result.Count = hist.NullCount if hist.Len() > 0 { - result.Count = hist.Buckets[hist.Len()-1].Count + result.Count += hist.Buckets[hist.Len()-1].Count } return result } @@ -167,12 +168,16 @@ type AnalyzeIndexExec struct { priority int analyzePB *tipb.AnalyzeReq result distsql.SelectResult + countNullRes distsql.SelectResult maxNumBuckets uint64 } -func (e *AnalyzeIndexExec) open() error { +// fetchAnalyzeResult builds and dispatches the `kv.Request` from given ranges, and stores the `SelectResult` +// in corresponding fields based on the input `isNullRange` argument, which indicates if the range is the +// special null range for single-column index to get the null count. +func (e *AnalyzeIndexExec) fetchAnalyzeResult(ranges []*ranger.Range, isNullRange bool) error { var builder distsql.RequestBuilder - kvReq, err := builder.SetIndexRanges(e.ctx.GetSessionVars().StmtCtx, e.physicalTableID, e.idxInfo.ID, ranger.FullRange()). + kvReq, err := builder.SetIndexRanges(e.ctx.GetSessionVars().StmtCtx, e.physicalTableID, e.idxInfo.ID, ranges). SetAnalyzeRequest(e.analyzePB). SetKeepOrder(true). SetConcurrency(e.concurrency). @@ -181,29 +186,51 @@ func (e *AnalyzeIndexExec) open() error { return errors.Trace(err) } ctx := context.TODO() - e.result, err = distsql.Analyze(ctx, e.ctx.GetClient(), kvReq, e.ctx.GetSessionVars().KVVars, e.ctx.GetSessionVars().InRestrictedSQL) + result, err := distsql.Analyze(ctx, e.ctx.GetClient(), kvReq, e.ctx.GetSessionVars().KVVars, e.ctx.GetSessionVars().InRestrictedSQL) if err != nil { return errors.Trace(err) } - e.result.Fetch(ctx) + result.Fetch(ctx) + if isNullRange { + e.countNullRes = result + } else { + e.result = result + } return nil } -func (e *AnalyzeIndexExec) buildStats() (hist *statistics.Histogram, cms *statistics.CMSketch, err error) { - if err = e.open(); err != nil { - return nil, nil, errors.Trace(err) +func (e *AnalyzeIndexExec) open() error { + ranges := ranger.FullRange() + // For single-column index, we do not load null rows from TiKV, so the built histogram would not include + // null values, and its `NullCount` would be set by result of another distsql call to get null rows. + // For multi-column index, we cannot define null for the rows, so we still use full range, and the rows + // containing null fields would exist in built histograms. Note that, the `NullCount` of histograms for + // multi-column index is always 0 then. + if len(e.idxInfo.Columns) == 1 { + ranges = ranger.FullNotNullRange() } - defer func() { - if err1 := e.result.Close(); err1 != nil { - hist = nil - cms = nil - err = errors.Trace(err1) + err := e.fetchAnalyzeResult(ranges, false) + if err != nil { + return err + } + if len(e.idxInfo.Columns) == 1 { + ranges = ranger.NullRange() + err = e.fetchAnalyzeResult(ranges, true) + if err != nil { + return err } - }() - hist = &statistics.Histogram{} - cms = statistics.NewCMSketch(defaultCMSketchDepth, defaultCMSketchWidth) + } + return nil +} + +func (e *AnalyzeIndexExec) buildStatsFromResult(result distsql.SelectResult, needCMS bool) (*statistics.Histogram, *statistics.CMSketch, error) { + hist := &statistics.Histogram{} + var cms *statistics.CMSketch + if needCMS { + cms = statistics.NewCMSketch(defaultCMSketchDepth, defaultCMSketchWidth) + } for { - data, err := e.result.NextRaw(context.TODO()) + data, err := result.NextRaw(context.TODO()) if err != nil { return nil, nil, errors.Trace(err) } @@ -217,15 +244,42 @@ func (e *AnalyzeIndexExec) buildStats() (hist *statistics.Histogram, cms *statis } hist, err = statistics.MergeHistograms(e.ctx.GetSessionVars().StmtCtx, hist, statistics.HistogramFromProto(resp.Hist), int(e.maxNumBuckets)) if err != nil { - return nil, nil, errors.Trace(err) + return nil, nil, err } - if resp.Cms != nil { - err := cms.MergeCMSketch(statistics.CMSketchFromProto(resp.Cms)) - if err != nil { - return nil, nil, errors.Trace(err) + if needCMS { + if resp.Cms == nil { + logutil.Logger(context.TODO()).Warn("nil CMS in response", zap.String("table", e.idxInfo.Table.O), zap.String("index", e.idxInfo.Name.O)) + } else { + err := cms.MergeCMSketch(statistics.CMSketchFromProto(resp.Cms)) + if err != nil { + return nil, nil, errors.Trace(err) + } } } } + return hist, cms, nil +} + +func (e *AnalyzeIndexExec) buildStats() (hist *statistics.Histogram, cms *statistics.CMSketch, err error) { + if err = e.open(); err != nil { + return nil, nil, err + } + defer func() { + err = closeAll(e.result, e.countNullRes) + }() + hist, cms, err = e.buildStatsFromResult(e.result, true) + if err != nil { + return nil, nil, err + } + if e.countNullRes != nil { + nullHist, _, err := e.buildStatsFromResult(e.countNullRes, false) + if err != nil { + return nil, nil, err + } + if l := nullHist.Len(); l > 0 { + hist.NullCount = nullHist.Buckets[l-1].Count + } + } hist.ID = e.idxInfo.ID return hist, cms, nil } diff --git a/executor/show_stats_test.go b/executor/show_stats_test.go index 2786ac65498ef..af01376d9f798 100644 --- a/executor/show_stats_test.go +++ b/executor/show_stats_test.go @@ -82,7 +82,58 @@ func (s *testSuite1) TestShowStatsHasNullValue(c *C) { tk.MustExec("create table t (a int, index idx(a))") tk.MustExec("insert into t values(NULL)") tk.MustExec("analyze table t") - tk.MustQuery("show stats_buckets").Check(testkit.Rows("test t idx 1 0 1 1 NULL NULL")) + // Null values are excluded from histogram for single-column index. + tk.MustQuery("show stats_buckets").Check(testkit.Rows()) + tk.MustExec("insert into t values(1)") + tk.MustExec("analyze table t") + tk.MustQuery("show stats_buckets").Sort().Check(testkit.Rows( + "test t a 0 0 1 1 1 1", + "test t idx 1 0 1 1 1 1", + )) + tk.MustExec("drop table t") + tk.MustExec("create table t (a int, b int, index idx(a, b))") + tk.MustExec("insert into t values(NULL, NULL)") + tk.MustExec("analyze table t") + tk.MustQuery("show stats_buckets").Check(testkit.Rows("test t idx 1 0 1 1 (NULL, NULL) (NULL, NULL)")) + + tk.MustExec("drop table t") + tk.MustExec("create table t(a int, b int, c int, index idx_b(b), index idx_c_a(c, a))") + tk.MustExec("insert into t values(1,null,1),(2,null,2),(3,3,3),(4,null,4),(null,null,null)") + res := tk.MustQuery("show stats_histograms where table_name = 't'") + c.Assert(len(res.Rows()), Equals, 0) + tk.MustExec("analyze table t index idx_b") + res = tk.MustQuery("show stats_histograms where table_name = 't' and column_name = 'idx_b'") + c.Assert(len(res.Rows()), Equals, 1) + c.Assert(res.Rows()[0][7], Equals, "4") + res = tk.MustQuery("show stats_histograms where table_name = 't' and column_name = 'b'") + c.Assert(len(res.Rows()), Equals, 0) + tk.MustExec("analyze table t index idx_c_a") + res = tk.MustQuery("show stats_histograms where table_name = 't' and column_name = 'idx_c_a'") + c.Assert(len(res.Rows()), Equals, 1) + c.Assert(res.Rows()[0][7], Equals, "0") + res = tk.MustQuery("show stats_histograms where table_name = 't' and column_name = 'c'") + c.Assert(len(res.Rows()), Equals, 0) + res = tk.MustQuery("show stats_histograms where table_name = 't' and column_name = 'a'") + c.Assert(len(res.Rows()), Equals, 0) + tk.MustExec("truncate table t") + tk.MustExec("insert into t values(1,null,1),(2,null,2),(3,3,3),(4,null,4),(null,null,null)") + res = tk.MustQuery("show stats_histograms where table_name = 't'") + c.Assert(len(res.Rows()), Equals, 0) + tk.MustExec("analyze table t index") + res = tk.MustQuery("show stats_histograms where table_name = 't'").Sort() + c.Assert(len(res.Rows()), Equals, 2) + c.Assert(res.Rows()[0][7], Equals, "4") + c.Assert(res.Rows()[1][7], Equals, "0") + tk.MustExec("truncate table t") + tk.MustExec("insert into t values(1,null,1),(2,null,2),(3,3,3),(4,null,4),(null,null,null)") + tk.MustExec("analyze table t") + res = tk.MustQuery("show stats_histograms where table_name = 't'").Sort() + c.Assert(len(res.Rows()), Equals, 5) + c.Assert(res.Rows()[0][7], Equals, "1") + c.Assert(res.Rows()[1][7], Equals, "4") + c.Assert(res.Rows()[2][7], Equals, "1") + c.Assert(res.Rows()[3][7], Equals, "4") + c.Assert(res.Rows()[4][7], Equals, "0") } func (s *testSuite1) TestShowPartitionStats(c *C) { diff --git a/planner/core/planbuilder.go b/planner/core/planbuilder.go index 1a327bd3bd880..a8c2468687331 100644 --- a/planner/core/planbuilder.go +++ b/planner/core/planbuilder.go @@ -688,11 +688,11 @@ func (b *PlanBuilder) buildCheckIndexSchema(tn *ast.TableName, indexName string) // getColsInfo returns the info of index columns, normal columns and primary key. func getColsInfo(tn *ast.TableName) (indicesInfo []*model.IndexInfo, colsInfo []*model.ColumnInfo, pkCol *model.ColumnInfo) { tbl := tn.TableInfo - if tbl.PKIsHandle { - for _, col := range tbl.Columns { - if mysql.HasPriKeyFlag(col.Flag) { - pkCol = col - } + for _, col := range tbl.Columns { + if tbl.PKIsHandle && mysql.HasPriKeyFlag(col.Flag) { + pkCol = col + } else { + colsInfo = append(colsInfo, col) } } for _, idx := range tn.TableInfo.Indices { @@ -700,12 +700,6 @@ func getColsInfo(tn *ast.TableName) (indicesInfo []*model.IndexInfo, colsInfo [] indicesInfo = append(indicesInfo, idx) } } - for _, col := range tbl.Columns { - if col == pkCol { - continue - } - colsInfo = append(colsInfo, col) - } return } diff --git a/statistics/feedback.go b/statistics/feedback.go index be8b5bac1af69..d5b26a0d7b12b 100644 --- a/statistics/feedback.go +++ b/statistics/feedback.go @@ -1077,6 +1077,7 @@ func dumpFeedbackForIndex(h *Handle, q *QueryFeedback, t *Table) error { } func (q *QueryFeedback) dumpRangeFeedback(sc *stmtctx.StatementContext, h *Handle, ran *ranger.Range, rangeCount float64) error { + lowIsNull := ran.LowVal[0].IsNull() if q.tp == indexType { lower, err := codec.EncodeKey(sc, nil, ran.LowVal[0]) if err != nil { @@ -1102,8 +1103,17 @@ func (q *QueryFeedback) dumpRangeFeedback(sc *stmtctx.StatementContext, h *Handl ranges := q.hist.SplitRange(sc, []*ranger.Range{ran}, q.tp == indexType) counts := make([]float64, 0, len(ranges)) sum := 0.0 - for _, r := range ranges { + for i, r := range ranges { + // Though after `SplitRange`, we may have ranges like `[l, r]`, we still use + // `betweenRowCount` to compute the estimation since the ranges of feedback are all in `[l, r)` + // form, that is to say, we ignore the exclusiveness of ranges from `SplitRange` and just use + // its result of boundary values. count := q.hist.betweenRowCount(r.LowVal[0], r.HighVal[0]) + // We have to include `NullCount` of histogram for [l, r) cases where l is null because `betweenRowCount` + // does not include null values of lower bound. + if i == 0 && lowIsNull { + count += float64(q.hist.NullCount) + } sum += count counts = append(counts, count) } diff --git a/statistics/histogram.go b/statistics/histogram.go index 09ab5e1f99861..ce47f988ea471 100644 --- a/statistics/histogram.go +++ b/statistics/histogram.go @@ -412,27 +412,22 @@ func (hg *Histogram) equalRowCount(value types.Datum) float64 { // greaterRowCount estimates the row count where the column greater than value. func (hg *Histogram) greaterRowCount(value types.Datum) float64 { - gtCount := hg.totalRowCount() - hg.lessRowCount(value) - hg.equalRowCount(value) + gtCount := hg.notNullCount() - hg.lessRowCount(value) - hg.equalRowCount(value) if gtCount < 0 { gtCount = 0 } return gtCount } -// greaterAndEqRowCount estimates the row count where the column greater than or equal to value. -func (hg *Histogram) greaterAndEqRowCount(value types.Datum) float64 { - return hg.totalRowCount() - hg.lessRowCount(value) -} - // lessRowCount estimates the row count where the column less than value. func (hg *Histogram) lessRowCountWithBktIdx(value types.Datum) (float64, int) { - // all the values is null + // All the values are null. if hg.Bounds.NumRows() == 0 { return 0, 0 } index, match := hg.Bounds.LowerBound(0, &value) if index == hg.Bounds.NumRows() { - return hg.totalRowCount(), hg.Len() - 1 + return hg.notNullCount(), hg.Len() - 1 } // Since we store the lower and upper bound together, so dividing the index by 2 will get the bucket index. bucketIdx := index / 2 @@ -455,21 +450,16 @@ func (hg *Histogram) lessRowCount(value types.Datum) float64 { return result } -// lessAndEqRowCount estimates the row count where the column less than or equal to value. -func (hg *Histogram) lessAndEqRowCount(value types.Datum) float64 { - return hg.lessRowCount(value) + hg.equalRowCount(value) -} - // betweenRowCount estimates the row count where column greater or equal to a and less than b. func (hg *Histogram) betweenRowCount(a, b types.Datum) float64 { lessCountA := hg.lessRowCount(a) lessCountB := hg.lessRowCount(b) // If lessCountA is not less than lessCountB, it may be that they fall to the same bucket and we cannot estimate // the fraction, so we use `totalCount / NDV` to estimate the row count, but the result should not greater than - // lessCountB or totalRowCount-lessCountA. + // lessCountB or notNullCount-lessCountA. if lessCountA >= lessCountB && hg.NDV > 0 { - result := math.Min(lessCountB, hg.totalRowCount()-lessCountA) - return math.Min(result, hg.totalRowCount()/float64(hg.NDV)) + result := math.Min(lessCountB, hg.notNullCount()-lessCountA) + return math.Min(result, hg.notNullCount()/float64(hg.NDV)) } return lessCountB - lessCountA } @@ -478,6 +468,9 @@ func (hg *Histogram) totalRowCount() float64 { return hg.notNullCount() + float64(hg.NullCount) } +// notNullCount indicates the count of non-null values in column histogram and single-column index histogram, +// for multi-column index histogram, since we cannot define null for the row, we treat all rows as non-null, that means, +// notNullCount would return same value as totalRowCount for multi-column index histograms. func (hg *Histogram) notNullCount() float64 { if hg.Len() == 0 { return 0 @@ -780,7 +773,7 @@ func (c *Column) equalRowCount(sc *stmtctx.StatementContext, val types.Datum, mo if val.IsNull() { return float64(c.NullCount), nil } - // all the values is null + // All the values are null. if c.Histogram.Bounds.NumRows() == 0 { return 0.0, nil } @@ -814,18 +807,24 @@ func (c *Column) getColumnRowCount(sc *stmtctx.StatementContext, ranges []*range } continue } - // the interval case. + // The interval case. cnt := c.betweenRowCount(rg.LowVal[0], rg.HighVal[0]) - if c.outOfRange(rg.LowVal[0]) || c.outOfRange(rg.HighVal[0]) { + if (c.outOfRange(rg.LowVal[0]) && !rg.LowVal[0].IsNull()) || c.outOfRange(rg.HighVal[0]) { cnt += float64(modifyCount) / outOfRangeBetweenRate } - if rg.LowExclude { + // `betweenRowCount` returns count for [l, h) range, we adjust cnt for boudaries here. + // Note that, `cnt` does not include null values, we need specially handle cases + // where null is the lower bound. + if rg.LowExclude && !rg.LowVal[0].IsNull() { lowCnt, err := c.equalRowCount(sc, rg.LowVal[0], modifyCount) if err != nil { return 0, errors.Trace(err) } cnt -= lowCnt } + if !rg.LowExclude && rg.LowVal[0].IsNull() { + cnt += float64(c.NullCount) + } if !rg.HighExclude { highCnt, err := c.equalRowCount(sc, rg.HighVal[0], modifyCount) if err != nil { @@ -861,34 +860,49 @@ func (idx *Index) IsInvalid(collPseudo bool) bool { return (collPseudo && idx.NotAccurate()) || idx.totalRowCount() == 0 } -func (idx *Index) equalRowCount(sc *stmtctx.StatementContext, b []byte, modifyCount int64) float64 { +var nullKeyBytes, _ = codec.EncodeKey(nil, nil, types.NewDatum(nil)) + +func (idx *Index) equalRowCount(sc *stmtctx.StatementContext, b []byte, modifyCount int64) (float64, error) { + if len(idx.Info.Columns) == 1 { + if bytes.Equal(b, nullKeyBytes) { + return float64(idx.NullCount), nil + } + } val := types.NewBytesDatum(b) if idx.NDV > 0 && idx.outOfRange(val) { - return float64(modifyCount) / (float64(idx.NDV)) + return float64(modifyCount) / (float64(idx.NDV)), nil } if idx.CMSketch != nil { - return float64(idx.CMSketch.QueryBytes(b)) + return float64(idx.CMSketch.QueryBytes(b)), nil } - return idx.Histogram.equalRowCount(val) + return idx.Histogram.equalRowCount(val), nil } func (idx *Index) getRowCount(sc *stmtctx.StatementContext, indexRanges []*ranger.Range, modifyCount int64) (float64, error) { totalCount := float64(0) + isSingleCol := len(idx.Info.Columns) == 1 for _, indexRange := range indexRanges { lb, err := codec.EncodeKey(sc, nil, indexRange.LowVal...) if err != nil { - return 0, errors.Trace(err) + return 0, err } rb, err := codec.EncodeKey(sc, nil, indexRange.HighVal...) if err != nil { - return 0, errors.Trace(err) + return 0, err } fullLen := len(indexRange.LowVal) == len(indexRange.HighVal) && len(indexRange.LowVal) == len(idx.Info.Columns) - if fullLen && bytes.Equal(lb, rb) { - if !indexRange.LowExclude && !indexRange.HighExclude { - totalCount += idx.equalRowCount(sc, lb, modifyCount) + if bytes.Equal(lb, rb) { + if indexRange.LowExclude || indexRange.HighExclude { + continue + } + if fullLen { + count, err := idx.equalRowCount(sc, lb, modifyCount) + if err != nil { + return 0, err + } + totalCount += count + continue } - continue } if indexRange.LowExclude { lb = kv.Key(lb).PrefixNext() @@ -899,9 +913,13 @@ func (idx *Index) getRowCount(sc *stmtctx.StatementContext, indexRanges []*range l := types.NewBytesDatum(lb) r := types.NewBytesDatum(rb) totalCount += idx.betweenRowCount(l, r) - if idx.outOfRange(l) || idx.outOfRange(r) { + lowIsNull := bytes.Equal(lb, nullKeyBytes) + if (idx.outOfRange(l) && !(isSingleCol && lowIsNull)) || idx.outOfRange(r) { totalCount += float64(modifyCount) / outOfRangeBetweenRate } + if isSingleCol && lowIsNull { + totalCount += float64(idx.NullCount) + } } if totalCount > idx.totalRowCount() { totalCount = idx.totalRowCount() diff --git a/statistics/selectivity_test.go b/statistics/selectivity_test.go index 429165b6283a4..338b4e3039c6d 100644 --- a/statistics/selectivity_test.go +++ b/statistics/selectivity_test.go @@ -360,3 +360,62 @@ func BenchmarkSelectivity(b *testing.B) { }) pprof.StopCPUProfile() } + +func (s *testStatsSuite) TestColumnIndexNullEstimation(c *C) { + defer cleanEnv(c, s.store, s.do) + testKit := testkit.NewTestKit(c, s.store) + testKit.MustExec("use test") + testKit.MustExec("drop table if exists t") + testKit.MustExec("create table t(a int, b int, c int, index idx_b(b), index idx_c_a(c, a))") + testKit.MustExec("insert into t values(1,null,1),(2,null,2),(3,3,3),(4,null,4),(null,null,null);") + h := s.do.StatsHandle() + c.Assert(h.DumpStatsDeltaToKV(statistics.DumpAll), IsNil) + testKit.MustExec("analyze table t") + testKit.MustQuery(`explain select b from t where b is null`).Check(testkit.Rows( + "IndexReader_6 4.00 root index:IndexScan_5", + "└─IndexScan_5 4.00 cop table:t, index:b, range:[NULL,NULL], keep order:false", + )) + testKit.MustQuery(`explain select b from t where b is not null`).Check(testkit.Rows( + "IndexReader_6 1.00 root index:IndexScan_5", + "└─IndexScan_5 1.00 cop table:t, index:b, range:[-inf,+inf], keep order:false", + )) + testKit.MustQuery(`explain select b from t where b is null or b > 3`).Check(testkit.Rows( + "IndexReader_6 4.00 root index:IndexScan_5", + "└─IndexScan_5 4.00 cop table:t, index:b, range:[NULL,NULL], (3,+inf], keep order:false", + )) + testKit.MustQuery(`explain select b from t use index(idx_b)`).Check(testkit.Rows( + "IndexReader_5 5.00 root index:IndexScan_4", + "└─IndexScan_4 5.00 cop table:t, index:b, range:[NULL,+inf], keep order:false", + )) + testKit.MustQuery(`explain select b from t where b < 4`).Check(testkit.Rows( + "IndexReader_6 1.00 root index:IndexScan_5", + "└─IndexScan_5 1.00 cop table:t, index:b, range:[-inf,4), keep order:false", + )) + // Make sure column stats has been loaded. + testKit.MustExec(`explain select * from t where a is null`) + c.Assert(h.LoadNeededHistograms(), IsNil) + testKit.MustQuery(`explain select * from t where a is null`).Check(testkit.Rows( + "TableReader_7 1.00 root data:Selection_6", + "└─Selection_6 1.00 cop isnull(test.t.a)", + " └─TableScan_5 5.00 cop table:t, range:[-inf,+inf], keep order:false", + )) + testKit.MustQuery(`explain select * from t where a is not null`).Check(testkit.Rows( + "TableReader_7 4.00 root data:Selection_6", + "└─Selection_6 4.00 cop not(isnull(test.t.a))", + " └─TableScan_5 5.00 cop table:t, range:[-inf,+inf], keep order:false", + )) + testKit.MustQuery(`explain select * from t where a is null or a > 3`).Check(testkit.Rows( + "TableReader_7 2.00 root data:Selection_6", + "└─Selection_6 2.00 cop or(isnull(test.t.a), gt(test.t.a, 3))", + " └─TableScan_5 5.00 cop table:t, range:[-inf,+inf], keep order:false", + )) + testKit.MustQuery(`explain select * from t`).Check(testkit.Rows( + "TableReader_5 5.00 root data:TableScan_4", + "└─TableScan_4 5.00 cop table:t, range:[-inf,+inf], keep order:false", + )) + testKit.MustQuery(`explain select * from t where a < 4`).Check(testkit.Rows( + "TableReader_7 3.00 root data:Selection_6", + "└─Selection_6 3.00 cop lt(test.t.a, 4)", + " └─TableScan_5 5.00 cop table:t, range:[-inf,+inf], keep order:false", + )) +} diff --git a/statistics/statistics_test.go b/statistics/statistics_test.go index cfd2f4eb51a12..316dbd90f24fd 100644 --- a/statistics/statistics_test.go +++ b/statistics/statistics_test.go @@ -314,12 +314,6 @@ func (s *testStatisticsSuite) TestBuild(c *C) { c.Check(int(count), Equals, 20000) count = col.betweenRowCount(types.NewIntDatum(30000), types.NewIntDatum(35000)) c.Check(int(count), Equals, 5000) - count = col.greaterAndEqRowCount(types.NewIntDatum(1001)) - c.Check(int(count), Equals, 98999) - count = col.lessAndEqRowCount(types.NewIntDatum(99999)) - c.Check(int(count), Equals, 100000) - count = col.lessAndEqRowCount(types.Datum{}) - c.Check(int(count), Equals, 0) count = col.greaterRowCount(types.NewIntDatum(1001)) c.Check(int(count), Equals, 98998) count = col.lessRowCount(types.NewIntDatum(99999)) diff --git a/statistics/table.go b/statistics/table.go index eea3c5c43fb8b..9f9b3bb3977e1 100644 --- a/statistics/table.go +++ b/statistics/table.go @@ -355,7 +355,7 @@ func (t *Table) ColumnGreaterRowCount(sc *stmtctx.StatementContext, value types. return c.greaterRowCount(value) * c.getIncreaseFactor(t.Count) } -// ColumnLessRowCount estimates the row count where the column less than value. +// ColumnLessRowCount estimates the row count where the column less than value. Note that null values are not counted. func (t *Table) ColumnLessRowCount(sc *stmtctx.StatementContext, value types.Datum, colID int64) float64 { c, ok := t.Columns[colID] if !ok || c.IsInvalid(sc, t.Pseudo) { @@ -370,7 +370,11 @@ func (t *Table) ColumnBetweenRowCount(sc *stmtctx.StatementContext, a, b types.D if !ok || c.IsInvalid(sc, t.Pseudo) { return float64(t.Count) / pseudoBetweenRate } - return c.betweenRowCount(a, b) * c.getIncreaseFactor(t.Count) + count := c.betweenRowCount(a, b) + if a.IsNull() { + count += float64(c.NullCount) + } + return count * c.getIncreaseFactor(t.Count) } // ColumnEqualRowCount estimates the row count where the column equals to value. @@ -509,13 +513,27 @@ func (coll *HistColl) GenerateHistCollFromColumnInfo(infos []*model.ColumnInfo, return newColl } +// isSingleColIdxNullRange checks if a range is [NULL, NULL] on a single-column index. +func isSingleColIdxNullRange(idx *Index, ran *ranger.Range) bool { + if len(idx.Info.Columns) > 1 { + return false + } + l, h := ran.LowVal[0], ran.HighVal[0] + if l.IsNull() && h.IsNull() { + return true + } + return false +} + func (coll *HistColl) getIndexRowCount(sc *stmtctx.StatementContext, idxID int64, indexRanges []*ranger.Range) (float64, error) { idx := coll.Indices[idxID] totalCount := float64(0) for _, ran := range indexRanges { rangePosition := getOrdinalOfRangeCond(sc, ran) - // first one is range, just use the previous way to estimate - if rangePosition == 0 { + // If first one is range, just use the previous way to estimate; if it is [NULL, NULL] range + // on single-column index, use previous way as well, because CMSketch does not contain null + // values in this case. + if rangePosition == 0 || isSingleColIdxNullRange(idx, ran) { count, err := idx.getRowCount(sc, []*ranger.Range{ran}, coll.ModifyCount) if err != nil { return 0, errors.Trace(err) diff --git a/util/ranger/points.go b/util/ranger/points.go index db07588a861f6..9e73643068394 100644 --- a/util/ranger/points.go +++ b/util/ranger/points.go @@ -134,11 +134,21 @@ func FullIntRange(isUnsigned bool) []*Range { return []*Range{{LowVal: []types.Datum{types.NewIntDatum(math.MinInt64)}, HighVal: []types.Datum{types.NewIntDatum(math.MaxInt64)}}} } -// FullRange is (-∞, +∞) for Range. +// FullRange is [null, +∞) for Range. func FullRange() []*Range { return []*Range{{LowVal: []types.Datum{{}}, HighVal: []types.Datum{types.MaxValueDatum()}}} } +// FullNotNullRange is (-∞, +∞) for Range. +func FullNotNullRange() []*Range { + return []*Range{{LowVal: []types.Datum{types.MinNotNullDatum()}, HighVal: []types.Datum{types.MaxValueDatum()}}} +} + +// NullRange is [null, null] for Range. +func NullRange() []*Range { + return []*Range{{LowVal: []types.Datum{{}}, HighVal: []types.Datum{{}}}} +} + // builder is the range builder struct. type builder struct { err error