Skip to content

Commit

Permalink
*: improve row count estimation for columns with NULL
Browse files Browse the repository at this point in the history
  • Loading branch information
eurekaka committed Mar 27, 2019
1 parent 83ff58c commit 829c5c5
Show file tree
Hide file tree
Showing 9 changed files with 285 additions and 77 deletions.
98 changes: 76 additions & 22 deletions executor/analyze.go
Original file line number Diff line number Diff line change
Expand Up @@ -152,8 +152,9 @@ func analyzeIndexPushdown(idxExec *AnalyzeIndexExec) statistics.AnalyzeResult {
Cms: []*statistics.CMSketch{cms},
IsIndex: 1,
}
result.Count = hist.NullCount
if hist.Len() > 0 {
result.Count = hist.Buckets[hist.Len()-1].Count
result.Count += hist.Buckets[hist.Len()-1].Count
}
return result
}
Expand All @@ -167,12 +168,16 @@ type AnalyzeIndexExec struct {
priority int
analyzePB *tipb.AnalyzeReq
result distsql.SelectResult
countNullRes distsql.SelectResult
maxNumBuckets uint64
}

func (e *AnalyzeIndexExec) open() error {
// fetchAnalyzeResult builds and dispatches the `kv.Request` from given ranges, and stores the `SelectResult`
// in corresponding fields based on the input `isNullRange` argument, which indicates if the range is the
// special null range for single-column index to get the null count.
func (e *AnalyzeIndexExec) fetchAnalyzeResult(ranges []*ranger.Range, isNullRange bool) error {
var builder distsql.RequestBuilder
kvReq, err := builder.SetIndexRanges(e.ctx.GetSessionVars().StmtCtx, e.physicalTableID, e.idxInfo.ID, ranger.FullRange()).
kvReq, err := builder.SetIndexRanges(e.ctx.GetSessionVars().StmtCtx, e.physicalTableID, e.idxInfo.ID, ranges).
SetAnalyzeRequest(e.analyzePB).
SetKeepOrder(true).
SetConcurrency(e.concurrency).
Expand All @@ -181,29 +186,51 @@ func (e *AnalyzeIndexExec) open() error {
return errors.Trace(err)
}
ctx := context.TODO()
e.result, err = distsql.Analyze(ctx, e.ctx.GetClient(), kvReq, e.ctx.GetSessionVars().KVVars, e.ctx.GetSessionVars().InRestrictedSQL)
result, err := distsql.Analyze(ctx, e.ctx.GetClient(), kvReq, e.ctx.GetSessionVars().KVVars, e.ctx.GetSessionVars().InRestrictedSQL)
if err != nil {
return errors.Trace(err)
}
e.result.Fetch(ctx)
result.Fetch(ctx)
if isNullRange {
e.countNullRes = result
} else {
e.result = result
}
return nil
}

func (e *AnalyzeIndexExec) buildStats() (hist *statistics.Histogram, cms *statistics.CMSketch, err error) {
if err = e.open(); err != nil {
return nil, nil, errors.Trace(err)
func (e *AnalyzeIndexExec) open() error {
ranges := ranger.FullRange()
// For single-column index, we do not load null rows from TiKV, so the built histogram would not include
// null values, and its `NullCount` would be set by result of another distsql call to get null rows.
// For multi-column index, we cannot define null for the rows, so we still use full range, and the rows
// containing null fields would exist in built histograms. Note that, the `NullCount` of histograms for
// multi-column index is always 0 then.
if len(e.idxInfo.Columns) == 1 {
ranges = ranger.FullNotNullRange()
}
defer func() {
if err1 := e.result.Close(); err1 != nil {
hist = nil
cms = nil
err = errors.Trace(err1)
err := e.fetchAnalyzeResult(ranges, false)
if err != nil {
return err
}
if len(e.idxInfo.Columns) == 1 {
ranges = ranger.NullRange()
err = e.fetchAnalyzeResult(ranges, true)
if err != nil {
return err
}
}()
hist = &statistics.Histogram{}
cms = statistics.NewCMSketch(defaultCMSketchDepth, defaultCMSketchWidth)
}
return nil
}

func (e *AnalyzeIndexExec) buildStatsFromResult(result distsql.SelectResult, needCMS bool) (*statistics.Histogram, *statistics.CMSketch, error) {
hist := &statistics.Histogram{}
var cms *statistics.CMSketch
if needCMS {
cms = statistics.NewCMSketch(defaultCMSketchDepth, defaultCMSketchWidth)
}
for {
data, err := e.result.NextRaw(context.TODO())
data, err := result.NextRaw(context.TODO())
if err != nil {
return nil, nil, errors.Trace(err)
}
Expand All @@ -217,15 +244,42 @@ func (e *AnalyzeIndexExec) buildStats() (hist *statistics.Histogram, cms *statis
}
hist, err = statistics.MergeHistograms(e.ctx.GetSessionVars().StmtCtx, hist, statistics.HistogramFromProto(resp.Hist), int(e.maxNumBuckets))
if err != nil {
return nil, nil, errors.Trace(err)
return nil, nil, err
}
if resp.Cms != nil {
err := cms.MergeCMSketch(statistics.CMSketchFromProto(resp.Cms))
if err != nil {
return nil, nil, errors.Trace(err)
if needCMS {
if resp.Cms == nil {
logutil.Logger(context.TODO()).Warn("nil CMS in response", zap.String("table", e.idxInfo.Table.O), zap.String("index", e.idxInfo.Name.O))
} else {
err := cms.MergeCMSketch(statistics.CMSketchFromProto(resp.Cms))
if err != nil {
return nil, nil, errors.Trace(err)
}
}
}
}
return hist, cms, nil
}

func (e *AnalyzeIndexExec) buildStats() (hist *statistics.Histogram, cms *statistics.CMSketch, err error) {
if err = e.open(); err != nil {
return nil, nil, err
}
defer func() {
err = closeAll(e.result, e.countNullRes)
}()
hist, cms, err = e.buildStatsFromResult(e.result, true)
if err != nil {
return nil, nil, err
}
if e.countNullRes != nil {
nullHist, _, err := e.buildStatsFromResult(e.countNullRes, false)
if err != nil {
return nil, nil, err
}
if l := nullHist.Len(); l > 0 {
hist.NullCount = nullHist.Buckets[l-1].Count
}
}
hist.ID = e.idxInfo.ID
return hist, cms, nil
}
Expand Down
53 changes: 52 additions & 1 deletion executor/show_stats_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,58 @@ func (s *testSuite1) TestShowStatsHasNullValue(c *C) {
tk.MustExec("create table t (a int, index idx(a))")
tk.MustExec("insert into t values(NULL)")
tk.MustExec("analyze table t")
tk.MustQuery("show stats_buckets").Check(testkit.Rows("test t idx 1 0 1 1 NULL NULL"))
// Null values are excluded from histogram for single-column index.
tk.MustQuery("show stats_buckets").Check(testkit.Rows())
tk.MustExec("insert into t values(1)")
tk.MustExec("analyze table t")
tk.MustQuery("show stats_buckets").Sort().Check(testkit.Rows(
"test t a 0 0 1 1 1 1",
"test t idx 1 0 1 1 1 1",
))
tk.MustExec("drop table t")
tk.MustExec("create table t (a int, b int, index idx(a, b))")
tk.MustExec("insert into t values(NULL, NULL)")
tk.MustExec("analyze table t")
tk.MustQuery("show stats_buckets").Check(testkit.Rows("test t idx 1 0 1 1 (NULL, NULL) (NULL, NULL)"))

tk.MustExec("drop table t")
tk.MustExec("create table t(a int, b int, c int, index idx_b(b), index idx_c_a(c, a))")
tk.MustExec("insert into t values(1,null,1),(2,null,2),(3,3,3),(4,null,4),(null,null,null)")
res := tk.MustQuery("show stats_histograms where table_name = 't'")
c.Assert(len(res.Rows()), Equals, 0)
tk.MustExec("analyze table t index idx_b")
res = tk.MustQuery("show stats_histograms where table_name = 't' and column_name = 'idx_b'")
c.Assert(len(res.Rows()), Equals, 1)
c.Assert(res.Rows()[0][7], Equals, "4")
res = tk.MustQuery("show stats_histograms where table_name = 't' and column_name = 'b'")
c.Assert(len(res.Rows()), Equals, 0)
tk.MustExec("analyze table t index idx_c_a")
res = tk.MustQuery("show stats_histograms where table_name = 't' and column_name = 'idx_c_a'")
c.Assert(len(res.Rows()), Equals, 1)
c.Assert(res.Rows()[0][7], Equals, "0")
res = tk.MustQuery("show stats_histograms where table_name = 't' and column_name = 'c'")
c.Assert(len(res.Rows()), Equals, 0)
res = tk.MustQuery("show stats_histograms where table_name = 't' and column_name = 'a'")
c.Assert(len(res.Rows()), Equals, 0)
tk.MustExec("truncate table t")
tk.MustExec("insert into t values(1,null,1),(2,null,2),(3,3,3),(4,null,4),(null,null,null)")
res = tk.MustQuery("show stats_histograms where table_name = 't'")
c.Assert(len(res.Rows()), Equals, 0)
tk.MustExec("analyze table t index")
res = tk.MustQuery("show stats_histograms where table_name = 't'").Sort()
c.Assert(len(res.Rows()), Equals, 2)
c.Assert(res.Rows()[0][7], Equals, "4")
c.Assert(res.Rows()[1][7], Equals, "0")
tk.MustExec("truncate table t")
tk.MustExec("insert into t values(1,null,1),(2,null,2),(3,3,3),(4,null,4),(null,null,null)")
tk.MustExec("analyze table t")
res = tk.MustQuery("show stats_histograms where table_name = 't'").Sort()
c.Assert(len(res.Rows()), Equals, 5)
c.Assert(res.Rows()[0][7], Equals, "1")
c.Assert(res.Rows()[1][7], Equals, "4")
c.Assert(res.Rows()[2][7], Equals, "1")
c.Assert(res.Rows()[3][7], Equals, "4")
c.Assert(res.Rows()[4][7], Equals, "0")
}

func (s *testSuite1) TestShowPartitionStats(c *C) {
Expand Down
16 changes: 5 additions & 11 deletions planner/core/planbuilder.go
Original file line number Diff line number Diff line change
Expand Up @@ -688,24 +688,18 @@ func (b *PlanBuilder) buildCheckIndexSchema(tn *ast.TableName, indexName string)
// getColsInfo returns the info of index columns, normal columns and primary key.
func getColsInfo(tn *ast.TableName) (indicesInfo []*model.IndexInfo, colsInfo []*model.ColumnInfo, pkCol *model.ColumnInfo) {
tbl := tn.TableInfo
if tbl.PKIsHandle {
for _, col := range tbl.Columns {
if mysql.HasPriKeyFlag(col.Flag) {
pkCol = col
}
for _, col := range tbl.Columns {
if tbl.PKIsHandle && mysql.HasPriKeyFlag(col.Flag) {
pkCol = col
} else {
colsInfo = append(colsInfo, col)
}
}
for _, idx := range tn.TableInfo.Indices {
if idx.State == model.StatePublic {
indicesInfo = append(indicesInfo, idx)
}
}
for _, col := range tbl.Columns {
if col == pkCol {
continue
}
colsInfo = append(colsInfo, col)
}
return
}

Expand Down
12 changes: 11 additions & 1 deletion statistics/feedback.go
Original file line number Diff line number Diff line change
Expand Up @@ -1077,6 +1077,7 @@ func dumpFeedbackForIndex(h *Handle, q *QueryFeedback, t *Table) error {
}

func (q *QueryFeedback) dumpRangeFeedback(sc *stmtctx.StatementContext, h *Handle, ran *ranger.Range, rangeCount float64) error {
lowIsNull := ran.LowVal[0].IsNull()
if q.tp == indexType {
lower, err := codec.EncodeKey(sc, nil, ran.LowVal[0])
if err != nil {
Expand All @@ -1102,8 +1103,17 @@ func (q *QueryFeedback) dumpRangeFeedback(sc *stmtctx.StatementContext, h *Handl
ranges := q.hist.SplitRange(sc, []*ranger.Range{ran}, q.tp == indexType)
counts := make([]float64, 0, len(ranges))
sum := 0.0
for _, r := range ranges {
for i, r := range ranges {
// Though after `SplitRange`, we may have ranges like `[l, r]`, we still use
// `betweenRowCount` to compute the estimation since the ranges of feedback are all in `[l, r)`
// form, that is to say, we ignore the exclusiveness of ranges from `SplitRange` and just use
// its result of boundary values.
count := q.hist.betweenRowCount(r.LowVal[0], r.HighVal[0])
// We have to include `NullCount` of histogram for [l, r) cases where l is null because `betweenRowCount`
// does not include null values of lower bound.
if i == 0 && lowIsNull {
count += float64(q.hist.NullCount)
}
sum += count
counts = append(counts, count)
}
Expand Down
Loading

0 comments on commit 829c5c5

Please sign in to comment.