Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

*: improve NULL count estimation for single column index #9474

Merged
merged 2 commits into from
Mar 30, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 76 additions & 22 deletions executor/analyze.go
Original file line number Diff line number Diff line change
Expand Up @@ -152,8 +152,9 @@ func analyzeIndexPushdown(idxExec *AnalyzeIndexExec) statistics.AnalyzeResult {
Cms: []*statistics.CMSketch{cms},
IsIndex: 1,
}
result.Count = hist.NullCount
if hist.Len() > 0 {
result.Count = hist.Buckets[hist.Len()-1].Count
result.Count += hist.Buckets[hist.Len()-1].Count
}
return result
}
Expand All @@ -167,12 +168,16 @@ type AnalyzeIndexExec struct {
priority int
analyzePB *tipb.AnalyzeReq
result distsql.SelectResult
countNullRes distsql.SelectResult
maxNumBuckets uint64
}

func (e *AnalyzeIndexExec) open() error {
// fetchAnalyzeResult builds and dispatches the `kv.Request` from given ranges, and stores the `SelectResult`
// in corresponding fields based on the input `isNullRange` argument, which indicates if the range is the
// special null range for single-column index to get the null count.
func (e *AnalyzeIndexExec) fetchAnalyzeResult(ranges []*ranger.Range, isNullRange bool) error {
var builder distsql.RequestBuilder
kvReq, err := builder.SetIndexRanges(e.ctx.GetSessionVars().StmtCtx, e.physicalTableID, e.idxInfo.ID, ranger.FullRange()).
kvReq, err := builder.SetIndexRanges(e.ctx.GetSessionVars().StmtCtx, e.physicalTableID, e.idxInfo.ID, ranges).
SetAnalyzeRequest(e.analyzePB).
SetKeepOrder(true).
SetConcurrency(e.concurrency).
Expand All @@ -181,29 +186,51 @@ func (e *AnalyzeIndexExec) open() error {
return errors.Trace(err)
}
ctx := context.TODO()
e.result, err = distsql.Analyze(ctx, e.ctx.GetClient(), kvReq, e.ctx.GetSessionVars().KVVars, e.ctx.GetSessionVars().InRestrictedSQL)
result, err := distsql.Analyze(ctx, e.ctx.GetClient(), kvReq, e.ctx.GetSessionVars().KVVars, e.ctx.GetSessionVars().InRestrictedSQL)
if err != nil {
return errors.Trace(err)
}
e.result.Fetch(ctx)
result.Fetch(ctx)
if isNullRange {
e.countNullRes = result
} else {
e.result = result
}
return nil
}

func (e *AnalyzeIndexExec) buildStats() (hist *statistics.Histogram, cms *statistics.CMSketch, err error) {
if err = e.open(); err != nil {
return nil, nil, errors.Trace(err)
func (e *AnalyzeIndexExec) open() error {
ranges := ranger.FullRange()
// For single-column index, we do not load null rows from TiKV, so the built histogram would not include
// null values, and its `NullCount` would be set by result of another distsql call to get null rows.
// For multi-column index, we cannot define null for the rows, so we still use full range, and the rows
// containing null fields would exist in built histograms. Note that, the `NullCount` of histograms for
// multi-column index is always 0 then.
if len(e.idxInfo.Columns) == 1 {
ranges = ranger.FullNotNullRange()
}
defer func() {
if err1 := e.result.Close(); err1 != nil {
hist = nil
cms = nil
err = errors.Trace(err1)
err := e.fetchAnalyzeResult(ranges, false)
if err != nil {
return err
}
if len(e.idxInfo.Columns) == 1 {
ranges = ranger.NullRange()
err = e.fetchAnalyzeResult(ranges, true)
zz-jason marked this conversation as resolved.
Show resolved Hide resolved
if err != nil {
return err
}
}()
hist = &statistics.Histogram{}
cms = statistics.NewCMSketch(defaultCMSketchDepth, defaultCMSketchWidth)
}
return nil
}

func (e *AnalyzeIndexExec) buildStatsFromResult(result distsql.SelectResult, needCMS bool) (*statistics.Histogram, *statistics.CMSketch, error) {
hist := &statistics.Histogram{}
var cms *statistics.CMSketch
if needCMS {
cms = statistics.NewCMSketch(defaultCMSketchDepth, defaultCMSketchWidth)
}
for {
data, err := e.result.NextRaw(context.TODO())
data, err := result.NextRaw(context.TODO())
if err != nil {
return nil, nil, errors.Trace(err)
}
Expand All @@ -217,15 +244,42 @@ func (e *AnalyzeIndexExec) buildStats() (hist *statistics.Histogram, cms *statis
}
hist, err = statistics.MergeHistograms(e.ctx.GetSessionVars().StmtCtx, hist, statistics.HistogramFromProto(resp.Hist), int(e.maxNumBuckets))
if err != nil {
return nil, nil, errors.Trace(err)
return nil, nil, err
}
if resp.Cms != nil {
err := cms.MergeCMSketch(statistics.CMSketchFromProto(resp.Cms))
if err != nil {
return nil, nil, errors.Trace(err)
if needCMS {
if resp.Cms == nil {
logutil.Logger(context.TODO()).Warn("nil CMS in response", zap.String("table", e.idxInfo.Table.O), zap.String("index", e.idxInfo.Name.O))
} else {
err := cms.MergeCMSketch(statistics.CMSketchFromProto(resp.Cms))
if err != nil {
return nil, nil, errors.Trace(err)
}
}
}
}
return hist, cms, nil
}

func (e *AnalyzeIndexExec) buildStats() (hist *statistics.Histogram, cms *statistics.CMSketch, err error) {
if err = e.open(); err != nil {
return nil, nil, err
}
defer func() {
err = closeAll(e.result, e.countNullRes)
}()
hist, cms, err = e.buildStatsFromResult(e.result, true)
if err != nil {
return nil, nil, err
}
if e.countNullRes != nil {
nullHist, _, err := e.buildStatsFromResult(e.countNullRes, false)
if err != nil {
return nil, nil, err
}
if l := nullHist.Len(); l > 0 {
hist.NullCount = nullHist.Buckets[l-1].Count
}
}
hist.ID = e.idxInfo.ID
return hist, cms, nil
}
Expand Down
53 changes: 52 additions & 1 deletion executor/show_stats_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,58 @@ func (s *testSuite1) TestShowStatsHasNullValue(c *C) {
tk.MustExec("create table t (a int, index idx(a))")
tk.MustExec("insert into t values(NULL)")
tk.MustExec("analyze table t")
tk.MustQuery("show stats_buckets").Check(testkit.Rows("test t idx 1 0 1 1 NULL NULL"))
// Null values are excluded from histogram for single-column index.
tk.MustQuery("show stats_buckets").Check(testkit.Rows())
tk.MustExec("insert into t values(1)")
tk.MustExec("analyze table t")
tk.MustQuery("show stats_buckets").Sort().Check(testkit.Rows(
"test t a 0 0 1 1 1 1",
"test t idx 1 0 1 1 1 1",
))
tk.MustExec("drop table t")
tk.MustExec("create table t (a int, b int, index idx(a, b))")
tk.MustExec("insert into t values(NULL, NULL)")
tk.MustExec("analyze table t")
tk.MustQuery("show stats_buckets").Check(testkit.Rows("test t idx 1 0 1 1 (NULL, NULL) (NULL, NULL)"))

tk.MustExec("drop table t")
tk.MustExec("create table t(a int, b int, c int, index idx_b(b), index idx_c_a(c, a))")
tk.MustExec("insert into t values(1,null,1),(2,null,2),(3,3,3),(4,null,4),(null,null,null)")
res := tk.MustQuery("show stats_histograms where table_name = 't'")
c.Assert(len(res.Rows()), Equals, 0)
tk.MustExec("analyze table t index idx_b")
res = tk.MustQuery("show stats_histograms where table_name = 't' and column_name = 'idx_b'")
c.Assert(len(res.Rows()), Equals, 1)
c.Assert(res.Rows()[0][7], Equals, "4")
res = tk.MustQuery("show stats_histograms where table_name = 't' and column_name = 'b'")
c.Assert(len(res.Rows()), Equals, 0)
tk.MustExec("analyze table t index idx_c_a")
res = tk.MustQuery("show stats_histograms where table_name = 't' and column_name = 'idx_c_a'")
c.Assert(len(res.Rows()), Equals, 1)
c.Assert(res.Rows()[0][7], Equals, "0")
res = tk.MustQuery("show stats_histograms where table_name = 't' and column_name = 'c'")
c.Assert(len(res.Rows()), Equals, 0)
res = tk.MustQuery("show stats_histograms where table_name = 't' and column_name = 'a'")
c.Assert(len(res.Rows()), Equals, 0)
tk.MustExec("truncate table t")
tk.MustExec("insert into t values(1,null,1),(2,null,2),(3,3,3),(4,null,4),(null,null,null)")
res = tk.MustQuery("show stats_histograms where table_name = 't'")
c.Assert(len(res.Rows()), Equals, 0)
tk.MustExec("analyze table t index")
res = tk.MustQuery("show stats_histograms where table_name = 't'").Sort()
c.Assert(len(res.Rows()), Equals, 2)
c.Assert(res.Rows()[0][7], Equals, "4")
c.Assert(res.Rows()[1][7], Equals, "0")
tk.MustExec("truncate table t")
tk.MustExec("insert into t values(1,null,1),(2,null,2),(3,3,3),(4,null,4),(null,null,null)")
tk.MustExec("analyze table t")
res = tk.MustQuery("show stats_histograms where table_name = 't'").Sort()
c.Assert(len(res.Rows()), Equals, 5)
c.Assert(res.Rows()[0][7], Equals, "1")
c.Assert(res.Rows()[1][7], Equals, "4")
c.Assert(res.Rows()[2][7], Equals, "1")
c.Assert(res.Rows()[3][7], Equals, "4")
c.Assert(res.Rows()[4][7], Equals, "0")
}

func (s *testSuite1) TestShowPartitionStats(c *C) {
Expand Down
16 changes: 5 additions & 11 deletions planner/core/planbuilder.go
Original file line number Diff line number Diff line change
Expand Up @@ -687,24 +687,18 @@ func (b *PlanBuilder) buildCheckIndexSchema(tn *ast.TableName, indexName string)
// getColsInfo returns the info of index columns, normal columns and primary key.
func getColsInfo(tn *ast.TableName) (indicesInfo []*model.IndexInfo, colsInfo []*model.ColumnInfo, pkCol *model.ColumnInfo) {
tbl := tn.TableInfo
if tbl.PKIsHandle {
for _, col := range tbl.Columns {
if mysql.HasPriKeyFlag(col.Flag) {
pkCol = col
}
for _, col := range tbl.Columns {
if tbl.PKIsHandle && mysql.HasPriKeyFlag(col.Flag) {
pkCol = col
} else {
colsInfo = append(colsInfo, col)
}
}
for _, idx := range tn.TableInfo.Indices {
if idx.State == model.StatePublic {
indicesInfo = append(indicesInfo, idx)
}
}
for _, col := range tbl.Columns {
if col == pkCol {
continue
}
colsInfo = append(colsInfo, col)
}
return
}

Expand Down
12 changes: 11 additions & 1 deletion statistics/feedback.go
Original file line number Diff line number Diff line change
Expand Up @@ -1077,6 +1077,7 @@ func dumpFeedbackForIndex(h *Handle, q *QueryFeedback, t *Table) error {
}

func (q *QueryFeedback) dumpRangeFeedback(sc *stmtctx.StatementContext, h *Handle, ran *ranger.Range, rangeCount float64) error {
lowIsNull := ran.LowVal[0].IsNull()
if q.tp == indexType {
lower, err := codec.EncodeKey(sc, nil, ran.LowVal[0])
if err != nil {
Expand All @@ -1102,8 +1103,17 @@ func (q *QueryFeedback) dumpRangeFeedback(sc *stmtctx.StatementContext, h *Handl
ranges := q.hist.SplitRange(sc, []*ranger.Range{ran}, q.tp == indexType)
counts := make([]float64, 0, len(ranges))
sum := 0.0
for _, r := range ranges {
for i, r := range ranges {
// Though after `SplitRange`, we may have ranges like `[l, r]`, we still use
// `betweenRowCount` to compute the estimation since the ranges of feedback are all in `[l, r)`
// form, that is to say, we ignore the exclusiveness of ranges from `SplitRange` and just use
// its result of boundary values.
count := q.hist.betweenRowCount(r.LowVal[0], r.HighVal[0])
// We have to include `NullCount` of histogram for [l, r) cases where l is null because `betweenRowCount`
// does not include null values of lower bound.
if i == 0 && lowIsNull {
count += float64(q.hist.NullCount)
}
sum += count
counts = append(counts, count)
}
Expand Down
Loading