Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

executor: support fast analyze. #9973

Closed
wants to merge 72 commits into from
Closed
Show file tree
Hide file tree
Changes from 51 commits
Commits
Show all changes
72 commits
Select commit Hold shift + click to select a range
38eb623
ci
lzmhhh123 Mar 26, 2019
f6ee210
ci
lzmhhh123 Mar 28, 2019
f0105d0
ci
lzmhhh123 Mar 31, 2019
f05cf4a
Merge branch 'dev/plug_in_debug_pb' into dev/fast_analyze
lzmhhh123 Mar 31, 2019
586cf13
ci
lzmhhh123 Apr 1, 2019
4cd7147
improve
lzmhhh123 Apr 1, 2019
5c2e687
debug
lzmhhh123 Apr 1, 2019
289268b
improve
lzmhhh123 Apr 2, 2019
957d0a7
add some TODOs
lzmhhh123 Apr 2, 2019
86ea376
debug
lzmhhh123 Apr 2, 2019
6763c80
address comments
lzmhhh123 Apr 2, 2019
f412d13
address comments
lzmhhh123 Apr 2, 2019
e91daaa
add idx collector
lzmhhh123 Apr 2, 2019
0e9640c
ci
lzmhhh123 Apr 4, 2019
925832f
ci
lzmhhh123 Apr 4, 2019
e77c0f1
ci
lzmhhh123 Apr 4, 2019
05fbe0e
handle scan task
lzmhhh123 Apr 4, 2019
61f84d7
address comments
lzmhhh123 Apr 8, 2019
b191070
address comments
lzmhhh123 Apr 8, 2019
a149b6f
debug
lzmhhh123 Apr 8, 2019
b9fab49
improve
lzmhhh123 Apr 8, 2019
1381dba
address comments
lzmhhh123 Apr 8, 2019
c1deecc
ci
lzmhhh123 Apr 9, 2019
eedfef7
change client to snapshot
lzmhhh123 Apr 9, 2019
1552da1
fix ci
lzmhhh123 Apr 9, 2019
00b8a2a
address comments
lzmhhh123 Apr 9, 2019
d0351e3
improve
lzmhhh123 Apr 10, 2019
0ecb10f
address comments
lzmhhh123 Apr 10, 2019
1374c34
Merge branch 'master' into dev/fast_analyze
lzmhhh123 Apr 11, 2019
a3ce332
remove code
lzmhhh123 Apr 11, 2019
2ca551b
fix ci
lzmhhh123 Apr 11, 2019
1a18ee7
debug
lzmhhh123 Apr 13, 2019
5bf7254
add unit tests
lzmhhh123 Apr 14, 2019
fbb123c
squash push
lzmhhh123 Apr 14, 2019
f175b34
Merge branch 'master' into dev/fast_analyze
lzmhhh123 Apr 15, 2019
bf265b0
add test
lzmhhh123 Apr 15, 2019
da5b9eb
Merge branch 'dev/fast_analyze' of https://github.com/lzmhhh123/tidb …
lzmhhh123 Apr 15, 2019
23aef4d
limit bucket size in unit test
lzmhhh123 Apr 15, 2019
2ea1d58
improve
lzmhhh123 Apr 15, 2019
fdb3c91
address comments
lzmhhh123 Apr 16, 2019
890c229
remove global rander
lzmhhh123 Apr 16, 2019
af03002
improve
lzmhhh123 Apr 16, 2019
9254400
Split core into a single pr
erjiaqing Apr 16, 2019
2433427
remove copy and equal from pr
erjiaqing Apr 16, 2019
a7a3a19
address comments
lzmhhh123 Apr 16, 2019
77dc45b
move some code into separate functions
erjiaqing Apr 17, 2019
cfc8a93
update
erjiaqing Apr 17, 2019
6ef86f2
address comment
lzmhhh123 Apr 17, 2019
7008f89
rename some variables
erjiaqing Apr 17, 2019
2055420
upd
erjiaqing Apr 17, 2019
af8e40c
fix
erjiaqing Apr 17, 2019
bfd2128
fix
erjiaqing Apr 17, 2019
8e63f5e
Merge branch 'master' into cms_topn_core
erjiaqing Apr 17, 2019
b244ab1
merge
erjiaqing Apr 17, 2019
40ac221
upd
erjiaqing Apr 17, 2019
d947e96
fix
erjiaqing Apr 17, 2019
6148210
fix
erjiaqing Apr 17, 2019
5472c8a
fix data race
lzmhhh123 Apr 17, 2019
544f215
fix
lzmhhh123 Apr 17, 2019
33aa77e
fix
lzmhhh123 Apr 18, 2019
d4f7684
debug
lzmhhh123 Apr 18, 2019
9cb5128
debug
lzmhhh123 Apr 18, 2019
6b078b5
debug
lzmhhh123 Apr 18, 2019
ba40c91
upd
erjiaqing Apr 18, 2019
c07b72c
some rename
erjiaqing Apr 18, 2019
42d32af
fix
erjiaqing Apr 18, 2019
250d6d6
improve
lzmhhh123 Apr 18, 2019
2d59027
fix
lzmhhh123 Apr 18, 2019
0a76db7
fix test
lzmhhh123 Apr 18, 2019
466428c
Merge remote-tracking branch 'gs/cms_topn_core' into dev/fast_analyze
lzmhhh123 Apr 21, 2019
53f9745
build cmsketch
lzmhhh123 Apr 21, 2019
d29d982
debug the calculation of ndv
lzmhhh123 Apr 21, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
493 changes: 479 additions & 14 deletions executor/analyze.go

Large diffs are not rendered by default.

63 changes: 63 additions & 0 deletions executor/analyze_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,12 @@ import (
. "github.com/pingcap/check"
"github.com/pingcap/parser/model"
"github.com/pingcap/parser/mysql"
"github.com/pingcap/tidb/domain"
"github.com/pingcap/tidb/executor"
"github.com/pingcap/tidb/session"
"github.com/pingcap/tidb/sessionctx"
"github.com/pingcap/tidb/store/mockstore"
"github.com/pingcap/tidb/store/mockstore/mocktikv"
"github.com/pingcap/tidb/util/testkit"
)

Expand Down Expand Up @@ -110,6 +113,66 @@ func (s *testSuite1) TestAnalyzeParameters(c *C) {
c.Assert(tbl.Columns[1].Len(), Equals, 4)
}

func (s *testSuite1) TestFastAnalyze(c *C) {
cluster := mocktikv.NewCluster()
mocktikv.BootstrapWithSingleStore(cluster)
store, err := mockstore.NewMockTikvStore(
mockstore.WithCluster(cluster),
)
c.Assert(err, IsNil)
var dom *domain.Domain
dom, err = session.BootstrapSession(store)
c.Assert(err, IsNil)
tk := testkit.NewTestKit(c, store)
executor.MaxSampleSize = 1000
executor.RandSeed = 123

tk.MustExec("use test")
tk.MustExec("drop table if exists t")
tk.MustExec("create table t(a int primary key, b int, index index_b(b))")
tk.MustExec("set @@session.tidb_enable_fast_analyze=1")
tk.MustExec("set @@session.tidb_build_stats_concurrency=1")
for i := 0; i < 3000; i++ {
tk.MustExec(fmt.Sprintf("insert into t values (%d, %d)", i, i))
}
tblInfo, err := dom.InfoSchema().TableByName(model.NewCIStr("test"), model.NewCIStr("t"))
c.Assert(err, IsNil)
tid := tblInfo.Meta().ID

// construct 5 regions split by {600, 1200, 1800, 2400}
splitKeys := generateTableSplitKeyForInt(tid, []int{600, 1200, 1800, 2400})
manipulateCluster(cluster, splitKeys)

tk.MustExec("analyze table t with 5 buckets")

is := executor.GetInfoSchema(tk.Se.(sessionctx.Context))
table, err := is.TableByName(model.NewCIStr("test"), model.NewCIStr("t"))
c.Assert(err, IsNil)
tableInfo := table.Meta()
tbl := dom.StatsHandle().GetTableStats(tableInfo)
c.Assert(fmt.Sprintln(tbl), Equals,
"Table:37 Count:3000\n"+
"column:1 ndv:3000 totColSize:0\n"+
"num: 603 lower_bound: 6 upper_bound: 612 repeats: 1\n"+
"num: 603 lower_bound: 621 upper_bound: 1205 repeats: 1\n"+
"num: 603 lower_bound: 1207 upper_bound: 1830 repeats: 1\n"+
"num: 603 lower_bound: 1831 upper_bound: 2387 repeats: 1\n"+
"num: 588 lower_bound: 2390 upper_bound: 2997 repeats: 1\n"+
"column:2 ndv:3000 totColSize:0\n"+
"num: 603 lower_bound: 6 upper_bound: 612 repeats: 1\n"+
"num: 603 lower_bound: 621 upper_bound: 1205 repeats: 1\n"+
"num: 603 lower_bound: 1207 upper_bound: 1830 repeats: 1\n"+
"num: 603 lower_bound: 1831 upper_bound: 2387 repeats: 1\n"+
"num: 588 lower_bound: 2390 upper_bound: 2997 repeats: 1\n"+
"index:1 ndv:3000\n"+
"num: 603 lower_bound: 6 upper_bound: 612 repeats: 1\n"+
"num: 603 lower_bound: 621 upper_bound: 1205 repeats: 1\n"+
"num: 603 lower_bound: 1207 upper_bound: 1830 repeats: 1\n"+
"num: 603 lower_bound: 1831 upper_bound: 2387 repeats: 1\n"+
"num: 588 lower_bound: 2390 upper_bound: 2997 repeats: 1\n")

}

func (s *testSuite1) TestAnalyzeTooLongColumns(c *C) {
tk := testkit.NewTestKit(c, s.store)
tk.MustExec("use test")
Expand Down
5 changes: 3 additions & 2 deletions executor/builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -1413,8 +1413,8 @@ func (b *executorBuilder) buildAnalyzeFastColumn(e *AnalyzeExec, task plannercor
colsInfo: task.ColsInfo,
pkInfo: task.PKInfo,
maxNumBuckets: maxNumBuckets,
table: task.Table,
concurrency: concurrency,
wg: &sync.WaitGroup{},
},
})
}
Expand Down Expand Up @@ -1442,8 +1442,8 @@ func (b *executorBuilder) buildAnalyzeFastIndex(e *AnalyzeExec, task plannercore
PhysicalTableID: task.PhysicalTableID,
idxsInfo: []*model.IndexInfo{task.IndexInfo},
maxNumBuckets: maxNumBuckets,
table: task.Table,
concurrency: concurrency,
wg: &sync.WaitGroup{},
},
})
}
Expand All @@ -1453,6 +1453,7 @@ func (b *executorBuilder) buildAnalyze(v *plannercore.Analyze) Executor {
e := &AnalyzeExec{
baseExecutor: newBaseExecutor(b.ctx, v.Schema(), v.ExplainID()),
tasks: make([]*analyzeTask, 0, len(v.ColTasks)+len(v.IdxTasks)),
wg: &sync.WaitGroup{},
}
enableFastAnalyze := b.ctx.GetSessionVars().EnableFastAnalyze
for _, task := range v.ColTasks {
Expand Down
2 changes: 0 additions & 2 deletions planner/core/common_plans.go
Original file line number Diff line number Diff line change
Expand Up @@ -396,15 +396,13 @@ type AnalyzeColumnsTask struct {
PhysicalTableID int64
PKInfo *model.ColumnInfo
ColsInfo []*model.ColumnInfo
Table table.Table
}

// AnalyzeIndexTask is used for analyze index.
type AnalyzeIndexTask struct {
// PhysicalTableID is the id for a partition or a table.
PhysicalTableID int64
IndexInfo *model.IndexInfo
Table table.Table
}

// Analyze represents an analyze plan
Expand Down
6 changes: 0 additions & 6 deletions planner/core/planbuilder.go
Original file line number Diff line number Diff line change
Expand Up @@ -772,16 +772,11 @@ func (b *PlanBuilder) buildAnalyzeTable(as *ast.AnalyzeTableStmt) (Plan, error)
if err != nil {
return nil, err
}
table, ok := b.is.TableByID(tbl.TableInfo.ID)
if !ok {
return nil, infoschema.ErrTableNotExists.GenWithStackByArgs(tbl.DBInfo.Name.O, tbl.TableInfo.Name.O)
}
for _, idx := range idxInfo {
for _, id := range physicalIDs {
p.IdxTasks = append(p.IdxTasks, AnalyzeIndexTask{
lzmhhh123 marked this conversation as resolved.
Show resolved Hide resolved
PhysicalTableID: id,
IndexInfo: idx,
Table: table,
})
}
}
Expand All @@ -791,7 +786,6 @@ func (b *PlanBuilder) buildAnalyzeTable(as *ast.AnalyzeTableStmt) (Plan, error)
PhysicalTableID: id,
PKInfo: pkInfo,
ColsInfo: colInfo,
Table: table,
})
}
}
Expand Down
24 changes: 18 additions & 6 deletions statistics/builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
package statistics

import (
"math"

"github.com/pingcap/errors"
"github.com/pingcap/tidb/sessionctx"
"github.com/pingcap/tidb/sessionctx/stmtctx"
Expand Down Expand Up @@ -93,23 +95,20 @@ func (b *SortedBuilder) Iterate(data types.Datum) error {
return nil
}

// BuildColumn builds histogram from samples for column.
func BuildColumn(ctx sessionctx.Context, numBuckets, id int64, collector *SampleCollector, tp *types.FieldType) (*Histogram, error) {
count := collector.Count
ndv := collector.FMSketch.NDV()
func buildColumnHist(ctx sessionctx.Context, numBuckets, id int64, collector *SampleCollector, tp *types.FieldType, count int64, ndv int64, nullCount int64) (*Histogram, error) {
if ndv > count {
ndv = count
}
if count == 0 || len(collector.Samples) == 0 {
return NewHistogram(id, ndv, collector.NullCount, 0, tp, 0, collector.TotalSize), nil
return NewHistogram(id, ndv, nullCount, 0, tp, 0, collector.TotalSize), nil
}
sc := ctx.GetSessionVars().StmtCtx
samples := collector.Samples
err := SortSampleItems(sc, samples)
if err != nil {
return nil, err
}
hg := NewHistogram(id, ndv, collector.NullCount, 0, tp, int(numBuckets), collector.TotalSize)
hg := NewHistogram(id, ndv, nullCount, 0, tp, int(numBuckets), collector.TotalSize)

sampleNum := int64(len(samples))
// As we use samples to build the histogram, the bucket number and repeat should multiply a factor.
Expand Down Expand Up @@ -175,6 +174,19 @@ func BuildColumn(ctx sessionctx.Context, numBuckets, id int64, collector *Sample
return hg, nil
}

// BuildColumn builds histogram from samples for column.
func BuildColumn(ctx sessionctx.Context, numBuckets, id int64, collector *SampleCollector, tp *types.FieldType) (*Histogram, error) {
return buildColumnHist(ctx, numBuckets, id, collector, tp, collector.Count, collector.FMSketch.NDV(), collector.NullCount)
}

// BuildColumnWithSamples builds histogram from samples for column.
// It was used in that collector.Count is not the entire count but the sample count.
func BuildColumnWithSamples(ctx sessionctx.Context, numBuckets, id int64, collector *SampleCollector, tp *types.FieldType, count int64) (*Histogram, error) {
ndv := collector.FMSketch.NDV() * int64(math.Round(float64(count)/float64(collector.Count)))
nullCount := collector.NullCount * int64(math.Round(float64(count)/float64(collector.Count)))
return buildColumnHist(ctx, numBuckets, id, collector, tp, count, ndv, nullCount)
}

// AnalyzeResult is used to represent analyze result.
type AnalyzeResult struct {
// PhysicalTableID is the id of a partition or a table.
Expand Down
8 changes: 8 additions & 0 deletions statistics/sample.go
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,14 @@ func (c *SampleCollector) collect(sc *stmtctx.StatementContext, d types.Datum) e
return nil
}

// UpdateTotalSize is to calculate total size based on samples.
func (c *SampleCollector) UpdateTotalSize() {
c.TotalSize = 0
for _, item := range c.Samples {
c.TotalSize += int64(len(item.Value.GetBytes()))
}
}

// SampleBuilder is used to build samples for columns.
// Also, if primary key is handle, it will directly build histogram for it.
type SampleBuilder struct {
Expand Down
3 changes: 2 additions & 1 deletion store/mockstore/mocktikv/rpc.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (
"context"
"fmt"
"io"
"strconv"
"time"

"github.com/golang/protobuf/proto"
Expand Down Expand Up @@ -826,7 +827,7 @@ func (c *RPCClient) SendRequest(ctx context.Context, addr string, req *tikvrpc.R
resp.DebugGetRegionProperties = &debugpb.GetRegionPropertiesResponse{
Props: []*debugpb.Property{{
Name: "num_rows",
Value: string(len(scanResp.Pairs)),
Value: strconv.Itoa(len(scanResp.Pairs)),
}}}
default:
return nil, errors.Errorf("unsupport this request type %v", req.Type)
Expand Down