From 2163271ae4c3d026b4b62aaf7defa108017da77c Mon Sep 17 00:00:00 2001 From: Yuanjia Zhang Date: Mon, 28 Aug 2023 11:50:06 +0800 Subject: [PATCH] planner: Move the Selectivity function from the stats package into cardinality package (#46438) ref pingcap/tidb#46358 --- planner/cardinality/BUILD.bazel | 3 +- planner/cardinality/row_count_column.go | 59 ++++++++++++++++- planner/cardinality/row_count_test.go | 56 +++++++++++++++++ statistics/BUILD.bazel | 2 +- statistics/handle/BUILD.bazel | 1 + statistics/handle/ddl_test.go | 9 +-- statistics/handle/handletest/handle_test.go | 2 +- .../handle/handletest/statstest/BUILD.bazel | 1 + .../handle/handletest/statstest/stats_test.go | 5 +- statistics/handle/updatetest/BUILD.bazel | 1 + statistics/handle/updatetest/update_test.go | 3 +- statistics/statistics_test.go | 30 --------- statistics/table.go | 63 ------------------- 13 files changed, 130 insertions(+), 105 deletions(-) create mode 100644 planner/cardinality/row_count_test.go diff --git a/planner/cardinality/BUILD.bazel b/planner/cardinality/BUILD.bazel index 9d2df0d5693d2..4e062b1728308 100644 --- a/planner/cardinality/BUILD.bazel +++ b/planner/cardinality/BUILD.bazel @@ -45,13 +45,14 @@ go_test( timeout = "short", srcs = [ "main_test.go", + "row_count_test.go", "selectivity_test.go", "trace_test.go", ], data = glob(["testdata/**"]), embed = [":cardinality"], flaky = True, - shard_count = 30, + shard_count = 31, deps = [ "//config", "//domain", diff --git a/planner/cardinality/row_count_column.go b/planner/cardinality/row_count_column.go index bb395a32d2b76..ac27df26427a0 100644 --- a/planner/cardinality/row_count_column.go +++ b/planner/cardinality/row_count_column.go @@ -30,8 +30,6 @@ func init() { statistics.GetRowCountByColumnRanges = GetRowCountByColumnRanges statistics.GetRowCountByIntColumnRanges = GetRowCountByIntColumnRanges statistics.GetRowCountByIndexRanges = GetRowCountByIndexRanges - statistics.EqualRowCountOnColumn = equalRowCountOnColumn - statistics.BetweenRowCountOnColumn = betweenRowCountOnColumn } // GetRowCountByColumnRanges estimates the row count by a slice of Range. @@ -306,3 +304,60 @@ func betweenRowCountOnColumn(sctx sessionctx.Context, c *statistics.Column, l, r } return float64(c.TopN.BetweenCount(sctx, lowEncoded, highEncoded)) + histBetweenCnt } + +// functions below are mainly for testing. + +// ColumnGreaterRowCount estimates the row count where the column greater than value. +func ColumnGreaterRowCount(sctx sessionctx.Context, t *statistics.Table, value types.Datum, colID int64) float64 { + c, ok := t.Columns[colID] + if !ok || c.IsInvalid(sctx, t.Pseudo) { + return float64(t.RealtimeCount) / pseudoLessRate + } + return c.GreaterRowCount(value) * c.GetIncreaseFactor(t.RealtimeCount) +} + +// ColumnLessRowCount estimates the row count where the column less than value. Note that null values are not counted. +func ColumnLessRowCount(sctx sessionctx.Context, t *statistics.Table, value types.Datum, colID int64) float64 { + c, ok := t.Columns[colID] + if !ok || c.IsInvalid(sctx, t.Pseudo) { + return float64(t.RealtimeCount) / pseudoLessRate + } + return c.LessRowCount(sctx, value) * c.GetIncreaseFactor(t.RealtimeCount) +} + +// ColumnBetweenRowCount estimates the row count where column greater or equal to a and less than b. +func ColumnBetweenRowCount(sctx sessionctx.Context, t *statistics.Table, a, b types.Datum, colID int64) (float64, error) { + sc := sctx.GetSessionVars().StmtCtx + c, ok := t.Columns[colID] + if !ok || c.IsInvalid(sctx, t.Pseudo) { + return float64(t.RealtimeCount) / pseudoBetweenRate, nil + } + aEncoded, err := codec.EncodeKey(sc, nil, a) + if err != nil { + return 0, err + } + bEncoded, err := codec.EncodeKey(sc, nil, b) + if err != nil { + return 0, err + } + count := betweenRowCountOnColumn(sctx, c, a, b, aEncoded, bEncoded) + if a.IsNull() { + count += float64(c.NullCount) + } + return count * c.GetIncreaseFactor(t.RealtimeCount), nil +} + +// ColumnEqualRowCount estimates the row count where the column equals to value. +func ColumnEqualRowCount(sctx sessionctx.Context, t *statistics.Table, value types.Datum, colID int64) (float64, error) { + c, ok := t.Columns[colID] + if !ok || c.IsInvalid(sctx, t.Pseudo) { + return float64(t.RealtimeCount) / pseudoEqualRate, nil + } + encodedVal, err := codec.EncodeKey(sctx.GetSessionVars().StmtCtx, nil, value) + if err != nil { + return 0, err + } + result, err := equalRowCountOnColumn(sctx, c, value, encodedVal, t.ModifyCount) + result *= c.GetIncreaseFactor(t.RealtimeCount) + return result, errors.Trace(err) +} diff --git a/planner/cardinality/row_count_test.go b/planner/cardinality/row_count_test.go new file mode 100644 index 0000000000000..61d2f1c3c42ed --- /dev/null +++ b/planner/cardinality/row_count_test.go @@ -0,0 +1,56 @@ +// Copyright 2023 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cardinality + +import ( + "testing" + + "github.com/pingcap/tidb/parser/model" + "github.com/pingcap/tidb/parser/mysql" + "github.com/pingcap/tidb/statistics" + "github.com/pingcap/tidb/types" + "github.com/pingcap/tidb/util/mock" + "github.com/stretchr/testify/require" +) + +func TestPseudoTable(t *testing.T) { + ti := &model.TableInfo{} + colInfo := &model.ColumnInfo{ + ID: 1, + FieldType: *types.NewFieldType(mysql.TypeLonglong), + State: model.StatePublic, + } + ti.Columns = append(ti.Columns, colInfo) + tbl := statistics.PseudoTable(ti) + require.Len(t, tbl.Columns, 1) + require.Greater(t, tbl.RealtimeCount, int64(0)) + sctx := mock.NewContext() + count := ColumnLessRowCount(sctx, tbl, types.NewIntDatum(100), colInfo.ID) + require.Equal(t, 3333, int(count)) + count, err := ColumnEqualRowCount(sctx, tbl, types.NewIntDatum(1000), colInfo.ID) + require.NoError(t, err) + require.Equal(t, 10, int(count)) + count, _ = ColumnBetweenRowCount(sctx, tbl, types.NewIntDatum(1000), types.NewIntDatum(5000), colInfo.ID) + require.Equal(t, 250, int(count)) + ti.Columns = append(ti.Columns, &model.ColumnInfo{ + ID: 2, + FieldType: *types.NewFieldType(mysql.TypeLonglong), + Hidden: true, + State: model.StatePublic, + }) + tbl = statistics.PseudoTable(ti) + // We added a hidden column. The pseudo table still only have one column. + require.Equal(t, len(tbl.Columns), 1) +} diff --git a/statistics/BUILD.bazel b/statistics/BUILD.bazel index 6f850fadfdad8..28c6256096449 100644 --- a/statistics/BUILD.bazel +++ b/statistics/BUILD.bazel @@ -76,7 +76,7 @@ go_test( data = glob(["testdata/**"]), embed = [":statistics"], flaky = True, - shard_count = 41, + shard_count = 40, deps = [ "//config", "//parser/ast", diff --git a/statistics/handle/BUILD.bazel b/statistics/handle/BUILD.bazel index a6c84ae37647d..d466b12824434 100644 --- a/statistics/handle/BUILD.bazel +++ b/statistics/handle/BUILD.bazel @@ -70,6 +70,7 @@ go_test( "//config", "//domain", "//parser/model", + "//planner/cardinality", "//sessionctx/stmtctx", "//sessionctx/variable", "//statistics", diff --git a/statistics/handle/ddl_test.go b/statistics/handle/ddl_test.go index 4c146fbca485f..ad812a7e95f19 100644 --- a/statistics/handle/ddl_test.go +++ b/statistics/handle/ddl_test.go @@ -18,6 +18,7 @@ import ( "testing" "github.com/pingcap/tidb/parser/model" + "github.com/pingcap/tidb/planner/cardinality" "github.com/pingcap/tidb/testkit" "github.com/pingcap/tidb/types" "github.com/pingcap/tidb/util/mock" @@ -51,9 +52,9 @@ func TestDDLAfterLoad(t *testing.T) { tableInfo = tbl.Meta() sctx := mock.NewContext() - count := statsTbl.ColumnGreaterRowCount(sctx, types.NewDatum(recordCount+1), tableInfo.Columns[0].ID) + count := cardinality.ColumnGreaterRowCount(sctx, statsTbl, types.NewDatum(recordCount+1), tableInfo.Columns[0].ID) require.Equal(t, 0.0, count) - count = statsTbl.ColumnGreaterRowCount(sctx, types.NewDatum(recordCount+1), tableInfo.Columns[2].ID) + count = cardinality.ColumnGreaterRowCount(sctx, statsTbl, types.NewDatum(recordCount+1), tableInfo.Columns[2].ID) require.Equal(t, 333, int(count)) } @@ -133,10 +134,10 @@ func TestDDLHistogram(t *testing.T) { require.False(t, statsTbl.Pseudo) require.True(t, statsTbl.Columns[tableInfo.Columns[3].ID].IsStatsInitialized()) sctx := mock.NewContext() - count, err := statsTbl.ColumnEqualRowCount(sctx, types.NewIntDatum(0), tableInfo.Columns[3].ID) + count, err := cardinality.ColumnEqualRowCount(sctx, statsTbl, types.NewIntDatum(0), tableInfo.Columns[3].ID) require.NoError(t, err) require.Equal(t, float64(2), count) - count, err = statsTbl.ColumnEqualRowCount(sctx, types.NewIntDatum(1), tableInfo.Columns[3].ID) + count, err = cardinality.ColumnEqualRowCount(sctx, statsTbl, types.NewIntDatum(1), tableInfo.Columns[3].ID) require.NoError(t, err) require.Equal(t, float64(0), count) diff --git a/statistics/handle/handletest/handle_test.go b/statistics/handle/handletest/handle_test.go index ceca9115c7ffa..c1c437ff4e2be 100644 --- a/statistics/handle/handletest/handle_test.go +++ b/statistics/handle/handletest/handle_test.go @@ -55,7 +55,7 @@ func TestEmptyTable(t *testing.T) { require.NoError(t, err) tableInfo := tbl.Meta() statsTbl := do.StatsHandle().GetTableStats(tableInfo) - count := statsTbl.ColumnGreaterRowCount(mock.NewContext(), types.NewDatum(1), tableInfo.Columns[0].ID) + count := cardinality.ColumnGreaterRowCount(mock.NewContext(), statsTbl, types.NewDatum(1), tableInfo.Columns[0].ID) require.Equal(t, 0.0, count) } diff --git a/statistics/handle/handletest/statstest/BUILD.bazel b/statistics/handle/handletest/statstest/BUILD.bazel index 512230d962e0f..14169da4f8be1 100644 --- a/statistics/handle/handletest/statstest/BUILD.bazel +++ b/statistics/handle/handletest/statstest/BUILD.bazel @@ -13,6 +13,7 @@ go_test( deps = [ "//config", "//parser/model", + "//planner/cardinality", "//statistics/handle/internal", "//testkit", "//testkit/testsetup", diff --git a/statistics/handle/handletest/statstest/stats_test.go b/statistics/handle/handletest/statstest/stats_test.go index b2d3bf9792985..9acb8d770ad61 100644 --- a/statistics/handle/handletest/statstest/stats_test.go +++ b/statistics/handle/handletest/statstest/stats_test.go @@ -22,6 +22,7 @@ import ( "github.com/pingcap/failpoint" "github.com/pingcap/tidb/config" "github.com/pingcap/tidb/parser/model" + "github.com/pingcap/tidb/planner/cardinality" "github.com/pingcap/tidb/statistics/handle/internal" "github.com/pingcap/tidb/testkit" "github.com/pingcap/tidb/types" @@ -335,9 +336,9 @@ func TestLoadStats(t *testing.T) { require.Nil(t, cms) // Column stats are loaded after they are needed. - _, err = stat.ColumnEqualRowCount(testKit.Session(), types.NewIntDatum(1), colAID) + _, err = cardinality.ColumnEqualRowCount(testKit.Session(), stat, types.NewIntDatum(1), colAID) require.NoError(t, err) - _, err = stat.ColumnEqualRowCount(testKit.Session(), types.NewIntDatum(1), colCID) + _, err = cardinality.ColumnEqualRowCount(testKit.Session(), stat, types.NewIntDatum(1), colCID) require.NoError(t, err) require.NoError(t, h.LoadNeededHistograms()) stat = h.GetTableStats(tableInfo) diff --git a/statistics/handle/updatetest/BUILD.bazel b/statistics/handle/updatetest/BUILD.bazel index 6419a5e719343..ec974a7203738 100644 --- a/statistics/handle/updatetest/BUILD.bazel +++ b/statistics/handle/updatetest/BUILD.bazel @@ -12,6 +12,7 @@ go_test( deps = [ "//parser/model", "//parser/mysql", + "//planner/cardinality", "//sessionctx/variable", "//statistics", "//statistics/handle", diff --git a/statistics/handle/updatetest/update_test.go b/statistics/handle/updatetest/update_test.go index ff2e85259256b..f487f89aaef59 100644 --- a/statistics/handle/updatetest/update_test.go +++ b/statistics/handle/updatetest/update_test.go @@ -24,6 +24,7 @@ import ( "github.com/pingcap/tidb/parser/model" "github.com/pingcap/tidb/parser/mysql" + "github.com/pingcap/tidb/planner/cardinality" "github.com/pingcap/tidb/sessionctx/variable" "github.com/pingcap/tidb/statistics" "github.com/pingcap/tidb/statistics/handle" @@ -87,7 +88,7 @@ func TestSingleSessionInsert(t *testing.T) { require.Equal(t, int64(rowCount1*2), stats1.RealtimeCount) // Test IncreaseFactor. - count, err := stats1.ColumnEqualRowCount(testKit.Session(), types.NewIntDatum(1), tableInfo1.Columns[0].ID) + count, err := cardinality.ColumnEqualRowCount(testKit.Session(), stats1, types.NewIntDatum(1), tableInfo1.Columns[0].ID) require.NoError(t, err) require.Equal(t, float64(rowCount1*2), count) diff --git a/statistics/statistics_test.go b/statistics/statistics_test.go index 4072f0522ee7f..5df0bc08dee7e 100644 --- a/statistics/statistics_test.go +++ b/statistics/statistics_test.go @@ -190,36 +190,6 @@ func TestMergeHistogram(t *testing.T) { } } -func TestPseudoTable(t *testing.T) { - ti := &model.TableInfo{} - colInfo := &model.ColumnInfo{ - ID: 1, - FieldType: *types.NewFieldType(mysql.TypeLonglong), - State: model.StatePublic, - } - ti.Columns = append(ti.Columns, colInfo) - tbl := PseudoTable(ti) - require.Len(t, tbl.Columns, 1) - require.Greater(t, tbl.RealtimeCount, int64(0)) - sctx := mock.NewContext() - count := tbl.ColumnLessRowCount(sctx, types.NewIntDatum(100), colInfo.ID) - require.Equal(t, 3333, int(count)) - count, err := tbl.ColumnEqualRowCount(sctx, types.NewIntDatum(1000), colInfo.ID) - require.NoError(t, err) - require.Equal(t, 10, int(count)) - count, _ = tbl.ColumnBetweenRowCount(sctx, types.NewIntDatum(1000), types.NewIntDatum(5000), colInfo.ID) - require.Equal(t, 250, int(count)) - ti.Columns = append(ti.Columns, &model.ColumnInfo{ - ID: 2, - FieldType: *types.NewFieldType(mysql.TypeLonglong), - Hidden: true, - State: model.StatePublic, - }) - tbl = PseudoTable(ti) - // We added a hidden column. The pseudo table still only have one column. - require.Equal(t, len(tbl.Columns), 1) -} - func buildCMSketch(values []types.Datum) *CMSketch { cms := NewCMSketch(8, 2048) for _, val := range values { diff --git a/statistics/table.go b/statistics/table.go index 5457b95abc094..a6ca3b7ec9b26 100644 --- a/statistics/table.go +++ b/statistics/table.go @@ -21,7 +21,6 @@ import ( "strings" "sync" - "github.com/pingcap/errors" "github.com/pingcap/tidb/expression" "github.com/pingcap/tidb/kv" "github.com/pingcap/tidb/parser/model" @@ -30,7 +29,6 @@ import ( "github.com/pingcap/tidb/tablecodec" "github.com/pingcap/tidb/types" "github.com/pingcap/tidb/util/chunk" - "github.com/pingcap/tidb/util/codec" "github.com/pingcap/tidb/util/ranger" "go.uber.org/atomic" ) @@ -65,12 +63,6 @@ var ( // GetRowCountByColumnRanges is a function type to get row count by column ranges. GetRowCountByColumnRanges func(sctx sessionctx.Context, coll *HistColl, colID int64, colRanges []*ranger.Range) (result float64, err error) - - // EqualRowCountOnColumn is a function type to get the row count by equal condition on column. - EqualRowCountOnColumn func(sctx sessionctx.Context, c *Column, val types.Datum, encodedVal []byte, realtimeRowCount int64) (result float64, err error) - - // BetweenRowCountOnColumn is a function type to get the row count by between condition on column. - BetweenRowCountOnColumn func(sctx sessionctx.Context, c *Column, l, r types.Datum, lowEncoded, highEncoded []byte) float64 ) // Table represents statistics for a table. @@ -488,61 +480,6 @@ func (t *Table) IsOutdated() bool { return false } -// ColumnGreaterRowCount estimates the row count where the column greater than value. -func (t *Table) ColumnGreaterRowCount(sctx sessionctx.Context, value types.Datum, colID int64) float64 { - c, ok := t.Columns[colID] - if !ok || c.IsInvalid(sctx, t.Pseudo) { - return float64(t.RealtimeCount) / pseudoLessRate - } - return c.GreaterRowCount(value) * c.GetIncreaseFactor(t.RealtimeCount) -} - -// ColumnLessRowCount estimates the row count where the column less than value. Note that null values are not counted. -func (t *Table) ColumnLessRowCount(sctx sessionctx.Context, value types.Datum, colID int64) float64 { - c, ok := t.Columns[colID] - if !ok || c.IsInvalid(sctx, t.Pseudo) { - return float64(t.RealtimeCount) / pseudoLessRate - } - return c.LessRowCount(sctx, value) * c.GetIncreaseFactor(t.RealtimeCount) -} - -// ColumnBetweenRowCount estimates the row count where column greater or equal to a and less than b. -func (t *Table) ColumnBetweenRowCount(sctx sessionctx.Context, a, b types.Datum, colID int64) (float64, error) { - sc := sctx.GetSessionVars().StmtCtx - c, ok := t.Columns[colID] - if !ok || c.IsInvalid(sctx, t.Pseudo) { - return float64(t.RealtimeCount) / pseudoBetweenRate, nil - } - aEncoded, err := codec.EncodeKey(sc, nil, a) - if err != nil { - return 0, err - } - bEncoded, err := codec.EncodeKey(sc, nil, b) - if err != nil { - return 0, err - } - count := BetweenRowCountOnColumn(sctx, c, a, b, aEncoded, bEncoded) - if a.IsNull() { - count += float64(c.NullCount) - } - return count * c.GetIncreaseFactor(t.RealtimeCount), nil -} - -// ColumnEqualRowCount estimates the row count where the column equals to value. -func (t *Table) ColumnEqualRowCount(sctx sessionctx.Context, value types.Datum, colID int64) (float64, error) { - c, ok := t.Columns[colID] - if !ok || c.IsInvalid(sctx, t.Pseudo) { - return float64(t.RealtimeCount) / pseudoEqualRate, nil - } - encodedVal, err := codec.EncodeKey(sctx.GetSessionVars().StmtCtx, nil, value) - if err != nil { - return 0, err - } - result, err := EqualRowCountOnColumn(sctx, c, value, encodedVal, t.ModifyCount) - result *= c.GetIncreaseFactor(t.RealtimeCount) - return result, errors.Trace(err) -} - // PseudoAvgCountPerValue gets a pseudo average count if histogram not exists. func (t *Table) PseudoAvgCountPerValue() float64 { return float64(t.RealtimeCount) / pseudoEqualRate