From ff507954edbc7919b4c81e1a31278ae66749a19e Mon Sep 17 00:00:00 2001 From: time-and-fate <25057648+time-and-fate@users.noreply.github.com> Date: Tue, 7 Mar 2023 22:33:10 +0800 Subject: [PATCH 01/10] add --- planner/core/exhaust_physical_plans.go | 59 ++++++++++++++++++++++++-- 1 file changed, 56 insertions(+), 3 deletions(-) diff --git a/planner/core/exhaust_physical_plans.go b/planner/core/exhaust_physical_plans.go index 8ec5d95a16aac..2490d5b189673 100644 --- a/planner/core/exhaust_physical_plans.go +++ b/planner/core/exhaust_physical_plans.go @@ -17,6 +17,7 @@ package core import ( "bytes" "fmt" + "github.com/pingcap/tidb/util/mathutil" "math" "strings" "unsafe" @@ -947,7 +948,7 @@ func (p *LogicalJoin) buildIndexJoinInner2IndexScan( maxOneRow = ok && (sf.FuncName.L == ast.EQ) } } - innerTask := p.constructInnerIndexScanTask(wrapper, helper.chosenPath, helper.chosenRanges.Range(), helper.chosenRemained, outerJoinKeys, rangeInfo, false, false, avgInnerRowCnt, maxOneRow) + innerTask := p.constructInnerIndexScanTask(wrapper, helper.chosenPath, helper.chosenRanges.Range(), helper.chosenRemained, innerJoinKeys, rangeInfo, false, false, avgInnerRowCnt, maxOneRow) failpoint.Inject("MockOnlyEnableIndexHashJoin", func(val failpoint.Value) { if val.(bool) && !p.ctx.GetSessionVars().InRestrictedSQL { failpoint.Return(p.constructIndexHashJoin(prop, outerIdx, innerTask, helper.chosenRanges, keyOff2IdxOff, helper.chosenPath, helper.lastColManager)) @@ -962,7 +963,7 @@ func (p *LogicalJoin) buildIndexJoinInner2IndexScan( // Because we can't keep order for union scan, if there is a union scan in inner task, // we can't construct index merge join. if us == nil { - innerTask2 := p.constructInnerIndexScanTask(wrapper, helper.chosenPath, helper.chosenRanges.Range(), helper.chosenRemained, outerJoinKeys, rangeInfo, true, !prop.IsSortItemEmpty() && prop.SortItems[0].Desc, avgInnerRowCnt, maxOneRow) + innerTask2 := p.constructInnerIndexScanTask(wrapper, helper.chosenPath, helper.chosenRanges.Range(), helper.chosenRemained, innerJoinKeys, rangeInfo, true, !prop.IsSortItemEmpty() && prop.SortItems[0].Desc, avgInnerRowCnt, maxOneRow) if innerTask2 != nil { joins = append(joins, p.constructIndexMergeJoin(prop, outerIdx, innerTask2, helper.chosenRanges, keyOff2IdxOff, helper.chosenPath, helper.lastColManager)...) } @@ -1146,13 +1147,49 @@ func (p *LogicalJoin) constructInnerUnionScan(us *LogicalUnionScan, reader Physi return physicalUnionScan } +func getColsNDVFromHistColl(cols []*expression.Column, histColl *statistics.HistColl) int64 { + if len(cols) == 0 || histColl == nil { + return -1 + } + colUIDs := make([]int64, len(cols)) + for i, col := range cols { + colUIDs[i] = col.UniqueID + } + + if len(colUIDs) == 1 && histColl.Columns != nil { + uid := colUIDs[0] + if colStats, ok := histColl.Columns[uid]; ok && colStats != nil { + return colStats.NDV + } + } + slices.Sort(colUIDs) + if histColl.Indices == nil || histColl.Idx2ColumnIDs == nil { + return -1 + } + for idxID, idxCols := range histColl.Idx2ColumnIDs { + if len(idxCols) != len(colUIDs) { + continue + } + orderedIdxCols := make([]int64, len(idxCols)) + copy(orderedIdxCols, idxCols) + slices.Sort(orderedIdxCols) + if !slices.Equal(idxCols, colUIDs) { + continue + } + if idxStats, ok := histColl.Indices[idxID]; ok && idxStats != nil { + return idxStats.NDV + } + } + return -1 +} + // constructInnerIndexScanTask is specially used to construct the inner plan for PhysicalIndexJoin. func (p *LogicalJoin) constructInnerIndexScanTask( wrapper *indexJoinInnerChildWrapper, path *util.AccessPath, ranges ranger.Ranges, filterConds []expression.Expression, - _ []*expression.Column, + innerJoinKeys []*expression.Column, rangeInfo string, keepOrder bool, desc bool, @@ -1239,6 +1276,16 @@ func (p *LogicalJoin) constructInnerIndexScanTask( } is.initSchema(append(path.FullIdxCols, ds.commonHandleCols...), cop.tablePlan != nil) indexConds, tblConds := ds.splitIndexFilterConditions(filterConds, path.FullIdxCols, path.FullIdxColLens) + rowCountUpperBound := -1.0 + if ds.tableStats != nil { + joinKeyNDV := getColsNDVFromHistColl(innerJoinKeys, ds.tableStats.HistColl) + if joinKeyNDV > 0 { + rowCountUpperBound = ds.tableStats.RowCount / float64(joinKeyNDV) + } + } + if rowCountUpperBound > 0 { + rowCount = mathutil.Min(rowCount, rowCountUpperBound) + } if maxOneRow { // Theoretically, this line is unnecessary because row count estimation of join should guarantee rowCount is not larger // than 1.0; however, there may be rowCount larger than 1.0 in reality, e.g, pseudo statistics cases, which does not reflect @@ -1261,6 +1308,9 @@ func (p *LogicalJoin) constructInnerIndexScanTask( // rowCount is computed from result row count of join, which has already accounted the filters on DataSource, // i.e, rowCount equals to `countAfterIndex * selectivity`. cnt := rowCount / selectivity + if rowCountUpperBound > 0 { + cnt = mathutil.Min(cnt, rowCountUpperBound) + } if maxOneRow { cnt = math.Min(cnt, 1.0) } @@ -1274,6 +1324,9 @@ func (p *LogicalJoin) constructInnerIndexScanTask( selectivity = SelectionFactor } cnt := tmpPath.CountAfterIndex / selectivity + if rowCountUpperBound > 0 { + cnt = mathutil.Min(cnt, rowCountUpperBound) + } if maxOneRow { cnt = math.Min(cnt, 1.0) } From 42cfcfb364dd39898c51083cfbfa171bd32ec380 Mon Sep 17 00:00:00 2001 From: time-and-fate <25057648+time-and-fate@users.noreply.github.com> Date: Tue, 7 Mar 2023 22:36:20 +0800 Subject: [PATCH 02/10] fmt --- planner/core/exhaust_physical_plans.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/planner/core/exhaust_physical_plans.go b/planner/core/exhaust_physical_plans.go index 2490d5b189673..e642f78dd226c 100644 --- a/planner/core/exhaust_physical_plans.go +++ b/planner/core/exhaust_physical_plans.go @@ -17,7 +17,6 @@ package core import ( "bytes" "fmt" - "github.com/pingcap/tidb/util/mathutil" "math" "strings" "unsafe" @@ -38,6 +37,7 @@ import ( "github.com/pingcap/tidb/util/chunk" "github.com/pingcap/tidb/util/collate" "github.com/pingcap/tidb/util/logutil" + "github.com/pingcap/tidb/util/mathutil" "github.com/pingcap/tidb/util/plancodec" "github.com/pingcap/tidb/util/ranger" "github.com/pingcap/tidb/util/set" From 88f933dd66ea87f89901affe518e0cc597c90bf8 Mon Sep 17 00:00:00 2001 From: time-and-fate <25057648+time-and-fate@users.noreply.github.com> Date: Wed, 8 Mar 2023 16:13:17 +0800 Subject: [PATCH 03/10] update test result --- cmd/explaintest/r/tpch.result | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/cmd/explaintest/r/tpch.result b/cmd/explaintest/r/tpch.result index a2fc266a653d7..4f2f894289b84 100644 --- a/cmd/explaintest/r/tpch.result +++ b/cmd/explaintest/r/tpch.result @@ -257,9 +257,9 @@ Projection 10.00 root tpch.lineitem.l_orderkey, Column#35, tpch.orders.o_orderd │ └─Selection 36870000.00 cop[tikv] lt(tpch.orders.o_orderdate, 1995-03-13 00:00:00.000000) │ └─TableFullScan 75000000.00 cop[tikv] table:orders keep order:false └─IndexLookUp(Probe) 91515927.49 root - ├─IndexRangeScan(Build) 168388203.74 cop[tikv] table:lineitem, index:PRIMARY(L_ORDERKEY, L_LINENUMBER) range: decided by [eq(tpch.lineitem.l_orderkey, tpch.orders.o_orderkey)], keep order:false + ├─IndexRangeScan(Build) 91515927.49 cop[tikv] table:lineitem, index:PRIMARY(L_ORDERKEY, L_LINENUMBER) range: decided by [eq(tpch.lineitem.l_orderkey, tpch.orders.o_orderkey)], keep order:false └─Selection(Probe) 91515927.49 cop[tikv] gt(tpch.lineitem.l_shipdate, 1995-03-13 00:00:00.000000) - └─TableRowIDScan 168388203.74 cop[tikv] table:lineitem keep order:false + └─TableRowIDScan 91515927.49 cop[tikv] table:lineitem keep order:false /* Q4 Order Priority Checking Query This query determines how well the order priority system is working and gives an assessment of customer satisfaction. @@ -298,9 +298,9 @@ Sort 1.00 root tpch.orders.o_orderpriority │ └─Selection 2925937.50 cop[tikv] ge(tpch.orders.o_orderdate, 1995-01-01 00:00:00.000000), lt(tpch.orders.o_orderdate, 1995-04-01 00:00:00.000000) │ └─TableFullScan 75000000.00 cop[tikv] table:orders keep order:false └─IndexLookUp(Probe) 11851908.75 root - ├─IndexRangeScan(Build) 14814885.94 cop[tikv] table:lineitem, index:PRIMARY(L_ORDERKEY, L_LINENUMBER) range: decided by [eq(tpch.lineitem.l_orderkey, tpch.orders.o_orderkey)], keep order:false + ├─IndexRangeScan(Build) 11851908.75 cop[tikv] table:lineitem, index:PRIMARY(L_ORDERKEY, L_LINENUMBER) range: decided by [eq(tpch.lineitem.l_orderkey, tpch.orders.o_orderkey)], keep order:false └─Selection(Probe) 11851908.75 cop[tikv] lt(tpch.lineitem.l_commitdate, tpch.lineitem.l_receiptdate) - └─TableRowIDScan 14814885.94 cop[tikv] table:lineitem keep order:false + └─TableRowIDScan 11851908.75 cop[tikv] table:lineitem keep order:false /* Q5 Local Supplier Volume Query This query lists the revenue volume done through local suppliers. @@ -672,9 +672,9 @@ Projection 20.00 root tpch.customer.c_custkey, tpch.customer.c_name, Column#39, │ └─TableReader(Probe) 7500000.00 root data:TableFullScan │ └─TableFullScan 7500000.00 cop[tikv] table:customer keep order:false └─IndexLookUp(Probe) 12222016.17 root - ├─IndexRangeScan(Build) 49605980.10 cop[tikv] table:lineitem, index:PRIMARY(L_ORDERKEY, L_LINENUMBER) range: decided by [eq(tpch.lineitem.l_orderkey, tpch.orders.o_orderkey)], keep order:false + ├─IndexRangeScan(Build) 12222016.17 cop[tikv] table:lineitem, index:PRIMARY(L_ORDERKEY, L_LINENUMBER) range: decided by [eq(tpch.lineitem.l_orderkey, tpch.orders.o_orderkey)], keep order:false └─Selection(Probe) 12222016.17 cop[tikv] eq(tpch.lineitem.l_returnflag, "R") - └─TableRowIDScan 49605980.10 cop[tikv] table:lineitem keep order:false + └─TableRowIDScan 12222016.17 cop[tikv] table:lineitem keep order:false /* Q11 Important Stock Identification Query This query finds the most important subset of suppliers' stock in a given nation. @@ -1239,9 +1239,9 @@ Projection 100.00 root tpch.supplier.s_name, Column#72 │ ├─IndexRangeScan(Build) 49550432.16 cop[tikv] table:l2, index:PRIMARY(L_ORDERKEY, L_LINENUMBER) range: decided by [eq(tpch.lineitem.l_orderkey, tpch.lineitem.l_orderkey)], keep order:false │ └─TableRowIDScan(Probe) 49550432.16 cop[tikv] table:l2 keep order:false └─IndexLookUp(Probe) 39640345.73 root - ├─IndexRangeScan(Build) 49550432.16 cop[tikv] table:l3, index:PRIMARY(L_ORDERKEY, L_LINENUMBER) range: decided by [eq(tpch.lineitem.l_orderkey, tpch.lineitem.l_orderkey)], keep order:false + ├─IndexRangeScan(Build) 39640345.73 cop[tikv] table:l3, index:PRIMARY(L_ORDERKEY, L_LINENUMBER) range: decided by [eq(tpch.lineitem.l_orderkey, tpch.lineitem.l_orderkey)], keep order:false └─Selection(Probe) 39640345.73 cop[tikv] gt(tpch.lineitem.l_receiptdate, tpch.lineitem.l_commitdate) - └─TableRowIDScan 49550432.16 cop[tikv] table:l3 keep order:false + └─TableRowIDScan 39640345.73 cop[tikv] table:l3 keep order:false /* Q22 Global Sales Opportunity Query The Global Sales Opportunity Query identifies geographies where there are customers who may be likely to make a From 76a14c9fb155f1a06d03a6643250e2e0c635d2ba Mon Sep 17 00:00:00 2001 From: time-and-fate <25057648+time-and-fate@users.noreply.github.com> Date: Wed, 8 Mar 2023 19:44:48 +0800 Subject: [PATCH 04/10] add test --- statistics/integration_test.go | 48 ++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/statistics/integration_test.go b/statistics/integration_test.go index 1c8672b790ffd..64da08e7c97fb 100644 --- a/statistics/integration_test.go +++ b/statistics/integration_test.go @@ -24,10 +24,12 @@ import ( "github.com/pingcap/failpoint" "github.com/pingcap/tidb/parser/model" + "github.com/pingcap/tidb/parser/mysql" "github.com/pingcap/tidb/statistics" "github.com/pingcap/tidb/statistics/handle" "github.com/pingcap/tidb/testkit" "github.com/pingcap/tidb/testkit/testdata" + "github.com/pingcap/tidb/types" "github.com/stretchr/testify/require" ) @@ -693,3 +695,49 @@ func TestSingleColumnIndexNDV(t *testing.T) { require.Equal(t, expectedResults[i][2], row[7]) // null_count } } + +func TestIndexJoinInnerRowCountUpperBound(t *testing.T) { + store, dom := testkit.CreateMockStoreAndDomain(t) + testKit := testkit.NewTestKit(t, store) + h := dom.StatsHandle() + + testKit.MustExec("use test") + testKit.MustExec("drop table if exists t") + testKit.MustExec("create table t(a int, b int, index idx(b))") + require.NoError(t, h.HandleDDLEvent(<-h.DDLEventCh())) + is := dom.InfoSchema() + tb, err := is.TableByName(model.NewCIStr("test"), model.NewCIStr("t")) + require.NoError(t, err) + tblInfo := tb.Meta() + + // mock the statistics.Table + mockStatsTbl := mockStatsTable(tblInfo, 5000000) + colValues, err := generateIntDatum(1, 500) + require.NoError(t, err) + for i := 1; i <= 2; i++ { + mockStatsTbl.Columns[int64(i)] = &statistics.Column{ + Histogram: *mockStatsHistogram(int64(i), colValues, 10000, types.NewFieldType(mysql.TypeLonglong)), + Info: tblInfo.Columns[i-1], + StatsLoadedStatus: statistics.NewStatsFullLoadStatus(), + StatsVer: 2, + } + } + generateMapsForMockStatsTbl(mockStatsTbl) + stat := h.GetTableStats(tblInfo) + stat.HistColl = mockStatsTbl.HistColl + + testKit.MustQuery("explain format = 'brief' " + + "select /*+ inl_join(t2) */ * from (select * from t where t.a < 2) as t1 join t t2 where t2.a = 100 and t1.a = t2.b"). + Check(testkit.Rows( + "Projection 12500.00 root test.t.a, test.t.b, test.t.a, test.t.b", + "└─IndexJoin 12500.00 root inner join, inner:IndexLookUp, outer key:test.t.a, inner key:test.t.b, equal cond:eq(test.t.a, test.t.b)", + " ├─TableReader(Build) 20000.00 root data:Selection", + " │ └─Selection 20000.00 cop[tikv] lt(test.t.a, 2), not(isnull(test.t.a))", + " │ └─TableFullScan 5000000.00 cop[tikv] table:t keep order:false, stats:pseudo", + " └─IndexLookUp(Probe) 12500.00 root ", + " ├─Selection(Build) 6250000.00 cop[tikv] not(isnull(test.t.b))", + " │ └─IndexRangeScan 6250000.00 cop[tikv] table:t2, index:idx(b) range: decided by [eq(test.t.b, test.t.a)], keep order:false, stats:pseudo", + " └─Selection(Probe) 12500.00 cop[tikv] eq(test.t.a, 100)", + " └─TableRowIDScan 6250000.00 cop[tikv] table:t2 keep order:false, stats:pseudo", + )) +} From 54799f2081f3e9df76c3ff05aabfea228bf43865 Mon Sep 17 00:00:00 2001 From: time-and-fate <25057648+time-and-fate@users.noreply.github.com> Date: Wed, 8 Mar 2023 20:34:18 +0800 Subject: [PATCH 05/10] update test --- statistics/integration_test.go | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/statistics/integration_test.go b/statistics/integration_test.go index 64da08e7c97fb..b238a83650b3c 100644 --- a/statistics/integration_test.go +++ b/statistics/integration_test.go @@ -711,12 +711,13 @@ func TestIndexJoinInnerRowCountUpperBound(t *testing.T) { tblInfo := tb.Meta() // mock the statistics.Table - mockStatsTbl := mockStatsTable(tblInfo, 5000000) + mockStatsTbl := mockStatsTable(tblInfo, 500000) colValues, err := generateIntDatum(1, 500) require.NoError(t, err) for i := 1; i <= 2; i++ { mockStatsTbl.Columns[int64(i)] = &statistics.Column{ - Histogram: *mockStatsHistogram(int64(i), colValues, 10000, types.NewFieldType(mysql.TypeLonglong)), + Count: 500000, + Histogram: *mockStatsHistogram(int64(i), colValues, 1000, types.NewFieldType(mysql.TypeLonglong)), Info: tblInfo.Columns[i-1], StatsLoadedStatus: statistics.NewStatsFullLoadStatus(), StatsVer: 2, @@ -727,17 +728,16 @@ func TestIndexJoinInnerRowCountUpperBound(t *testing.T) { stat.HistColl = mockStatsTbl.HistColl testKit.MustQuery("explain format = 'brief' " + - "select /*+ inl_join(t2) */ * from (select * from t where t.a < 2) as t1 join t t2 where t2.a = 100 and t1.a = t2.b"). + "select /*+ inl_join(t2) */ * from (select * from t where t.a < 1) as t1 join t t2 where t2.a = 0 and t1.a = t2.b"). Check(testkit.Rows( - "Projection 12500.00 root test.t.a, test.t.b, test.t.a, test.t.b", - "└─IndexJoin 12500.00 root inner join, inner:IndexLookUp, outer key:test.t.a, inner key:test.t.b, equal cond:eq(test.t.a, test.t.b)", - " ├─TableReader(Build) 20000.00 root data:Selection", - " │ └─Selection 20000.00 cop[tikv] lt(test.t.a, 2), not(isnull(test.t.a))", - " │ └─TableFullScan 5000000.00 cop[tikv] table:t keep order:false, stats:pseudo", - " └─IndexLookUp(Probe) 12500.00 root ", - " ├─Selection(Build) 6250000.00 cop[tikv] not(isnull(test.t.b))", - " │ └─IndexRangeScan 6250000.00 cop[tikv] table:t2, index:idx(b) range: decided by [eq(test.t.b, test.t.a)], keep order:false, stats:pseudo", - " └─Selection(Probe) 12500.00 cop[tikv] eq(test.t.a, 100)", - " └─TableRowIDScan 6250000.00 cop[tikv] table:t2 keep order:false, stats:pseudo", + "IndexJoin 1000000.00 root inner join, inner:IndexLookUp, outer key:test.t.a, inner key:test.t.b, equal cond:eq(test.t.a, test.t.b)", + "├─TableReader(Build) 1000.00 root data:Selection", + "│ └─Selection 1000.00 cop[tikv] lt(test.t.a, 1), not(isnull(test.t.a))", + "│ └─TableFullScan 500000.00 cop[tikv] table:t keep order:false, stats:pseudo", + "└─IndexLookUp(Probe) 1000000.00 root ", + " ├─Selection(Build) 1000000.00 cop[tikv] not(isnull(test.t.b))", + " │ └─IndexRangeScan 1000000.00 cop[tikv] table:t2, index:idx(b) range: decided by [eq(test.t.b, test.t.a)], keep order:false, stats:pseudo", + " └─Selection(Probe) 1000000.00 cop[tikv] eq(test.t.a, 0)", + " └─TableRowIDScan 1000000.00 cop[tikv] table:t2 keep order:false, stats:pseudo", )) } From 4b0cd062f9503b777fe1a0fed63bd8e61f5e7e5f Mon Sep 17 00:00:00 2001 From: time-and-fate <25057648+time-and-fate@users.noreply.github.com> Date: Wed, 8 Mar 2023 20:42:16 +0800 Subject: [PATCH 06/10] reduce changes --- planner/core/exhaust_physical_plans.go | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/planner/core/exhaust_physical_plans.go b/planner/core/exhaust_physical_plans.go index e642f78dd226c..f21deb747bce0 100644 --- a/planner/core/exhaust_physical_plans.go +++ b/planner/core/exhaust_physical_plans.go @@ -37,7 +37,6 @@ import ( "github.com/pingcap/tidb/util/chunk" "github.com/pingcap/tidb/util/collate" "github.com/pingcap/tidb/util/logutil" - "github.com/pingcap/tidb/util/mathutil" "github.com/pingcap/tidb/util/plancodec" "github.com/pingcap/tidb/util/ranger" "github.com/pingcap/tidb/util/set" @@ -1284,7 +1283,7 @@ func (p *LogicalJoin) constructInnerIndexScanTask( } } if rowCountUpperBound > 0 { - rowCount = mathutil.Min(rowCount, rowCountUpperBound) + rowCount = math.Min(rowCount, rowCountUpperBound) } if maxOneRow { // Theoretically, this line is unnecessary because row count estimation of join should guarantee rowCount is not larger @@ -1309,7 +1308,7 @@ func (p *LogicalJoin) constructInnerIndexScanTask( // i.e, rowCount equals to `countAfterIndex * selectivity`. cnt := rowCount / selectivity if rowCountUpperBound > 0 { - cnt = mathutil.Min(cnt, rowCountUpperBound) + cnt = math.Min(cnt, rowCountUpperBound) } if maxOneRow { cnt = math.Min(cnt, 1.0) @@ -1325,7 +1324,7 @@ func (p *LogicalJoin) constructInnerIndexScanTask( } cnt := tmpPath.CountAfterIndex / selectivity if rowCountUpperBound > 0 { - cnt = mathutil.Min(cnt, rowCountUpperBound) + cnt = math.Min(cnt, rowCountUpperBound) } if maxOneRow { cnt = math.Min(cnt, 1.0) From 49954035d6ca4e9940ae467794b9c26480cc7044 Mon Sep 17 00:00:00 2001 From: time-and-fate <25057648+time-and-fate@users.noreply.github.com> Date: Wed, 8 Mar 2023 21:32:56 +0800 Subject: [PATCH 07/10] add comments --- planner/core/exhaust_physical_plans.go | 9 +++++++++ statistics/integration_test.go | 4 +++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/planner/core/exhaust_physical_plans.go b/planner/core/exhaust_physical_plans.go index f21deb747bce0..e674111f2a5a7 100644 --- a/planner/core/exhaust_physical_plans.go +++ b/planner/core/exhaust_physical_plans.go @@ -1155,16 +1155,20 @@ func getColsNDVFromHistColl(cols []*expression.Column, histColl *statistics.Hist colUIDs[i] = col.UniqueID } + // Try to get NDV from column stats if it's a single column. if len(colUIDs) == 1 && histColl.Columns != nil { uid := colUIDs[0] if colStats, ok := histColl.Columns[uid]; ok && colStats != nil { return colStats.NDV } } + slices.Sort(colUIDs) if histColl.Indices == nil || histColl.Idx2ColumnIDs == nil { return -1 } + + // Try to get NDV from index stats. for idxID, idxCols := range histColl.Idx2ColumnIDs { if len(idxCols) != len(colUIDs) { continue @@ -1275,6 +1279,10 @@ func (p *LogicalJoin) constructInnerIndexScanTask( } is.initSchema(append(path.FullIdxCols, ds.commonHandleCols...), cop.tablePlan != nil) indexConds, tblConds := ds.splitIndexFilterConditions(filterConds, path.FullIdxCols, path.FullIdxColLens) + + // Because we are estimating an average row count of the inner side corresponding to each row from the outer side, + // the estimated row count of the IndexScan should be no larger than (total row count / NDV of join key columns). + // We use it as an upper bound here. rowCountUpperBound := -1.0 if ds.tableStats != nil { joinKeyNDV := getColsNDVFromHistColl(innerJoinKeys, ds.tableStats.HistColl) @@ -1282,6 +1290,7 @@ func (p *LogicalJoin) constructInnerIndexScanTask( rowCountUpperBound = ds.tableStats.RowCount / float64(joinKeyNDV) } } + if rowCountUpperBound > 0 { rowCount = math.Min(rowCount, rowCountUpperBound) } diff --git a/statistics/integration_test.go b/statistics/integration_test.go index b238a83650b3c..80f839c45b06f 100644 --- a/statistics/integration_test.go +++ b/statistics/integration_test.go @@ -710,7 +710,9 @@ func TestIndexJoinInnerRowCountUpperBound(t *testing.T) { require.NoError(t, err) tblInfo := tb.Meta() - // mock the statistics.Table + // Mock the stats: + // The two columns are the same. + // From 0 to 499, each value has 1000 rows. Therefore, NDV is 500 and total row count is 500000. mockStatsTbl := mockStatsTable(tblInfo, 500000) colValues, err := generateIntDatum(1, 500) require.NoError(t, err) From 6a2ef90a621f6d2aaa6e3d01f76e67bde961ec94 Mon Sep 17 00:00:00 2001 From: time-and-fate <25057648+time-and-fate@users.noreply.github.com> Date: Thu, 9 Mar 2023 15:17:07 +0800 Subject: [PATCH 08/10] add --- planner/core/exhaust_physical_plans.go | 27 +++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/planner/core/exhaust_physical_plans.go b/planner/core/exhaust_physical_plans.go index e674111f2a5a7..e148d605a1679 100644 --- a/planner/core/exhaust_physical_plans.go +++ b/planner/core/exhaust_physical_plans.go @@ -1146,7 +1146,7 @@ func (p *LogicalJoin) constructInnerUnionScan(us *LogicalUnionScan, reader Physi return physicalUnionScan } -func getColsNDVFromHistColl(cols []*expression.Column, histColl *statistics.HistColl) int64 { +func getColsNDVLowerBoundFromHistColl(cols []*expression.Column, histColl *statistics.HistColl) int64 { if len(cols) == 0 || histColl == nil { return -1 } @@ -1155,7 +1155,7 @@ func getColsNDVFromHistColl(cols []*expression.Column, histColl *statistics.Hist colUIDs[i] = col.UniqueID } - // Try to get NDV from column stats if it's a single column. + // 1. Try to get NDV from column stats if it's a single column. if len(colUIDs) == 1 && histColl.Columns != nil { uid := colUIDs[0] if colStats, ok := histColl.Columns[uid]; ok && colStats != nil { @@ -1168,7 +1168,7 @@ func getColsNDVFromHistColl(cols []*expression.Column, histColl *statistics.Hist return -1 } - // Try to get NDV from index stats. + // 2. Try to get NDV from index stats. for idxID, idxCols := range histColl.Idx2ColumnIDs { if len(idxCols) != len(colUIDs) { continue @@ -1183,7 +1183,24 @@ func getColsNDVFromHistColl(cols []*expression.Column, histColl *statistics.Hist return idxStats.NDV } } - return -1 + + // 3. If we still haven't got an NDV, we use the minimal NDV in the column stats as a lower bound. + // This would happen when len(cols) > 0 and no proper index stats are available. + minNDV := int64(-1) + for _, colStats := range histColl.Columns { + if colStats == nil || colStats.Info == nil { + continue + } + col := colStats.Info + if col.IsGenerated() && !col.GeneratedStored { + continue + } + if (colStats.NDV > 0 && minNDV <= 0) || + colStats.NDV < minNDV { + minNDV = colStats.NDV + } + } + return minNDV } // constructInnerIndexScanTask is specially used to construct the inner plan for PhysicalIndexJoin. @@ -1285,7 +1302,7 @@ func (p *LogicalJoin) constructInnerIndexScanTask( // We use it as an upper bound here. rowCountUpperBound := -1.0 if ds.tableStats != nil { - joinKeyNDV := getColsNDVFromHistColl(innerJoinKeys, ds.tableStats.HistColl) + joinKeyNDV := getColsNDVLowerBoundFromHistColl(innerJoinKeys, ds.tableStats.HistColl) if joinKeyNDV > 0 { rowCountUpperBound = ds.tableStats.RowCount / float64(joinKeyNDV) } From bf1213edb1dba9e8c5cd48deaba623e1d9f39520 Mon Sep 17 00:00:00 2001 From: time-and-fate <25057648+time-and-fate@users.noreply.github.com> Date: Mon, 13 Mar 2023 20:11:05 +0800 Subject: [PATCH 09/10] add --- planner/core/exhaust_physical_plans.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/planner/core/exhaust_physical_plans.go b/planner/core/exhaust_physical_plans.go index e21e0bf057871..965aa9d83608e 100644 --- a/planner/core/exhaust_physical_plans.go +++ b/planner/core/exhaust_physical_plans.go @@ -1184,8 +1184,11 @@ func getColsNDVLowerBoundFromHistColl(cols []*expression.Column, histColl *stati } } + // TODO: if there's an index that contains the expected columns, we can also make use of its NDV. + // For example, NDV(a,b,c) / NDV(c) is a safe lower bound of NDV(a,b). + // 3. If we still haven't got an NDV, we use the minimal NDV in the column stats as a lower bound. - // This would happen when len(cols) > 0 and no proper index stats are available. + // This would happen when len(cols) > 1 and no proper index stats are available. minNDV := int64(-1) for _, colStats := range histColl.Columns { if colStats == nil || colStats.Info == nil { From 23f34dfc2fb746cbd884498de9b1f11a0dff7d37 Mon Sep 17 00:00:00 2001 From: time-and-fate <25057648+time-and-fate@users.noreply.github.com> Date: Mon, 13 Mar 2023 20:16:45 +0800 Subject: [PATCH 10/10] add --- planner/core/exhaust_physical_plans.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/planner/core/exhaust_physical_plans.go b/planner/core/exhaust_physical_plans.go index 965aa9d83608e..ddcc94513ab7d 100644 --- a/planner/core/exhaust_physical_plans.go +++ b/planner/core/exhaust_physical_plans.go @@ -1155,6 +1155,9 @@ func getColsNDVLowerBoundFromHistColl(cols []*expression.Column, histColl *stati colUIDs[i] = col.UniqueID } + // Note that we don't need to specially handle prefix index in this function, because the NDV of a prefix index is + // equal or less than the corresponding normal index, and that's safe here since we want a lower bound. + // 1. Try to get NDV from column stats if it's a single column. if len(colUIDs) == 1 && histColl.Columns != nil { uid := colUIDs[0]