From 829c5c5bd9521bf4efdbf40e4b4708c178cc3b3b Mon Sep 17 00:00:00 2001
From: Kenan Yao <cauchy1992@gmail.com>
Date: Mon, 25 Feb 2019 23:22:19 +0800
Subject: [PATCH] *: improve row count estimation for columns with NULL

---
 executor/analyze.go            | 98 ++++++++++++++++++++++++++--------
 executor/show_stats_test.go    | 53 +++++++++++++++++-
 planner/core/planbuilder.go    | 16 ++----
 statistics/feedback.go         | 12 ++++-
 statistics/histogram.go        | 80 ++++++++++++++++-----------
 statistics/selectivity_test.go | 59 ++++++++++++++++++++
 statistics/statistics_test.go  |  6 ---
 statistics/table.go            | 26 +++++++--
 util/ranger/points.go          | 12 ++++-
 9 files changed, 285 insertions(+), 77 deletions(-)

diff --git a/executor/analyze.go b/executor/analyze.go
index 1c934b2e368d7..7330bb490f416 100644
--- a/executor/analyze.go
+++ b/executor/analyze.go
@@ -152,8 +152,9 @@ func analyzeIndexPushdown(idxExec *AnalyzeIndexExec) statistics.AnalyzeResult {
 		Cms:             []*statistics.CMSketch{cms},
 		IsIndex:         1,
 	}
+	result.Count = hist.NullCount
 	if hist.Len() > 0 {
-		result.Count = hist.Buckets[hist.Len()-1].Count
+		result.Count += hist.Buckets[hist.Len()-1].Count
 	}
 	return result
 }
@@ -167,12 +168,16 @@ type AnalyzeIndexExec struct {
 	priority        int
 	analyzePB       *tipb.AnalyzeReq
 	result          distsql.SelectResult
+	countNullRes    distsql.SelectResult
 	maxNumBuckets   uint64
 }
 
-func (e *AnalyzeIndexExec) open() error {
+// fetchAnalyzeResult builds and dispatches the `kv.Request` from given ranges, and stores the `SelectResult`
+// in corresponding fields based on the input `isNullRange` argument, which indicates if the range is the
+// special null range for single-column index to get the null count.
+func (e *AnalyzeIndexExec) fetchAnalyzeResult(ranges []*ranger.Range, isNullRange bool) error {
 	var builder distsql.RequestBuilder
-	kvReq, err := builder.SetIndexRanges(e.ctx.GetSessionVars().StmtCtx, e.physicalTableID, e.idxInfo.ID, ranger.FullRange()).
+	kvReq, err := builder.SetIndexRanges(e.ctx.GetSessionVars().StmtCtx, e.physicalTableID, e.idxInfo.ID, ranges).
 		SetAnalyzeRequest(e.analyzePB).
 		SetKeepOrder(true).
 		SetConcurrency(e.concurrency).
@@ -181,29 +186,51 @@ func (e *AnalyzeIndexExec) open() error {
 		return errors.Trace(err)
 	}
 	ctx := context.TODO()
-	e.result, err = distsql.Analyze(ctx, e.ctx.GetClient(), kvReq, e.ctx.GetSessionVars().KVVars, e.ctx.GetSessionVars().InRestrictedSQL)
+	result, err := distsql.Analyze(ctx, e.ctx.GetClient(), kvReq, e.ctx.GetSessionVars().KVVars, e.ctx.GetSessionVars().InRestrictedSQL)
 	if err != nil {
 		return errors.Trace(err)
 	}
-	e.result.Fetch(ctx)
+	result.Fetch(ctx)
+	if isNullRange {
+		e.countNullRes = result
+	} else {
+		e.result = result
+	}
 	return nil
 }
 
-func (e *AnalyzeIndexExec) buildStats() (hist *statistics.Histogram, cms *statistics.CMSketch, err error) {
-	if err = e.open(); err != nil {
-		return nil, nil, errors.Trace(err)
+func (e *AnalyzeIndexExec) open() error {
+	ranges := ranger.FullRange()
+	// For single-column index, we do not load null rows from TiKV, so the built histogram would not include
+	// null values, and its `NullCount` would be set by result of another distsql call to get null rows.
+	// For multi-column index, we cannot define null for the rows, so we still use full range, and the rows
+	// containing null fields would exist in built histograms. Note that, the `NullCount` of histograms for
+	// multi-column index is always 0 then.
+	if len(e.idxInfo.Columns) == 1 {
+		ranges = ranger.FullNotNullRange()
 	}
-	defer func() {
-		if err1 := e.result.Close(); err1 != nil {
-			hist = nil
-			cms = nil
-			err = errors.Trace(err1)
+	err := e.fetchAnalyzeResult(ranges, false)
+	if err != nil {
+		return err
+	}
+	if len(e.idxInfo.Columns) == 1 {
+		ranges = ranger.NullRange()
+		err = e.fetchAnalyzeResult(ranges, true)
+		if err != nil {
+			return err
 		}
-	}()
-	hist = &statistics.Histogram{}
-	cms = statistics.NewCMSketch(defaultCMSketchDepth, defaultCMSketchWidth)
+	}
+	return nil
+}
+
+func (e *AnalyzeIndexExec) buildStatsFromResult(result distsql.SelectResult, needCMS bool) (*statistics.Histogram, *statistics.CMSketch, error) {
+	hist := &statistics.Histogram{}
+	var cms *statistics.CMSketch
+	if needCMS {
+		cms = statistics.NewCMSketch(defaultCMSketchDepth, defaultCMSketchWidth)
+	}
 	for {
-		data, err := e.result.NextRaw(context.TODO())
+		data, err := result.NextRaw(context.TODO())
 		if err != nil {
 			return nil, nil, errors.Trace(err)
 		}
@@ -217,15 +244,42 @@ func (e *AnalyzeIndexExec) buildStats() (hist *statistics.Histogram, cms *statis
 		}
 		hist, err = statistics.MergeHistograms(e.ctx.GetSessionVars().StmtCtx, hist, statistics.HistogramFromProto(resp.Hist), int(e.maxNumBuckets))
 		if err != nil {
-			return nil, nil, errors.Trace(err)
+			return nil, nil, err
 		}
-		if resp.Cms != nil {
-			err := cms.MergeCMSketch(statistics.CMSketchFromProto(resp.Cms))
-			if err != nil {
-				return nil, nil, errors.Trace(err)
+		if needCMS {
+			if resp.Cms == nil {
+				logutil.Logger(context.TODO()).Warn("nil CMS in response", zap.String("table", e.idxInfo.Table.O), zap.String("index", e.idxInfo.Name.O))
+			} else {
+				err := cms.MergeCMSketch(statistics.CMSketchFromProto(resp.Cms))
+				if err != nil {
+					return nil, nil, errors.Trace(err)
+				}
 			}
 		}
 	}
+	return hist, cms, nil
+}
+
+func (e *AnalyzeIndexExec) buildStats() (hist *statistics.Histogram, cms *statistics.CMSketch, err error) {
+	if err = e.open(); err != nil {
+		return nil, nil, err
+	}
+	defer func() {
+		err = closeAll(e.result, e.countNullRes)
+	}()
+	hist, cms, err = e.buildStatsFromResult(e.result, true)
+	if err != nil {
+		return nil, nil, err
+	}
+	if e.countNullRes != nil {
+		nullHist, _, err := e.buildStatsFromResult(e.countNullRes, false)
+		if err != nil {
+			return nil, nil, err
+		}
+		if l := nullHist.Len(); l > 0 {
+			hist.NullCount = nullHist.Buckets[l-1].Count
+		}
+	}
 	hist.ID = e.idxInfo.ID
 	return hist, cms, nil
 }
diff --git a/executor/show_stats_test.go b/executor/show_stats_test.go
index 2786ac65498ef..af01376d9f798 100644
--- a/executor/show_stats_test.go
+++ b/executor/show_stats_test.go
@@ -82,7 +82,58 @@ func (s *testSuite1) TestShowStatsHasNullValue(c *C) {
 	tk.MustExec("create table t (a int, index idx(a))")
 	tk.MustExec("insert into t values(NULL)")
 	tk.MustExec("analyze table t")
-	tk.MustQuery("show stats_buckets").Check(testkit.Rows("test t  idx 1 0 1 1 NULL NULL"))
+	// Null values are excluded from histogram for single-column index.
+	tk.MustQuery("show stats_buckets").Check(testkit.Rows())
+	tk.MustExec("insert into t values(1)")
+	tk.MustExec("analyze table t")
+	tk.MustQuery("show stats_buckets").Sort().Check(testkit.Rows(
+		"test t  a 0 0 1 1 1 1",
+		"test t  idx 1 0 1 1 1 1",
+	))
+	tk.MustExec("drop table t")
+	tk.MustExec("create table t (a int, b int, index idx(a, b))")
+	tk.MustExec("insert into t values(NULL, NULL)")
+	tk.MustExec("analyze table t")
+	tk.MustQuery("show stats_buckets").Check(testkit.Rows("test t  idx 1 0 1 1 (NULL, NULL) (NULL, NULL)"))
+
+	tk.MustExec("drop table t")
+	tk.MustExec("create table t(a int, b int, c int, index idx_b(b), index idx_c_a(c, a))")
+	tk.MustExec("insert into t values(1,null,1),(2,null,2),(3,3,3),(4,null,4),(null,null,null)")
+	res := tk.MustQuery("show stats_histograms where table_name = 't'")
+	c.Assert(len(res.Rows()), Equals, 0)
+	tk.MustExec("analyze table t index idx_b")
+	res = tk.MustQuery("show stats_histograms where table_name = 't' and column_name = 'idx_b'")
+	c.Assert(len(res.Rows()), Equals, 1)
+	c.Assert(res.Rows()[0][7], Equals, "4")
+	res = tk.MustQuery("show stats_histograms where table_name = 't' and column_name = 'b'")
+	c.Assert(len(res.Rows()), Equals, 0)
+	tk.MustExec("analyze table t index idx_c_a")
+	res = tk.MustQuery("show stats_histograms where table_name = 't' and column_name = 'idx_c_a'")
+	c.Assert(len(res.Rows()), Equals, 1)
+	c.Assert(res.Rows()[0][7], Equals, "0")
+	res = tk.MustQuery("show stats_histograms where table_name = 't' and column_name = 'c'")
+	c.Assert(len(res.Rows()), Equals, 0)
+	res = tk.MustQuery("show stats_histograms where table_name = 't' and column_name = 'a'")
+	c.Assert(len(res.Rows()), Equals, 0)
+	tk.MustExec("truncate table t")
+	tk.MustExec("insert into t values(1,null,1),(2,null,2),(3,3,3),(4,null,4),(null,null,null)")
+	res = tk.MustQuery("show stats_histograms where table_name = 't'")
+	c.Assert(len(res.Rows()), Equals, 0)
+	tk.MustExec("analyze table t index")
+	res = tk.MustQuery("show stats_histograms where table_name = 't'").Sort()
+	c.Assert(len(res.Rows()), Equals, 2)
+	c.Assert(res.Rows()[0][7], Equals, "4")
+	c.Assert(res.Rows()[1][7], Equals, "0")
+	tk.MustExec("truncate table t")
+	tk.MustExec("insert into t values(1,null,1),(2,null,2),(3,3,3),(4,null,4),(null,null,null)")
+	tk.MustExec("analyze table t")
+	res = tk.MustQuery("show stats_histograms where table_name = 't'").Sort()
+	c.Assert(len(res.Rows()), Equals, 5)
+	c.Assert(res.Rows()[0][7], Equals, "1")
+	c.Assert(res.Rows()[1][7], Equals, "4")
+	c.Assert(res.Rows()[2][7], Equals, "1")
+	c.Assert(res.Rows()[3][7], Equals, "4")
+	c.Assert(res.Rows()[4][7], Equals, "0")
 }
 
 func (s *testSuite1) TestShowPartitionStats(c *C) {
diff --git a/planner/core/planbuilder.go b/planner/core/planbuilder.go
index 1a327bd3bd880..a8c2468687331 100644
--- a/planner/core/planbuilder.go
+++ b/planner/core/planbuilder.go
@@ -688,11 +688,11 @@ func (b *PlanBuilder) buildCheckIndexSchema(tn *ast.TableName, indexName string)
 // getColsInfo returns the info of index columns, normal columns and primary key.
 func getColsInfo(tn *ast.TableName) (indicesInfo []*model.IndexInfo, colsInfo []*model.ColumnInfo, pkCol *model.ColumnInfo) {
 	tbl := tn.TableInfo
-	if tbl.PKIsHandle {
-		for _, col := range tbl.Columns {
-			if mysql.HasPriKeyFlag(col.Flag) {
-				pkCol = col
-			}
+	for _, col := range tbl.Columns {
+		if tbl.PKIsHandle && mysql.HasPriKeyFlag(col.Flag) {
+			pkCol = col
+		} else {
+			colsInfo = append(colsInfo, col)
 		}
 	}
 	for _, idx := range tn.TableInfo.Indices {
@@ -700,12 +700,6 @@ func getColsInfo(tn *ast.TableName) (indicesInfo []*model.IndexInfo, colsInfo []
 			indicesInfo = append(indicesInfo, idx)
 		}
 	}
-	for _, col := range tbl.Columns {
-		if col == pkCol {
-			continue
-		}
-		colsInfo = append(colsInfo, col)
-	}
 	return
 }
 
diff --git a/statistics/feedback.go b/statistics/feedback.go
index be8b5bac1af69..d5b26a0d7b12b 100644
--- a/statistics/feedback.go
+++ b/statistics/feedback.go
@@ -1077,6 +1077,7 @@ func dumpFeedbackForIndex(h *Handle, q *QueryFeedback, t *Table) error {
 }
 
 func (q *QueryFeedback) dumpRangeFeedback(sc *stmtctx.StatementContext, h *Handle, ran *ranger.Range, rangeCount float64) error {
+	lowIsNull := ran.LowVal[0].IsNull()
 	if q.tp == indexType {
 		lower, err := codec.EncodeKey(sc, nil, ran.LowVal[0])
 		if err != nil {
@@ -1102,8 +1103,17 @@ func (q *QueryFeedback) dumpRangeFeedback(sc *stmtctx.StatementContext, h *Handl
 	ranges := q.hist.SplitRange(sc, []*ranger.Range{ran}, q.tp == indexType)
 	counts := make([]float64, 0, len(ranges))
 	sum := 0.0
-	for _, r := range ranges {
+	for i, r := range ranges {
+		// Though after `SplitRange`, we may have ranges like `[l, r]`, we still use
+		// `betweenRowCount` to compute the estimation since the ranges of feedback are all in `[l, r)`
+		// form, that is to say, we ignore the exclusiveness of ranges from `SplitRange` and just use
+		// its result of boundary values.
 		count := q.hist.betweenRowCount(r.LowVal[0], r.HighVal[0])
+		// We have to include `NullCount` of histogram for [l, r) cases where l is null because `betweenRowCount`
+		// does not include null values of lower bound.
+		if i == 0 && lowIsNull {
+			count += float64(q.hist.NullCount)
+		}
 		sum += count
 		counts = append(counts, count)
 	}
diff --git a/statistics/histogram.go b/statistics/histogram.go
index 09ab5e1f99861..ce47f988ea471 100644
--- a/statistics/histogram.go
+++ b/statistics/histogram.go
@@ -412,27 +412,22 @@ func (hg *Histogram) equalRowCount(value types.Datum) float64 {
 
 // greaterRowCount estimates the row count where the column greater than value.
 func (hg *Histogram) greaterRowCount(value types.Datum) float64 {
-	gtCount := hg.totalRowCount() - hg.lessRowCount(value) - hg.equalRowCount(value)
+	gtCount := hg.notNullCount() - hg.lessRowCount(value) - hg.equalRowCount(value)
 	if gtCount < 0 {
 		gtCount = 0
 	}
 	return gtCount
 }
 
-// greaterAndEqRowCount estimates the row count where the column greater than or equal to value.
-func (hg *Histogram) greaterAndEqRowCount(value types.Datum) float64 {
-	return hg.totalRowCount() - hg.lessRowCount(value)
-}
-
 // lessRowCount estimates the row count where the column less than value.
 func (hg *Histogram) lessRowCountWithBktIdx(value types.Datum) (float64, int) {
-	// all the values is null
+	// All the values are null.
 	if hg.Bounds.NumRows() == 0 {
 		return 0, 0
 	}
 	index, match := hg.Bounds.LowerBound(0, &value)
 	if index == hg.Bounds.NumRows() {
-		return hg.totalRowCount(), hg.Len() - 1
+		return hg.notNullCount(), hg.Len() - 1
 	}
 	// Since we store the lower and upper bound together, so dividing the index by 2 will get the bucket index.
 	bucketIdx := index / 2
@@ -455,21 +450,16 @@ func (hg *Histogram) lessRowCount(value types.Datum) float64 {
 	return result
 }
 
-// lessAndEqRowCount estimates the row count where the column less than or equal to value.
-func (hg *Histogram) lessAndEqRowCount(value types.Datum) float64 {
-	return hg.lessRowCount(value) + hg.equalRowCount(value)
-}
-
 // betweenRowCount estimates the row count where column greater or equal to a and less than b.
 func (hg *Histogram) betweenRowCount(a, b types.Datum) float64 {
 	lessCountA := hg.lessRowCount(a)
 	lessCountB := hg.lessRowCount(b)
 	// If lessCountA is not less than lessCountB, it may be that they fall to the same bucket and we cannot estimate
 	// the fraction, so we use `totalCount / NDV` to estimate the row count, but the result should not greater than
-	// lessCountB or totalRowCount-lessCountA.
+	// lessCountB or notNullCount-lessCountA.
 	if lessCountA >= lessCountB && hg.NDV > 0 {
-		result := math.Min(lessCountB, hg.totalRowCount()-lessCountA)
-		return math.Min(result, hg.totalRowCount()/float64(hg.NDV))
+		result := math.Min(lessCountB, hg.notNullCount()-lessCountA)
+		return math.Min(result, hg.notNullCount()/float64(hg.NDV))
 	}
 	return lessCountB - lessCountA
 }
@@ -478,6 +468,9 @@ func (hg *Histogram) totalRowCount() float64 {
 	return hg.notNullCount() + float64(hg.NullCount)
 }
 
+// notNullCount indicates the count of non-null values in column histogram and single-column index histogram,
+// for multi-column index histogram, since we cannot define null for the row, we treat all rows as non-null, that means,
+// notNullCount would return same value as totalRowCount for multi-column index histograms.
 func (hg *Histogram) notNullCount() float64 {
 	if hg.Len() == 0 {
 		return 0
@@ -780,7 +773,7 @@ func (c *Column) equalRowCount(sc *stmtctx.StatementContext, val types.Datum, mo
 	if val.IsNull() {
 		return float64(c.NullCount), nil
 	}
-	// all the values is null
+	// All the values are null.
 	if c.Histogram.Bounds.NumRows() == 0 {
 		return 0.0, nil
 	}
@@ -814,18 +807,24 @@ func (c *Column) getColumnRowCount(sc *stmtctx.StatementContext, ranges []*range
 			}
 			continue
 		}
-		// the interval case.
+		// The interval case.
 		cnt := c.betweenRowCount(rg.LowVal[0], rg.HighVal[0])
-		if c.outOfRange(rg.LowVal[0]) || c.outOfRange(rg.HighVal[0]) {
+		if (c.outOfRange(rg.LowVal[0]) && !rg.LowVal[0].IsNull()) || c.outOfRange(rg.HighVal[0]) {
 			cnt += float64(modifyCount) / outOfRangeBetweenRate
 		}
-		if rg.LowExclude {
+		// `betweenRowCount` returns count for [l, h) range, we adjust cnt for boudaries here.
+		// Note that, `cnt` does not include null values, we need specially handle cases
+		// where null is the lower bound.
+		if rg.LowExclude && !rg.LowVal[0].IsNull() {
 			lowCnt, err := c.equalRowCount(sc, rg.LowVal[0], modifyCount)
 			if err != nil {
 				return 0, errors.Trace(err)
 			}
 			cnt -= lowCnt
 		}
+		if !rg.LowExclude && rg.LowVal[0].IsNull() {
+			cnt += float64(c.NullCount)
+		}
 		if !rg.HighExclude {
 			highCnt, err := c.equalRowCount(sc, rg.HighVal[0], modifyCount)
 			if err != nil {
@@ -861,34 +860,49 @@ func (idx *Index) IsInvalid(collPseudo bool) bool {
 	return (collPseudo && idx.NotAccurate()) || idx.totalRowCount() == 0
 }
 
-func (idx *Index) equalRowCount(sc *stmtctx.StatementContext, b []byte, modifyCount int64) float64 {
+var nullKeyBytes, _ = codec.EncodeKey(nil, nil, types.NewDatum(nil))
+
+func (idx *Index) equalRowCount(sc *stmtctx.StatementContext, b []byte, modifyCount int64) (float64, error) {
+	if len(idx.Info.Columns) == 1 {
+		if bytes.Equal(b, nullKeyBytes) {
+			return float64(idx.NullCount), nil
+		}
+	}
 	val := types.NewBytesDatum(b)
 	if idx.NDV > 0 && idx.outOfRange(val) {
-		return float64(modifyCount) / (float64(idx.NDV))
+		return float64(modifyCount) / (float64(idx.NDV)), nil
 	}
 	if idx.CMSketch != nil {
-		return float64(idx.CMSketch.QueryBytes(b))
+		return float64(idx.CMSketch.QueryBytes(b)), nil
 	}
-	return idx.Histogram.equalRowCount(val)
+	return idx.Histogram.equalRowCount(val), nil
 }
 
 func (idx *Index) getRowCount(sc *stmtctx.StatementContext, indexRanges []*ranger.Range, modifyCount int64) (float64, error) {
 	totalCount := float64(0)
+	isSingleCol := len(idx.Info.Columns) == 1
 	for _, indexRange := range indexRanges {
 		lb, err := codec.EncodeKey(sc, nil, indexRange.LowVal...)
 		if err != nil {
-			return 0, errors.Trace(err)
+			return 0, err
 		}
 		rb, err := codec.EncodeKey(sc, nil, indexRange.HighVal...)
 		if err != nil {
-			return 0, errors.Trace(err)
+			return 0, err
 		}
 		fullLen := len(indexRange.LowVal) == len(indexRange.HighVal) && len(indexRange.LowVal) == len(idx.Info.Columns)
-		if fullLen && bytes.Equal(lb, rb) {
-			if !indexRange.LowExclude && !indexRange.HighExclude {
-				totalCount += idx.equalRowCount(sc, lb, modifyCount)
+		if bytes.Equal(lb, rb) {
+			if indexRange.LowExclude || indexRange.HighExclude {
+				continue
+			}
+			if fullLen {
+				count, err := idx.equalRowCount(sc, lb, modifyCount)
+				if err != nil {
+					return 0, err
+				}
+				totalCount += count
+				continue
 			}
-			continue
 		}
 		if indexRange.LowExclude {
 			lb = kv.Key(lb).PrefixNext()
@@ -899,9 +913,13 @@ func (idx *Index) getRowCount(sc *stmtctx.StatementContext, indexRanges []*range
 		l := types.NewBytesDatum(lb)
 		r := types.NewBytesDatum(rb)
 		totalCount += idx.betweenRowCount(l, r)
-		if idx.outOfRange(l) || idx.outOfRange(r) {
+		lowIsNull := bytes.Equal(lb, nullKeyBytes)
+		if (idx.outOfRange(l) && !(isSingleCol && lowIsNull)) || idx.outOfRange(r) {
 			totalCount += float64(modifyCount) / outOfRangeBetweenRate
 		}
+		if isSingleCol && lowIsNull {
+			totalCount += float64(idx.NullCount)
+		}
 	}
 	if totalCount > idx.totalRowCount() {
 		totalCount = idx.totalRowCount()
diff --git a/statistics/selectivity_test.go b/statistics/selectivity_test.go
index 429165b6283a4..338b4e3039c6d 100644
--- a/statistics/selectivity_test.go
+++ b/statistics/selectivity_test.go
@@ -360,3 +360,62 @@ func BenchmarkSelectivity(b *testing.B) {
 	})
 	pprof.StopCPUProfile()
 }
+
+func (s *testStatsSuite) TestColumnIndexNullEstimation(c *C) {
+	defer cleanEnv(c, s.store, s.do)
+	testKit := testkit.NewTestKit(c, s.store)
+	testKit.MustExec("use test")
+	testKit.MustExec("drop table if exists t")
+	testKit.MustExec("create table t(a int, b int, c int, index idx_b(b), index idx_c_a(c, a))")
+	testKit.MustExec("insert into t values(1,null,1),(2,null,2),(3,3,3),(4,null,4),(null,null,null);")
+	h := s.do.StatsHandle()
+	c.Assert(h.DumpStatsDeltaToKV(statistics.DumpAll), IsNil)
+	testKit.MustExec("analyze table t")
+	testKit.MustQuery(`explain select b from t where b is null`).Check(testkit.Rows(
+		"IndexReader_6 4.00 root index:IndexScan_5",
+		"└─IndexScan_5 4.00 cop table:t, index:b, range:[NULL,NULL], keep order:false",
+	))
+	testKit.MustQuery(`explain select b from t where b is not null`).Check(testkit.Rows(
+		"IndexReader_6 1.00 root index:IndexScan_5",
+		"└─IndexScan_5 1.00 cop table:t, index:b, range:[-inf,+inf], keep order:false",
+	))
+	testKit.MustQuery(`explain select b from t where b is null or b > 3`).Check(testkit.Rows(
+		"IndexReader_6 4.00 root index:IndexScan_5",
+		"└─IndexScan_5 4.00 cop table:t, index:b, range:[NULL,NULL], (3,+inf], keep order:false",
+	))
+	testKit.MustQuery(`explain select b from t use index(idx_b)`).Check(testkit.Rows(
+		"IndexReader_5 5.00 root index:IndexScan_4",
+		"└─IndexScan_4 5.00 cop table:t, index:b, range:[NULL,+inf], keep order:false",
+	))
+	testKit.MustQuery(`explain select b from t where b < 4`).Check(testkit.Rows(
+		"IndexReader_6 1.00 root index:IndexScan_5",
+		"└─IndexScan_5 1.00 cop table:t, index:b, range:[-inf,4), keep order:false",
+	))
+	// Make sure column stats has been loaded.
+	testKit.MustExec(`explain select * from t where a is null`)
+	c.Assert(h.LoadNeededHistograms(), IsNil)
+	testKit.MustQuery(`explain select * from t where a is null`).Check(testkit.Rows(
+		"TableReader_7 1.00 root data:Selection_6",
+		"└─Selection_6 1.00 cop isnull(test.t.a)",
+		"  └─TableScan_5 5.00 cop table:t, range:[-inf,+inf], keep order:false",
+	))
+	testKit.MustQuery(`explain select * from t where a is not null`).Check(testkit.Rows(
+		"TableReader_7 4.00 root data:Selection_6",
+		"└─Selection_6 4.00 cop not(isnull(test.t.a))",
+		"  └─TableScan_5 5.00 cop table:t, range:[-inf,+inf], keep order:false",
+	))
+	testKit.MustQuery(`explain select * from t where a is null or a > 3`).Check(testkit.Rows(
+		"TableReader_7 2.00 root data:Selection_6",
+		"└─Selection_6 2.00 cop or(isnull(test.t.a), gt(test.t.a, 3))",
+		"  └─TableScan_5 5.00 cop table:t, range:[-inf,+inf], keep order:false",
+	))
+	testKit.MustQuery(`explain select * from t`).Check(testkit.Rows(
+		"TableReader_5 5.00 root data:TableScan_4",
+		"└─TableScan_4 5.00 cop table:t, range:[-inf,+inf], keep order:false",
+	))
+	testKit.MustQuery(`explain select * from t where a < 4`).Check(testkit.Rows(
+		"TableReader_7 3.00 root data:Selection_6",
+		"└─Selection_6 3.00 cop lt(test.t.a, 4)",
+		"  └─TableScan_5 5.00 cop table:t, range:[-inf,+inf], keep order:false",
+	))
+}
diff --git a/statistics/statistics_test.go b/statistics/statistics_test.go
index cfd2f4eb51a12..316dbd90f24fd 100644
--- a/statistics/statistics_test.go
+++ b/statistics/statistics_test.go
@@ -314,12 +314,6 @@ func (s *testStatisticsSuite) TestBuild(c *C) {
 	c.Check(int(count), Equals, 20000)
 	count = col.betweenRowCount(types.NewIntDatum(30000), types.NewIntDatum(35000))
 	c.Check(int(count), Equals, 5000)
-	count = col.greaterAndEqRowCount(types.NewIntDatum(1001))
-	c.Check(int(count), Equals, 98999)
-	count = col.lessAndEqRowCount(types.NewIntDatum(99999))
-	c.Check(int(count), Equals, 100000)
-	count = col.lessAndEqRowCount(types.Datum{})
-	c.Check(int(count), Equals, 0)
 	count = col.greaterRowCount(types.NewIntDatum(1001))
 	c.Check(int(count), Equals, 98998)
 	count = col.lessRowCount(types.NewIntDatum(99999))
diff --git a/statistics/table.go b/statistics/table.go
index eea3c5c43fb8b..9f9b3bb3977e1 100644
--- a/statistics/table.go
+++ b/statistics/table.go
@@ -355,7 +355,7 @@ func (t *Table) ColumnGreaterRowCount(sc *stmtctx.StatementContext, value types.
 	return c.greaterRowCount(value) * c.getIncreaseFactor(t.Count)
 }
 
-// ColumnLessRowCount estimates the row count where the column less than value.
+// ColumnLessRowCount estimates the row count where the column less than value. Note that null values are not counted.
 func (t *Table) ColumnLessRowCount(sc *stmtctx.StatementContext, value types.Datum, colID int64) float64 {
 	c, ok := t.Columns[colID]
 	if !ok || c.IsInvalid(sc, t.Pseudo) {
@@ -370,7 +370,11 @@ func (t *Table) ColumnBetweenRowCount(sc *stmtctx.StatementContext, a, b types.D
 	if !ok || c.IsInvalid(sc, t.Pseudo) {
 		return float64(t.Count) / pseudoBetweenRate
 	}
-	return c.betweenRowCount(a, b) * c.getIncreaseFactor(t.Count)
+	count := c.betweenRowCount(a, b)
+	if a.IsNull() {
+		count += float64(c.NullCount)
+	}
+	return count * c.getIncreaseFactor(t.Count)
 }
 
 // ColumnEqualRowCount estimates the row count where the column equals to value.
@@ -509,13 +513,27 @@ func (coll *HistColl) GenerateHistCollFromColumnInfo(infos []*model.ColumnInfo,
 	return newColl
 }
 
+// isSingleColIdxNullRange checks if a range is [NULL, NULL] on a single-column index.
+func isSingleColIdxNullRange(idx *Index, ran *ranger.Range) bool {
+	if len(idx.Info.Columns) > 1 {
+		return false
+	}
+	l, h := ran.LowVal[0], ran.HighVal[0]
+	if l.IsNull() && h.IsNull() {
+		return true
+	}
+	return false
+}
+
 func (coll *HistColl) getIndexRowCount(sc *stmtctx.StatementContext, idxID int64, indexRanges []*ranger.Range) (float64, error) {
 	idx := coll.Indices[idxID]
 	totalCount := float64(0)
 	for _, ran := range indexRanges {
 		rangePosition := getOrdinalOfRangeCond(sc, ran)
-		// first one is range, just use the previous way to estimate
-		if rangePosition == 0 {
+		// If first one is range, just use the previous way to estimate; if it is [NULL, NULL] range
+		// on single-column index, use previous way as well, because CMSketch does not contain null
+		// values in this case.
+		if rangePosition == 0 || isSingleColIdxNullRange(idx, ran) {
 			count, err := idx.getRowCount(sc, []*ranger.Range{ran}, coll.ModifyCount)
 			if err != nil {
 				return 0, errors.Trace(err)
diff --git a/util/ranger/points.go b/util/ranger/points.go
index db07588a861f6..9e73643068394 100644
--- a/util/ranger/points.go
+++ b/util/ranger/points.go
@@ -134,11 +134,21 @@ func FullIntRange(isUnsigned bool) []*Range {
 	return []*Range{{LowVal: []types.Datum{types.NewIntDatum(math.MinInt64)}, HighVal: []types.Datum{types.NewIntDatum(math.MaxInt64)}}}
 }
 
-// FullRange is (-∞, +∞) for Range.
+// FullRange is [null, +∞) for Range.
 func FullRange() []*Range {
 	return []*Range{{LowVal: []types.Datum{{}}, HighVal: []types.Datum{types.MaxValueDatum()}}}
 }
 
+// FullNotNullRange is (-∞, +∞) for Range.
+func FullNotNullRange() []*Range {
+	return []*Range{{LowVal: []types.Datum{types.MinNotNullDatum()}, HighVal: []types.Datum{types.MaxValueDatum()}}}
+}
+
+// NullRange is [null, null] for Range.
+func NullRange() []*Range {
+	return []*Range{{LowVal: []types.Datum{{}}, HighVal: []types.Datum{{}}}}
+}
+
 // builder is the range builder struct.
 type builder struct {
 	err error