diff --git a/src/mongo/db/pipeline/accumulator_internal_construct_stats.cpp b/src/mongo/db/pipeline/accumulator_internal_construct_stats.cpp index 2500c3d0a7cb6..21132716bd3c0 100644 --- a/src/mongo/db/pipeline/accumulator_internal_construct_stats.cpp +++ b/src/mongo/db/pipeline/accumulator_internal_construct_stats.cpp @@ -44,7 +44,7 @@ #include "mongo/db/pipeline/expression_context.h" #include "mongo/db/pipeline/variables.h" #include "mongo/db/query/allowed_contexts.h" -#include "mongo/db/query/stats/array_histogram.h" +#include "mongo/db/query/stats/ce_histogram.h" #include "mongo/db/query/stats/max_diff.h" #include "mongo/db/query/stats/stats_gen.h" #include "mongo/db/query/stats/value_utils.h" @@ -120,8 +120,8 @@ Value AccumulatorInternalConstructStats::getValue(bool toBeMerged) { uassert(8423374, "Can not merge analyze pipelines", !toBeMerged); // Generate and serialize maxdiff histogram for scalar and array values. - auto arrayHistogram = stats::createArrayEstimator(_values, _params.getNumberBuckets()); - auto stats = stats::makeStatistics(_count, _params.getSampleRate(), arrayHistogram); + auto ceHistogram = stats::createCEHistogram(_values, _params.getNumberBuckets()); + auto stats = stats::makeStatistics(_count, _params.getSampleRate(), ceHistogram); return Value(stats); } diff --git a/src/mongo/db/query/ce/SConscript b/src/mongo/db/query/ce/SConscript index 8ab4b9b6ff96b..ec2dc7841af7d 100644 --- a/src/mongo/db/query/ce/SConscript +++ b/src/mongo/db/query/ce/SConscript @@ -65,7 +65,7 @@ env.CppUnitTest( env.Benchmark( target="histogram_bm", - source=["array_histogram_bm.cpp"], + source=["ce_histogram_bm.cpp"], LIBDEPS=[ "$BUILD_DIR/mongo/db/query/query_test_service_context", "ce_test_utils", diff --git a/src/mongo/db/query/ce/array_histogram_bm.cpp b/src/mongo/db/query/ce/ce_histogram_bm.cpp similarity index 98% rename from src/mongo/db/query/ce/array_histogram_bm.cpp rename to src/mongo/db/query/ce/ce_histogram_bm.cpp index ce77cba128b7b..35dbdf16d98e2 100644 --- a/src/mongo/db/query/ce/array_histogram_bm.cpp +++ b/src/mongo/db/query/ce/ce_histogram_bm.cpp @@ -121,7 +121,7 @@ void BM_CreateHistogram(benchmark::State& state) { for (auto curState : state) { // Built histogram. - auto arrHist = stats::createArrayEstimator(data, configuration.numberOfBuckets); + auto ceHist = stats::createCEHistogram(data, configuration.numberOfBuckets); } } @@ -170,7 +170,7 @@ void BM_RunHistogramEstimations(benchmark::State& state) { } // Build histogram. - auto arrHist = stats::createArrayEstimator(data, configuration.numberOfBuckets); + auto ceHist = stats::createCEHistogram(data, configuration.numberOfBuckets); TypeProbability typeCombinationQuery{configuration.sbeDataType, 100}; @@ -181,7 +181,7 @@ void BM_RunHistogramEstimations(benchmark::State& state) { configuration.dataInterval, typeCombinationQuery, data, - arrHist, + ceHist, true /*includeScalar*/, false /*useE2EAPI*/, seed); diff --git a/src/mongo/db/query/ce/generated_histograms_test.cpp b/src/mongo/db/query/ce/generated_histograms_test.cpp index 7117e62aa33e5..74a9b6e80b81e 100644 --- a/src/mongo/db/query/ce/generated_histograms_test.cpp +++ b/src/mongo/db/query/ce/generated_histograms_test.cpp @@ -33,7 +33,7 @@ namespace mongo::ce { namespace { namespace value = sbe::value; -using stats::ArrayHistogram; +using stats::CEHistogram; using stats::ScalarHistogram; using stats::TypeCounts; @@ -72,7 +72,7 @@ TEST(EstimatorTest, UniformIntStrEstimate) { constexpr double collCard = 1000.0; const ScalarHistogram hist = createHistogram(data); - const auto arrHist = ArrayHistogram::make( + const auto ceHist = CEHistogram::make( hist, TypeCounts{{value::TypeTags::NumberInt64, 515}, {value::TypeTags::StringSmall, 485}}, collCard); @@ -96,7 +96,7 @@ TEST(EstimatorTest, UniformIntStrEstimate) { // Range query crossing the type brackets. // Actual cardinality {$gt: 100} = 475. - expectedCard = estimateCardinalityRange(*arrHist, + expectedCard = estimateCardinalityRange(*ceHist, false /* lowInclusive */, value::TypeTags::NumberInt64, value::bitcastFrom(100), @@ -107,7 +107,7 @@ TEST(EstimatorTest, UniformIntStrEstimate) { ASSERT_CE_APPROX_EQUAL(460.1, expectedCard.card, kErrorBound); // Actual cardinality {$lt: 'abc'} = 291. - expectedCard = estimateCardinalityRange(*arrHist, + expectedCard = estimateCardinalityRange(*ceHist, true /* lowInclusive */, tagLowStr, valLowStr, @@ -118,7 +118,7 @@ TEST(EstimatorTest, UniformIntStrEstimate) { ASSERT_CE_APPROX_EQUAL(319.9, expectedCard.card, kErrorBound); // Actual cardinality {$gte: 'abc'} = 194. - expectedCard = estimateCardinalityRange(*arrHist, + expectedCard = estimateCardinalityRange(*ceHist, true /* lowInclusive */, tagAbc, valAbc, @@ -130,11 +130,11 @@ TEST(EstimatorTest, UniformIntStrEstimate) { // Queries over the low string bound. // Actual cardinality {$eq: ''} = 0. - expectedCard = estimateCardinalityEq(*arrHist, tagLowStr, valLowStr, true); + expectedCard = estimateCardinalityEq(*ceHist, tagLowStr, valLowStr, true); ASSERT_CE_APPROX_EQUAL(2.727, expectedCard.card, 0.001); // Actual cardinality {$gt: ''} = 485. - expectedCard = estimateCardinalityRange(*arrHist, + expectedCard = estimateCardinalityRange(*ceHist, false /* lowInclusive */, tagLowStr, valLowStr, @@ -222,7 +222,7 @@ TEST(EstimatorTest, IntStrArrayEstimate) { {value::TypeTags::Array, 293}}; TypeCounts arrayTypeCounts{{value::TypeTags::NumberInt64, 282}, {value::TypeTags::StringSmall, 222}}; - const auto arrHist = ArrayHistogram::make( + const auto ceHist = CEHistogram::make( scalarHist, typeCounts, uniqueHist, minHist, maxHist, arrayTypeCounts, collCard); const auto [tagLowDbl, valLowDbl] = @@ -232,7 +232,7 @@ TEST(EstimatorTest, IntStrArrayEstimate) { value::ValueGuard vgLowStr(tagLowStr, valLowStr); // Actual cardinality {$lt: 100} = 115. - EstimationResult expectedCard = estimateCardinalityRange(*arrHist, + EstimationResult expectedCard = estimateCardinalityRange(*ceHist, false /* lowInclusive */, tagLowDbl, valLowDbl, @@ -243,7 +243,7 @@ TEST(EstimatorTest, IntStrArrayEstimate) { ASSERT_CE_APPROX_EQUAL(109.9, expectedCard.card, kErrorBound); // Actual cardinality {$gt: 502} = 434. - expectedCard = estimateCardinalityRange(*arrHist, + expectedCard = estimateCardinalityRange(*ceHist, false /* lowInclusive */, value::TypeTags::NumberInt64, value::bitcastFrom(500), @@ -254,7 +254,7 @@ TEST(EstimatorTest, IntStrArrayEstimate) { ASSERT_CE_APPROX_EQUAL(443.8, expectedCard.card, kErrorBound); // Actual cardinality {$gte: 502} = 437. - expectedCard = estimateCardinalityRange(*arrHist, + expectedCard = estimateCardinalityRange(*ceHist, true /* lowInclusive */, value::TypeTags::NumberInt64, value::bitcastFrom(500), @@ -265,17 +265,17 @@ TEST(EstimatorTest, IntStrArrayEstimate) { ASSERT_CE_APPROX_EQUAL(448.3, expectedCard.card, kErrorBound); // Actual cardinality {$eq: ''} = 0. - expectedCard = estimateCardinalityEq(*arrHist, tagLowStr, valLowStr, true /* includeScalar */); + expectedCard = estimateCardinalityEq(*ceHist, tagLowStr, valLowStr, true /* includeScalar */); ASSERT_CE_APPROX_EQUAL(6.69, expectedCard.card, 0.001); // Actual cardinality {$eq: 'DD2'} = 2. auto [tagStr, valStr] = value::makeNewString("DD2"_sd); value::ValueGuard vg(tagStr, valStr); - expectedCard = estimateCardinalityEq(*arrHist, tagStr, valStr, true /* includeScalar */); + expectedCard = estimateCardinalityEq(*ceHist, tagStr, valStr, true /* includeScalar */); ASSERT_CE_APPROX_EQUAL(5.27, expectedCard.card, kErrorBound); // Actual cardinality {$lte: 'DD2'} = 120. - expectedCard = estimateCardinalityRange(*arrHist, + expectedCard = estimateCardinalityRange(*ceHist, true /* lowInclusive */, tagLowStr, valLowStr, @@ -288,7 +288,7 @@ TEST(EstimatorTest, IntStrArrayEstimate) { // Actual cardinality {$gt: 'DD2'} = 450. auto [tagObj, valObj] = value::makeNewObject(); value::ValueGuard vgObj(tagObj, valObj); - expectedCard = estimateCardinalityRange(*arrHist, + expectedCard = estimateCardinalityRange(*ceHist, false /* lowInclusive */, tagStr, valStr, @@ -303,11 +303,11 @@ TEST(EstimatorTest, IntStrArrayEstimate) { std::make_pair(value::TypeTags::NumberInt64, value::bitcastFrom(603)); // Actual cardinality {$match: {a: {$elemMatch: {$eq: 603}}}} = 12. - expectedCard = estimateCardinalityEq(*arrHist, tagInt, valInt, false /* includeScalar */); + expectedCard = estimateCardinalityEq(*ceHist, tagInt, valInt, false /* includeScalar */); ASSERT_CE_APPROX_EQUAL(12.0, expectedCard.card, kErrorBound); // Actual cardinality {$match: {a: {$elemMatch: {$lte: 603}}}} = 252. - expectedCard = estimateCardinalityRange(*arrHist, + expectedCard = estimateCardinalityRange(*ceHist, false /* lowInclusive */, tagLowDbl, valLowDbl, @@ -318,7 +318,7 @@ TEST(EstimatorTest, IntStrArrayEstimate) { ASSERT_CE_APPROX_EQUAL(293.0, expectedCard.card, kErrorBound); // Actual cardinality {$match: {a: {$elemMatch: {$gte: 603}}}} = 200. - expectedCard = estimateCardinalityRange(*arrHist, + expectedCard = estimateCardinalityRange(*ceHist, true /* lowInclusive */, tagInt, valInt, @@ -330,11 +330,11 @@ TEST(EstimatorTest, IntStrArrayEstimate) { // Actual cardinality {$match: {a: {$elemMatch: {$eq: 'cu'}}}} = 7. std::tie(tagStr, valStr) = value::makeNewString("cu"_sd); - expectedCard = estimateCardinalityEq(*arrHist, tagStr, valStr, false /* includeScalar */); + expectedCard = estimateCardinalityEq(*ceHist, tagStr, valStr, false /* includeScalar */); ASSERT_CE_APPROX_EQUAL(3.8, expectedCard.card, kErrorBound); // Actual cardinality {$match: {a: {$elemMatch: {$gte: 'cu'}}}} = 125. - expectedCard = estimateCardinalityRange(*arrHist, + expectedCard = estimateCardinalityRange(*ceHist, true /* lowInclusive */, tagStr, valStr, @@ -345,7 +345,7 @@ TEST(EstimatorTest, IntStrArrayEstimate) { ASSERT_CE_APPROX_EQUAL(109.7, expectedCard.card, kErrorBound); // Actual cardinality {$match: {a: {$elemMatch: {$lte: 'cu'}}}} = 141. - expectedCard = estimateCardinalityRange(*arrHist, + expectedCard = estimateCardinalityRange(*ceHist, true /* lowInclusive */, tagLowStr, valLowStr, diff --git a/src/mongo/db/query/ce/histogram_accuracy_test_utils.cpp b/src/mongo/db/query/ce/histogram_accuracy_test_utils.cpp index 2b04bb72b88ea..4278e18848211 100644 --- a/src/mongo/db/query/ce/histogram_accuracy_test_utils.cpp +++ b/src/mongo/db/query/ce/histogram_accuracy_test_utils.cpp @@ -301,7 +301,7 @@ ErrorCalculationSummary runQueries(size_t size, const std::pair interval, const std::pair queryTypeInfo, const std::vector& data, - const std::shared_ptr arrHist, + const std::shared_ptr ceHist, bool includeScalar, bool useE2EAPI, const size_t seed) { @@ -355,7 +355,7 @@ ErrorCalculationSummary runQueries(size_t size, // Estimate result. estimatedCard = estimateCardinalityEq( - *arrHist, queryTypeInfo.first, sbeValLow[i].getValue(), includeScalar); + *ceHist, queryTypeInfo.first, sbeValLow[i].getValue(), includeScalar); break; } case kRange: { @@ -371,7 +371,7 @@ ErrorCalculationSummary runQueries(size_t size, data, queryTypeInfo.first, sbeValLow[i], sbeValHigh[i]); // Estimate result. - estimatedCard = estimateCardinalityRange(*arrHist, + estimatedCard = estimateCardinalityRange(*ceHist, true /*lowInclusive*/, queryTypeInfo.first, sbeValLow[i].getValue(), @@ -465,7 +465,7 @@ void runAccuracyTestConfiguration(const DataDistributionEnum dataDistribution, } // Build histogram. - auto arrHist = stats::createArrayEstimator(data, numberOfBuckets); + auto ceHist = stats::createCEHistogram(data, numberOfBuckets); // Run queries. for (const auto& typeCombinationQuery : typeCombinationsQueries) { @@ -480,7 +480,7 @@ void runAccuracyTestConfiguration(const DataDistributionEnum dataDistribution, queryInterval, typeCombinationQuery, data, - arrHist, + ceHist, includeScalar, useE2EAPI, seed); diff --git a/src/mongo/db/query/ce/histogram_accuracy_test_utils.h b/src/mongo/db/query/ce/histogram_accuracy_test_utils.h index f417f062e51c5..141412aaddb65 100644 --- a/src/mongo/db/query/ce/histogram_accuracy_test_utils.h +++ b/src/mongo/db/query/ce/histogram_accuracy_test_utils.h @@ -118,7 +118,7 @@ ErrorCalculationSummary runQueries(size_t size, std::pair interval, std::pair queryTypeInfo, const std::vector& data, - std::shared_ptr arrHist, + std::shared_ptr ceHist, bool includeScalar, bool useE2EAPI, size_t seed); diff --git a/src/mongo/db/query/ce/histogram_common.h b/src/mongo/db/query/ce/histogram_common.h index 0c525e41167fc..1b7e8af51cb72 100644 --- a/src/mongo/db/query/ce/histogram_common.h +++ b/src/mongo/db/query/ce/histogram_common.h @@ -29,8 +29,7 @@ #pragma once -#include "mongo/db/query/stats/array_histogram.h" -#include "mongo/db/query/stats/scalar_histogram.h" +#include "mongo/db/query/stats/ce_histogram.h" namespace mongo::ce { diff --git a/src/mongo/db/query/ce/histogram_estimation_impl.cpp b/src/mongo/db/query/ce/histogram_estimation_impl.cpp index 3f98aa4da5535..46a08d19f738f 100644 --- a/src/mongo/db/query/ce/histogram_estimation_impl.cpp +++ b/src/mongo/db/query/ce/histogram_estimation_impl.cpp @@ -243,7 +243,7 @@ EstimationResult estimateRangeQueryOnArray(const ScalarHistogram& histogramAmin, return highEstimate - lowEstimate; } -// --------------------- ARRAY HISTOGRAM ESTIMATION METHODS --------------------- +// --------------------- CE HISTOGRAM ESTIMATION METHODS --------------------- int compareTypeTags(sbe::value::TypeTags a, sbe::value::TypeTags b) { auto orderOfA = canonicalizeBSONTypeUnsafeLookup(tagToType(a)); @@ -256,24 +256,25 @@ int compareTypeTags(sbe::value::TypeTags a, sbe::value::TypeTags b) { return 0; } -EstimationResult estimateCardinalityEq(const stats::ArrayHistogram& ah, +EstimationResult estimateCardinalityEq(const stats::CEHistogram& ceHist, sbe::value::TypeTags tag, sbe::value::Value val, bool includeScalar) { EstimationResult estimation = {0.0 /*card*/, 0.0 /*ndv*/}; // Estimate cardinality for fields containing scalar values if includeScalar is true. if (includeScalar) { - estimation = estimateCardinality(ah.getScalar(), tag, val, EstimationType::kEqual); + estimation = estimateCardinality(ceHist.getScalar(), tag, val, EstimationType::kEqual); } // If histogram includes array data points, calculate cardinality for fields containing array // values. - if (ah.isArray()) { - estimation += estimateCardinality(ah.getArrayUnique(), tag, val, EstimationType::kEqual); + if (ceHist.isArray()) { + estimation += + estimateCardinality(ceHist.getArrayUnique(), tag, val, EstimationType::kEqual); } return estimation; } -EstimationResult estimateCardinalityRange(const stats::ArrayHistogram& ah, +EstimationResult estimateCardinalityRange(const stats::CEHistogram& ceHist, bool lowInclusive, sbe::value::TypeTags tagLow, sbe::value::Value valLow, @@ -293,12 +294,12 @@ EstimationResult estimateCardinalityRange(const stats::ArrayHistogram& ah, }; EstimationResult result = {0.0 /*card*/, 0.0 /*ndv*/}; - if (ah.isArray()) { + if (ceHist.isArray()) { if (includeScalar) { // Range query on array data. - result += estimateRangeQueryOnArray(ah.getArrayMin(), - ah.getArrayMax(), + result += estimateRangeQueryOnArray(ceHist.getArrayMin(), + ceHist.getArrayMax(), lowInclusive, tagLow, valLow, @@ -307,11 +308,11 @@ EstimationResult estimateCardinalityRange(const stats::ArrayHistogram& ah, valHigh); } else { // $elemMatch query on array data. - const auto arrayMinEst = estRange(ah.getArrayMin()); - const auto arrayMaxEst = estRange(ah.getArrayMax()); - const auto arrayUniqueEst = estRange(ah.getArrayUnique()); + const auto arrayMinEst = estRange(ceHist.getArrayMin()); + const auto arrayMaxEst = estRange(ceHist.getArrayMax()); + const auto arrayUniqueEst = estRange(ceHist.getArrayUnique()); - const double totalArrayCount = ah.getArrayCount() - ah.getEmptyArrayCount(); + const double totalArrayCount = ceHist.getArrayCount() - ceHist.getEmptyArrayCount(); uassert( 9160701, "Array histograms should contain at least one array", totalArrayCount > 0); @@ -327,7 +328,7 @@ EstimationResult estimateCardinalityRange(const stats::ArrayHistogram& ah, } case EstimationAlgo::HistogramV2: { const double avgArraySize = - getTotals(ah.getArrayUnique()).card / totalArrayCount; + getTotals(ceHist.getArrayUnique()).card / totalArrayCount; const double adjustedUniqueCard = (avgArraySize == 0.0) ? 0.0 : std::min(arrayUniqueEst.card / pow(avgArraySize, 0.2), totalArrayCount); @@ -351,13 +352,13 @@ EstimationResult estimateCardinalityRange(const stats::ArrayHistogram& ah, } if (includeScalar) { - result += estRange(ah.getScalar()); + result += estRange(ceHist.getScalar()); } return {result}; } -bool canEstimateBound(const stats::ArrayHistogram& ah, +bool canEstimateBound(const stats::CEHistogram& ceHist, const sbe::value::TypeTags tag, bool includeScalar) { // if histogrammable, then it's estimable @@ -371,7 +372,7 @@ bool canEstimateBound(const stats::ArrayHistogram& ah, return false; } } - if (ah.isArray()) { + if (ceHist.isArray()) { if (tag != sbe::value::TypeTags::Null) { return false; } @@ -380,11 +381,11 @@ bool canEstimateBound(const stats::ArrayHistogram& ah, } // TODO: SERVER-94855 Supports mixed type intervals with type counts. -Cardinality estimateIntervalCardinality(const stats::ArrayHistogram& ah, +Cardinality estimateIntervalCardinality(const stats::CEHistogram& ceHist, const mongo::Interval& interval, bool includeScalar) { if (interval.isFullyOpen()) { - return ah.getSampleSize(); + return ceHist.getSampleSize(); } bool startInclusive = interval.startInclusive; @@ -407,9 +408,9 @@ Cardinality estimateIntervalCardinality(const stats::ArrayHistogram& ah, if (compareTypeTags(startTag, endTag) == 0) { if (stats::canEstimateTypeViaHistogram(startTag)) { if (stats::compareValues(startTag, startVal, endTag, endVal) == 0) { - return estimateCardinalityEq(ah, startTag, startVal, includeScalar).card; + return estimateCardinalityEq(ceHist, startTag, startVal, includeScalar).card; } - return estimateCardinalityRange(ah, + return estimateCardinalityRange(ceHist, startInclusive, startTag, startVal, diff --git a/src/mongo/db/query/ce/histogram_estimation_impl.h b/src/mongo/db/query/ce/histogram_estimation_impl.h index e3accfa8034d3..424fad7183b69 100644 --- a/src/mongo/db/query/ce/histogram_estimation_impl.h +++ b/src/mongo/db/query/ce/histogram_estimation_impl.h @@ -89,23 +89,23 @@ EstimationResult estimateRangeQueryOnArray(const stats::ScalarHistogram& histogr sbe::value::TypeTags tagHigh, sbe::value::Value valHigh); -// --------------------- ARRAY HISTOGRAM ESTIMATION METHODS --------------------- +// --------------------- CE HISTOGRAM ESTIMATION METHODS --------------------- /** - * Estimates the cardinality of an equality predicate given an ArrayHistogram and an SBE value and + * Estimates the cardinality of an equality predicate given an CEHistogram and an SBE value and * type tag pair. */ -EstimationResult estimateCardinalityEq(const stats::ArrayHistogram& ah, +EstimationResult estimateCardinalityEq(const stats::CEHistogram& ceHist, sbe::value::TypeTags tag, sbe::value::Value val, bool includeScalar); /** - * Estimates the cardinality of a range predicate given an ArrayHistogram and a range predicate. + * Estimates the cardinality of a range predicate given an CEHistogram and a range predicate. * Set 'includeScalar' to true to indicate whether or not the provided range should include no-array * values. The other fields define the range of the estimation. */ -EstimationResult estimateCardinalityRange(const stats::ArrayHistogram& ah, +EstimationResult estimateCardinalityRange(const stats::CEHistogram& ceHist, bool lowInclusive, sbe::value::TypeTags tagLow, sbe::value::Value valLow, @@ -119,14 +119,14 @@ EstimationResult estimateCardinalityRange(const stats::ArrayHistogram& ah, * Estimates the selectivity of a given interval if histogram estimation is possible. Otherwise, * throw an exception. */ -Cardinality estimateIntervalCardinality(const stats::ArrayHistogram& ah, +Cardinality estimateIntervalCardinality(const stats::CEHistogram& ceHist, const mongo::Interval& interval, bool includeScalar = true); /** * Checks if a given bound can be estimated via either histograms or type counts. */ -bool canEstimateBound(const stats::ArrayHistogram& ah, +bool canEstimateBound(const stats::CEHistogram& ceHist, sbe::value::TypeTags tag, bool includeScalar); diff --git a/src/mongo/db/query/ce/histogram_estimation_impl_test.cpp b/src/mongo/db/query/ce/histogram_estimation_impl_test.cpp index bbbdd5b449b99..cfb7fbd24f7ed 100644 --- a/src/mongo/db/query/ce/histogram_estimation_impl_test.cpp +++ b/src/mongo/db/query/ce/histogram_estimation_impl_test.cpp @@ -36,7 +36,7 @@ namespace mongo::ce { namespace { namespace value = sbe::value; -using stats::ArrayHistogram; +using stats::CEHistogram; using stats::ScalarHistogram; using stats::TypeCounts; @@ -59,7 +59,7 @@ constexpr double kErrorBound = 0.01; // assertions comparing the estimated cardinality with the correct value, or approximate assertions // accepting 'kErrorBound' error. -TEST(ScalarEstimatorInterpolationTest, ManualHistogram) { +TEST(ScalarHistogramEstimatorInterpolationTest, ManualHistogram) { std::vector data{{0, 1.0, 1.0, 1.0}, {10, 1.0, 10.0, 5.0}, {20, 3.0, 15.0, 3.0}, @@ -85,7 +85,7 @@ TEST(ScalarEstimatorInterpolationTest, ManualHistogram) { ASSERT_EQ(21.5, estimateCardinalityScalarHistogramInteger(hist, 25, kGreaterOrEqual)); } -TEST(ScalarEstimatorInterpolationTest, UniformIntEstimate) { +TEST(ScalarHistogramEstimatorInterpolationTest, UniformIntEstimate) { // This hard-codes a maxdiff histogram with 10 buckets built off a uniform int distribution with // a minimum of 0, a maximum of 1000, and 70 distinct values. std::vector data{{2, 1, 0, 0}, @@ -139,7 +139,7 @@ TEST(ScalarEstimatorInterpolationTest, UniformIntEstimate) { ASSERT_APPROX_EQUAL(43.0, expectedCard, 0.1); // Actual: 40. } -TEST(ScalarEstimatorInterpolationTest, NormalIntEstimate) { +TEST(ScalarHistogramEstimatorInterpolationTest, NormalIntEstimate) { // This hard-codes a maxdiff histogram with 10 buckets built off a normal int distribution with // a minimum of 0, a maximum of 1000, and 70 distinct values. std::vector data{{2, 1, 0, 0}, @@ -180,7 +180,7 @@ TEST(ScalarEstimatorInterpolationTest, NormalIntEstimate) { ASSERT_APPROX_EQUAL(10.4, expectedCard, 0.1); // Actual: 10. } -TEST(ScalarEstimatorInterpolationTest, UniformStrEstimate) { +TEST(ScalarHistogramEstimatorInterpolationTest, UniformStrEstimate) { // This hard-codes a maxdiff histogram with 10 buckets built off a uniform string distribution // with a minimum length of 3, a maximum length of 5, and 80 distinct values. std::vector data{{{"0ejz", 2, 0, 0}, @@ -212,7 +212,7 @@ TEST(ScalarEstimatorInterpolationTest, UniformStrEstimate) { ASSERT_APPROX_EQUAL(41.3, expectedCard, 0.1); // Actual: 41. } -TEST(ScalarEstimatorInterpolationTest, NormalStrEstimate) { +TEST(ScalarHistogramEstimatorInterpolationTest, NormalStrEstimate) { // This hard-codes a maxdiff histogram with 10 buckets built off a normal string distribution // with a minimum length of 3, a maximum length of 5, and 80 distinct values. std::vector data{{ @@ -261,7 +261,7 @@ TEST(ScalarEstimatorInterpolationTest, NormalStrEstimate) { ASSERT_APPROX_EQUAL(40.0, expectedCard, 0.1); // Actual: 37. } -TEST(ScalarEstimatorEdgeCasesTest, OneBucketIntHistogram) { +TEST(ScalarHistogramEstimatorEdgeCasesTest, OneBucketIntHistogram) { std::vector data{{100, 3.0, 27.0, 9.0}}; const ScalarHistogram hist = createHistogram(data); @@ -290,7 +290,7 @@ TEST(ScalarEstimatorEdgeCasesTest, OneBucketIntHistogram) { ASSERT_EQ(0.0, estimateCardinality(hist, NumberInt64, 1000, kGreaterOrEqual).card); } -TEST(ScalarEstimatorEdgeCasesTest, OneExclusiveBucketIntHistogram) { +TEST(ScalarHistogramEstimatorEdgeCasesTest, OneExclusiveBucketIntHistogram) { // Data set of a single value. // By exclusive bucket we mean a bucket with only boundary, that is the range frequency and // NDV are zero. @@ -314,7 +314,7 @@ TEST(ScalarEstimatorEdgeCasesTest, OneExclusiveBucketIntHistogram) { ASSERT_EQ(0.0, estimateCardinality(hist, NumberInt64, 1000, kGreater).card); } -TEST(ScalarEstimatorEdgeCasesTest, OneBucketTwoIntValuesHistogram) { +TEST(ScalarHistogramEstimatorEdgeCasesTest, OneBucketTwoIntValuesHistogram) { // Data set of two values, example {5, 100, 100}. std::vector data{{100, 2.0, 1.0, 1.0}}; const ScalarHistogram hist = createHistogram(data); @@ -335,7 +335,7 @@ TEST(ScalarEstimatorEdgeCasesTest, OneBucketTwoIntValuesHistogram) { ASSERT_EQ(0.0, estimateCardinality(hist, NumberInt64, 1000, kGreater).card); } -TEST(ScalarEstimatorEdgeCasesTest, OneBucketTwoIntValuesHistogram2) { +TEST(ScalarHistogramEstimatorEdgeCasesTest, OneBucketTwoIntValuesHistogram2) { std::vector data{{100, 2.0, 3.0, 1.0}}; const ScalarHistogram hist = createHistogram(data); @@ -355,7 +355,7 @@ TEST(ScalarEstimatorEdgeCasesTest, OneBucketTwoIntValuesHistogram2) { ASSERT_EQ(0.0, estimateCardinality(hist, NumberInt64, 1000, kGreater).card); } -TEST(ScalarEstimatorEdgeCasesTest, TwoBucketsIntHistogram) { +TEST(ScalarHistogramEstimatorEdgeCasesTest, TwoBucketsIntHistogram) { std::vector data{{1, 1.0, 0.0, 0.0}, {100, 3.0, 26.0, 8.0}}; const ScalarHistogram hist = createHistogram(data); @@ -401,7 +401,7 @@ TEST(ScalarEstimatorEdgeCasesTest, TwoBucketsIntHistogram) { 19.38, estimateCardinality(hist, NumberInt64, 50, kGreaterOrEqual).card, kErrorBound); } -TEST(ScalarEstimatorEdgeCasesTest, ThreeExclusiveBucketsIntHistogram) { +TEST(ScalarHistogramEstimatorEdgeCasesTest, ThreeExclusiveBucketsIntHistogram) { std::vector data{{1, 1.0, 0.0, 0.0}, {10, 8.0, 0.0, 0.0}, {100, 1.0, 0.0, 0.0}}; const ScalarHistogram hist = createHistogram(data); @@ -414,7 +414,7 @@ TEST(ScalarEstimatorEdgeCasesTest, ThreeExclusiveBucketsIntHistogram) { ASSERT_EQ(9.0, estimateCardinality(hist, NumberInt64, 5, kGreaterOrEqual).card); } -TEST(ScalarEstimatorEdgeCasesTest, OneBucketStrHistogram) { +TEST(ScalarHistogramEstimatorEdgeCasesTest, OneBucketStrHistogram) { std::vector data{{"xyz", 3.0, 27.0, 9.0}}; const ScalarHistogram hist = createHistogram(data); @@ -455,7 +455,7 @@ TEST(ScalarEstimatorEdgeCasesTest, OneBucketStrHistogram) { ASSERT_EQ(0.0, estimateCardinality(hist, tag, value, kGreater).card); } -TEST(ScalarEstimatorEdgeCasesTest, TwoBucketsStrHistogram) { +TEST(ScalarHistogramEstimatorEdgeCasesTest, TwoBucketsStrHistogram) { // Data set of 100 strings in the range ["abc", "xyz"], with average frequency of 2. std::vector data{{"abc", 2.0, 0.0, 0.0}, {"xyz", 3.0, 95.0, 48.0}}; const ScalarHistogram hist = createHistogram(data); @@ -508,7 +508,7 @@ TEST(ScalarEstimatorEdgeCasesTest, TwoBucketsStrHistogram) { 4.98, estimateCardinality(hist, tag, value, kGreaterOrEqual).card, kErrorBound); } -TEST(ScalarEstimatorEdgeCasesTest, TwoBucketsDateHistogram) { +TEST(ScalarHistogramEstimatorEdgeCasesTest, TwoBucketsDateHistogram) { // June 6, 2017 -- June 7, 2017. const int64_t startInstant = 1496777923000LL; const int64_t endInstant = 1496864323000LL; @@ -548,7 +548,7 @@ TEST(ScalarEstimatorEdgeCasesTest, TwoBucketsDateHistogram) { ASSERT_EQ(0.0, estimateCardinality(hist, Date, valueAfter, kGreater).card); } -TEST(ScalarEstimatorEdgeCasesTest, TwoBucketsTimestampHistogram) { +TEST(ScalarHistogramEstimatorEdgeCasesTest, TwoBucketsTimestampHistogram) { // June 6, 2017 -- June 7, 2017 in seconds. const int64_t startInstant = 1496777923LL; const int64_t endInstant = 1496864323LL; @@ -589,7 +589,7 @@ TEST(ScalarEstimatorEdgeCasesTest, TwoBucketsTimestampHistogram) { ASSERT_EQ(0.0, estimateCardinality(hist, TimeStamp, valueAfter, kGreater).card); } -TEST(ScalarEstimatorEdgeCasesTest, TwoBucketsObjectIdHistogram) { +TEST(ScalarHistogramEstimatorEdgeCasesTest, TwoBucketsObjectIdHistogram) { const auto startOid = OID("63340d8d27afef2de7357e8d"); const auto endOid = OID("63340dbed6cd8af737d4139a"); ASSERT_TRUE(startOid < endOid); @@ -641,7 +641,7 @@ TEST(ScalarEstimatorEdgeCasesTest, TwoBucketsObjectIdHistogram) { * the cardinality estimates are precise. To test the approximate estimation, we force the histogram * generation to use one bucket per type (except the first numeric type). */ -TEST(ScalarEstimatorEdgeCasesTest, MinValueMixedHistogramFromData) { +TEST(ScalarHistogramEstimatorEdgeCasesTest, MinValueMixedHistogramFromData) { const int64_t startInstant = 1506777923000LL; const int64_t endInstant = 1516864323000LL; const Timestamp startTs{Seconds(1516864323LL), 0}; @@ -824,7 +824,7 @@ TEST(ScalarEstimatorEdgeCasesTest, MinValueMixedHistogramFromData) { ASSERT_EQ(3.0, expectedCard.card); } -TEST(ScalarEstimatorEdgeCasesTest, MinValueMixedHistogramFromBuckets) { +TEST(ScalarHistogramEstimatorEdgeCasesTest, MinValueMixedHistogramFromBuckets) { const auto endOid = OID("63340dbed6cd8af737d4139a"); const auto endDate = Date_t::fromMillisSinceEpoch(1526864323000LL); const Timestamp endTs{Seconds(1526864323LL), 0}; @@ -881,10 +881,10 @@ TEST(ScalarEstimatorEdgeCasesTest, MinValueMixedHistogramFromBuckets) { } -// --------------------- ARRAY HISTOGRAM ESTIMATION TESTS --------------------- +// --------------------- CE HISTOGRAM ESTIMATION TESTS --------------------- // The tests included in this section of the file evaluate the functionality and correctness of the -// ArrayHistogram. -// The tests generates array histograms and estimates the frequency of various keys values. The +// CEHistogram. +// The tests generates ce_histograms and estimates the frequency of various keys values. The // tests either perform exact assertions comparing the estimated cardinality with the correct value, // or approximate assertions accepting 'kErrorBound' error. @@ -957,7 +957,7 @@ static std::string computeRMSE(std::vector& querySet, bool isElemMatc return os.str(); } -TEST(ArrayEstimatorTest, ManualHistogram) { +TEST(CEHistogramEstimatorTest, ManualHistogram) { std::vector data{{0, 1.0, 1.0, 1.0}, {10, 1.0, 10.0, 5.0}, {20, 3.0, 15.0, 3.0}, @@ -966,19 +966,18 @@ TEST(ArrayEstimatorTest, ManualHistogram) { {50, 1.0, 10.0, 5.0}}; const double intCnt = 55; const ScalarHistogram& hist = createHistogram(data); - const auto arrHist = - ArrayHistogram::make(hist, stats::TypeCounts{{NumberInt64, intCnt}}, intCnt); + const auto ceHist = CEHistogram::make(hist, stats::TypeCounts{{NumberInt64, intCnt}}, intCnt); - ASSERT_EQ(3.0, estimateCardinalityEq(*arrHist, NumberInt64, 20, true).card); - ASSERT_EQ(1.0, estimateCardinalityEq(*arrHist, NumberInt64, 50, true).card); + ASSERT_EQ(3.0, estimateCardinalityEq(*ceHist, NumberInt64, 20, true).card); + ASSERT_EQ(1.0, estimateCardinalityEq(*ceHist, NumberInt64, 50, true).card); ASSERT_EQ(0, - estimateCardinalityEq(*arrHist, NumberInt64, 40, false) + estimateCardinalityEq(*ceHist, NumberInt64, 40, false) .card); // should be 2.0 for includeScalar: true // value not in data - ASSERT_EQ(0, estimateCardinalityEq(*arrHist, NumberInt64, 60, true).card); + ASSERT_EQ(0, estimateCardinalityEq(*ceHist, NumberInt64, 60, true).card); } -TEST(ArrayEstimatorTest, UniformIntHistogram) { +TEST(CEHistogramEstimatorTest, UniformIntHistogram) { std::vector data{{2, 1, 0, 0}, {57, 3, 2, 1}, {179, 5, 10, 6}, @@ -991,19 +990,18 @@ TEST(ArrayEstimatorTest, UniformIntHistogram) { {986, 1, 0, 0}}; const ScalarHistogram& hist = createHistogram(data); const double intCnt = 100; - const auto arrHist = - ArrayHistogram::make(hist, stats::TypeCounts{{NumberInt64, intCnt}}, intCnt); + const auto ceHist = CEHistogram::make(hist, stats::TypeCounts{{NumberInt64, intCnt}}, intCnt); - ASSERT_EQ(4.0, estimateCardinalityEq(*arrHist, NumberInt64, 558, true).card); + ASSERT_EQ(4.0, estimateCardinalityEq(*ceHist, NumberInt64, 558, true).card); ASSERT_APPROX_EQUAL(1.6, - estimateCardinalityEq(*arrHist, NumberInt64, 530, true).card, + estimateCardinalityEq(*ceHist, NumberInt64, 530, true).card, 0.1); // Actual: 1. ASSERT_APPROX_EQUAL(1.6, - estimateCardinalityEq(*arrHist, NumberInt64, 400, true).card, + estimateCardinalityEq(*ceHist, NumberInt64, 400, true).card, 0.1); // Actual: 1. } -TEST(ArrayEstimatorTest, NormalIntArrayHistogram) { +TEST(CEHistogramEstimatorTest, NormalIntArrayHistogram) { std::vector data{{2, 1, 0, 0}, {317, 8, 20, 15}, {344, 2, 0, 0}, @@ -1016,16 +1014,15 @@ TEST(ArrayEstimatorTest, NormalIntArrayHistogram) { {993, 1, 21, 9}}; const ScalarHistogram hist = createHistogram(data); const double intCnt = 100; - const auto arrHist = - ArrayHistogram::make(hist, stats::TypeCounts{{NumberInt64, intCnt}}, intCnt); + const auto ceHist = CEHistogram::make(hist, stats::TypeCounts{{NumberInt64, intCnt}}, intCnt); - ASSERT_EQ(3.0, estimateCardinalityEq(*arrHist, NumberInt64, 696, true).card); + ASSERT_EQ(3.0, estimateCardinalityEq(*ceHist, NumberInt64, 696, true).card); ASSERT_APPROX_EQUAL(1.3, - estimateCardinalityEq(*arrHist, NumberInt64, 150, true).card, + estimateCardinalityEq(*ceHist, NumberInt64, 150, true).card, 0.1); // Actual: 1. } -TEST(ArrayEstimatorTest, SkewedIntHistogram) { +TEST(CEHistogramEstimatorTest, SkewedIntHistogram) { std::vector data{{0, 1.0, 1.0, 1.0}, {10, 150.0, 10.0, 5.0}, {20, 100.0, 14.0, 3.0}, @@ -1034,16 +1031,15 @@ TEST(ArrayEstimatorTest, SkewedIntHistogram) { {50, 1.0, 10.0, 5.0}}; const double intCnt = 300; const ScalarHistogram& hist = createHistogram(data); - const auto arrHist = - ArrayHistogram::make(hist, stats::TypeCounts{{NumberInt64, intCnt}}, intCnt); + const auto ceHist = CEHistogram::make(hist, stats::TypeCounts{{NumberInt64, intCnt}}, intCnt); - ASSERT_EQ(150.0, estimateCardinalityEq(*arrHist, NumberInt64, 10, true).card); - ASSERT_EQ(100.0, estimateCardinalityEq(*arrHist, NumberInt64, 20, true).card); - ASSERT_EQ(1.0, estimateCardinalityEq(*arrHist, NumberInt64, 30, true).card); - ASSERT_EQ(0, estimateCardinalityEq(*arrHist, NumberInt64, 40, false).card); + ASSERT_EQ(150.0, estimateCardinalityEq(*ceHist, NumberInt64, 10, true).card); + ASSERT_EQ(100.0, estimateCardinalityEq(*ceHist, NumberInt64, 20, true).card); + ASSERT_EQ(1.0, estimateCardinalityEq(*ceHist, NumberInt64, 30, true).card); + ASSERT_EQ(0, estimateCardinalityEq(*ceHist, NumberInt64, 40, false).card); } -TEST(ArrayEstimatorTest, StringHistogram) { +TEST(CEHistogramEstimatorTest, StringHistogram) { std::vector data{ {"testA", 5.0, 2.0, 1.0}, {"testB", 3.0, 2.0, 2.0}, {"testC", 2.0, 1.0, 1.0}}; const double strCnt = 15; @@ -1051,21 +1047,21 @@ TEST(ArrayEstimatorTest, StringHistogram) { ASSERT_EQ(strCnt, getTotals(hist).card); - const auto arrHist = ArrayHistogram::make( - hist, stats::TypeCounts{{value::TypeTags::StringSmall, strCnt}}, strCnt); + const auto ceHist = + CEHistogram::make(hist, stats::TypeCounts{{value::TypeTags::StringSmall, strCnt}}, strCnt); auto [tag, value] = value::makeNewString("testA"_sd); value::ValueGuard vg(tag, value); - ASSERT_EQ(5.0, estimateCardinalityEq(*arrHist, tag, value, true).card); + ASSERT_EQ(5.0, estimateCardinalityEq(*ceHist, tag, value, true).card); std::tie(tag, value) = value::makeNewString("testB"_sd); - ASSERT_EQ(3.0, estimateCardinalityEq(*arrHist, tag, value, true).card); + ASSERT_EQ(3.0, estimateCardinalityEq(*ceHist, tag, value, true).card); std::tie(tag, value) = value::makeNewString("testC"_sd); - ASSERT_EQ(0, estimateCardinalityEq(*arrHist, tag, value, false).card); + ASSERT_EQ(0, estimateCardinalityEq(*ceHist, tag, value, false).card); } -TEST(ArrayEstimatorTest, UniformStrHistogram) { +TEST(CEHistogramEstimatorTest, UniformStrHistogram) { std::vector data{{{"0ejz", 2, 0, 0}, {"8DCaq", 3, 4, 4}, {"Cy5Kw", 3, 3, 3}, @@ -1079,17 +1075,17 @@ TEST(ArrayEstimatorTest, UniformStrHistogram) { const double strCnt = 100; const ScalarHistogram& hist = createHistogram(data); - const auto arrHist = ArrayHistogram::make( - hist, stats::TypeCounts{{value::TypeTags::StringSmall, strCnt}}, strCnt); + const auto ceHist = + CEHistogram::make(hist, stats::TypeCounts{{value::TypeTags::StringSmall, strCnt}}, strCnt); const auto [tag, value] = value::makeNewString("TTV"_sd); value::ValueGuard vg(tag, value); ASSERT_APPROX_EQUAL( - 1.55, estimateCardinalityEq(*arrHist, tag, value, true).card, 0.1); // Actual: 2. + 1.55, estimateCardinalityEq(*ceHist, tag, value, true).card, 0.1); // Actual: 2. } -TEST(ArrayEstimatorTest, NormalStrHistogram) { +TEST(CEHistogramEstimatorTest, NormalStrHistogram) { std::vector data{{ {"0ejz", 1, 0, 0}, {"4FGjc", 3, 5, 3}, @@ -1105,21 +1101,21 @@ TEST(ArrayEstimatorTest, NormalStrHistogram) { const double strCnt = 100; const ScalarHistogram& hist = createHistogram(data); - const auto arrHist = ArrayHistogram::make( - hist, stats::TypeCounts{{value::TypeTags::StringSmall, strCnt}}, strCnt); + const auto ceHist = + CEHistogram::make(hist, stats::TypeCounts{{value::TypeTags::StringSmall, strCnt}}, strCnt); auto [tag, value] = value::makeNewString("TTV"_sd); value::ValueGuard vg(tag, value); ASSERT_APPROX_EQUAL( - 5.0, estimateCardinalityEq(*arrHist, tag, value, true).card, 0.1); // Actual: 5. + 5.0, estimateCardinalityEq(*ceHist, tag, value, true).card, 0.1); // Actual: 5. std::tie(tag, value) = value::makeNewString("Pfa"_sd); ASSERT_APPROX_EQUAL( - 1.75, estimateCardinalityEq(*arrHist, tag, value, true).card, 0.1); // Actual: 2. + 1.75, estimateCardinalityEq(*ceHist, tag, value, true).card, 0.1); // Actual: 2. } -TEST(ArrayEstimatorTest, IntStrHistogram) { +TEST(CEHistogramEstimatorTest, IntStrHistogram) { std::vector data{{1, 1.0, 0.0, 0.0}, {"test", 20.0, 0.0, 0.0}}; const double intCnt = 1; const double strCnt = 20; @@ -1128,20 +1124,20 @@ TEST(ArrayEstimatorTest, IntStrHistogram) { ASSERT_EQ(totalCnt, getTotals(hist).card); - const auto arrHist = ArrayHistogram::make( + const auto ceHist = CEHistogram::make( hist, stats::TypeCounts{{NumberInt64, intCnt}, {value::TypeTags::StringSmall, strCnt}}, totalCnt); auto [tag, value] = value::makeNewString("test"_sd); value::ValueGuard vg(tag, value); - ASSERT_EQ(20.0, estimateCardinalityEq(*arrHist, tag, value, true).card); - ASSERT_EQ(1.0, estimateCardinalityEq(*arrHist, NumberInt64, 1, true).card); - ASSERT_EQ(0, estimateCardinalityEq(*arrHist, tag, value, false).card); - ASSERT_EQ(0, estimateCardinalityEq(*arrHist, NumberInt64, 1, false).card); + ASSERT_EQ(20.0, estimateCardinalityEq(*ceHist, tag, value, true).card); + ASSERT_EQ(1.0, estimateCardinalityEq(*ceHist, NumberInt64, 1, true).card); + ASSERT_EQ(0, estimateCardinalityEq(*ceHist, tag, value, false).card); + ASSERT_EQ(0, estimateCardinalityEq(*ceHist, NumberInt64, 1, false).card); } -TEST(ArrayEstimatorTest, UniformIntStrHistogram) { +TEST(CEHistogramEstimatorTest, UniformIntStrHistogram) { std::vector data{{ {2, 3, 0, 0}, {19, 4, 1, 1}, {226, 2, 49, 20}, {301, 5, 12, 4}, {317, 3, 0, 0}, {344, 2, 3, 1}, {423, 5, 18, 6}, {445, 3, 0, 0}, @@ -1156,30 +1152,30 @@ TEST(ArrayEstimatorTest, UniformIntStrHistogram) { ASSERT_EQ(totalCnt, getTotals(hist).card); - const auto arrHist = ArrayHistogram::make( + const auto ceHist = CEHistogram::make( hist, stats::TypeCounts{{NumberInt64, intCnt}, {value::TypeTags::StringSmall, strCnt}}, totalCnt); ASSERT_APPROX_EQUAL(7.0, - estimateCardinalityEq(*arrHist, NumberInt64, 993, true).card, + estimateCardinalityEq(*ceHist, NumberInt64, 993, true).card, 0.1); // Actual: 9 auto [tag, value] = value::makeNewString("04e"_sd); value::ValueGuard vg(tag, value); ASSERT_APPROX_EQUAL( - 2.2, estimateCardinalityEq(*arrHist, tag, value, true).card, 0.1); // Actual: 3. + 2.2, estimateCardinalityEq(*ceHist, tag, value, true).card, 0.1); // Actual: 3. value::TypeTags lowTag = value::TypeTags::NumberInt64; value::Value lowVal = 100000000; ASSERT_APPROX_EQUAL(0.0, - estimateCardinalityEq(*arrHist, lowTag, lowVal, true).card, + estimateCardinalityEq(*ceHist, lowTag, lowVal, true).card, 0.1); // Actual: 0 // Query: [{$match: {a: {$lt: '04e'}}}]. - auto expectedCard = estimateCardinalityRange(*arrHist, + auto expectedCard = estimateCardinalityRange(*ceHist, false /* lowInclusive */, lowTag, lowVal, @@ -1190,8 +1186,7 @@ TEST(ArrayEstimatorTest, UniformIntStrHistogram) { ASSERT_CE_APPROX_EQUAL(13.3, expectedCard.card, 0.1); // Actual: 0. // Query: [{$match: {a: {$lte: '04e'}}}]. - expectedCard = - estimateCardinalityRange(*arrHist, false, lowTag, lowVal, true, tag, value, true); + expectedCard = estimateCardinalityRange(*ceHist, false, lowTag, lowVal, true, tag, value, true); ASSERT_CE_APPROX_EQUAL(15.5, expectedCard.card, 0.1); // Actual: 3. // Value towards the end of the bucket gets the same half bucket estimate. @@ -1199,16 +1194,15 @@ TEST(ArrayEstimatorTest, UniformIntStrHistogram) { // Query: [{$match: {a: {$lt: '8B5'}}}]. expectedCard = - estimateCardinalityRange(*arrHist, false, lowTag, lowVal, false, tag, value, true); + estimateCardinalityRange(*ceHist, false, lowTag, lowVal, false, tag, value, true); ASSERT_CE_APPROX_EQUAL(13.3, expectedCard.card, 0.1); // Actual: 24. // Query: [{$match: {a: {$lte: '8B5'}}}]. - expectedCard = - estimateCardinalityRange(*arrHist, false, lowTag, lowVal, true, tag, value, true); + expectedCard = estimateCardinalityRange(*ceHist, false, lowTag, lowVal, true, tag, value, true); ASSERT_CE_APPROX_EQUAL(15.5, expectedCard.card, 0.1); // Actual: 29. } -TEST(ArrayEstimatorInterpolationTest, UniformIntStrEstimate) { +TEST(CEHistogramEstimatorInterpolationTest, UniformIntStrEstimate) { // This hard-codes a maxdiff histogram with 20 buckets built off of a uniform distribution with // two types occurring with equal probability: // - 100 distinct ints between 0 and 1000, and @@ -1224,7 +1218,7 @@ TEST(ArrayEstimatorInterpolationTest, UniformIntStrEstimate) { {"MIb", 5, 45, 17}, {"Zgi", 3, 55, 22}, {"pZ", 6, 62, 25}, {"yUwxz", 5, 29, 12}, }}; const ScalarHistogram hist = createHistogram(data); - const auto arrHist = ArrayHistogram::make( + const auto ceHist = CEHistogram::make( hist, TypeCounts{{value::TypeTags::NumberInt64, numInt}, {value::TypeTags::StringSmall, numStr}}, collCard); @@ -1258,14 +1252,14 @@ TEST(ArrayEstimatorInterpolationTest, UniformIntStrEstimate) { // Type bracketing: low value of different type than the bucket bound. // Query: [{$match: {a: {$eq: 100000000}}}]. - expectedCard = estimateCardinalityEq(*arrHist, lowTag, lowVal, true /* includeScalar */); + expectedCard = estimateCardinalityEq(*ceHist, lowTag, lowVal, true /* includeScalar */); ASSERT_CE_APPROX_EQUAL(0.0, expectedCard.card, 0.1); // Actual: 0. // No interpolation for inequality to values inside the first string bucket, fallback to half of // the bucket frequency. // Query: [{$match: {a: {$lt: '04e'}}}]. - expectedCard = estimateCardinalityRange(*arrHist, + expectedCard = estimateCardinalityRange(*ceHist, false /* lowInclusive */, lowTag, lowVal, @@ -1276,7 +1270,7 @@ TEST(ArrayEstimatorInterpolationTest, UniformIntStrEstimate) { ASSERT_CE_APPROX_EQUAL(13.3, expectedCard.card, 0.1); // Actual: 0. // Query: [{$match: {a: {$lte: '04e'}}}]. - expectedCard = estimateCardinalityRange(*arrHist, + expectedCard = estimateCardinalityRange(*ceHist, false /* lowInclusive */, lowTag, lowVal, @@ -1290,7 +1284,7 @@ TEST(ArrayEstimatorInterpolationTest, UniformIntStrEstimate) { std::tie(tag, value) = value::makeNewString("8B5"_sd); // Query: [{$match: {a: {$lt: '8B5'}}}]. - expectedCard = estimateCardinalityRange(*arrHist, + expectedCard = estimateCardinalityRange(*ceHist, false /* lowInclusive */, lowTag, lowVal, @@ -1301,7 +1295,7 @@ TEST(ArrayEstimatorInterpolationTest, UniformIntStrEstimate) { ASSERT_CE_APPROX_EQUAL(13.3, expectedCard.card, 0.1); // Actual: 24. // Query: [{$match: {a: {$lte: '8B5'}}}]. - expectedCard = estimateCardinalityRange(*arrHist, + expectedCard = estimateCardinalityRange(*ceHist, false /* lowInclusive */, lowTag, lowVal, @@ -1312,7 +1306,7 @@ TEST(ArrayEstimatorInterpolationTest, UniformIntStrEstimate) { ASSERT_CE_APPROX_EQUAL(15.5, expectedCard.card, 0.1); // Actual: 29. } -TEST(ArrayEstimatorInterpolationTest, UniformIntArrayOnlyEstimate) { +TEST(CEHistogramEstimatorInterpolationTest, UniformIntArrayOnlyEstimate) { // This hard-codes a maxdiff histogram with 10 buckets built off of an array distribution with // arrays between 3 and 5 elements long, each containing 100 distinct ints uniformly distributed // between 0 and 1000. There are no scalar elements. @@ -1343,14 +1337,14 @@ TEST(ArrayEstimatorInterpolationTest, UniformIntArrayOnlyEstimate) { }}; const ScalarHistogram uniqueHist = createHistogram(uniqueData); - const auto arrHist = ArrayHistogram::make(scalarHist, - TypeCounts{{value::TypeTags::Array, 100}}, - uniqueHist, - minHist, - maxHist, - // There are 100 non-empty int-only arrays. - TypeCounts{{value::TypeTags::NumberInt64, 100}}, - 100.0 /* sampleSize */); + const auto ceHist = CEHistogram::make(scalarHist, + TypeCounts{{value::TypeTags::Array, 100}}, + uniqueHist, + minHist, + maxHist, + // There are 100 non-empty int-only arrays. + TypeCounts{{value::TypeTags::NumberInt64, 100}}, + 100.0 /* sampleSize */); // Query in the middle of the domain: estimate from ArrayUnique histogram. value::TypeTags lowTag = value::TypeTags::NumberInt64; @@ -1360,14 +1354,14 @@ TEST(ArrayEstimatorInterpolationTest, UniformIntArrayOnlyEstimate) { // Test interpolation for query: [{$match: {a: {$elemMatch: {$gt: 500, $lt: 600}}}}]. auto expectedCard = - estimateCardinalityRange(*arrHist, false, lowTag, lowVal, false, highTag, highVal, false); + estimateCardinalityRange(*ceHist, false, lowTag, lowVal, false, highTag, highVal, false); ASSERT_CE_APPROX_EQUAL(27.0, expectedCard.card, 0.1); // actual 21. // Test interpolation for query: [{$match: {a: {$gt: 500, $lt: 600}}}]. // Note: although there are no scalars, the estimate is different than the // above since we use different formulas. expectedCard = - estimateCardinalityRange(*arrHist, false, lowTag, lowVal, false, highTag, highVal, true); + estimateCardinalityRange(*ceHist, false, lowTag, lowVal, false, highTag, highVal, true); ASSERT_CE_APPROX_EQUAL(92.0, expectedCard.card, 0.1); // actual 92. // Query at the end of the domain: more precise estimates from ArrayMin, ArrayMax histograms. @@ -1376,16 +1370,16 @@ TEST(ArrayEstimatorInterpolationTest, UniformIntArrayOnlyEstimate) { // Test interpolation for query: [{$match: {a: {$elemMatch: {$gt: 10, $lt: 110}}}}]. expectedCard = - estimateCardinalityRange(*arrHist, false, lowTag, lowVal, false, highTag, highVal, false); + estimateCardinalityRange(*ceHist, false, lowTag, lowVal, false, highTag, highVal, false); ASSERT_CE_APPROX_EQUAL(24.1, expectedCard.card, 0.1); // actual 29. // Test interpolation for query: [{$match: {a: {$gt: 10, $lt: 110}}}]. expectedCard = - estimateCardinalityRange(*arrHist, false, lowTag, lowVal, false, highTag, highVal, true); + estimateCardinalityRange(*ceHist, false, lowTag, lowVal, false, highTag, highVal, true); ASSERT_CE_APPROX_EQUAL(27.8, expectedCard.card, 0.1); // actual 31. } -TEST(ArrayEstimatorInterpolationTest, UniformIntMixedArrayEstimate) { +TEST(CEHistogramEstimatorInterpolationTest, UniformIntMixedArrayEstimate) { // This hard-codes a maxdiff histogram with 20 buckets built off of a mixed distribution split // with equal probability between: // - an array distribution between 3 and 5 elements long, each containing 80 distinct ints @@ -1424,14 +1418,14 @@ TEST(ArrayEstimatorInterpolationTest, UniformIntMixedArrayEstimate) { const ScalarHistogram uniqueHist = createHistogram(uniqueData); TypeCounts typeCounts{{value::TypeTags::NumberInt64, 106}, {value::TypeTags::Array, 94}}; - const auto arrHist = ArrayHistogram::make(scalarHist, - typeCounts, - uniqueHist, - minHist, - maxHist, - // There are 94 non-empty int-only arrays. - TypeCounts{{value::TypeTags::NumberInt64, 94}}, - 200.0 /* sampleSize */); + const auto ceHist = CEHistogram::make(scalarHist, + typeCounts, + uniqueHist, + minHist, + maxHist, + // There are 94 non-empty int-only arrays. + TypeCounts{{value::TypeTags::NumberInt64, 94}}, + 200.0 /* sampleSize */); value::TypeTags lowTag = value::TypeTags::NumberInt64; value::Value lowVal = 500; @@ -1440,33 +1434,33 @@ TEST(ArrayEstimatorInterpolationTest, UniformIntMixedArrayEstimate) { // Test interpolation for query: [{$match: {a: {$gt: 500, $lt: 550}}}]. auto expectedCard = - estimateCardinalityRange(*arrHist, false, lowTag, lowVal, false, highTag, highVal, true); + estimateCardinalityRange(*ceHist, false, lowTag, lowVal, false, highTag, highVal, true); ASSERT_CE_APPROX_EQUAL(92.9, expectedCard.card, 0.1); // Actual: 94. // Test interpolation for query: [{$match: {a: {$elemMatch: {$gt: 500, $lt: 550}}}}]. expectedCard = - estimateCardinalityRange(*arrHist, false, lowTag, lowVal, false, highTag, highVal, false); + estimateCardinalityRange(*ceHist, false, lowTag, lowVal, false, highTag, highVal, false); ASSERT_CE_APPROX_EQUAL(11.0, expectedCard.card, 0.1); // Actual: 8. } -TEST(ArrayEstimatorEdgeCasesTest, TwoExclusiveBucketsMixedHistogram) { +TEST(CEHistogramEstimatorEdgeCasesTest, TwoExclusiveBucketsMixedHistogram) { // Data set of mixed data types: 3 integers and 5 strings. constexpr double numInts = 3.0; constexpr double numStrs = 5.0; constexpr double collCard = numInts + numStrs; std::vector data{{1, numInts, 0.0, 0.0}, {"abc", numStrs, 0.0, 0.0}}; const ScalarHistogram hist = createHistogram(data); - const auto arrHist = ArrayHistogram::make(hist, - TypeCounts{{value::TypeTags::NumberInt64, numInts}, - {value::TypeTags::StringSmall, numStrs}}, - collCard); + const auto ceHist = CEHistogram::make(hist, + TypeCounts{{value::TypeTags::NumberInt64, numInts}, + {value::TypeTags::StringSmall, numStrs}}, + collCard); const auto [tagLowDbl, valLowDbl] = std::make_pair(value::TypeTags::NumberDouble, value::bitcastFrom(std::numeric_limits::quiet_NaN())); // (NaN, 1). - auto expectedCard = estimateCardinalityRange(*arrHist, + auto expectedCard = estimateCardinalityRange(*ceHist, false, tagLowDbl, valLowDbl, @@ -1477,7 +1471,7 @@ TEST(ArrayEstimatorEdgeCasesTest, TwoExclusiveBucketsMixedHistogram) { ASSERT_CE_APPROX_EQUAL(0.0, expectedCard.card, kErrorBound); // (NaN, 5). - expectedCard = estimateCardinalityRange(*arrHist, + expectedCard = estimateCardinalityRange(*ceHist, false, tagLowDbl, valLowDbl, @@ -1493,7 +1487,7 @@ TEST(ArrayEstimatorEdgeCasesTest, TwoExclusiveBucketsMixedHistogram) { value::ValueGuard vg(tag, value); // [0, ""). - expectedCard = estimateCardinalityRange(*arrHist, + expectedCard = estimateCardinalityRange(*ceHist, true, value::TypeTags::NumberInt32, value::bitcastFrom(0), @@ -1505,24 +1499,24 @@ TEST(ArrayEstimatorEdgeCasesTest, TwoExclusiveBucketsMixedHistogram) { // ["", "a"]. expectedCard = - estimateCardinalityRange(*arrHist, true, tagLowStr, valLowStr, true, tag, value, true); + estimateCardinalityRange(*ceHist, true, tagLowStr, valLowStr, true, tag, value, true); ASSERT_CE_APPROX_EQUAL(0.0, expectedCard.card, kErrorBound); std::tie(tag, value) = value::makeNewString("xyz"_sd); // ["", "xyz"]. expectedCard = - estimateCardinalityRange(*arrHist, true, tagLowStr, valLowStr, true, tag, value, true); + estimateCardinalityRange(*ceHist, true, tagLowStr, valLowStr, true, tag, value, true); ASSERT_CE_APPROX_EQUAL(numStrs, expectedCard.card, kErrorBound); } -TEST(ArrayEstimatorEdgeCasesTest, TwoBucketsMixedHistogram) { +TEST(CEHistogramEstimatorEdgeCasesTest, TwoBucketsMixedHistogram) { // Data set of mixed data types: 20 integers and 80 strings. // Histogram with one bucket per data type. std::vector data{{100, 3.0, 17.0, 9.0}, {"pqr", 5.0, 75.0, 25.0}}; const ScalarHistogram hist = createHistogram(data); - const auto arrHist = ArrayHistogram::make( + const auto ceHist = CEHistogram::make( hist, TypeCounts{{value::TypeTags::NumberInt64, 20}, {value::TypeTags::StringSmall, 80}}, 100.0 /* sampleSize */); @@ -1586,7 +1580,7 @@ TEST(ArrayEstimatorEdgeCasesTest, TwoBucketsMixedHistogram) { std::make_pair(value::TypeTags::NumberInt64, value::bitcastFrom(1000000)); // [NaN, 25]. - expectedCard = estimateCardinalityRange(*arrHist, + expectedCard = estimateCardinalityRange(*ceHist, true, tagLowDbl, valLowDbl, @@ -1597,7 +1591,7 @@ TEST(ArrayEstimatorEdgeCasesTest, TwoBucketsMixedHistogram) { ASSERT_CE_APPROX_EQUAL(8.49, expectedCard.card, kErrorBound); // [25, 1000000]. - expectedCard = estimateCardinalityRange(*arrHist, + expectedCard = estimateCardinalityRange(*ceHist, true, value::TypeTags::NumberInt32, value::bitcastFrom(25), @@ -1609,7 +1603,7 @@ TEST(ArrayEstimatorEdgeCasesTest, TwoBucketsMixedHistogram) { // [NaN, 1000000]. expectedCard = estimateCardinalityRange( - *arrHist, true, tagLowDbl, valLowDbl, true, tagHighInt, valHighInt, true); + *ceHist, true, tagLowDbl, valLowDbl, true, tagHighInt, valHighInt, true); ASSERT_CE_APPROX_EQUAL(20.0, expectedCard.card, kErrorBound); const auto [tagLowStr, valLowStr] = value::makeNewString(""_sd); @@ -1617,11 +1611,11 @@ TEST(ArrayEstimatorEdgeCasesTest, TwoBucketsMixedHistogram) { // [NaN, ""). expectedCard = estimateCardinalityRange( - *arrHist, true, tagLowDbl, valLowDbl, false, tagLowStr, valLowStr, true); + *ceHist, true, tagLowDbl, valLowDbl, false, tagLowStr, valLowStr, true); ASSERT_CE_APPROX_EQUAL(20.0, expectedCard.card, kErrorBound); // [25, ""). - expectedCard = estimateCardinalityRange(*arrHist, + expectedCard = estimateCardinalityRange(*ceHist, true, value::TypeTags::NumberInt32, value::bitcastFrom(25), @@ -1633,7 +1627,7 @@ TEST(ArrayEstimatorEdgeCasesTest, TwoBucketsMixedHistogram) { // ["", "a"]. expectedCard = - estimateCardinalityRange(*arrHist, true, tagLowStr, valLowStr, true, tag, value, true); + estimateCardinalityRange(*ceHist, true, tagLowStr, valLowStr, true, tag, value, true); ASSERT_CE_APPROX_EQUAL(37.49, expectedCard.card, kErrorBound); @@ -1641,17 +1635,16 @@ TEST(ArrayEstimatorEdgeCasesTest, TwoBucketsMixedHistogram) { auto [tagObj, valObj] = value::makeNewObject(); value::ValueGuard vgObj(tagObj, valObj); expectedCard = - estimateCardinalityRange(*arrHist, true, tagLowStr, valLowStr, false, tagObj, valObj, true); + estimateCardinalityRange(*ceHist, true, tagLowStr, valLowStr, false, tagObj, valObj, true); ASSERT_CE_APPROX_EQUAL(80.0, expectedCard.card, kErrorBound); // ["a", {}). - expectedCard = - estimateCardinalityRange(*arrHist, true, tag, value, false, tagObj, valObj, true); + expectedCard = estimateCardinalityRange(*ceHist, true, tag, value, false, tagObj, valObj, true); ASSERT_CE_APPROX_EQUAL(45.5, expectedCard.card, kErrorBound); } -TEST(ArrayEstimatorDataTest, Histogram1000ArraysSmall10Buckets) { +TEST(CEHistogramEstimatorDataTest, Histogram1000ArraysSmall10Buckets) { std::vector scalarData{{}}; const ScalarHistogram scalarHist = createHistogram(scalarData); @@ -1701,7 +1694,7 @@ TEST(ArrayEstimatorDataTest, Histogram1000ArraysSmall10Buckets) { arrayTypeCounts.insert({value::TypeTags::NumberInt32, 1000}); constexpr double collCard = 1000.0; - const auto arrHist = ArrayHistogram::make( + const auto ceHist = CEHistogram::make( scalarHist, typeCounts, aUniqueHist, aMinHist, aMaxHist, arrayTypeCounts, collCard); std::vector querySet{{10, 20, 35.7, 93.0, 37.8, 39.0}, @@ -1713,7 +1706,7 @@ TEST(ArrayEstimatorDataTest, Histogram1000ArraysSmall10Buckets) { for (const auto q : querySet) { // $match query, includeScalar = true. - auto estCard = estimateCardinalityRange(*arrHist, + auto estCard = estimateCardinalityRange(*ceHist, false, value::TypeTags::NumberInt32, value::bitcastFrom(q.low), @@ -1724,7 +1717,7 @@ TEST(ArrayEstimatorDataTest, Histogram1000ArraysSmall10Buckets) { ASSERT_CE_APPROX_EQUAL(estCard.card, q.estMatch, 0.1); // $elemMatch query, includeScalar = false. - estCard = estimateCardinalityRange(*arrHist, + estCard = estimateCardinalityRange(*ceHist, false, value::TypeTags::NumberInt32, value::bitcastFrom(q.low), @@ -1744,7 +1737,7 @@ TEST(ArrayEstimatorDataTest, Histogram1000ArraysSmall10Buckets) { } } -TEST(ArrayEstimatorDataTest, Histogram1000ArraysLarge10Buckets) { +TEST(CEHistogramEstimatorDataTest, Histogram1000ArraysLarge10Buckets) { std::vector scalarData{{}}; const ScalarHistogram scalarHist = createHistogram(scalarData); @@ -1794,7 +1787,7 @@ TEST(ArrayEstimatorDataTest, Histogram1000ArraysLarge10Buckets) { arrayTypeCounts.insert({value::TypeTags::NumberInt32, 1000}); constexpr double collCard = 1000.0; - const auto arrHist = ArrayHistogram::make( + const auto ceHist = CEHistogram::make( scalarHist, typeCounts, aUniqueHist, aMinHist, aMaxHist, arrayTypeCounts, collCard); std::vector querySet{{10, 20, 13.7, 39.0, 9.7, 26.0}, @@ -1806,7 +1799,7 @@ TEST(ArrayEstimatorDataTest, Histogram1000ArraysLarge10Buckets) { for (const auto q : querySet) { // $match query, includeScalar = true. - auto estCard = estimateCardinalityRange(*arrHist, + auto estCard = estimateCardinalityRange(*ceHist, false, value::TypeTags::NumberInt32, value::bitcastFrom(q.low), @@ -1817,7 +1810,7 @@ TEST(ArrayEstimatorDataTest, Histogram1000ArraysLarge10Buckets) { ASSERT_CE_APPROX_EQUAL(estCard.card, q.estMatch, 0.1); // $elemMatch query, includeScalar = false. - estCard = estimateCardinalityRange(*arrHist, + estCard = estimateCardinalityRange(*ceHist, false, value::TypeTags::NumberInt32, value::bitcastFrom(q.low), diff --git a/src/mongo/db/query/ce/histogram_estimator.cpp b/src/mongo/db/query/ce/histogram_estimator.cpp index 5951eddbd273c..df27f3eceeeb8 100644 --- a/src/mongo/db/query/ce/histogram_estimator.cpp +++ b/src/mongo/db/query/ce/histogram_estimator.cpp @@ -31,7 +31,7 @@ namespace mongo::ce { -Cardinality HistogramEstimator::estimateCardinality(const stats::ArrayHistogram& hist, +Cardinality HistogramEstimator::estimateCardinality(const stats::CEHistogram& hist, const Cardinality collectionSize, const mongo::Interval& interval, bool includeScalar) { @@ -40,7 +40,7 @@ Cardinality HistogramEstimator::estimateCardinality(const stats::ArrayHistogram& collectionSize; } -bool HistogramEstimator::canEstimateInterval(const stats::ArrayHistogram& hist, +bool HistogramEstimator::canEstimateInterval(const stats::CEHistogram& hist, const mongo::Interval& interval, bool includeScalar) { diff --git a/src/mongo/db/query/ce/histogram_estimator.h b/src/mongo/db/query/ce/histogram_estimator.h index 4f7feed9381d8..5eb6cbd47b295 100644 --- a/src/mongo/db/query/ce/histogram_estimator.h +++ b/src/mongo/db/query/ce/histogram_estimator.h @@ -41,7 +41,7 @@ class HistogramEstimator { * 'inputScalar' indicates whether or not the provided interval should include non-array values. * e.g., $elemMatch should exclude the non-array values when 'includeScalar' is set to false. */ - static Cardinality estimateCardinality(const stats::ArrayHistogram& hist, + static Cardinality estimateCardinality(const stats::CEHistogram& hist, Cardinality collectionSize, const mongo::Interval& interval, bool includeScalar); @@ -49,7 +49,7 @@ class HistogramEstimator { /** * Checks if given interval can be estimated. */ - static bool canEstimateInterval(const stats::ArrayHistogram& hist, + static bool canEstimateInterval(const stats::CEHistogram& hist, const mongo::Interval& interval, bool includeScalar); }; diff --git a/src/mongo/db/query/ce/histogram_estimator_test.cpp b/src/mongo/db/query/ce/histogram_estimator_test.cpp index 4e344a91a1e5e..b84fa5d2b5d53 100644 --- a/src/mongo/db/query/ce/histogram_estimator_test.cpp +++ b/src/mongo/db/query/ce/histogram_estimator_test.cpp @@ -36,7 +36,7 @@ namespace { namespace value = sbe::value; using mongo::Interval; -using stats::ArrayHistogram; +using stats::CEHistogram; using stats::ScalarHistogram; using stats::TypeCounts; @@ -51,21 +51,21 @@ TEST(HistogramPredicateEstimationTest, IntHistogramIntervalEstimation) { {50, 1.0, 10.0, 5.0}}; const Cardinality intCnt = 55; const ScalarHistogram hist = createHistogram(data); - const auto arrHist = ArrayHistogram::make(hist, TypeCounts{{NumberInt64, intCnt}}, intCnt); + const auto ceHist = CEHistogram::make(hist, TypeCounts{{NumberInt64, intCnt}}, intCnt); { // {a: 20} Interval interval(BSON("" << 20 << "" << 20), true /*startIncluded*/, true /*endIncluded*/); auto estimatedCard = - estimateCardinalityEq(*arrHist, NumberInt64, 20, true /*includeScalar*/).card; + estimateCardinalityEq(*ceHist, NumberInt64, 20, true /*includeScalar*/).card; ASSERT_EQ(3.0, estimatedCard); ASSERT_CE_APPROX_EQUAL(estimatedCard, - estimateIntervalCardinality(*arrHist, interval), + estimateIntervalCardinality(*ceHist, interval), 0.001 /* rounding error */); } { // {a: {$gte: 20, $lte: 30}} Interval interval(BSON("" << 20 << "" << 30), true, true); - auto estimatedCard = estimateCardinalityRange(*arrHist, + auto estimatedCard = estimateCardinalityRange(*ceHist, true /*lowInclusive*/, NumberInt64, 20, @@ -76,13 +76,13 @@ TEST(HistogramPredicateEstimationTest, IntHistogramIntervalEstimation) { .card; ASSERT_EQ(14.0, estimatedCard); ASSERT_CE_APPROX_EQUAL(estimatedCard, - estimateIntervalCardinality(*arrHist, interval), + estimateIntervalCardinality(*ceHist, interval), 0.001 /* rounding error */); } { // {a: {$gte: 20, $lte: 25}}, bucket interpolation. Interval interval(BSON("" << 20 << "" << 25), true /*startIncluded*/, true /*endIncluded*/); - auto estimatedCard = estimateCardinalityRange(*arrHist, + auto estimatedCard = estimateCardinalityRange(*ceHist, true /*lowInclusive*/, NumberInt64, 20, @@ -93,7 +93,7 @@ TEST(HistogramPredicateEstimationTest, IntHistogramIntervalEstimation) { .card; ASSERT_EQ(8.0, estimatedCard); ASSERT_CE_APPROX_EQUAL(estimatedCard, - estimateIntervalCardinality(*arrHist, interval), + estimateIntervalCardinality(*ceHist, interval), 0.001 /* rounding error */); } @@ -101,7 +101,7 @@ TEST(HistogramPredicateEstimationTest, IntHistogramIntervalEstimation) { Interval interval(BSON("" << 30 << "" << 40), true /*startIncluded*/, true /*endIncluded*/); ASSERT_EQ(3.0, HistogramEstimator::estimateCardinality( - *arrHist, intCnt, interval, true /*includeScalar*/)); + *ceHist, intCnt, interval, true /*includeScalar*/)); } } @@ -121,7 +121,7 @@ TEST(HistogramPredicateEstimationTest, StrHistogramIntervalEstimation) { const Cardinality strCnt = 100; const ScalarHistogram& hist = createHistogram(data); - const auto arrHist = ArrayHistogram::make( + const auto ceHist = CEHistogram::make( hist, stats::TypeCounts{{sbe::value::TypeTags::StringSmall, strCnt}}, strCnt); auto [tagLow, valLow] = value::makeNewString("TTV"_sd); @@ -135,10 +135,10 @@ TEST(HistogramPredicateEstimationTest, StrHistogramIntervalEstimation) { true, true); auto estimatedCard = - estimateCardinalityEq(*arrHist, tagLow, valLow, true /*includeScalar*/).card; + estimateCardinalityEq(*ceHist, tagLow, valLow, true /*includeScalar*/).card; ASSERT_EQ(5.0, estimatedCard); ASSERT_CE_APPROX_EQUAL(estimatedCard, - estimateIntervalCardinality(*arrHist, interval), + estimateIntervalCardinality(*ceHist, interval), 0.001 /* rounding error */); } @@ -151,7 +151,7 @@ TEST(HistogramPredicateEstimationTest, StrHistogramIntervalEstimation) { << "YtzS"), true, true); - auto estimatedCard = estimateCardinalityRange(*arrHist, + auto estimatedCard = estimateCardinalityRange(*ceHist, true /*lowInclusive*/, tagLow, valLow, @@ -162,7 +162,7 @@ TEST(HistogramPredicateEstimationTest, StrHistogramIntervalEstimation) { .card; ASSERT_EQ(10.0, estimatedCard); ASSERT_CE_APPROX_EQUAL(estimatedCard, - estimateIntervalCardinality(*arrHist, interval), + estimateIntervalCardinality(*ceHist, interval), 0.001 /* rounding error */); } @@ -175,7 +175,7 @@ TEST(HistogramPredicateEstimationTest, StrHistogramIntervalEstimation) { << "VtzSlajdkajda"), true, true); - auto estimatedCard = estimateCardinalityRange(*arrHist, + auto estimatedCard = estimateCardinalityRange(*ceHist, true /*lowInclusive*/, tagLow, valLow, @@ -186,7 +186,7 @@ TEST(HistogramPredicateEstimationTest, StrHistogramIntervalEstimation) { .card; ASSERT_CE_APPROX_EQUAL(6.244, estimatedCard, 0.001); ASSERT_CE_APPROX_EQUAL(estimatedCard, - estimateIntervalCardinality(*arrHist, interval), + estimateIntervalCardinality(*ceHist, interval), 0.001 /* rounding error */); } @@ -199,7 +199,7 @@ TEST(HistogramPredicateEstimationTest, StrHistogramIntervalEstimation) { true); ASSERT_EQ(34.0, HistogramEstimator::estimateCardinality( - *arrHist, strCnt, interval, true /*includeScalar*/)); + *ceHist, strCnt, interval, true /*includeScalar*/)); } } @@ -218,7 +218,7 @@ TEST(HistogramPredicateEstimationTest, IntStrHistogramIntervalEstimation) { ASSERT_EQ(totalCnt, getTotals(hist).card); - const auto arrHist = ArrayHistogram::make( + const auto ceHist = CEHistogram::make( hist, stats::TypeCounts{{NumberInt64, intCnt}, {sbe::value::TypeTags::StringSmall, strCnt}}, totalCnt); @@ -226,11 +226,11 @@ TEST(HistogramPredicateEstimationTest, IntStrHistogramIntervalEstimation) { { // {a: 993} Interval interval(BSON("" << 993 << "" << 993), true, true); auto estimatedCard = - estimateCardinalityEq(*arrHist, NumberInt64, 993, true /*includeScalar*/).card; + estimateCardinalityEq(*ceHist, NumberInt64, 993, true /*includeScalar*/).card; ASSERT_APPROX_EQUAL(7.0, estimatedCard, 0.1); // Actual: 9 ASSERT_CE_APPROX_EQUAL(estimatedCard, - estimateIntervalCardinality(*arrHist, interval), + estimateIntervalCardinality(*ceHist, interval), 0.001 /* rounding error */); } @@ -244,10 +244,10 @@ TEST(HistogramPredicateEstimationTest, IntStrHistogramIntervalEstimation) { true /*startIncluded*/, true /*endIncluded*/); auto estimatedCard = - estimateCardinalityEq(*arrHist, tag, value, true /*includeScalar*/).card; + estimateCardinalityEq(*ceHist, tag, value, true /*includeScalar*/).card; ASSERT_APPROX_EQUAL(2.2, estimatedCard, 0.1); // Actual: 3. ASSERT_CE_APPROX_EQUAL(estimatedCard, - estimateIntervalCardinality(*arrHist, interval), + estimateIntervalCardinality(*ceHist, interval), 0.001 /* rounding error */); } @@ -257,11 +257,11 @@ TEST(HistogramPredicateEstimationTest, IntStrHistogramIntervalEstimation) { Interval interval( BSON("" << 100000000 << "" << 100000000), true /*startIncluded*/, true /*endIncluded*/); auto estimatedCard = - estimateCardinalityEq(*arrHist, tagLow, valLow, true /*includeScalar*/).card; + estimateCardinalityEq(*ceHist, tagLow, valLow, true /*includeScalar*/).card; ASSERT_APPROX_EQUAL(0.0, estimatedCard, 0.1); // Actual: 0 ASSERT_CE_APPROX_EQUAL(estimatedCard, - estimateIntervalCardinality(*arrHist, interval), + estimateIntervalCardinality(*ceHist, interval), 0.001 /* rounding error */); } @@ -276,7 +276,7 @@ TEST(HistogramPredicateEstimationTest, IntStrHistogramIntervalEstimation) { << "04e"), true, false); - auto estimatedCard = estimateCardinalityRange(*arrHist, + auto estimatedCard = estimateCardinalityRange(*ceHist, false /* lowInclusive */, tagLow, valLow, @@ -287,7 +287,7 @@ TEST(HistogramPredicateEstimationTest, IntStrHistogramIntervalEstimation) { .card; ASSERT_CE_APPROX_EQUAL(13.3, estimatedCard, 0.1); // Actual: 0. ASSERT_CE_APPROX_EQUAL(estimatedCard, - estimateIntervalCardinality(*arrHist, interval), + estimateIntervalCardinality(*ceHist, interval), 0.001 /* rounding error */); } } @@ -326,14 +326,14 @@ TEST(HistogramPredicateEstimationTest, IntArrayOnlyIntervalEstimate) { const ScalarHistogram uniqueHist = createHistogram(uniqueData); - const auto arrHist = ArrayHistogram::make(scalarHist, - TypeCounts{{value::TypeTags::Array, 100}}, - uniqueHist, - minHist, - maxHist, - // There are 100 non-empty int-only arrays. - TypeCounts{{value::TypeTags::NumberInt64, 100}}, - totalCnt /* sampleSize */); + const auto ceHist = CEHistogram::make(scalarHist, + TypeCounts{{value::TypeTags::Array, 100}}, + uniqueHist, + minHist, + maxHist, + // There are 100 non-empty int-only arrays. + TypeCounts{{value::TypeTags::NumberInt64, 100}}, + totalCnt /* sampleSize */); { // {$match: {a: {$elemMatch: {$gt: 500, $lt: 600}}}} value::TypeTags tagLow = NumberInt64; @@ -341,7 +341,7 @@ TEST(HistogramPredicateEstimationTest, IntArrayOnlyIntervalEstimate) { value::TypeTags tagHigh = NumberInt64; value::Value valHigh = 600; Interval interval(BSON("" << 500 << "" << 600), false, false /*endIncluded*/); - auto estimatedCard = estimateCardinalityRange(*arrHist, + auto estimatedCard = estimateCardinalityRange(*ceHist, false /*lowInclusive*/, tagLow, valLow, @@ -353,7 +353,7 @@ TEST(HistogramPredicateEstimationTest, IntArrayOnlyIntervalEstimate) { ASSERT_CE_APPROX_EQUAL(27.0, estimatedCard, 0.1); // actual 21. ASSERT_CE_APPROX_EQUAL( estimatedCard, - estimateIntervalCardinality(*arrHist, interval, false /*includeScalar*/), + estimateIntervalCardinality(*ceHist, interval, false /*includeScalar*/), 0.001 /* rounding error */); } @@ -362,7 +362,7 @@ TEST(HistogramPredicateEstimationTest, IntArrayOnlyIntervalEstimate) { BSON("" << 10 << "" << 110), false /*startIncluded*/, false /*endIncluded*/); ASSERT_CE_APPROX_EQUAL(24.1, HistogramEstimator::estimateCardinality( - *arrHist, totalCnt, interval, false /*includeScalar*/), + *ceHist, totalCnt, interval, false /*includeScalar*/), 0.1 /* rounding error*/); } } @@ -382,24 +382,24 @@ DEATH_TEST(HistogramPredicateEstimationTest, const Cardinality totalCnt = 100; const ScalarHistogram& hist = createHistogram(data); - const auto arrHist = ArrayHistogram::make(hist, - stats::TypeCounts{{value::TypeTags::NumberInt64, 30}, - {value::TypeTags::Timestamp, 25}, - {value::TypeTags::Boolean, 25}, - {value::TypeTags::Nothing, 5}, - {value::TypeTags::Object, 15}}, - totalCnt, - 5, - 20); + const auto ceHist = CEHistogram::make(hist, + stats::TypeCounts{{value::TypeTags::NumberInt64, 30}, + {value::TypeTags::Timestamp, 25}, + {value::TypeTags::Boolean, 25}, + {value::TypeTags::Nothing, 5}, + {value::TypeTags::Object, 15}}, + totalCnt, + 5, + 20); { // check estimation for sbe::value::TypeTags::Boolean Interval interval( BSON("" << true << "" << true), true /*startIncluded*/, true /*endIncluded*/); ASSERT_EQ( true, - HistogramEstimator::canEstimateInterval(*arrHist, interval, true /*includeScalar*/)); + HistogramEstimator::canEstimateInterval(*ceHist, interval, true /*includeScalar*/)); ASSERT_CE_APPROX_EQUAL(5, /*estimatedCard */ - estimateIntervalCardinality(*arrHist, interval), + estimateIntervalCardinality(*ceHist, interval), 0.001 /* rounding error */); } @@ -408,9 +408,9 @@ DEATH_TEST(HistogramPredicateEstimationTest, BSON("" << BSONNULL << "" << BSONNULL), true /*startIncluded*/, true /*endIncluded*/); ASSERT_EQ( true, - HistogramEstimator::canEstimateInterval(*arrHist, interval, true /*includeScalar*/)); + HistogramEstimator::canEstimateInterval(*ceHist, interval, true /*includeScalar*/)); ASSERT_CE_APPROX_EQUAL(5, /*estimatedCard ,*/ - estimateIntervalCardinality(*arrHist, interval), + estimateIntervalCardinality(*ceHist, interval), 0.001 /* rounding error */); } @@ -419,9 +419,9 @@ DEATH_TEST(HistogramPredicateEstimationTest, BSON("" << startTs << "" << endTs), true /*startIncluded*/, true /*endIncluded*/); ASSERT_EQ( true, - HistogramEstimator::canEstimateInterval(*arrHist, interval, true /*includeScalar*/)); + HistogramEstimator::canEstimateInterval(*ceHist, interval, true /*includeScalar*/)); ASSERT_CE_APPROX_EQUAL(25, /*estimatedCard ,*/ - estimateIntervalCardinality(*arrHist, interval), + estimateIntervalCardinality(*ceHist, interval), 0.001 /* rounding error */); } @@ -435,8 +435,8 @@ DEATH_TEST(HistogramPredicateEstimationTest, true /*endIncluded*/); ASSERT_EQ( false, - HistogramEstimator::canEstimateInterval(*arrHist, interval, true /*includeScalar*/)); - ASSERT_THROWS_CODE(estimateIntervalCardinality(*arrHist, interval), DBException, 9163900); + HistogramEstimator::canEstimateInterval(*ceHist, interval, true /*includeScalar*/)); + ASSERT_THROWS_CODE(estimateIntervalCardinality(*ceHist, interval), DBException, 9163900); } { // check estimation for [Null, true] @@ -444,9 +444,9 @@ DEATH_TEST(HistogramPredicateEstimationTest, BSON("" << BSONNULL << "" << true), true /*startIncluded*/, true /*endIncluded*/); ASSERT_EQ( true, - HistogramEstimator::canEstimateInterval(*arrHist, interval, true /*includeScalar*/)); + HistogramEstimator::canEstimateInterval(*ceHist, interval, true /*includeScalar*/)); ASSERT_CE_APPROX_EQUAL(75, /*estimatedCard ,*/ - estimateIntervalCardinality(*arrHist, interval), + estimateIntervalCardinality(*ceHist, interval), 0.001 /* rounding error */); } @@ -455,9 +455,9 @@ DEATH_TEST(HistogramPredicateEstimationTest, BSON("" << false << "" << endTs), true /*startIncluded*/, true /*endIncluded*/); ASSERT_EQ( true, - HistogramEstimator::canEstimateInterval(*arrHist, interval, true /*includeScalar*/)); + HistogramEstimator::canEstimateInterval(*ceHist, interval, true /*includeScalar*/)); ASSERT_CE_APPROX_EQUAL(50, /*estimatedCard ,*/ - estimateIntervalCardinality(*arrHist, interval), + estimateIntervalCardinality(*ceHist, interval), 0.001 /* rounding error */); } } diff --git a/src/mongo/db/query/ce/maxdiff_histogram_test.cpp b/src/mongo/db/query/ce/maxdiff_histogram_test.cpp index 2b52dec03c137..62d8a2977bf8f 100644 --- a/src/mongo/db/query/ce/maxdiff_histogram_test.cpp +++ b/src/mongo/db/query/ce/maxdiff_histogram_test.cpp @@ -322,8 +322,8 @@ TEST_F(HistogramTest, MaxDiffIntArrays) { auto rawData = genFixedValueArray(nElems, 1.0, 0.0); auto arrayData = nestArrays(rawData, 0 /* No empty arrays */); - auto estimator = createArrayEstimator(arrayData, nBuckets, stats::SortArg::kArea); - auto estimatorAreaDiff = createArrayEstimator(arrayData, nBuckets); + auto estimator = createCEHistogram(arrayData, nBuckets, stats::SortArg::kArea); + auto estimatorAreaDiff = createCEHistogram(arrayData, nBuckets); auto opCtx = makeOperationContext(); @@ -417,10 +417,10 @@ TEST_F(HistogramTest, MaxDiffEmptyArrays) { "nElems"_attr = nElems, "arrayData"_attr = printValueArray(arrayData)); - const auto arrayHist = createArrayEstimator(arrayData, nBuckets, stats::SortArg::kAreaDiff); - const auto arrayHistAreaDiff = createArrayEstimator(arrayData, nBuckets); + const auto ceHist = createCEHistogram(arrayData, nBuckets, stats::SortArg::kAreaDiff); + const auto ceHistAreaDiff = createCEHistogram(arrayData, nBuckets); - const auto histograms = {arrayHist, arrayHistAreaDiff}; + const auto histograms = {ceHist, ceHistAreaDiff}; std::for_each(histograms.begin(), histograms.end(), [emptyArrayCount](auto&& histogram) { ASSERT_EQ(histogram->getEmptyArrayCount(), emptyArrayCount); diff --git a/src/mongo/db/query/stats/BUILD.bazel b/src/mongo/db/query/stats/BUILD.bazel index d3cf43f7ab892..b66a6b1ff38b3 100644 --- a/src/mongo/db/query/stats/BUILD.bazel +++ b/src/mongo/db/query/stats/BUILD.bazel @@ -20,13 +20,13 @@ idl_generator( mongo_cc_library( name = "stats_histograms", srcs = [ - "array_histogram.cpp", + "ce_histogram.cpp", "scalar_histogram.cpp", "value_utils.cpp", ":stats_gen_for_histograms", ], hdrs = [ - "array_histogram.h", + "ce_histogram.h", "scalar_histogram.h", "value_utils.h", ], diff --git a/src/mongo/db/query/stats/SConscript b/src/mongo/db/query/stats/SConscript index f21c7871ec2b1..1f6c085853479 100644 --- a/src/mongo/db/query/stats/SConscript +++ b/src/mongo/db/query/stats/SConscript @@ -108,9 +108,9 @@ env.CppUnitTest( ) env.CppUnitTest( - target="array_histogram_test", + target="ce_histogram_test", source=[ - "array_histogram_test.cpp", + "ce_histogram_test.cpp", ], LIBDEPS=[ "stats_test_utils", diff --git a/src/mongo/db/query/stats/array_histogram.cpp b/src/mongo/db/query/stats/ce_histogram.cpp similarity index 77% rename from src/mongo/db/query/stats/array_histogram.cpp rename to src/mongo/db/query/stats/ce_histogram.cpp index 32ede0e11f2da..0a20c7a272a90 100644 --- a/src/mongo/db/query/stats/array_histogram.cpp +++ b/src/mongo/db/query/stats/ce_histogram.cpp @@ -27,7 +27,7 @@ * it in the license file. */ -#include "mongo/db/query/stats/array_histogram.h" +#include "mongo/db/query/stats/ce_histogram.h" #include #include @@ -316,7 +316,7 @@ void validate(const ScalarHistogram& scalar, std::less_equal()); } else if (numArrays > 0) { - uasserted(7131000, "A scalar ArrayHistogram should not have any arrays in its counters."); + uasserted(7131000, "A scalar CEHistogram should not have any arrays in its counters."); } // Validate boolean counters. @@ -368,20 +368,20 @@ double getTotalCount(const TypeCounts& tc, boost::optional isHistogrammabl return total; } -ArrayHistogram::ArrayHistogram() - : ArrayHistogram(ScalarHistogram::make(), {} /* Type counts. */, 0.0 /* Sample size. */) {} - -ArrayHistogram::ArrayHistogram(ScalarHistogram scalar, - TypeCounts typeCounts, - ScalarHistogram arrayUnique, - ScalarHistogram arrayMin, - ScalarHistogram arrayMax, - TypeCounts arrayTypeCounts, - double sampleSize, - double emptyArrayCount, - double trueCount, - double falseCount, - double nanCount) +CEHistogram::CEHistogram() + : CEHistogram(ScalarHistogram::make(), {} /* Type counts. */, 0.0 /* Sample size. */) {} + +CEHistogram::CEHistogram(ScalarHistogram scalar, + TypeCounts typeCounts, + ScalarHistogram arrayUnique, + ScalarHistogram arrayMin, + ScalarHistogram arrayMax, + TypeCounts arrayTypeCounts, + double sampleSize, + double emptyArrayCount, + double trueCount, + double falseCount, + double nanCount) : _scalar(std::move(scalar)), _typeCounts(std::move(typeCounts)), _emptyArrayCount(emptyArrayCount), @@ -394,12 +394,12 @@ ArrayHistogram::ArrayHistogram(ScalarHistogram scalar, _arrayMax(std::move(arrayMax)), _arrayTypeCounts(std::move(arrayTypeCounts)) {} -ArrayHistogram::ArrayHistogram(ScalarHistogram scalar, - TypeCounts typeCounts, - double sampleSize, - double trueCount, - double falseCount, - double nanCount) +CEHistogram::CEHistogram(ScalarHistogram scalar, + TypeCounts typeCounts, + double sampleSize, + double trueCount, + double falseCount, + double nanCount) : _scalar(std::move(scalar)), _typeCounts(std::move(typeCounts)), _emptyArrayCount(0.0), @@ -412,37 +412,37 @@ ArrayHistogram::ArrayHistogram(ScalarHistogram scalar, _arrayMax(boost::none), _arrayTypeCounts(boost::none) {} -std::shared_ptr ArrayHistogram::make() { +std::shared_ptr CEHistogram::make() { // No need to validate an empty histogram. - return std::shared_ptr(new ArrayHistogram()); + return std::shared_ptr(new CEHistogram()); } -std::shared_ptr ArrayHistogram::make(ScalarHistogram scalar, - TypeCounts typeCounts, - double sampleSize, - double trueCount, - double falseCount, - double nanCount, - bool doValidation) { +std::shared_ptr CEHistogram::make(ScalarHistogram scalar, + TypeCounts typeCounts, + double sampleSize, + double trueCount, + double falseCount, + double nanCount, + bool doValidation) { if (doValidation) { validate(scalar, typeCounts, boost::none, sampleSize, trueCount, falseCount, nanCount); } - return std::shared_ptr(new ArrayHistogram( + return std::shared_ptr(new CEHistogram( std::move(scalar), std::move(typeCounts), sampleSize, trueCount, falseCount, nanCount)); } -std::shared_ptr ArrayHistogram::make(ScalarHistogram scalar, - TypeCounts typeCounts, - ScalarHistogram arrayUnique, - ScalarHistogram arrayMin, - ScalarHistogram arrayMax, - TypeCounts arrayTypeCounts, - double sampleSize, - double emptyArrayCount, - double trueCount, - double falseCount, - double nanCount, - bool doValidation) { +std::shared_ptr CEHistogram::make(ScalarHistogram scalar, + TypeCounts typeCounts, + ScalarHistogram arrayUnique, + ScalarHistogram arrayMin, + ScalarHistogram arrayMax, + TypeCounts arrayTypeCounts, + double sampleSize, + double emptyArrayCount, + double trueCount, + double falseCount, + double nanCount, + bool doValidation) { if (doValidation) { validate(scalar, typeCounts, @@ -452,20 +452,20 @@ std::shared_ptr ArrayHistogram::make(ScalarHistogram scala falseCount, nanCount); } - return std::shared_ptr(new ArrayHistogram(std::move(scalar), - std::move(typeCounts), - std::move(arrayUnique), - std::move(arrayMin), - std::move(arrayMax), - std::move(arrayTypeCounts), - sampleSize, - emptyArrayCount, - trueCount, - falseCount, - nanCount)); + return std::shared_ptr(new CEHistogram(std::move(scalar), + std::move(typeCounts), + std::move(arrayUnique), + std::move(arrayMin), + std::move(arrayMax), + std::move(arrayTypeCounts), + sampleSize, + emptyArrayCount, + trueCount, + falseCount, + nanCount)); } -std::shared_ptr ArrayHistogram::make(Statistics stats) { +std::shared_ptr CEHistogram::make(Statistics stats) { // Note that we don't run validation when loading a histogram from the Statistics collection // because we already validated this histogram before inserting it. const auto scalar = ScalarHistogram::make(stats.getScalarHistogram()); @@ -477,27 +477,27 @@ std::shared_ptr ArrayHistogram::make(Statistics stats) { // If we have ArrayStatistics, we will need to initialize the array-only fields. if (auto maybeArrayStats = stats.getArrayStatistics(); maybeArrayStats) { - return std::shared_ptr( - new ArrayHistogram(std::move(scalar), - std::move(typeCounts), - ScalarHistogram::make(maybeArrayStats->getUniqueHistogram()), - ScalarHistogram::make(maybeArrayStats->getMinHistogram()), - ScalarHistogram::make(maybeArrayStats->getMaxHistogram()), - mapStatsTypeCountToTypeCounts(maybeArrayStats->getTypeCount()), - sampleSize, - stats.getEmptyArrayCount(), - trueCount, - falseCount, - nanCount)); + return std::shared_ptr( + new CEHistogram(std::move(scalar), + std::move(typeCounts), + ScalarHistogram::make(maybeArrayStats->getUniqueHistogram()), + ScalarHistogram::make(maybeArrayStats->getMinHistogram()), + ScalarHistogram::make(maybeArrayStats->getMaxHistogram()), + mapStatsTypeCountToTypeCounts(maybeArrayStats->getTypeCount()), + sampleSize, + stats.getEmptyArrayCount(), + trueCount, + falseCount, + nanCount)); } // If we don't have ArrayStatistics available, we should construct a histogram with only scalar // fields. - return std::shared_ptr(new ArrayHistogram( + return std::shared_ptr(new CEHistogram( std::move(scalar), std::move(typeCounts), sampleSize, trueCount, falseCount, nanCount)); } -bool ArrayHistogram::isArray() const { +bool CEHistogram::isArray() const { return _arrayUnique && _arrayMin && _arrayMax && _arrayTypeCounts; } @@ -515,7 +515,7 @@ std::string typeCountsToString(const TypeCounts& typeCounts) { return os.str(); } -std::string ArrayHistogram::toString() const { +std::string CEHistogram::toString() const { std::ostringstream os; os << "{\n"; os << " scalar: " << _scalar.toString(); @@ -530,61 +530,61 @@ std::string ArrayHistogram::toString() const { return os.str(); } -const ScalarHistogram& ArrayHistogram::getScalar() const { +const ScalarHistogram& CEHistogram::getScalar() const { return _scalar; } -const ScalarHistogram& ArrayHistogram::getArrayUnique() const { - tassert(7131002, "Only an array ArrayHistogram has a unique histogram.", isArray()); +const ScalarHistogram& CEHistogram::getArrayUnique() const { + tassert(7131002, "Only an array CEHistogram has a unique histogram.", isArray()); return *_arrayUnique; } -const ScalarHistogram& ArrayHistogram::getArrayMin() const { - tassert(7131003, "Only an array ArrayHistogram has a min histogram.", isArray()); +const ScalarHistogram& CEHistogram::getArrayMin() const { + tassert(7131003, "Only an array CEHistogram has a min histogram.", isArray()); return *_arrayMin; } -const ScalarHistogram& ArrayHistogram::getArrayMax() const { - tassert(7131004, "Only an array ArrayHistogram has a max histogram.", isArray()); +const ScalarHistogram& CEHistogram::getArrayMax() const { + tassert(7131004, "Only an array CEHistogram has a max histogram.", isArray()); return *_arrayMax; } -const TypeCounts& ArrayHistogram::getTypeCounts() const { +const TypeCounts& CEHistogram::getTypeCounts() const { return _typeCounts; } -const TypeCounts& ArrayHistogram::getArrayTypeCounts() const { - tassert(7131005, "Only an array ArrayHistogram has array type counts.", isArray()); +const TypeCounts& CEHistogram::getArrayTypeCounts() const { + tassert(7131005, "Only an array CEHistogram has array type counts.", isArray()); return *_arrayTypeCounts; } -double ArrayHistogram::getArrayCount() const { +double CEHistogram::getArrayCount() const { if (isArray()) { double arrayCount = getTypeCount(sbe::value::TypeTags::Array); uassert( - 6979503, "Histogram with array data must have at least one array.", arrayCount > 0.0); + 6979503, "CEHistogram with array data must have at least one array.", arrayCount > 0.0); return arrayCount; } return 0.0; } -double ArrayHistogram::getTypeCount(sbe::value::TypeTags tag) const { +double CEHistogram::getTypeCount(sbe::value::TypeTags tag) const { return getTagTypeCount(getTypeCounts(), tag); } -double ArrayHistogram::getArrayTypeCount(sbe::value::TypeTags tag) const { +double CEHistogram::getArrayTypeCount(sbe::value::TypeTags tag) const { return getTagTypeCount(getArrayTypeCounts(), tag); } -double ArrayHistogram::getTotalTypeCount() const { +double CEHistogram::getTotalTypeCount() const { return getTotalCount(getTypeCounts()); } -double ArrayHistogram::getTotalArrayTypeCount() const { +double CEHistogram::getTotalArrayTypeCount() const { return getTotalCount(getArrayTypeCounts()); } -BSONObj ArrayHistogram::serialize() const { +BSONObj CEHistogram::serialize() const { BSONObjBuilder histogramBuilder; // Serialize boolean type counters. @@ -618,11 +618,11 @@ BSONObj ArrayHistogram::serialize() const { BSONObj makeStatistics(double documents, double sampleRate, - const std::shared_ptr arrayHistogram) { + const std::shared_ptr ceHistogram) { BSONObjBuilder builder; builder.appendNumber("documents", documents); builder.appendNumber("sampleRate", sampleRate); - builder.appendElements(arrayHistogram->serialize()); + builder.appendElements(ceHistogram->serialize()); builder.doneFast(); return builder.obj(); } @@ -630,10 +630,10 @@ BSONObj makeStatistics(double documents, BSONObj makeStatsPath(StringData path, double documents, double sampleRate, - const std::shared_ptr arrayHistogram) { + const std::shared_ptr ceHistogram) { BSONObjBuilder builder; builder.append("_id", path); - builder.append("statistics", makeStatistics(documents, sampleRate, arrayHistogram)); + builder.append("statistics", makeStatistics(documents, sampleRate, ceHistogram)); builder.doneFast(); return builder.obj(); } diff --git a/src/mongo/db/query/stats/array_histogram.h b/src/mongo/db/query/stats/ce_histogram.h similarity index 67% rename from src/mongo/db/query/stats/array_histogram.h rename to src/mongo/db/query/stats/ce_histogram.h index a8fe07dcf80fa..0293a6d2c3c4d 100644 --- a/src/mongo/db/query/stats/array_histogram.h +++ b/src/mongo/db/query/stats/ce_histogram.h @@ -52,54 +52,54 @@ using TypeCounts = std::map; **/ double getTotalCount(const TypeCounts& tc, boost::optional isHistogrammable = boost::none); -class ArrayHistogram { +class CEHistogram { public: /** * Factory method for constructing an empty scalar histogram. */ - static std::shared_ptr make(); + static std::shared_ptr make(); /** - * Factory method for constructing an ArrayHistogram using StatsPath IDL as input. + * Factory method for constructing an CEHistogram using StatsPath IDL as input. */ - static std::shared_ptr make(Statistics stats); + static std::shared_ptr make(Statistics stats); /** * Factory method for constructing a scalar histogram (no array fields). */ - static std::shared_ptr make(ScalarHistogram scalar, - TypeCounts typeCounts, - double sampleSize, - double trueCount = 0.0, - double falseCount = 0.0, - double nanCount = 0.0, - bool validate = true); + static std::shared_ptr make(ScalarHistogram scalar, + TypeCounts typeCounts, + double sampleSize, + double trueCount = 0.0, + double falseCount = 0.0, + double nanCount = 0.0, + bool validate = true); /** * Factory method for constructing an array field histogram. All array fields must be * initialized. */ - static std::shared_ptr make(ScalarHistogram scalar, - TypeCounts typeCounts, - ScalarHistogram arrayUnique, - ScalarHistogram arrayMin, - ScalarHistogram arrayMax, - TypeCounts arrayTypeCounts, - double sampleSize, - double emptyArrayCount = 0.0, - double trueCount = 0.0, - double falseCount = 0.0, - double nanCount = 0.0, - bool validate = true); - - // ArrayHistogram is neither copy-constructible nor copy-assignable. - ArrayHistogram(const ArrayHistogram&) = delete; - ArrayHistogram& operator=(const ArrayHistogram&) = delete; + static std::shared_ptr make(ScalarHistogram scalar, + TypeCounts typeCounts, + ScalarHistogram arrayUnique, + ScalarHistogram arrayMin, + ScalarHistogram arrayMax, + TypeCounts arrayTypeCounts, + double sampleSize, + double emptyArrayCount = 0.0, + double trueCount = 0.0, + double falseCount = 0.0, + double nanCount = 0.0, + bool validate = true); + + // CEHistogram is neither copy-constructible nor copy-assignable. + CEHistogram(const CEHistogram&) = delete; + CEHistogram& operator=(const CEHistogram&) = delete; // However, it is move-constructible and move-assignable. - ArrayHistogram(ArrayHistogram&&) = default; - ArrayHistogram& operator=(ArrayHistogram&&) = default; - ~ArrayHistogram() = default; + CEHistogram(CEHistogram&&) = default; + CEHistogram& operator=(CEHistogram&&) = default; + ~CEHistogram() = default; std::string toString() const; @@ -156,28 +156,28 @@ class ArrayHistogram { private: // Constructs an empty scalar histogram. - ArrayHistogram(); + CEHistogram(); // Constructor for scalar field histograms. - ArrayHistogram(ScalarHistogram scalar, - TypeCounts typeCounts, - double sampleSize, - double trueCount = 0.0, - double falseCount = 0.0, - double nanCount = 0.0); + CEHistogram(ScalarHistogram scalar, + TypeCounts typeCounts, + double sampleSize, + double trueCount = 0.0, + double falseCount = 0.0, + double nanCount = 0.0); // Constructor for array field histograms. We have to initialize all array fields in this case. - ArrayHistogram(ScalarHistogram scalar, - TypeCounts typeCounts, - ScalarHistogram arrayUnique, - ScalarHistogram arrayMin, - ScalarHistogram arrayMax, - TypeCounts arrayTypeCounts, - double sampleSize, - double emptyArrayCount = 0.0, - double trueCount = 0.0, - double falseCount = 0.0, - double nanCount = 0.0); + CEHistogram(ScalarHistogram scalar, + TypeCounts typeCounts, + ScalarHistogram arrayUnique, + ScalarHistogram arrayMin, + ScalarHistogram arrayMax, + TypeCounts arrayTypeCounts, + double sampleSize, + double emptyArrayCount = 0.0, + double trueCount = 0.0, + double falseCount = 0.0, + double nanCount = 0.0); /* Fields for all paths. */ @@ -212,7 +212,7 @@ class ArrayHistogram { */ BSONObj makeStatistics(double documents, double sampleRate, - std::shared_ptr arrayHistogram); + std::shared_ptr ceHistogram); /** * Returns an owned BSON Object representing data matching mongo::StatsPath IDL. @@ -220,5 +220,5 @@ BSONObj makeStatistics(double documents, BSONObj makeStatsPath(StringData path, double documents, double sampleRate, - std::shared_ptr arrayHistogram); + std::shared_ptr ceHistogram); } // namespace mongo::stats diff --git a/src/mongo/db/query/stats/array_histogram_test.cpp b/src/mongo/db/query/stats/ce_histogram_test.cpp similarity index 86% rename from src/mongo/db/query/stats/array_histogram_test.cpp rename to src/mongo/db/query/stats/ce_histogram_test.cpp index 3964522a018e1..dafb5cdb56294 100644 --- a/src/mongo/db/query/stats/array_histogram_test.cpp +++ b/src/mongo/db/query/stats/ce_histogram_test.cpp @@ -43,7 +43,7 @@ #include "mongo/bson/json.h" #include "mongo/bson/timestamp.h" #include "mongo/db/exec/sbe/values/value.h" -#include "mongo/db/query/stats/array_histogram.h" +#include "mongo/db/query/stats/ce_histogram.h" #include "mongo/db/query/stats/max_diff.h" #include "mongo/db/query/stats/rand_utils_new.h" #include "mongo/db/query/stats/scalar_histogram.h" @@ -57,7 +57,7 @@ namespace mongo::stats { -TEST(ArrayHistograms, BSONEdgeValues) { +TEST(CEHistograms, BSONEdgeValues) { const std::vector values{ SBEValue{sbe::value::TypeTags::Nothing, {}}, @@ -120,8 +120,8 @@ TEST(ArrayHistograms, BSONEdgeValues) { sbe::value::makeNewObject(), sbe::value::makeNewObjectId(), }; - auto ah = createArrayEstimator(values, ScalarHistogram::kMaxBuckets); - // We are relying on the fact that 'createArrayEstimator()' performs validation of the histogram + auto ceHist = createCEHistogram(values, ScalarHistogram::kMaxBuckets); + // We are relying on the fact that 'createCEHistogram()' performs validation of the histogram // upon construction. TypeCounts expectedTypeCounts = { @@ -142,25 +142,25 @@ TEST(ArrayHistograms, BSONEdgeValues) { {sbe::value::TypeTags::Object, 1}, {sbe::value::TypeTags::ObjectId, 1}, }; - ASSERT_EQ(expectedTypeCounts, ah->getTypeCounts()); - ASSERT_EQ(ah->getTrueCount(), 1); - ASSERT_EQ(ah->getFalseCount(), 1); - ASSERT_EQ(ah->getNanCount(), 4); - ASSERT_EQ(ah->getEmptyArrayCount(), 1); + ASSERT_EQ(expectedTypeCounts, ceHist->getTypeCounts()); + ASSERT_EQ(ceHist->getTrueCount(), 1); + ASSERT_EQ(ceHist->getFalseCount(), 1); + ASSERT_EQ(ceHist->getNanCount(), 4); + ASSERT_EQ(ceHist->getEmptyArrayCount(), 1); // Verify that we can build a histogram with the number of buckets equal to the number of // types in the value stream + 1 (numeric, date, timestamp, string, and objectId). - ah = createArrayEstimator(values, 6); + ceHist = createCEHistogram(values, 6); // Ensure we fail to build a histrogram when we have more types than buckets. - ASSERT_THROWS(createArrayEstimator(values, 5), DBException); + ASSERT_THROWS(createCEHistogram(values, 5), DBException); } -TEST(ArrayHistograms, EmptyHistogram) { - auto ah = createArrayEstimator({}, ScalarHistogram::kMaxBuckets); +TEST(CEHistograms, EmptyHistogram) { + auto ceHist = createCEHistogram({}, ScalarHistogram::kMaxBuckets); } -TEST(ArrayHistograms, SingleEntryHistogram) { +TEST(CEHistograms, SingleEntryHistogram) { const Date_t d = dateFromISOString("2015-10-21T07:28:00+0000").getValue(); const std::vector values{ makeInt64Value(42), @@ -172,13 +172,13 @@ TEST(ArrayHistograms, SingleEntryHistogram) { }; for (auto&& v : values) { std::vector singleValVec{sbe::value::copyValue(v.getTag(), v.getValue())}; - auto ah = createArrayEstimator(singleValVec, ScalarHistogram::kMaxBuckets); - ah = createArrayEstimator(singleValVec, 1); + auto ceHist = createCEHistogram(singleValVec, ScalarHistogram::kMaxBuckets); + ceHist = createCEHistogram(singleValVec, 1); } } -TEST(ArrayHistograms, DuplicateValues) { - auto ah = createArrayEstimator( +TEST(CEHistograms, DuplicateValues) { + auto ceHist = createCEHistogram( { makeInt64Value(1), makeInt64Value(1), @@ -190,9 +190,9 @@ TEST(ArrayHistograms, DuplicateValues) { 3); } -TEST(ArrayHistograms, SingleEntryInTypeClass) { +TEST(CEHistograms, SingleEntryInTypeClass) { // Single entry at the end - auto ah = createArrayEstimator( + auto ceHist = createCEHistogram( { makeInt64Value(1), makeInt64Value(2), @@ -200,7 +200,7 @@ TEST(ArrayHistograms, SingleEntryInTypeClass) { }, 3); // Single entry at the beginning - ah = createArrayEstimator( + ceHist = createCEHistogram( { makeInt64Value(1), sbe::value::makeNewString("marty"), @@ -208,7 +208,7 @@ TEST(ArrayHistograms, SingleEntryInTypeClass) { }, 3); // Single entry in the middle - ah = createArrayEstimator( + ceHist = createCEHistogram( { makeInt64Value(1), makeInt64Value(2), @@ -219,8 +219,8 @@ TEST(ArrayHistograms, SingleEntryInTypeClass) { 4); } -TEST(ArrayHistograms, LargeAreasWithinTypeClass) { - auto ah = createArrayEstimator( +TEST(CEHistograms, LargeAreasWithinTypeClass) { + auto ceHist = createCEHistogram( { makeInt32Value(std::numeric_limits::min()), makeInt32Value(std::numeric_limits::max()), @@ -234,8 +234,8 @@ TEST(ArrayHistograms, LargeAreasWithinTypeClass) { 2); } -TEST(ArrayHistograms, SmallAreasWithinTypeClass) { - auto ah = createArrayEstimator( +TEST(CEHistograms, SmallAreasWithinTypeClass) { + auto ceHist = createCEHistogram( { sbe::value::makeCopyDecimal(Decimal128::kSmallestNegative), sbe::value::makeCopyDecimal(Decimal128::kNormalizedZero), @@ -244,7 +244,7 @@ TEST(ArrayHistograms, SmallAreasWithinTypeClass) { 2); } -TEST(ArrayHistograms, MixedTypedHistrogram) { +TEST(CEHistograms, MixedTypedHistrogram) { std::mt19937_64 seed(42); MixedDistributionDescriptor uniform{{DistrType::kUniform, 1.0}}; TypeDistrVector td; @@ -261,11 +261,11 @@ TEST(ArrayHistograms, MixedTypedHistrogram) { DatasetDescriptorNew desc{std::move(td), seed}; const std::vector values = desc.genRandomDataset(10'000); ASSERT_EQ(10'000, values.size()); - auto ah = createArrayEstimator(values, ScalarHistogram::kMaxBuckets); - ASSERT_EQ(10'000, ah->getScalar().getCardinality()); + auto ceHist = createCEHistogram(values, ScalarHistogram::kMaxBuckets); + ASSERT_EQ(10'000, ceHist->getScalar().getCardinality()); } -TEST(ArrayHistograms, LargeNumberOfScalarValuesBucketRanges) { +TEST(CEHistograms, LargeNumberOfScalarValuesBucketRanges) { std::mt19937_64 seed(42); MixedDistributionDescriptor uniform{{DistrType::kUniform, 1.0}}; TypeDistrVector td; @@ -273,16 +273,16 @@ TEST(ArrayHistograms, LargeNumberOfScalarValuesBucketRanges) { DatasetDescriptorNew desc{std::move(td), seed}; const std::vector values = desc.genRandomDataset(1'000'000); ASSERT_EQ(1'000'000, values.size()); - auto ah = createArrayEstimator(values, ScalarHistogram::kMaxBuckets); + auto ceHist = createCEHistogram(values, ScalarHistogram::kMaxBuckets); - ASSERT_EQ(1'000'000, ah->getScalar().getCardinality()); + ASSERT_EQ(1'000'000, ceHist->getScalar().getCardinality()); // Assert that each bucket has at least one entry. - std::for_each(ah->getScalar().getBuckets().begin(), - ah->getScalar().getBuckets().end(), + std::for_each(ceHist->getScalar().getBuckets().begin(), + ceHist->getScalar().getBuckets().end(), [](auto&& bucket) { ASSERT_GTE(bucket._equalFreq + bucket._rangeFreq, 1); }); } -TEST(ArrayHistograms, LargeArraysHistogram) { +TEST(CEHistograms, LargeArraysHistogram) { std::mt19937_64 seed(42); MixedDistributionDescriptor uniform{{DistrType::kUniform, 1.0}}; @@ -299,15 +299,15 @@ TEST(ArrayHistograms, LargeArraysHistogram) { const auto values = arrayDatasetDesc.genRandomDataset(10); ASSERT_EQ(10, values.size()); - auto ah = createArrayEstimator(values, ScalarHistogram::kMaxBuckets); + auto ceHist = createCEHistogram(values, ScalarHistogram::kMaxBuckets); - ASSERT_TRUE(ah->getScalar().empty()); - ASSERT_EQ(100, ah->getArrayUnique().getBuckets().size()); - ASSERT_FALSE(ah->getArrayMin().empty()); - ASSERT_FALSE(ah->getArrayMax().empty()); + ASSERT_TRUE(ceHist->getScalar().empty()); + ASSERT_EQ(100, ceHist->getArrayUnique().getBuckets().size()); + ASSERT_FALSE(ceHist->getArrayMin().empty()); + ASSERT_FALSE(ceHist->getArrayMax().empty()); } -TEST(ArrayHistograms, LargeNumberOfArraysHistogram) { +TEST(CEHistograms, LargeNumberOfArraysHistogram) { std::mt19937_64 seed(42); MixedDistributionDescriptor uniform{{DistrType::kUniform, 1.0}}; @@ -324,12 +324,12 @@ TEST(ArrayHistograms, LargeNumberOfArraysHistogram) { const auto values = arrayDatasetDesc.genRandomDataset(100'000); ASSERT_EQ(100'000, values.size()); - auto ah = createArrayEstimator(values, ScalarHistogram::kMaxBuckets); + auto ceHist = createCEHistogram(values, ScalarHistogram::kMaxBuckets); - ASSERT_TRUE(ah->getScalar().empty()); - ASSERT_EQ(100, ah->getArrayUnique().getBuckets().size()); - ASSERT_EQ(100, ah->getArrayMin().getBuckets().size()); - ASSERT_EQ(100, ah->getArrayMax().getBuckets().size()); + ASSERT_TRUE(ceHist->getScalar().empty()); + ASSERT_EQ(100, ceHist->getArrayUnique().getBuckets().size()); + ASSERT_EQ(100, ceHist->getArrayMin().getBuckets().size()); + ASSERT_EQ(100, ceHist->getArrayMax().getBuckets().size()); } std::vector generateValuesVector(std::vector vals) { @@ -350,10 +350,10 @@ void assertBounds(const std::vector& expectedBounds, const ScalarHistogram& ASSERT_EQ(expectedBounds, gotBounds); } -TEST(ArrayHistograms, MaxDiffIntegerBounds) { +TEST(CEHistograms, MaxDiffIntegerBounds) { auto values = generateValuesVector({3, 6, 9}); - auto ah = createArrayEstimator(values, 3); - assertBounds({3, 6, 9}, ah->getScalar()); + auto ceHist = createCEHistogram(values, 3); + assertBounds({3, 6, 9}, ceHist->getScalar()); // Recall that area = (distance to next value - current value) * freqency of current value @@ -364,8 +364,8 @@ TEST(ArrayHistograms, MaxDiffIntegerBounds) { // 9 -> inf // We'd expect the top 3 buckets to be {3, 7, 9}. values = generateValuesVector({3, 6, 7, 9}); - ah = createArrayEstimator(values, 3); - assertBounds({3, 7, 9}, ah->getScalar()); + ceHist = createCEHistogram(values, 3); + assertBounds({3, 7, 9}, ceHist->getScalar()); // Data distribution -> Area // 1 -> inf @@ -375,11 +375,11 @@ TEST(ArrayHistograms, MaxDiffIntegerBounds) { // 10 -> (12-10) * 1 = 2 // 12 -> inf values = generateValuesVector({1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 10, 12}); - ah = createArrayEstimator(values, 3); - assertBounds({1, 3, 12}, ah->getScalar()); + ceHist = createCEHistogram(values, 3); + assertBounds({1, 3, 12}, ceHist->getScalar()); } -TEST(ArrayHistograms, Golden) { +TEST(CEHistograms, Golden) { const std::vector values = { makeInt64Value(3), makeInt64Value(4), @@ -401,7 +401,7 @@ TEST(ArrayHistograms, Golden) { return arr; }()), }; - auto ah = createArrayEstimator(values, 8, stats::SortArg::kArea); + auto ceHist = createCEHistogram(values, 8, stats::SortArg::kArea); auto expected = fromjson(R"( { trueCount: 0.0, @@ -448,9 +448,9 @@ TEST(ArrayHistograms, Golden) { typeCount: [ { typeName: "NumberInt64", count: 1.0 } ] } })"); - ASSERT_BSONOBJ_EQ(expected, ah->serialize()); + ASSERT_BSONOBJ_EQ(expected, ceHist->serialize()); - auto ahAreaDiff = createArrayEstimator(values, 8); + auto ceHistAreaDiff = createCEHistogram(values, 8); auto expectedAreaDiff = fromjson(R"( { trueCount: 0.0, @@ -497,7 +497,7 @@ TEST(ArrayHistograms, Golden) { typeCount: [ { typeName: "NumberInt64", count: 1.0 } ] } })"); - ASSERT_BSONOBJ_EQ(expectedAreaDiff, ahAreaDiff->serialize()); + ASSERT_BSONOBJ_EQ(expectedAreaDiff, ceHistAreaDiff->serialize()); } } // namespace mongo::stats diff --git a/src/mongo/db/query/stats/collection_statistics.h b/src/mongo/db/query/stats/collection_statistics.h index c7f4efdd1b061..abd4e7451dc39 100644 --- a/src/mongo/db/query/stats/collection_statistics.h +++ b/src/mongo/db/query/stats/collection_statistics.h @@ -30,11 +30,11 @@ #pragma once #include "mongo/db/namespace_string.h" -#include "mongo/db/query/stats/array_histogram.h" +#include "mongo/db/query/stats/ce_histogram.h" namespace mongo::stats { -using Histograms = std::map>; +using Histograms = std::map>; class CollectionStatistics { public: @@ -46,13 +46,13 @@ class CollectionStatistics { /** * Returns the histogram for the given field path, or nullptr if none exists. */ - virtual const ArrayHistogram* getHistogram(const std::string& path) const = 0; + virtual const CEHistogram* getHistogram(const std::string& path) const = 0; /** * Adds a histogram along the given path. */ virtual void addHistogram(const std::string& path, - std::shared_ptr histogram) const = 0; + std::shared_ptr histogram) const = 0; virtual ~CollectionStatistics() = default; }; diff --git a/src/mongo/db/query/stats/collection_statistics_impl.cpp b/src/mongo/db/query/stats/collection_statistics_impl.cpp index 27644242ddac7..ed53fe162d9b7 100644 --- a/src/mongo/db/query/stats/collection_statistics_impl.cpp +++ b/src/mongo/db/query/stats/collection_statistics_impl.cpp @@ -49,11 +49,11 @@ double CollectionStatisticsImpl::getCardinality() const { } void CollectionStatisticsImpl::addHistogram(const std::string& path, - std::shared_ptr histogram) const { + std::shared_ptr histogram) const { _histograms[path] = histogram; } -const ArrayHistogram* CollectionStatisticsImpl::getHistogram(const std::string& path) const { +const CEHistogram* CollectionStatisticsImpl::getHistogram(const std::string& path) const { if (auto mapIt = _histograms.find(path); mapIt != _histograms.end()) { return mapIt->second.get(); } else { diff --git a/src/mongo/db/query/stats/collection_statistics_impl.h b/src/mongo/db/query/stats/collection_statistics_impl.h index 10d722dec717b..da17540be6564 100644 --- a/src/mongo/db/query/stats/collection_statistics_impl.h +++ b/src/mongo/db/query/stats/collection_statistics_impl.h @@ -34,12 +34,12 @@ #include #include "mongo/db/namespace_string.h" -#include "mongo/db/query/stats/array_histogram.h" +#include "mongo/db/query/stats/ce_histogram.h" #include "mongo/db/query/stats/collection_statistics.h" namespace mongo::stats { -using Histograms = std::map>; +using Histograms = std::map>; class CollectionStatisticsImpl : public CollectionStatistics { public: @@ -53,13 +53,13 @@ class CollectionStatisticsImpl : public CollectionStatistics { /** * Returns the histogram for the given field path, or nullptr if none exists. */ - const ArrayHistogram* getHistogram(const std::string& path) const override; + const CEHistogram* getHistogram(const std::string& path) const override; /** * Adds a histogram along the given path. */ void addHistogram(const std::string& path, - std::shared_ptr histogram) const override; + std::shared_ptr histogram) const override; ~CollectionStatisticsImpl() override = default; diff --git a/src/mongo/db/query/stats/collection_statistics_mock.cpp b/src/mongo/db/query/stats/collection_statistics_mock.cpp index c3a6cd5770513..d31a5c5d2d9ae 100644 --- a/src/mongo/db/query/stats/collection_statistics_mock.cpp +++ b/src/mongo/db/query/stats/collection_statistics_mock.cpp @@ -42,11 +42,11 @@ double CollectionStatisticsMock::getCardinality() const { } void CollectionStatisticsMock::addHistogram(const std::string& path, - std::shared_ptr histogram) const { + std::shared_ptr histogram) const { _histograms[path] = histogram; } -const ArrayHistogram* CollectionStatisticsMock::getHistogram(const std::string& path) const { +const CEHistogram* CollectionStatisticsMock::getHistogram(const std::string& path) const { if (auto mapIt = _histograms.find(path); mapIt != _histograms.end()) { return mapIt->second.get(); } diff --git a/src/mongo/db/query/stats/collection_statistics_mock.h b/src/mongo/db/query/stats/collection_statistics_mock.h index e2503058baee1..e45626e48f000 100644 --- a/src/mongo/db/query/stats/collection_statistics_mock.h +++ b/src/mongo/db/query/stats/collection_statistics_mock.h @@ -33,7 +33,7 @@ #include #include "mongo/db/namespace_string.h" -#include "mongo/db/query/stats/array_histogram.h" +#include "mongo/db/query/stats/ce_histogram.h" #include "mongo/db/query/stats/collection_statistics.h" namespace mongo::stats { @@ -51,12 +51,12 @@ class CollectionStatisticsMock : public CollectionStatistics { * Adds a histogram along the given path. */ void addHistogram(const std::string& path, - std::shared_ptr histogram) const override; + std::shared_ptr histogram) const override; /** * Returns the histogram for the given field path, or nullptr if none exists. */ - const ArrayHistogram* getHistogram(const std::string& path) const override; + const CEHistogram* getHistogram(const std::string& path) const override; ~CollectionStatisticsMock() override = default; diff --git a/src/mongo/db/query/stats/max_diff.cpp b/src/mongo/db/query/stats/max_diff.cpp index 60f641a8861f0..d6c7da98c51cb 100644 --- a/src/mongo/db/query/stats/max_diff.cpp +++ b/src/mongo/db/query/stats/max_diff.cpp @@ -409,9 +409,9 @@ ScalarHistogram genMaxDiffHistogram(const DataDistribution& dataDistrib, return ScalarHistogram::make(std::move(bounds), std::move(buckets)); } -std::shared_ptr createArrayEstimator(const std::vector& arrayData, - size_t nBuckets, - SortArg sortArg) { +std::shared_ptr createCEHistogram(const std::vector& arrayData, + size_t nBuckets, + SortArg sortArg) { uassert(7120500, "A histogram must have at least one bucket.", nBuckets > 0); // Values that will be used as inputs to histogram generation code. @@ -529,25 +529,25 @@ std::shared_ptr createArrayEstimator(const std::vector #include "mongo/db/exec/sbe/values/value.h" -#include "mongo/db/query/stats/array_histogram.h" +#include "mongo/db/query/stats/ce_histogram.h" #include "mongo/db/query/stats/scalar_histogram.h" #include "mongo/db/query/stats/value_utils.h" @@ -130,8 +130,8 @@ ScalarHistogram genMaxDiffHistogram(const DataDistribution& dataDistribution, * Given a vector containing SBEValues, generate a set of statistics to summarize the supplied * data. Histograms will use the supplied number of buckets. */ -std::shared_ptr createArrayEstimator(const std::vector& arrayData, - size_t numBuckets, - SortArg sortArg = SortArg::kAreaDiff); +std::shared_ptr createCEHistogram(const std::vector& arrayData, + size_t numBuckets, + SortArg sortArg = SortArg::kAreaDiff); } // namespace mongo::stats diff --git a/src/mongo/db/query/stats/maxdiff_test_utils.cpp b/src/mongo/db/query/stats/maxdiff_test_utils.cpp index f676636a5722f..126229b59021b 100644 --- a/src/mongo/db/query/stats/maxdiff_test_utils.cpp +++ b/src/mongo/db/query/stats/maxdiff_test_utils.cpp @@ -46,7 +46,7 @@ #include "mongo/db/query/ce/histogram_common.h" #include "mongo/db/query/plan_executor.h" #include "mongo/db/query/plan_executor_factory.h" -#include "mongo/db/query/stats/array_histogram.h" +#include "mongo/db/query/stats/ce_histogram.h" #include "mongo/db/query/stats/max_diff.h" #include "mongo/stdx/unordered_map.h" #include "mongo/unittest/assert.h" @@ -167,7 +167,7 @@ std::string printValueArray(const std::vector& values) { return strStream.str(); } -std::string plotArrayEstimator(const ArrayHistogram& estimator, const std::string& header) { +std::string plotArrayEstimator(const CEHistogram& estimator, const std::string& header) { std::ostringstream os; os << header << "\n"; if (!estimator.getScalar().empty()) { diff --git a/src/mongo/db/query/stats/maxdiff_test_utils.h b/src/mongo/db/query/stats/maxdiff_test_utils.h index 961aa38b75a67..c00bbce7bc9d9 100644 --- a/src/mongo/db/query/stats/maxdiff_test_utils.h +++ b/src/mongo/db/query/stats/maxdiff_test_utils.h @@ -38,7 +38,7 @@ #include "mongo/db/query/ce/histogram_common.h" #include "mongo/db/query/optimizer/node.h" #include "mongo/db/query/optimizer/syntax/syntax.h" -#include "mongo/db/query/stats/array_histogram.h" +#include "mongo/db/query/stats/ce_histogram.h" #include "mongo/db/query/stats/scalar_histogram.h" #include "mongo/db/query/stats/value_utils.h" @@ -70,8 +70,8 @@ ScalarHistogram makeHistogram(std::vector& randData, size_t nBuckets); std::string printValueArray(const std::vector& values); /** - Plot a set of statistics as stored in ArrayHistogram. + Plot a set of statistics as stored in CEHistogram. */ -std::string plotArrayEstimator(const ArrayHistogram& estimator, const std::string& header); +std::string plotArrayEstimator(const CEHistogram& estimator, const std::string& header); } // namespace mongo::stats diff --git a/src/mongo/db/query/stats/stats_cache_loader.h b/src/mongo/db/query/stats/stats_cache_loader.h index 77f86cfca6f77..b393e7d1e111d 100644 --- a/src/mongo/db/query/stats/stats_cache_loader.h +++ b/src/mongo/db/query/stats/stats_cache_loader.h @@ -38,13 +38,13 @@ #include "mongo/base/string_data.h" #include "mongo/db/namespace_string.h" #include "mongo/db/operation_context.h" -#include "mongo/db/query/stats/array_histogram.h" +#include "mongo/db/query/stats/ce_histogram.h" #include "mongo/stdx/thread.h" #include "mongo/util/future.h" namespace mongo::stats { using StatsPathString = std::pair; -using StatsCacheVal = std::shared_ptr; +using StatsCacheVal = std::shared_ptr; class StatsCacheLoader { public: diff --git a/src/mongo/db/query/stats/stats_cache_loader_impl.cpp b/src/mongo/db/query/stats/stats_cache_loader_impl.cpp index a8065a09fe565..3e7ef8d9d0bf7 100644 --- a/src/mongo/db/query/stats/stats_cache_loader_impl.cpp +++ b/src/mongo/db/query/stats/stats_cache_loader_impl.cpp @@ -44,7 +44,7 @@ #include "mongo/db/dbdirectclient.h" #include "mongo/db/namespace_string.h" #include "mongo/db/query/find_command.h" -#include "mongo/db/query/stats/array_histogram.h" +#include "mongo/db/query/stats/ce_histogram.h" #include "mongo/db/query/stats/stats_gen.h" #include "mongo/idl/idl_parser.h" #include "mongo/logv2/log.h" @@ -84,7 +84,7 @@ SemiFuture StatsCacheLoaderImpl::getStats(OperationContext* opCtx IDLParserContext ctx("StatsPath"); BSONObj document = cursor->nextSafe().getOwned(); auto parsedStats = StatsPath::parse(ctx, document); - StatsCacheVal statsPtr(ArrayHistogram::make(parsedStats.getStatistics())); + StatsCacheVal statsPtr(CEHistogram::make(parsedStats.getStatistics())); return makeReadyFutureWith([this, statsPtr] { return statsPtr; }).semi(); } diff --git a/src/mongo/db/query/stats/stats_cache_loader_test.cpp b/src/mongo/db/query/stats/stats_cache_loader_test.cpp index 296ee461d4318..63a73b9e6de09 100644 --- a/src/mongo/db/query/stats/stats_cache_loader_test.cpp +++ b/src/mongo/db/query/stats/stats_cache_loader_test.cpp @@ -43,7 +43,7 @@ #include "mongo/db/concurrency/lock_manager_defs.h" #include "mongo/db/exec/sbe/values/value.h" #include "mongo/db/namespace_string.h" -#include "mongo/db/query/stats/array_histogram.h" +#include "mongo/db/query/stats/ce_histogram.h" #include "mongo/db/query/stats/max_diff.h" #include "mongo/db/query/stats/scalar_histogram.h" #include "mongo/db/query/stats/stats_cache_loader.h" @@ -100,14 +100,14 @@ TEST_F(StatsCacheLoaderTest, VerifyStatsLoadsScalar) { {sbe::value::TypeTags::NumberDouble, doubleCount}, {sbe::value::TypeTags::Boolean, trueCount + falseCount}, }; - auto ah = ArrayHistogram::make( + auto ceHist = CEHistogram::make( ScalarHistogram::make(*bounds, buckets), tc, numDocs, trueCount, falseCount); - auto expectedSerialized = ah->serialize(); + auto expectedSerialized = ceHist->serialize(); // Serialize histogram into a stats path. std::string path = "somePath"; constexpr double sampleRate = 1.0; - auto serialized = stats::makeStatsPath(path, numDocs, sampleRate, ah); + auto serialized = stats::makeStatsPath(path, numDocs, sampleRate, ceHist); // Initalize stats collection. NamespaceString nss = NamespaceString::createNamespaceString_forTest("test", "stats"); @@ -146,7 +146,7 @@ TEST_F(StatsCacheLoaderTest, VerifyStatsLoadsArray) { auto emptyArray2Val = sbe::value::makeNewArray().second; auto emptyArray3Val = sbe::value::makeNewArray().second; - // Create a small ArrayHistogram with boolean & empty array counts using maxdiff. + // Create a small CEHistogram with boolean & empty array counts using maxdiff. const std::vector values{ // Scalar doubles: 1, 2, 3. SBEValue{sbe::value::TypeTags::NumberDouble, sbe::value::bitcastFrom(1.0)}, @@ -166,18 +166,18 @@ TEST_F(StatsCacheLoaderTest, VerifyStatsLoadsArray) { // A non-empty array. SBEValue{sbe::value::TypeTags::Array, nonEmptyArrayVal}, }; - auto ah = createArrayEstimator(values, numDocs); - auto expectedSerialized = ah->serialize(); + auto ceHist = createCEHistogram(values, numDocs); + auto expectedSerialized = ceHist->serialize(); // Sanity check counters. - ASSERT_EQ(ah->getTrueCount(), 2.0); - ASSERT_EQ(ah->getFalseCount(), 4.0); - ASSERT_EQ(ah->getEmptyArrayCount(), 3.0); + ASSERT_EQ(ceHist->getTrueCount(), 2.0); + ASSERT_EQ(ceHist->getFalseCount(), 4.0); + ASSERT_EQ(ceHist->getEmptyArrayCount(), 3.0); // Serialize histogram into a stats path. std::string path = "somePath"; constexpr double sampleRate = 1.0; - auto serialized = stats::makeStatsPath(path, numDocs, sampleRate, ah); + auto serialized = stats::makeStatsPath(path, numDocs, sampleRate, ceHist); // Initalize stats collection. NamespaceString nss = NamespaceString::createNamespaceString_forTest("test", "stats"); diff --git a/src/mongo/db/query/stats/stats_cache_test.cpp b/src/mongo/db/query/stats/stats_cache_test.cpp index 4508913ce447c..fade92412fd68 100644 --- a/src/mongo/db/query/stats/stats_cache_test.cpp +++ b/src/mongo/db/query/stats/stats_cache_test.cpp @@ -39,7 +39,7 @@ #include "mongo/base/status.h" #include "mongo/base/string_data.h" #include "mongo/db/operation_context.h" -#include "mongo/db/query/stats/array_histogram.h" +#include "mongo/db/query/stats/ce_histogram.h" #include "mongo/db/query/stats/stats_cache.h" #include "mongo/db/query/stats/stats_cache_loader_mock.h" #include "mongo/db/service_context_test_fixture.h" @@ -91,7 +91,7 @@ class StatsCacheTest : public ServiceContextTest { }; TEST(StatsCacheTest, StandaloneValueHandle) { - StatsCacheVal statsPtr(ArrayHistogram::make()); + StatsCacheVal statsPtr(CEHistogram::make()); StatsCache::ValueHandle standaloneHandle(std::move(statsPtr)); ASSERT(standaloneHandle.isValid()); } diff --git a/src/mongo/db/query/stats/stats_catalog.cpp b/src/mongo/db/query/stats/stats_catalog.cpp index 149b42733d83b..7da3197237edf 100644 --- a/src/mongo/db/query/stats/stats_catalog.cpp +++ b/src/mongo/db/query/stats/stats_catalog.cpp @@ -37,7 +37,7 @@ #include #include "mongo/base/error_codes.h" -#include "mongo/db/query/stats/array_histogram.h" +#include "mongo/db/query/stats/ce_histogram.h" #include "mongo/db/query/stats/stats_cache.h" #include "mongo/util/assert_util.h" #include "mongo/util/decorable.h" @@ -90,7 +90,7 @@ StatsCatalog& StatsCatalog::get(OperationContext* opCtx) { return get(opCtx->getServiceContext()); } -StatusWith> StatsCatalog::getHistogram( +StatusWith> StatsCatalog::getHistogram( OperationContext* opCtx, const NamespaceString& nss, const std::string& path) { try { auto handle = _statsCache.acquire(opCtx, std::make_pair(nss, path)); diff --git a/src/mongo/db/query/stats/stats_catalog.h b/src/mongo/db/query/stats/stats_catalog.h index b27fd3ae5a0d5..abb27bae2ed89 100644 --- a/src/mongo/db/query/stats/stats_catalog.h +++ b/src/mongo/db/query/stats/stats_catalog.h @@ -37,7 +37,7 @@ #include "mongo/base/string_data.h" #include "mongo/db/namespace_string.h" #include "mongo/db/operation_context.h" -#include "mongo/db/query/stats/array_histogram.h" +#include "mongo/db/query/stats/ce_histogram.h" #include "mongo/db/query/stats/collection_statistics.h" #include "mongo/db/query/stats/stats_cache.h" #include "mongo/db/query/stats/stats_cache_loader.h" @@ -68,9 +68,9 @@ class StatsCatalog { ~StatsCatalog(); - StatusWith> getHistogram(OperationContext* opCtx, - const NamespaceString& nss, - const std::string& path); + StatusWith> getHistogram(OperationContext* opCtx, + const NamespaceString& nss, + const std::string& path); Status invalidatePath(const NamespaceString& nss, const std::string& path); diff --git a/src/mongo/db/query/stats/stats_path_test.cpp b/src/mongo/db/query/stats/stats_path_test.cpp index 5dbd1dde4fad1..e82feccd8bbee 100644 --- a/src/mongo/db/query/stats/stats_path_test.cpp +++ b/src/mongo/db/query/stats/stats_path_test.cpp @@ -32,7 +32,7 @@ #include "mongo/base/string_data.h" #include "mongo/db/exec/sbe/values/value.h" -#include "mongo/db/query/stats/array_histogram.h" +#include "mongo/db/query/stats/ce_histogram.h" #include "mongo/db/query/stats/scalar_histogram.h" #include "mongo/db/query/stats/stats_gen.h" #include "mongo/idl/idl_parser.h" @@ -86,11 +86,11 @@ TEST(StatsPath, BasicValidStatsPath) { {sbe::value::TypeTags::Boolean, trueCount + falseCount}, }; const auto sh = ScalarHistogram::make(*bounds, buckets); - auto ah = ArrayHistogram::make(std::move(sh), tc, numDocs, trueCount, falseCount); + auto cehist = CEHistogram::make(std::move(sh), tc, numDocs, trueCount, falseCount); // Serialize to BSON. constexpr double sampleRate = 1.0; - auto serializedPath = stats::makeStatsPath("somePath", numDocs, sampleRate, ah); + auto serializedPath = stats::makeStatsPath("somePath", numDocs, sampleRate, cehist); // Parse StatsPath via IDL & serialize to BSON. auto parsedPath = StatsPath::parse(ctx, serializedPath); @@ -109,11 +109,11 @@ TEST(StatsPath, BasicValidEmptyStatsPath) { std::vector buckets; // Create an empty scalar histogram. - auto ah = ArrayHistogram::make(ScalarHistogram::make(), TypeCounts{}, 0.0 /* sampleSize */); + auto cehist = CEHistogram::make(ScalarHistogram::make(), TypeCounts{}, 0.0 /* sampleSize */); // Serialize to BSON. constexpr double sampleRate = 1.0; - auto serializedPath = stats::makeStatsPath("someEmptyPath", numDocs, sampleRate, ah); + auto serializedPath = stats::makeStatsPath("someEmptyPath", numDocs, sampleRate, cehist); // Parse StatsPath via IDL & serialize to BSON. auto parsedPath = StatsPath::parse(ctx, serializedPath); diff --git a/src/mongo/db/query/stats/type_collision_test.cpp b/src/mongo/db/query/stats/type_collision_test.cpp index 64e870f3032d5..f7d0d419ffd05 100644 --- a/src/mongo/db/query/stats/type_collision_test.cpp +++ b/src/mongo/db/query/stats/type_collision_test.cpp @@ -38,7 +38,7 @@ #include "mongo/db/exec/document_value/value.h" #include "mongo/db/exec/docval_to_sbeval.h" #include "mongo/db/exec/sbe/values/value.h" -#include "mongo/db/query/stats/array_histogram.h" +#include "mongo/db/query/stats/ce_histogram.h" #include "mongo/db/query/stats/max_diff.h" #include "mongo/db/query/stats/value_utils.h" #include "mongo/platform/decimal128.h" @@ -76,15 +76,15 @@ TEST(TypeCollisionTest, ZeroedCollidingTypesHistogram) { // We should always fail to build a histogram on 0 buckets. auto i = 0; - ASSERT_THROWS(createArrayEstimator(data, 0), DBException); + ASSERT_THROWS(createCEHistogram(data, 0), DBException); // We should always fail to build a histogram if we have fewer buckets than type classes. for (i = 1; i < 5; i++) { - ASSERT_THROWS(createArrayEstimator(data, i), DBException); + ASSERT_THROWS(createCEHistogram(data, i), DBException); } // With sufficient buckets, we should build a histogram with one bucket per type class. - auto ah = createArrayEstimator(data, i); + auto ceHist = createCEHistogram(data, i); auto expected = fromjson( "{ \ trueCount: 0.0, \ @@ -142,6 +142,6 @@ TEST(TypeCollisionTest, ZeroedCollidingTypesHistogram) { bounds: [0.0, \"\", ObjectId('000000000000000000000000'), new Date(0), Timestamp(0, 0)]\ } \ }"); - ASSERT_BSONOBJ_EQ(expected, ah->serialize()); + ASSERT_BSONOBJ_EQ(expected, ceHist->serialize()); } } // namespace mongo::stats diff --git a/src/mongo/db/query/stats/type_count_test.cpp b/src/mongo/db/query/stats/type_count_test.cpp index 87a34873b5650..1ca9a70f8146c 100644 --- a/src/mongo/db/query/stats/type_count_test.cpp +++ b/src/mongo/db/query/stats/type_count_test.cpp @@ -31,7 +31,7 @@ #include "mongo/base/string_data.h" #include "mongo/db/exec/sbe/values/value.h" -#include "mongo/db/query/stats/array_histogram.h" +#include "mongo/db/query/stats/ce_histogram.h" #include "mongo/unittest/assert.h" #include "mongo/unittest/framework.h"