Skip to content

Commit

Permalink
Fix NaN handling for histogram, map_agg and map_union aggregates (fac…
Browse files Browse the repository at this point in the history
…ebookincubator#10071)

Summary:
Pull Request resolved: facebookincubator#10071

Ensures that NaNs with different binary representations are considered
equal and deduplicated when used as keys in a map.

Summary of changes:
For primitive types, we ensure that the Map data structure used to
accumulate uses hash and equality functions that handle NaN.
For complex input it uses BaseVector::hashValueAt() and
ContainerRowSerde::compare() as a hash and equality function,
respectively which were fixed to handle NaNs properly in facebookincubator#9963

Reviewed By: kgpai

Differential Revision: D58209683

fbshipit-source-id: f8697f248a0346456597ca71105eb5bcf1b71f8b
  • Loading branch information
Bikramjeet Vig authored and facebook-github-bot committed Jun 17, 2024
1 parent e452936 commit 0b5bf0f
Show file tree
Hide file tree
Showing 5 changed files with 173 additions and 4 deletions.
6 changes: 2 additions & 4 deletions velox/functions/prestosql/aggregates/HistogramAggregate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,10 @@ namespace {

template <typename T>
struct Accumulator {
using ValueMap = folly::F14FastMap<
using ValueMap = typename util::floating_point::HashMapNaNAwareTypeTraits<
T,
int64_t,
std::hash<T>,
std::equal_to<T>,
AlignedStlAllocator<std::pair<const T, int64_t>, 16>>;
AlignedStlAllocator<std::pair<const T, int64_t>, 16>>::Type;
ValueMap values;

explicit Accumulator(HashStringAllocator* allocator)
Expand Down
16 changes: 16 additions & 0 deletions velox/functions/prestosql/aggregates/MapAccumulator.h
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,22 @@ struct MapAccumulatorTypeTraits {
using AccumulatorType = MapAccumulator<T>;
};

template <>
struct MapAccumulatorTypeTraits<float> {
using AccumulatorType = MapAccumulator<
float,
util::floating_point::NaNAwareHash<float>,
util::floating_point::NaNAwareEquals<float>>;
};

template <>
struct MapAccumulatorTypeTraits<double> {
using AccumulatorType = MapAccumulator<
double,
util::floating_point::NaNAwareHash<double>,
util::floating_point::NaNAwareEquals<double>>;
};

template <>
struct MapAccumulatorTypeTraits<StringView> {
using AccumulatorType = StringViewMapAccumulator;
Expand Down
14 changes: 14 additions & 0 deletions velox/functions/prestosql/aggregates/tests/HistogramTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -305,5 +305,19 @@ TEST_F(HistogramTest, globalString) {
testGlobalHistogramWithDuck(data);
}

TEST_F(HistogramTest, globalNaNs) {
// Verify that NaNs with different binary representations are considered equal
// and deduplicated.
static const auto kNaN = std::numeric_limits<double>::quiet_NaN();
static const auto kSNaN = std::numeric_limits<double>::signaling_NaN();
auto vector = makeFlatVector<double>({1, kNaN, kSNaN, 2, 3, kNaN, kSNaN, 3});

auto expected = makeRowVector({makeMapVectorFromJson<double, int64_t>({
"{1: 1, 2: 1, 3: 2, NaN: 4}",
})});

testHistogram("histogram(c1)", {}, vector, vector, expected);
}

} // namespace
} // namespace facebook::velox::aggregate::test
69 changes: 69 additions & 0 deletions velox/functions/prestosql/aggregates/tests/MapAggTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -482,5 +482,74 @@ TEST_F(MapAggTest, rowCheckNull) {
"ROW comparison not supported for values that contain nulls");
}

TEST_F(MapAggTest, nans) {
// Verify that NaNs with different binary representations are considered equal
// and deduplicated when used as keys in the output map.
static const auto kNaN = std::numeric_limits<double>::quiet_NaN();
static const auto kSNaN = std::numeric_limits<double>::signaling_NaN();

// Global Aggregation, Primitive type
auto data = makeRowVector(
{makeFlatVector<double>({1, kNaN, kSNaN, 2, 3, kNaN, kSNaN, 3}),
makeFlatVector<int32_t>({1, 2, 3, 4, 5, 6, 7, 8}),
makeFlatVector<int32_t>({1, 1, 1, 1, 2, 2, 2, 2})});

auto expectedResult = makeRowVector({
makeMapVectorFromJson<double, int32_t>({
"{ 1: 1, 2: 4, 3: 5, NaN: 2}",
}),
});

testAggregations({data}, {}, {"map_agg(c0, c1)"}, {expectedResult});

// Group by Aggregation, Primitive type
expectedResult = makeRowVector(
{makeMapVectorFromJson<double, int32_t>(
{"{ 1: 1, 2: 4, NaN: 2}", "{ 3: 5, NaN: 6}"}),
makeFlatVector<int32_t>({1, 2})});

testAggregations(
{data}, {"c2"}, {"map_agg(c0, c1)"}, {"a0", "c2"}, {expectedResult});

// Global Aggregation, Complex type(Row)
// The complex input values that are to be used as keys are {1, 1}, {NaN, 2},
// {NaN, 2}, {2, 4}, {3, 5}, {NaN, 2}, {NaN, 2}, {3, 5}
data = makeRowVector(
{makeRowVector(
{makeFlatVector<double>({1, kNaN, kSNaN, 2, 3, kNaN, kSNaN, 3}),
makeFlatVector<int32_t>({1, 2, 2, 4, 5, 2, 2, 5})}),
makeFlatVector<int32_t>({1, 2, 3, 4, 5, 6, 7, 8}),
makeFlatVector<int32_t>({1, 1, 1, 1, 2, 2, 2, 2})});

// The expected result is
// [{"key":[1,1],"value":1},{"key":[2,4],"value":4},{"key":[3,5],"value":5},
// {"key":["NaN",2],"value":2}]
expectedResult = makeRowVector({makeMapVector(
{0},
makeRowVector(
{makeFlatVector<double>({1, 2, 3, kNaN}),
makeFlatVector<int32_t>({1, 4, 5, 2})}),
makeFlatVector<int32_t>({1, 4, 5, 2}))});

testAggregations({data}, {}, {"map_agg(c0, c1)"}, {expectedResult});

// Group by Aggregation, Complex type(Row)
// The expected result is
// [{"key":[1,1],"value":1},{"key":[2,4],"value":4},
// {"key":["NaN",2],"value":2}] | 1
// [{"key":[3,5],"value":5},{"key":["NaN",2],"value":6}] | 2
expectedResult = makeRowVector(
{makeMapVector(
{0, 3},
makeRowVector(
{makeFlatVector<double>({1, 2, kNaN, 3, kNaN}),
makeFlatVector<int32_t>({1, 4, 2, 5, 2})}),
makeFlatVector<int32_t>({1, 4, 2, 5, 6})),
makeFlatVector<int32_t>({1, 2})});

testAggregations(
{data}, {"c2"}, {"map_agg(c0, c1)"}, {"a0", "c2"}, {expectedResult});
}

} // namespace
} // namespace facebook::velox::aggregate::test
Original file line number Diff line number Diff line change
Expand Up @@ -345,5 +345,77 @@ TEST_F(MapUnionTest, unknownKeysAndValues) {
AssertQueryBuilder(plan).copyResults(pool()), "map key cannot be null");
}

TEST_F(MapUnionTest, nans) {
// Verify that NaNs with different binary representations are considered equal
// and deduplicated when used as keys in the output map.
static const auto kNaN = std::numeric_limits<double>::quiet_NaN();
static const auto kSNaN = std::numeric_limits<double>::signaling_NaN();

// Global Aggregation, Primitive type
auto data = makeRowVector(
{makeMapVectorFromJson<double, int32_t>(
{"{1: 1}", "{2: 2}", "{NaN: 4}", "{3: 3}", "{NaN: 5}", "{NaN: 6}"}),
makeFlatVector<int32_t>({1, 1, 1, 2, 2, 2})});

auto expectedResult = makeRowVector({
makeMapVectorFromJson<double, int32_t>({
"{ 1: 1, 2: 2, 3: 3, NaN: 4}",
}),
});

testAggregations({data}, {}, {"map_union(c0)"}, {expectedResult});

// Group by Aggregation, Primitive type
expectedResult = makeRowVector(
{makeMapVectorFromJson<double, int32_t>(
{"{ 1: 1, 2: 2, NaN: 4}", "{ 3: 3, NaN: 5}"}),
makeFlatVector<int32_t>({1, 2})});

testAggregations(
{data}, {"c1"}, {"map_union(c0)"}, {"a0", "c1"}, {expectedResult});

// Global Aggregation, Complex type(Row)
// The complex input values are:
// [{"key":[1,1],"value":1},{"key":["NaN",2],"value":2},{"key":[2,4],"value":3},{"key":[3,5],"value":4},
// {"key":["NaN",2],"value":5}, {"key":["NaN",2],"value":5}]
data = makeRowVector(
{makeMapVector(
{0, 1, 2, 3, 4, 5},
makeRowVector(
{makeFlatVector<double>({1, kSNaN, 2, 3, kNaN, kSNaN}),
makeFlatVector<int32_t>({1, 2, 4, 5, 2, 2})}),
makeFlatVector<int32_t>({1, 2, 3, 4, 5, 6})),
makeFlatVector<int32_t>({1, 1, 1, 2, 2, 2})});

// The expected result is
// [{"key":[1,1],"value":1},{"key":[2,4],"value":3},{"key":[3,5],"value":4},
// {"key":["NaN",2],"value":2}]
expectedResult = makeRowVector({makeMapVector(
{0},
makeRowVector(
{makeFlatVector<double>({1, 2, 3, kNaN}),
makeFlatVector<int32_t>({1, 4, 5, 2})}),
makeFlatVector<int32_t>({1, 3, 4, 2}))});

testAggregations({data}, {}, {"map_union(c0)"}, {expectedResult});

// Group by Aggregation, Complex type(Row)
// The expected result is
// [{"key":[1,1],"value":1},{"key":[2,4],"value":3},
// {"key":["NaN",2],"value":2}] | 1
// [{"key":[3,5],"value":4},{"key":["NaN",2],"value":5}] | 2
expectedResult = makeRowVector(
{makeMapVector(
{0, 3},
makeRowVector(
{makeFlatVector<double>({1, 2, kNaN, 3, kNaN}),
makeFlatVector<int32_t>({1, 4, 2, 5, 2})}),
makeFlatVector<int32_t>({1, 3, 2, 4, 5})),
makeFlatVector<int32_t>({1, 2})});

testAggregations(
{data}, {"c1"}, {"map_union(c0)"}, {"a0", "c1"}, {expectedResult});
}

} // namespace
} // namespace facebook::velox::aggregate::test

0 comments on commit 0b5bf0f

Please sign in to comment.